def read( strin, degap=False, style=None, preprocess=None): """ * strin - one or more fasta format sequences. The first non-whitespace character must be >. * degap (bool) - if True, Non-alphanumeric characters are removed * style - None (no change to input), 'upper', or 'lower' * preprocess - None or a function accepting a single argument "seqstr" for modifying the input sequence string arbitrarily. If a function is provided, degap and style are ignored. """ gt_count = strin.count('>') flist = strin.strip().split('>') flist.pop(0) # first element is necessarily empty seqlist = [] for f in flist: try: firstline, rawseq = f.strip().split('\n', 1) except ValueError: raise FastaFormatError, 'The input fasta sequence appears to be improperly formatted (characters 1-50):\n%s' % `f[:50]` try: name, header = firstline.split(None,1) except ValueError: name, header = firstline.strip(), '' if preprocess is None: if degap: seq = removeAllButAlpha(rawseq) else: seq = removeWhitespace(rawseq) if style == 'upper': seq = seq.upper() elif style == 'lower': seq = seq.lower() else: seq = preprocess(seq) seqlist.append(Seq(name, seq, header)) #sanity check if gt_count != len(seqlist): msg = 'input string contained %s ">" characters but only %s sequences were found ' raise FastaFormatError, msg % (gt_count, len(seqlist)) return seqlist
def _writeEMBLSeq(seqStr, linelength): lmar = 5 seqDelimiter = '//' linesep = os.linesep seqStr = removeAllButAlpha(seqStr) strList = [] strList.append('SQ sequence length %s;' % len(seqStr)) strLen = linelength-lmar for i in range( 0, len(seqStr), strLen): strList.append( ' '*lmar + seqStr[i:i+strLen] ) return linesep.join(strList) + linesep + seqDelimiter
def read( strin, degap=False, style=None): """ * strin - one or more fasta format sequences. The first non-whitespace character must be >. * degap (bool) - if True, Non-alphanumeric characters are removed """ assert strin.startswith('>') flist = strin[1:].split('\n>') seqlist = [] for f in flist: if f.strip() == '': continue try: firstline, rawseq = f.strip().split('\n', 1) except ValueError: raise FastaFormatError, 'The input fasta sequence appears to be improperly formatted (characters 1-50):\n%s' % `f[:50]` try: name, header = firstline.split(None,1) except ValueError: name, header = firstline.strip(), '' seq = re.sub(r'[^a-zA-Z-]','-',rawseq) if degap: seq = removeAllButAlpha(rawseq) if style == 'upper': seq = seq.upper() elif style == 'lower': seq = seq.lower() seqlist.append(Seq(name, seq, header)) return seqlist
def read(input, degap=False, case=None, keep_struct=True, keep_ref=True, check_duplicates=True): """ * input - filename or string containing stockholm format sequence alignment * degap (bool) - if True, Non-alphanumeric characters are removed * case - specify "upper" or "lower" to force sequences into either * keep_struct - keep structural model (#=GC SS_cons element) * keep_ref - keep reference sequence (#=GC RF element) Raise a ValueError if sequence names are not unique. return a list of Seq objects """ if len(input) < 50 and os.access(input, os.F_OK): lines = open(input) else: lines = input.splitlines() seqdata = {} names = [] linecounts = defaultdict(int) for line in lines: name, seqstr = None, None if not line.strip(): continue elif line.startswith("#=GC"): _, name, seqstr = line.split() elif line.startswith("#") or line.startswith("//"): continue else: name, seqstr = line.split() if name: linecounts[name] += 1 if name not in seqdata: names.append(name) seqdata[name] = seqdata.get(name, "") + seqstr.strip() lcset = set(linecounts.values()) # all names should have the same number of lines if len(lcset) > 1: log.error("The following sequence names are not unique:") expected = min(lcset) not_unique = [] for name, count in linecounts.items(): if count != expected: log.error("%s appears %s times" % (name, count / expected)) not_unique.append(name) msg = "The following sequence names are not unique: %s" % ",".join(not_unique) raise ValueError(msg) seqlist = [] for name in names: seq = seqdata[name] if name == "SS_cons": if not keep_struct: continue elif name == "RF" and not keep_ref: continue else: seq = re.sub(r"[^a-zA-Z-]", "-", seq) if degap: seq = removeAllButAlpha(seq) if case == "upper": seq = seq.upper() elif case == "lower": seq = seq.lower() seqlist.append(Seq(name, seq)) log.info("writing %s of %s sequences" % (len(seqlist), len(names))) return seqlist
def read(input, namefun=lambda d: d['ACCESSION'][0].split()[0], keep_origin=False): """ * input - filename or a string containing Genbank format sequence records * namefun - a function that operates on the dict contained in the data attribute (see below) to generate a string to be used as the sequence name. * keep_origin - if True, retains 'ORIGIN' element in data (the raw sequence string) return a generator of gbSeq objects, with sequence name set according to 'namefun'. The data attribute of each seq object contains all of the data from the genbank record represented as nested dicts and tuples:: {'ACCESSION': ('AB512777', {}), 'AUTHORS': ('Watanabe,K., Chao,S.-H., Sasamoto,M., Kudo,Y. and Fujimoto,J.', {}), 'AUTHORS-1': ('Watanabe,K., Chao,S.-H. and Fujimoto,J.', {}), 'DEFINITION': ('Lactobacillus hammesii gene for 16S rRNA, partial sequence, strain: YIT 12110.', {}), 'FEATURES': ('Location/Qualifiers', {'rRNA': ('<1..>1553', {'product': ('16S ribosomal RNA', {})}), 'source': ('1..1553', {'db_xref': ('taxon:267633', {}), 'mol_type': ('genomic DNA', {}), 'note': ('type strain of Lactobacillus hammesii', {}), 'organism': ('Lactobacillus hammesii', {}), 'strain': ('YIT 12110', {})})}), 'JOURNAL': ('Unpublished', {}), 'JOURNAL-1': ('Submitted (15-JUL-2009) Contact:Koichi Watanabe Yakult Central Institute for Microbiological Research, Culture Collection and Microbial Systematics; 1796 Yaho, Kunitachi, Tokyo 186-8650, Japan', {}), 'KEYWORDS': ('.', {}), 'LOCUS': ('AB512777 1553 bp DNA linear BCT 17-SEP-2009', {}), 'ORGANISM': ('Lactobacillus hammesii Bacteria; Firmicutes; Lactobacillales; Lactobacillaceae; Lactobacillus.', {}), 'REFERENCE': ('1', {}), 'REFERENCE-1': ('2 (bases 1 to 1553)', {}), 'SOURCE': ('Lactobacillus hammesii', {}), 'TITLE': ('Novel Lactobacillus species isolated from stinky tofu brine', {}), 'TITLE-1': ('Direct Submission', {}), 'VERSION': ('AB512777.1 GI:258612363', {})} """ seqdelim = r'//' leadingblank = ' '*10 if input.find('\n') == -1 and os.access(input, os.F_OK): lines = open(input) else: lines = input.splitlines() record = [] addto = None for i,line in enumerate(lines): line = line.rstrip() if not line: continue if line.startswith(leadingblank): line = line.strip() if line.startswith(r'/'): if '="' in line: k,v = line[1:].split('=',1) v = v.strip('" ') else: k,v = line[1:].strip(), '' addto.append([k,v]) else: addto.append(line) else: try: key, val = line.split(None,1) except ValueError: key, val = line, '' if line.strip() == seqdelim: d = _as_dict(record) seqstr = removeAllButAlpha(d['ORIGIN'][0]) if not keep_origin: del d['ORIGIN'] yield gbSeq(name=namefun(d), seq=seqstr, data=d) record = [] if key.isupper(): record.append([]) addto = record[-1] elif key[0].islower(): record[-1].append([]) addto = record[-1][-1] addto.extend([key, val])