def test_NameToInfo_invalid_label(self): """NameToInfo: raises error on invalid label """ s = 'AA' invalid_labels = ['U17136.1898-984','U17136.1/898984'] for l in invalid_labels: self.assertRaises(RecordError,NameToInfo,\ Sequence(s, Name=l)) a = 'U17136.1/' #missing start/end positions b = '/898-984' #missing genbank id obs_info = NameToInfo(Sequence(s,Name=a)) exp = Info({'GenBank':'U17136.1','Start':None,'End':None}) self.assertEqual(obs_info,exp) obs_info = NameToInfo(Sequence(s,Name=b)) exp = Info({'GenBank':None,'Start':897,'End':984}) self.assertEqual(obs_info,exp) #strict = False # in strict mode you want to get back as much info as possible lab1 = 'U17136.1898-984' lab2 = 'U17136.1/898984' obs_info = NameToInfo(Sequence(s,Name=lab1), strict=False) exp = Info({'GenBank':None,'Start':None,'End':None}) self.assertEqual(obs_info,exp) obs_info = NameToInfo(Sequence(s,Name=lab2), strict=False) exp = Info({'GenBank':'U17136.1','Start':None,'End':None}) self.assertEqual(obs_info,exp)
def test_single_constructor(self): """RdbParser should use constructors if supplied""" to_dna = lambda x, Info: DnaSequence(str(x).replace('U','T'), \ Info=Info) f = list(RdbParser(self.oneseq, to_dna)) self.assertEqual(len(f), 1) a = f[0] self.assertEqual(a, 'AGTCATCTAGATHCATHC') self.assertEqual(a.Info, Info({'Species':'H.Sapiens',\ 'OriginalSeq':'AGUCAUCUAGAUHCAUHC'})) def alternativeConstr(header_lines): info = Info() for line in header_lines: all = line.strip().split(':', 1) #strip out empty lines, lines without name, lines without colon if not all[0] or len(all) != 2: continue name = all[0].upper() value = all[1].strip().upper() info[name] = value return info f = list(RdbParser(self.oneseq, to_dna, alternativeConstr)) self.assertEqual(len(f), 1) a = f[0] self.assertEqual(a, 'AGTCATCTAGATHCATHC') exp_info = Info({'OriginalSeq':'AGUCAUCUAGAUHCAUHC',\ 'Refs':{}, 'SEQ':'H.SAPIENS'}) self.assertEqual(a.Info, Info({'OriginalSeq':'AGUCAUCUAGAUHCAUHC',\ 'Refs':{}, 'SEQ':'H.SAPIENS'}))
def test_rich_label(self): """rich label correctly constructs label strings""" # labels should be equal based on the result of applying their # attributes to their string template k = RichLabel(Info(species="rat"), "%(species)s") l = RichLabel(Info(species="rat", seq_id="xy5"), "%(species)s") self.assertEqual(k, l) # labels should construct from Info components correctly k = RichLabel(Info(species="rat", seq_id="xy5"), "%(seq_id)s:%(species)s") self.assertEqual(k, "xy5:rat") k = RichLabel(Info(species="rat", seq_id="xy5"), "%(species)s:%(seq_id)s") self.assertEqual(k, "rat:xy5") # extra components should be ignored k = RichLabel(Info(species="rat", seq_id="xy5"), "%(species)s") self.assertEqual(k, "rat") # the label should have Info object self.assertEqual(k.Info.species, "rat") self.assertEqual(k.Info.seq_id, "xy5") # label should be constructable just like a normal string self.assertEqual(RichLabel('a'), 'a')
def test_init_data(self): """Info init with data should put items in correct places""" #need to check init, setting, and resetting of attributes that belong #in the Info object and attributes that belong in Info.Refs. Also need #to check __getitem__, __setitem__, and __contains__. d = Info({'x':3, 'GO':12345}) self.assertEqual(d.x, 3) self.assertEqual(d.GO, [12345]) self.assertEqual(d.Refs.GO, [12345]) try: del d.Refs except AttributeError: pass else: raise Exception, "Failed to prevent deletion of required key Refs""" d.GenBank = ('qaz', 'wsx') self.assertEqual(d.GenBank, ['qaz', 'wsx']) self.assertContains(d.Refs, 'GenBank') self.assertContains(d, 'GenBank') d.GenBank = 'xyz' self.assertEqual(d.GenBank, ['xyz']) self.assertSameObj(d.GenBank, d.Refs.GenBank) d.GO = 'x' self.assertEqual(d.GO, ['x']) d.GO.append('y') self.assertEqual(d.GO, ['x', 'y']) d.ZZZ = 'zzz' self.assertEqual(d.ZZZ, 'zzz') self.assertNotContains(d.Refs, 'ZZZ') self.assertNotContains(d, 'XXX') self.assertEqual(d.XXX, None)
def CutgSpeciesParser(infile, strict=True, constructor=CodonUsage): """Yields successive sequences from infile as CodonUsage objects. If strict is True (default), raises RecordError when label or seq missing. """ if not strict: #easier to see logic without detailed error handling for rec in CutgSpeciesFinder(infile): try: label, counts = rec if not is_cutg_species_label(label): continue species, genes = species_label_splitter(label) info = Info({'Species': species, 'NumGenes': int(genes)}) freqs = constructor(zip(codon_order, map(int, counts.split())), Info=info) yield freqs except: continue else: for rec in CutgSpeciesFinder(infile): try: label, counts = rec except ValueError: #can't have got any counts raise RecordError, "Found label without sequences: %s" % rec if not is_cutg_species_label(label): raise RecordError, "Found CUTG record without label: %s" % rec species, genes = species_label_splitter(label) info = Info({'Species': species, 'NumGenes': int(genes)}) try: d = zip(codon_order, map(int, counts.split())) freqs = constructor(d, Info=info) except: raise RecordError, "Unable to convert counts: %s" % counts yield freqs
def test_full(self): """InfoMaker should return Info object with name, value pairs""" test_header = ['acc: X3402','abc:1','mty: ssu','seq: Mit. X3402',\ '','nonsense',':no_name'] obs = InfoMaker(test_header) exp = Info() exp.rRNA = 'X3402' exp.abc = '1' exp.Species = 'Mit. X3402' exp.Gene = 'ssu' self.assertEqual(obs,exp)
def GroupFastaParser(data, label_to_name, group_key="Group", aligned=False, moltype=ASCII, done_groups=None, DEBUG=False): """yields related sequences as a separate seq collection Arguments: - data: line iterable data source - label_to_name: LabelParser callback - group_key: name of group key in RichLabel.Info object - aligned: whether sequences are to be considered aligned - moltype: default is ASCII - done_groups: series of group keys to be excluded """ done_groups = [[], done_groups][done_groups is not None] parser = MinimalFastaParser(data, label_to_name=label_to_name, finder=XmfaFinder) group_ids = [] current_collection = {} for label, seq in parser: seq = moltype.makeSequence(seq, Name=label, Info=label.Info) if DEBUG: print "str(label) ", str(label), "repr(label)", repr(label) if not group_ids or label.Info[group_key] in group_ids: current_collection[label] = seq if not group_ids: group_ids.append(label.Info[group_key]) else: # we finish off check of current before creating a collection if group_ids[-1] not in done_groups: info = Info(Group=group_ids[-1]) if DEBUG: print "GroupParser collection keys", current_collection.keys( ) seqs = cogent.LoadSeqs(data=current_collection, moltype=moltype, aligned=aligned) seqs.Info = info yield seqs current_collection = {label: seq} group_ids.append(label.Info[group_key]) info = Info(Group=group_ids[-1]) seqs = cogent.LoadSeqs(data=current_collection, moltype=moltype, aligned=aligned) seqs.Info = info yield seqs
def NcbiFastaLabelParser(line): """Creates an Info object and populates it with the line contents. As of 11/12/03, all records in genpept.fsa and the human RefSeq fasta files were consistent with this format. """ info = Info() try: ignore, gi, db, db_ref, description = map(strip, line.split('|', 4)) except ValueError: #probably got wrong value raise RecordError, "Unable to parse label line %s" % line info.GI = gi info[NcbiLabels[db]] = db_ref info.Description = description return gi, info
def test_single(self): """RdbParser should read single record as (header,seq) tuple""" res = list(RdbParser(self.oneseq)) self.assertEqual(len(res), 1) first = res[0] self.assertEqual(first, Sequence('AGUCAUCUAGAUHCAUHC')) self.assertEqual(first.Info, Info({'Species':'H.Sapiens',\ 'OriginalSeq':'AGUCAUCUAGAUHCAUHC'})) res = list(RdbParser(self.multiline)) self.assertEqual(len(res), 1) first = res[0] self.assertEqual(first, Sequence('AGUCAUUAGAUHCAUHC')) self.assertEqual(first.Info, Info({'Species':'H.Sapiens',\ 'OriginalSeq':'AGUCAUUAGAUHCAUHC'}))
def parse_header(header_lines): """Return Info object from header information. header_lines -- list of lines or anything that behaves like it. Parses only the first three header lines with Filename, Organism, and Accession number. In general lines that contain a colon will be parsed. There's no error checking in here. If it fails to split on ':', the information is simply not added to the dictionary. The expected format for header lines is "key: value". The citation lane is parsed differently. """ info = {} for line in header_lines: if line.startswith('Citation'): info['Citation'] = line.split()[-1].strip() elif ':' in line: try: field, value = map(strip, line.split(':', 1)) info[field] = value except ValueError: #no interesting header line continue else: continue return Info(info)
def test_init_empty(self): """Info empty init should work as expected""" d = Info() self.assertEqual(len(d), 1) self.assertContains(d, 'Refs') self.assertEqual(d.Refs, DbRefs()) self.assertTrue(isinstance(d.Refs, DbRefs))
def setUp(self): """Setup for Fasta tests.""" self.strings = ['AAAA', 'CCCC', 'gggg', 'uuuu'] self.labels = ['1st', '2nd', '3rd', '4th'] self.infos = ["Dog", "Cat", "Mouse", "Rat"] self.sequences_with_labels = map(Sequence, self.strings) self.sequences_with_names = map(Sequence, self.strings) for l,sl,sn in zip(self.labels,self.sequences_with_labels,\ self.sequences_with_names): sl.Label = l sn.Name = l self.fasta_no_label = '>0\nAAAA\n>1\nCCCC\n>2\ngggg\n>3\nuuuu' self.fasta_with_label=\ '>1st\nAAAA\n>2nd\nCCCC\n>3rd\nGGGG\n>4th\nUUUU' self.fasta_with_label_lw2=\ '>1st\nAA\nAA\n>2nd\nCC\nCC\n>3rd\nGG\nGG\n>4th\nUU\nUU' self.alignment_dict = { '1st': 'AAAA', '2nd': 'CCCC', '3rd': 'GGGG', '4th': 'UUUU' } self.alignment_object = Alignment(self.alignment_dict) for label, info in zip(self.labels, self.infos): self.alignment_object.NamedSeqs[label].Info = Info(species=info) self.fasta_with_label_species=\ '>1st:Dog\nAAAA\n>2nd:Cat\nCCCC\n>3rd:Mouse\nGGGG\n>4th:Rat\nUUUU' self.alignment_object.RowOrder = ['1st', '2nd', '3rd', '4th']
def alternativeConstr(header_lines): info = Info() for line in header_lines: all = line.strip().split(':', 1) #strip out empty lines, lines without name, lines without colon if not all[0] or len(all) != 2: continue name = all[0].upper() value = all[1].strip().upper() info[name] = value return info
def test_RfamParser_single_family(self): """RfamParser: should work on a single family in stockholm format""" exp_header = Info() exp_aln = {'K02120.1/628-682':\ 'AUGGGAAAUUCCCCCUCCUAUAACCCCCCCGCUGGUAUCUCCCCCUCAGACUGGC',\ 'D00647.1/629-683':\ 'AUGGGAAACUCCCCCUCCUAUAACCCCCCCGCUGGCAUCUCCCCCUCAGACUGGC'} exp_struct = '<<<<<<.........>>>>>>.........<<<<<<.............>>>>>>' h, a, s = list(RfamParser(self.single_family))[0] self.assertEqual(h,exp_header) self.assertEqual(a,exp_aln) self.assertEqual(s,exp_struct)
def test_init_data(self): """Info init with data should put items in correct places""" #need to check init, setting, and resetting of attributes that belong #in the Info object and attributes that belong in Info.Refs. Also need #to check __getitem__, __setitem__, and __contains__. d = Info({'x': 3, 'GO': 12345}) self.assertEqual(d.x, 3) self.assertEqual(d.GO, [12345]) self.assertEqual(d.Refs.GO, [12345]) try: del d.Refs except AttributeError: pass else: raise Exception, "Failed to prevent deletion of required key Refs" "" d.GenBank = ('qaz', 'wsx') self.assertEqual(d.GenBank, ['qaz', 'wsx']) self.assertContains(d.Refs, 'GenBank') self.assertContains(d, 'GenBank') d.GenBank = 'xyz' self.assertEqual(d.GenBank, ['xyz']) self.assertSameObj(d.GenBank, d.Refs.GenBank) d.GO = 'x' self.assertEqual(d.GO, ['x']) d.GO.append('y') self.assertEqual(d.GO, ['x', 'y']) d.ZZZ = 'zzz' self.assertEqual(d.ZZZ, 'zzz') self.assertNotContains(d.Refs, 'ZZZ') self.assertNotContains(d, 'XXX') self.assertEqual(d.XXX, None)
def test_full(self): """RdbParser: full data, valid and invalid""" # when only good record, should work independent of strict r1 = RnaSequence("-??GG-UGAA--CGCU---ACGU-N???---",\ Info=Info({'Species': "unidentified Thermus OPB AF027020",\ 'Refs':{'rRNA':['AF027020']},\ 'OriginalSeq':'-o[oGG-U{G}AA--C^GC]U---ACGU-Nooo---'})) r2 = RnaSequence("---CGAUCG--UAUACG-N???-",\ Info=Info({'Species':'Thermus silvanus X84211',\ 'Refs':{'rRNA':['X84211']},\ 'OriginalSeq':'---CGAU[C(G){--UA}U]ACG-Nooo-'})) obs = list(RdbParser(RDB_LINES_ONLY_GOOD.split('\n'), strict=True)) self.assertEqual(len(obs), 2) self.assertEqual(obs[0], r1) self.assertEqual(str(obs[0]), str(r1)) self.assertEqual(obs[0].Info, r1.Info) self.assertEqual(obs[1], r2) self.assertEqual(str(obs[1]), str(r2)) self.assertEqual(obs[1].Info, r2.Info) obs = list(RdbParser(RDB_LINES_ONLY_GOOD.split('\n'), strict=False)) self.assertEqual(len(obs), 2) self.assertEqual(obs[0], r1) self.assertEqual(str(obs[0]), str(r1)) self.assertEqual(obs[0].Info, r1.Info) # when strict, should raise error on invalid record f = RdbParser(RDB_LINES_GOOD_BAD.split('\n'), strict=True) self.assertRaises(RecordError, list, f) # when not strict, malicious record is skipped obs = list(RdbParser(RDB_LINES_GOOD_BAD.split('\n'), strict=False)) self.assertEqual(len(obs), 2) self.assertEqual(obs[0], r1) self.assertEqual(str(obs[0]), str(r1)) self.assertEqual(obs[0].Info, r1.Info) self.assertEqual(obs[1], r2) self.assertEqual(str(obs[1]), str(r2)) self.assertEqual(obs[1].Info, r2.Info)
def test_multiple_constructor_bad(self): """RdbParser should complain or skip bad records w/ constructor""" def dnastrict(x, **kwargs): try: return DnaSequence(x, **kwargs) except Exception: raise RecordError, "Could not convert sequence" self.assertRaises(RecordError, list, RdbParser(self.oneX, dnastrict)) f = list(RdbParser(self.oneX, dnastrict, strict=False)) self.assertEqual(len(f), 2) a, b = f self.assertEqual(a, 'ACT') self.assertEqual(a.Info, Info({ 'Species': 'mit', 'OriginalSeq': 'ACT' })) self.assertEqual(b, 'AAA') self.assertEqual(b.Info, Info({ 'Species': 'pla', 'OriginalSeq': 'AAA' }))
def call(label): label = [label, label[1:]][label[0] == ">"] label = sep.split(label) if DEBUG: print(label) info = Info() for index, name, converter in field_formatters: if isinstance(converter, collections.Callable): try: info[name] = converter(label[index]) except IndexError: raise IndexError('parsing label %s failed for property %s at index %s' % (label, name, index)) else: info[name] = label[index] return RichLabel(info, display_template)
def InfoMaker(header_lines): """Returns an Info object constructed from the headerLines.""" info = Info() for line in header_lines: all = line.strip().split(':', 1) #strip out empty lines, lines without name, lines without colon if not all[0] or len(all) != 2: continue try: name = _field_names[all[0]] except KeyError: name = all[0] value = all[1].strip() info[name] = value return info
def call(label): label = [label, label[1:]][label[0] == ">"] label = sep.split(label) if DEBUG: print label info = Info() for index, name, converter in field_formatters: if callable(converter): try: info[name] = converter(label[index]) except IndexError: print label, index, name raise else: info[name] = label[index] return RichLabel(info, display_template)
def InfoFromLabel(line): """Takes a CUTG codon description line and returns an Info object. Raises RecordError if wrong number of fields etc. """ try: raw_fields = line.split('\\') result = Info( dict(list(zip(field_order, list(map(strip, raw_fields[1:])))))) #extra processing for first field first = raw_fields[0] if '#' in first: locus, cds_num = list(map(strip, raw_fields[0].split('#'))) else: locus, cds_num = first, '1' result['Locus'] = locus[1:] #remove leading '>' result['CdsNumber'] = cds_num #additional processing for last field: mostly key="value" pairs description = result['Description'] descrs = description.split('/') for d in descrs: if '=' in d: #assume key-value pair key, val = list(map(strip, d.split('=', 1))) #might be '=' in value #cut off leading and trailing " if present, but _not_ internal! if val.startswith('"'): val = val[1:] if val.endswith('"'): val = val[:-1] if key == 'db_xref': #handle cross-refs specially try: key, val = val.split(':') except ValueError: #missing actual reference? continue #just skip the bad db records try: if result[key]: result[key].append(val) else: result[key] = [val] except (KeyError, TypeError): #didn't recognize database result[key] = val else: #remember to convert the key to MixedCase naming convention result[cfu(key)] = val return result except: raise RecordError("Failed to read label line:\n%s" % line)
def test_full(self): """InfoMaker should return Info object with name, value pairs""" test_header = ['acc: X3402','abc:1','mty: ssu','seq: Mit. X3402',\ '','nonsense',':no_name'] obs = InfoMaker(test_header) exp = Info() exp.rRNA = 'X3402' exp.abc = '1' exp.Species = 'Mit. X3402' exp.Gene = 'ssu' self.assertEqual(obs, exp)
def NameToInfo(sequence, strict=True): """Returns an Info object constructed from the sequence Name sequence: Sequence object with a Name attribute The label will be split on Genbank acc. no. and sequence coordinates. The coordinates will be shifted one position, since in Python the first position is 0. """ #adjust label label = sequence.Name try: gb, pos = label.split('/', 1) #split genbank label and pos if not gb: gb = None if not pos: pos = None except: #unable to split, so string doesn't contain '/' if strict: raise RecordError, "Failed to extract genbank id and positions" +\ " from label %s"%label else: gb = None pos = None if pos: try: start, end = pos.split('-', 1) #split start and end pos except: if strict: raise RecordError,\ "Failed to extract genbank id and positions from label %s"\ %label else: start = None end = None else: start = None end = None if start: # adjust start position to do the correct thing in python # see comment in docstring start = int(start) - 1 if end: end = int(end) info = Info({'GenBank': gb, 'Start': start, 'End': end}) return info
def HeaderToInfo(header, strict=True): """Returns an Info object constructed from the header lines. Header is a list of lines that contain header information. Fields that can occur multiple times in a header are stored in a list. Fields that (should) occur only once are stored as a single value Comments are joined by ' ' to one field. Fields concerning the references are ignored, except for MedLine ID. """ # construct temporary dictionary containing all original information initial_info = {} for line in header: line = line.strip() if not line: continue try: init, label, content = line.split(' ', 2) if not init == '#=GF' or len(label) != 2: raise RecordError except: if strict: raise RecordError, "Failed to extract label and content " +\ "information from line %s"%(line) else: continue if label in ['BM', 'DR', 'RM', 'CC']: if label in initial_info: initial_info[label].append(content.strip()) else: initial_info[label] = [content.strip()] else: initial_info[label] = content.strip() # transform initial dict into final one # throw away useless information; group information final_info = {} for key in initial_info.keys(): name = _field_names.get(key, key) if name == 'Comment': value = ' '.join(initial_info[key]) else: value = initial_info[key] final_info[name] = value return Info(final_info)
def TreeAlign(model, seqs, tree=None, indel_rate=0.01, indel_length=0.01, ui = None, ests_from_pairwise=True, param_vals=None): """Returns a multiple alignment and tree. Uses the provided substitution model and a tree for determining the progressive order. If a tree is not provided a Neighbour Joining tree is constructed from pairwise distances estimated from pairwise aligning the sequences. If running in parallel, only the distance estimation is parallelised and only the master CPU returns the alignment and tree, other CPU's return None, None. Arguments: - model: a substitution model - seqs: a sequence collection - indel_rate, indel_length: parameters for the progressive pair-HMM - ests_from_pairwise: if no tree provided and True, the median value of the substitution model parameters are used - param_vals: named key, value pairs for model parameters. These override ests_from_pairwise. """ _exclude_params = ['mprobs', 'rate', 'bin_switch'] if param_vals: param_vals = dict(param_vals) else: param_vals = {} if isinstance(seqs, dict): seq_names = list(seqs.keys()) else: seq_names = seqs.getSeqNames() two_seqs = len(seq_names) == 2 if tree: tip_names = tree.getTipNames() tip_names.sort() seq_names.sort() assert tip_names == seq_names, \ "names don't match between seqs and tree: tree=%s; seqs=%s" % \ (tip_names, seq_names) ests_from_pairwise = False elif two_seqs: tree = LoadTree(tip_names=seqs.getSeqNames()) ests_from_pairwise = False else: if ests_from_pairwise: est_params = [param for param in model.getParamList() \ if param not in _exclude_params] else: est_params = None dcalc = EstimateDistances(seqs, model, do_pair_align=True, est_params=est_params) dcalc.run() dists = dcalc.getPairwiseDistances() tree = NJ.nj(dists) LF = model.makeLikelihoodFunction(tree.bifurcating(name_unnamed=True), aligned=False) if ests_from_pairwise and not param_vals: # we use the Median to avoid the influence of outlier pairs param_vals = {} for param in est_params: numbers = dcalc.getParamValues(param) print("Param Estimate Summary Stats: %s" % param) print(numbers.summarize()) param_vals[param] = numbers.Median ui.display("Doing %s alignment" % ["progressive", "pairwise"][two_seqs]) with LF.updatesPostponed(): for param, val in list(param_vals.items()): LF.setParamRule(param, value=val, is_constant=True) LF.setParamRule('indel_rate', value=indel_rate, is_constant=True) LF.setParamRule('indel_length', value=indel_length, is_constant=True) LF.setSequences(seqs) edge = LF.getLogLikelihood().edge align = edge.getViterbiPath().getAlignment() info = Info() info["AlignParams"] = param_vals info["AlignParams"].update(dict(indel_length=indel_length, indel_rate=indel_rate)) align.Info = info return align, tree
def test_empty(self): """InfoMaker: should return empty Info from empty header""" empty_header = [] obs = InfoMaker(empty_header) exp = Info() self.assertEqual(obs, exp)
def test_identity(self): """Info should get its own new Refs when created""" i = Info() j = Info() self.assertNotSameObj(i, j) self.assertNotSameObj(i.Refs, j.Refs)
def RichGenbankParser(handle, info_excludes=None, moltype=None, skip_contigs=False): """Returns annotated sequences from GenBank formatted file. Arguments: - info_excludes: a series of fields to be excluded from the Info object - moltype: a MolType instance, such as PROTEIN, DNA. Default is ASCII. - skip_contigs: ignores records with no actual sequence data, typically a genomic contig.""" info_excludes = info_excludes or [] moltype = moltype or ASCII for rec in MinimalGenbankParser(handle): info = Info() # populate the Info object, excluding the sequence for label, value in rec.items(): if label in info_excludes: continue info[label] = value if rec['mol_type'] == 'protein': # which it doesn't for genbank moltype = PROTEIN elif rec['mol_type'] == 'DNA': moltype = DNA try: seq = moltype.makeSequence(rec['sequence'].upper(), Info=info, Name=rec['locus']) except KeyError: if not skip_contigs: if 'contig' in rec: yield rec['locus'], rec['contig'] elif 'WGS' in rec: yield rec['locus'], rec['WGS'] else: yield rec['locus'], None continue for feature in rec['features']: spans = [] reversed = None if feature['location'] == None or feature['type'] in ['source', \ 'organism']: continue for location in feature['location']: (lo, hi) = (location.first() - 1, location.last()) if location.Strand == -1: (lo, hi) = (hi, lo) assert reversed is not False reversed = True else: assert reversed is not True reversed = False # ensure we don't put in a span that starts beyond the sequence if lo > len(seq): continue # or that's longer than the sequence hi = [hi, len(seq)][hi > len(seq)] spans.append((lo, hi)) if reversed: spans.reverse() for id_field in ['gene', 'note', 'product', 'clone']: if id_field in feature: name = feature[id_field] if not isinstance(name, basestring): name = ' '.join(name) break else: name = None seq.addAnnotation(Feature, feature['type'], name, spans) yield (rec['locus'], seq)