def _make_position(location_string, offset=0): """Turn a Swiss location position into a SeqFeature position object (PRIVATE). An offset of -1 is used with a start location to make it pythonic. """ if location_string == "?": return SeqFeature.UnknownPosition() #Hack so that feature from 0 to 0 becomes 0 to 0, not -1 to 0. try: return SeqFeature.ExactPosition(max(0, offset + int(location_string))) except ValueError: pass if location_string.startswith("<"): try: return SeqFeature.BeforePosition( max(0, offset + int(location_string[1:]))) except ValueError: pass elif location_string.startswith(">"): # e.g. ">13" try: return SeqFeature.AfterPosition( max(0, offset + int(location_string[1:]))) except ValueError: pass elif location_string.startswith("?"): # e.g. "?22" try: return SeqFeature.UncertainPosition( max(0, offset + int(location_string[1:]))) except ValueError: pass raise NotImplementedError("Cannot parse location '%s'" % location_string)
def _parse_feature(element): feature = SeqFeature.SeqFeature() for k, v in element.attrib.items(): feature.qualifiers[k] = v feature.type = element.attrib.get('type', '') if 'id' in element.attrib: feature.id = element.attrib['id'] for feature_element in element: if feature_element.tag == NS + 'location': position_elements = feature_element.findall(NS + 'position') if position_elements: element = position_elements[0] start_position = _parse_position(element, -1) end_position = _parse_position(element) else: element = feature_element.findall(NS + 'begin')[0] start_position = _parse_position(element, -1) element = feature_element.findall(NS + 'end')[0] end_position = _parse_position(element) feature.location = SeqFeature.FeatureLocation(start_position, end_position) else: try: feature.qualifiers[feature_element.tag.replace(NS, '')] = feature_element.text except: pass # skip unparsable tag self.ParsedSeqRecord.features.append(feature)
def _make_seqfeature(name, from_res, to_res, description, ft_id): """Construct SeqFeature from feature data from parser (PRIVATE).""" loc = SeqFeature.FeatureLocation(_make_position(from_res, -1), _make_position(to_res, 0)) if not ft_id: ft_id = "<unknown id>" # The default in SeqFeature object return SeqFeature.SeqFeature(loc, type=name, id=ft_id, qualifiers={"description": description})
def _parse_reference(element): reference = SeqFeature.Reference() authors = [] scopes = [] tissues = [] journal_name = '' pub_type = '' pub_date = '' for ref_element in element: if ref_element.tag == NS + 'citation': pub_type = ref_element.attrib['type'] if pub_type == 'submission': pub_type += ' to the ' + ref_element.attrib['db'] if 'name' in ref_element.attrib: journal_name = ref_element.attrib['name'] pub_date = ref_element.attrib.get('date', '') j_volume = ref_element.attrib.get('volume', '') j_first = ref_element.attrib.get('first', '') j_last = ref_element.attrib.get('last', '') for cit_element in ref_element: if cit_element.tag == NS + 'title': reference.title = cit_element.text elif cit_element.tag == NS + 'authorList': for person_element in cit_element: authors.append(person_element.attrib['name']) elif cit_element.tag == NS + 'dbReference': self.ParsedSeqRecord.dbxrefs.append(cit_element.attrib['type'] + ':' + cit_element.attrib['id']) if cit_element.attrib['type'] == 'PubMed': reference.pubmed_id = cit_element.attrib['id'] elif ref_element.attrib['type'] == 'MEDLINE': reference.medline_id = cit_element.attrib['id'] elif ref_element.tag == NS + 'scope': scopes.append(ref_element.text) elif ref_element.tag == NS + 'source': for source_element in ref_element: if source_element.tag == NS + 'tissue': tissues.append(source_element.text) if scopes: scopes_str = 'Scope: ' + ', '.join(scopes) else: scopes_str = '' if tissues: tissues_str = 'Tissue: ' + ', '.join(tissues) else: tissues_str = '' # locations cannot be parsed since they are actually written in # free text inside scopes so all the references are put in the # annotation. reference.location = [] reference.authors = ', '.join(authors) if journal_name: if pub_date and j_volume and j_first and j_last: reference.journal = REFERENCE_JOURNAL % dict(name=journal_name, volume=j_volume, first=j_first, last=j_last, pub_date=pub_date) else: reference.journal = journal_name reference.comment = ' | '.join((pub_type, pub_date, scopes_str, tissues_str)) append_to_annotations('references', reference)
def _parse_position(element, offset=0): try: position = int(element.attrib['position']) + offset except KeyError as err: position = None status = element.attrib.get('status', '') if status == 'unknown': assert position is None return SeqFeature.UnknownPosition() elif not status: return SeqFeature.ExactPosition(position) elif status == 'greater than': return SeqFeature.AfterPosition(position) elif status == 'less than': return SeqFeature.BeforePosition(position) elif status == 'uncertain': return SeqFeature.UncertainPosition(position) else: raise NotImplementedError("Position status %r" % status)
def _parse_dbReference(element): self.ParsedSeqRecord.dbxrefs.append(element.attrib['type'] + ':' + element.attrib['id']) #e.g. # <dbReference type="PDB" key="11" id="2GEZ"> # <property value="X-ray" type="method"/> # <property value="2.60 A" type="resolution"/> # <property value="A/C/E/G=1-192, B/D/F/H=193-325" type="chains"/> # </dbReference> if 'type' in element.attrib: if element.attrib['type'] == 'PDB': method = "" resolution = "" for ref_element in element: if ref_element.tag == NS + 'property': dat_type = ref_element.attrib['type'] if dat_type == 'method': method = ref_element.attrib['value'] if dat_type == 'resolution': resolution = ref_element.attrib['value'] if dat_type == 'chains': pairs = ref_element.attrib['value'].split(',') for elem in pairs: pair = elem.strip().split('=') if pair[1] != '-': #TODO - How best to store these, do SeqFeatures make sense? feature = SeqFeature.SeqFeature() feature.type = element.attrib['type'] feature.qualifiers['name'] = element.attrib['id'] feature.qualifiers['method'] = method feature.qualifiers['resolution'] = resolution feature.qualifiers['chains'] = pair[0].split('/') start = int(pair[1].split('-')[0]) - 1 end = int(pair[1].split('-')[1]) feature.location = SeqFeature.FeatureLocation(start, end) #self.ParsedSeqRecord.features.append(feature) for ref_element in element: if ref_element.tag == NS + 'property': pass # this data cannot be fitted in a seqrecord object with a simple list. however at least ensembl and EMBL parsing can be improved to add entries in dbxrefs
def SwissIterator(handle): """Breaks up a Swiss-Prot/UniProt file into SeqRecord objects. Every section from the ID line to the terminating // becomes a single SeqRecord with associated annotation and features. This parser is for the flat file "swiss" format as used by: * Swiss-Prot aka SwissProt * TrEMBL * UniProtKB aka UniProt Knowledgebase For consistency with BioPerl and EMBOSS we call this the "swiss" format. See also the SeqIO support for "uniprot-xml" format. """ swiss_records = SwissProt.parse(handle) for swiss_record in swiss_records: # Convert the SwissProt record to a SeqRecord seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein) record = SeqRecord.SeqRecord( seq, id=swiss_record.accessions[0], name=swiss_record.entry_name, description=swiss_record.description, features=[_make_seqfeature(*f) for f in swiss_record.features], ) record.description = swiss_record.description for cross_reference in swiss_record.cross_references: if len(cross_reference) < 2: continue database, accession = cross_reference[:2] dbxref = "%s:%s" % (database, accession) if not dbxref in record.dbxrefs: record.dbxrefs.append(dbxref) annotations = record.annotations annotations['accessions'] = swiss_record.accessions if swiss_record.created: annotations['date'] = swiss_record.created[0] if swiss_record.sequence_update: annotations[ 'date_last_sequence_update'] = swiss_record.sequence_update[0] if swiss_record.annotation_update: annotations[ 'date_last_annotation_update'] = swiss_record.annotation_update[ 0] if swiss_record.gene_name: annotations['gene_name'] = swiss_record.gene_name annotations['organism'] = swiss_record.organism.rstrip(".") annotations['taxonomy'] = swiss_record.organism_classification annotations['ncbi_taxid'] = swiss_record.taxonomy_id if swiss_record.host_organism: annotations['organism_host'] = swiss_record.host_organism if swiss_record.host_taxonomy_id: annotations['host_ncbi_taxid'] = swiss_record.host_taxonomy_id if swiss_record.comments: annotations['comment'] = "\n".join(swiss_record.comments) if swiss_record.references: annotations['references'] = [] for reference in swiss_record.references: feature = SeqFeature.Reference() feature.comment = " ".join("%s=%s;" % k_v for k_v in reference.comments) for key, value in reference.references: if key == 'PubMed': feature.pubmed_id = value elif key == 'MEDLINE': feature.medline_id = value elif key == 'DOI': pass elif key == 'AGRICOLA': pass else: raise ValueError("Unknown key %s found in references" % key) feature.authors = reference.authors feature.title = reference.title feature.journal = reference.location annotations['references'].append(feature) if swiss_record.keywords: record.annotations['keywords'] = swiss_record.keywords yield record