def parse_annotation_from_column_9(col9): annot = bioannotation.FunctionalAnnotation() atts = column_9_dict(col9) ## List of attributes which may be in column 9 that we want to skip as # as not being involved with annotation. skip = ['ID', 'Parent'] for att in atts: if att == 'product_name': annot.product_name = atts[att] elif att == 'Dbxref': ec_nums = list() ## TODO: Review this whole conditional if isinstance(atts['Dbxref'], str): if atts['Dbxref'].startswith("EC"): ec_nums.append(atts['Dbxref']) else: annot.add_dbxref(atts['Dbxref']) else: for dbxref in atts['Dbxref']: if dbxref.startswith("EC"): ec_nums.append(dbxref) else: annot.add_dbxref(dbxref) for ec_num in ec_nums: ec_annot = bioannotation.ECAnnotation(number=ec_num) annot.add_ec_number(ec_annot) elif att == 'Ontology_term': ont_terms = list() if isinstance(atts['Ontology_term'], str) and atts['Ontology_term'].startswith("GO"): ont_terms.append(atts['Ontology_term']) else: for term in atts['Ontology_term']: if term.startswith("GO"): ont_terms.append(term) for go_id in ont_terms: go_annot = bioannotation.GOAnnotation(go_id=go_id) annot.add_go_annotation(go_annot) elif att == 'gene_symbol': annot.gene_symbol = atts[att] elif att not in skip: ## just save any other attributes provided annot.other_attributes[att] = atts[att] return annot
def get_hmmdb_go_terms( acc, c ): """ This returns a list of bioannotation:GOAnnotation objects """ qry = "SELECT hg.go_id FROM hmm_go hg JOIN hmm ON hg.hmm_id=hmm.id WHERE hmm.accession = ?" c.execute(qry, (acc,)) go_annots = list() for row in c: go = bioannotation.GOAnnotation(go_id=row[0], ev_code='ISM', with_from=acc) go_annots.append(go) return go_annots
def get_uniref_go_terms( acc, c ): """ This returns a list of bioannotation:GOAnnotation objects """ qry = """ SELECT us_go.go_id FROM uniref_go us_go WHERE us_go.id = ? """ c.execute(qry, (acc,)) go_annots = list() for row in c: go = bioannotation.GOAnnotation(go_id=row[0], ev_code='ISA', with_from=acc) go_annots.append(go) return go_annots
def get_uspdb_go_terms( acc, c ): """ This returns a list of bioannotation:GOAnnotation objects """ qry = """ SELECT us_go.go_id FROM uniprot_sprot_go us_go JOIN uniprot_sprot_acc us_acc ON us_go.id=us_acc.id WHERE us_acc.accession = ? """ c.execute(qry, (acc,)) go_annots = list() for row in c: go = bioannotation.GOAnnotation(go_id=row[0], ev_code='ISA', with_from=acc) go_annots.append(go) return go_annots
def parse_annotation_from_column_9(col9): annot = bioannotation.FunctionalAnnotation() atts = column_9_dict(col9) if 'product_name' in atts: annot.product_name = atts['product_name'] if 'Dbxref' in atts: ec_nums = list() if isinstance(atts['Dbxref'], str) and atts['Dbxref'].startswith("EC"): ec_nums.append(atts['Dbxref']) else: for dbxref in atts['Dbxref']: if dbxref.startswith("EC"): ec_nums.append(dbxref) for ec_num in ec_nums: ec_annot = bioannotation.ECAnnotation(number=ec_num) annot.add_ec_number(ec_annot) if 'Ontology_term' in atts: ont_terms = list() if isinstance(atts['Ontology_term'], str) and atts['Ontology_term'].startswith("GO"): ont_terms.append(atts['Ontology_term']) else: for term in atts['Ontology_term']: if term.startswith("GO"): ont_terms.append(term) for go_id in ont_terms: go_annot = bioannotation.GOAnnotation(go_id=go_id) annot.add_go_annotation(go_annot) return annot
def parse_tmhmm_evidence( log_fh, polypeptides, htab_list ): ''' Reads a list of raw TMHMM evidence and a dict of polypeptides, adding annotation attributes where possible. Notes from the esteemed M Giglio: The GO term to use would be GO:0016021 "integral component of membrane" Or if you want to be more conservative you could go with GO:0016020 "membrane" Depends on the evidence. For the prok pipe we are pretty conservative, we require five TMHMM domains and then we call it putative integral membrane protein. On ECO - in fact Marcus and I are the developers of ECO. It is an ontology of evidence types. An annotation to an ECO term is used in conjunction with another annotation, like a GO term (but many other types of annotation can, and are, used with ECO). It provides additional information about the annotation. In fact for GO, the assignment of an evidence term along with a GO term is a required part of a GO annotation. (ECO terms are the "evidence codes" in GO.) INPUT: Expected TMHMM input (all HTML lines are skipped) # CHARM010_V2.mRNA.887 Length: 904 # CHARM010_V2.mRNA.887 Number of predicted TMHs: 6 # CHARM010_V2.mRNA.887 Exp number of AAs in TMHs: 133.07638 # CHARM010_V2.mRNA.887 Exp number, first 60 AAs: 21.83212 # CHARM010_V2.mRNA.887 Total prob of N-in: 0.99994 # CHARM010_V2.mRNA.887 POSSIBLE N-term signal sequence CHARM010_V2.mRNA.887 TMHMM2.0 inside 1 11 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 12 34 CHARM010_V2.mRNA.887 TMHMM2.0 outside 35 712 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 713 735 CHARM010_V2.mRNA.887 TMHMM2.0 inside 736 755 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 756 773 CHARM010_V2.mRNA.887 TMHMM2.0 outside 774 782 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 783 805 CHARM010_V2.mRNA.887 TMHMM2.0 inside 806 809 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 810 832 CHARM010_V2.mRNA.887 TMHMM2.0 outside 833 871 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 872 894 CHARM010_V2.mRNA.887 TMHMM2.0 inside 895 904 ''' # The number of helices spanning the membrane required before counted as a membrane protein MIN_HELICAL_SPANS = 3 # For successful matches, this is the product name which gets applied GENE_PRODUCT_NAME = 'Putative integral membrane protein' for file in biocodeutils.read_list_file(htab_list): last_qry_id = None current_helix_count = 0 for line in open(file): if line.startswith('<'): continue m = re.match("# (.+?)\s+Length: \d+", line) if m: current_id = m.group(1) # purge previous result if current_helix_count >= MIN_HELICAL_SPANS: annot = polypeptides[last_qry_id].annotation if annot.product_name == DEFAULT_PRODUCT_NAME: annot.product_name = GENE_PRODUCT_NAME log_fh.write("INFO: {0}: Updated product name to '{1}' because it had {2} TMHelix domains predicted by TMHMM\n".format(last_qry_id, annot.product_name, current_helix_count)) else: log_fh.write("INFO: {0}: TMHMM predicted {1} TMHelix domains but gene product name unchanged because of previous assignment\n".format(last_qry_id, current_helix_count)) ## we add the GO terms no matter what annot.add_go_annotation( bioannotation.GOAnnotation(go_id='0016021') ) # reset last_qry_id = current_id current_helix_count = 0 continue cols = line.split() if len(cols) == 5 and cols[2] == 'TMhelix': current_helix_count += 1