def get_hmmdb_go_terms( acc, c ): """ This returns a list of bioannotation:GOAnnotation objects """ qry = "SELECT hg.go_id FROM hmm_go hg JOIN hmm ON hg.hmm_id=hmm.id WHERE hmm.accession = ?" c.execute(qry, (acc,)) go_annots = list() for row in c: go = annotation.GOAnnotation(go_id=row[0], ev_code='ISM', with_from=acc) go_annots.append(go) return go_annots
def get_uniref_go_terms( acc, c ): """ This returns a list of bioannotation:GOAnnotation objects """ qry = """ SELECT us_go.go_id FROM uniref_go us_go WHERE us_go.id = ? """ c.execute(qry, (acc,)) go_annots = list() for row in c: go = annotation.GOAnnotation(go_id=row[0], ev_code='ISA', with_from=acc) go_annots.append(go) return go_annots
def get_uspdb_go_terms( acc, c ): """ This returns a list of bioannotation:GOAnnotation objects """ qry = """ SELECT us_go.go_id FROM uniprot_sprot_go us_go JOIN uniprot_sprot_acc us_acc ON us_go.id=us_acc.id WHERE us_acc.accession = ? """ c.execute(qry, (acc,)) go_annots = list() for row in c: go = annotation.GOAnnotation(go_id=row[0], ev_code='ISA', with_from=acc) go_annots.append(go) return go_annots
def get_go_annotations(feat): """ Looks for sections like this to extract GO terms CDS join(3366667..3366969,3463389..3463463) /gene="ENSMUSG00000040653.6" /protein_id="ENSMUSP00000149688.1" /db_xref="EMBL:AC162384" /db_xref="EMBL:CH466562" /db_xref="GO:0005737" /db_xref="GO:0042325" """ go_terms = list() if 'db_xref' in feat.qualifiers: for dbxref in feat.qualifiers['db_xref']: m = re.match('GO:(.+)', dbxref) if m: go_terms.append(annotation.GOAnnotation(go_id=m.group(1))) return go_terms
def parse_tmhmm_evidence(log_fh, polypeptides, htab_list): ''' Reads a list of raw TMHMM evidence and a dict of polypeptides, adding annotation attributes where possible. Notes from the esteemed M Giglio: The GO term to use would be GO:0016021 "integral component of membrane" Or if you want to be more conservative you could go with GO:0016020 "membrane" Depends on the evidence. For the prok pipe we are pretty conservative, we require five TMHMM domains and then we call it putative integral membrane protein. On ECO - in fact Marcus and I are the developers of ECO. It is an ontology of evidence types. An annotation to an ECO term is used in conjunction with another annotation, like a GO term (but many other types of annotation can, and are, used with ECO). It provides additional information about the annotation. In fact for GO, the assignment of an evidence term along with a GO term is a required part of a GO annotation. (ECO terms are the "evidence codes" in GO.) INPUT: Expected TMHMM input (all HTML lines are skipped) # CHARM010_V2.mRNA.887 Length: 904 # CHARM010_V2.mRNA.887 Number of predicted TMHs: 6 # CHARM010_V2.mRNA.887 Exp number of AAs in TMHs: 133.07638 # CHARM010_V2.mRNA.887 Exp number, first 60 AAs: 21.83212 # CHARM010_V2.mRNA.887 Total prob of N-in: 0.99994 # CHARM010_V2.mRNA.887 POSSIBLE N-term signal sequence CHARM010_V2.mRNA.887 TMHMM2.0 inside 1 11 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 12 34 CHARM010_V2.mRNA.887 TMHMM2.0 outside 35 712 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 713 735 CHARM010_V2.mRNA.887 TMHMM2.0 inside 736 755 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 756 773 CHARM010_V2.mRNA.887 TMHMM2.0 outside 774 782 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 783 805 CHARM010_V2.mRNA.887 TMHMM2.0 inside 806 809 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 810 832 CHARM010_V2.mRNA.887 TMHMM2.0 outside 833 871 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 872 894 CHARM010_V2.mRNA.887 TMHMM2.0 inside 895 904 ''' # The number of helices spanning the membrane required before counted as a membrane protein MIN_HELICAL_SPANS = 3 # For successful matches, this is the product name which gets applied GENE_PRODUCT_NAME = 'Putative integral membrane protein' for file in utils.read_list_file(htab_list): last_qry_id = None current_helix_count = 0 for line in open(file): if line.startswith('<'): continue m = re.match("# (.+?)\s+Length: \d+", line) if m: current_id = m.group(1) # purge previous result if current_helix_count >= MIN_HELICAL_SPANS: annot = polypeptides[last_qry_id].annotation if annot.product_name == DEFAULT_PRODUCT_NAME: annot.product_name = GENE_PRODUCT_NAME log_fh.write( "INFO: {0}: Updated product name to '{1}' because it had {2} TMHelix domains predicted by TMHMM\n" .format(last_qry_id, annot.product_name, current_helix_count)) else: log_fh.write( "INFO: {0}: TMHMM predicted {1} TMHelix domains but gene product name unchanged because of previous assignment\n" .format(last_qry_id, current_helix_count)) ## we add the GO terms no matter what annot.add_go_annotation( annotation.GOAnnotation(go_id='0016021')) # reset last_qry_id = current_id current_helix_count = 0 continue cols = line.split() if len(cols) == 5 and cols[2] == 'TMhelix': current_helix_count += 1