def parse_hmm_evidence(log_fh, polypeptides, htab_list, cursor):
    '''
    Reads a list file of HMM evidence and dict of polypeptides, populating each with
    Annotation evidence where appropriate.  Each file in the list can have results
    for multiple queries, but it's assumed that ALL candidate matches for any given
    query are grouped together.

    Currently only the top hit for any given query polypeptide is used.
    '''
    for file in utils.read_list_file(htab_list):
        last_qry_id = None

        for line in open(file):
            line = line.rstrip()
            cols = line.split("\t")

            ## only consider the row if the total score is above the total trusted cutoff
            if cols[12] >= cols[17]:
                continue

            this_qry_id = cols[5]
            accession = cols[0]
            version = None

            # if this is a PFAM accession, handle the version
            m = re.match("^(PF\d+)\.\d+", accession)
            if m:
                version = accession
                accession = m.group(1)

            ## the HMM hits are sorted already with the top hit for each query first
            if last_qry_id != this_qry_id:
                ## save it
                annot = polypeptides[this_qry_id].annotation
                annot.product_name = cols[15]
                log_fh.write(
                    "INFO: {0}: Updated product name to '{1}' based on HMM hit to accession '{2}'"
                    .format(this_qry_id, annot.product_name, accession))

                # does our hmm database provide GO terms for this accession?
                for go_annot in get_hmmdb_go_terms(accession, cursor):
                    annot.add_go_annotation(go_annot)

                # do we have a gene symbol for this accession?
                annot.gene_symbol = get_hmmdb_gene_symbol(accession, cursor)

                # do we have an EC number?
                for ec_annot in get_hmmdb_ec_nums(accession, cursor):
                    annot.add_ec_number(ec_annot)

                ## remember the ID we just saw
                last_qry_id = this_qry_id
def parse_hmm_evidence( log_fh, polypeptides, htab_list, cursor ):
    '''
    Reads a list file of HMM evidence and dict of polypeptides, populating each with
    Annotation evidence where appropriate.  Each file in the list can have results
    for multiple queries, but it's assumed that ALL candidate matches for any given
    query are grouped together.

    Currently only the top hit for any given query polypeptide is used.
    '''
    for file in utils.read_list_file(htab_list):
        last_qry_id = None
        
        for line in open(file):
            line = line.rstrip()
            cols = line.split("\t")
            
            ## only consider the row if the total score is above the total trusted cutoff
            if cols[12] >= cols[17]:
                continue

            this_qry_id = cols[5]
            accession = cols[0]
            version = None

            # if this is a PFAM accession, handle the version
            m = re.match("^(PF\d+)\.\d+", accession)
            if m:
                version = accession
                accession = m.group(1)

            ## the HMM hits are sorted already with the top hit for each query first
            if last_qry_id != this_qry_id:
                ## save it
                annot = polypeptides[this_qry_id].annotation
                annot.product_name = cols[15]
                log_fh.write("INFO: {0}: Updated product name to '{1}' based on HMM hit to accession '{2}'".format(this_qry_id, annot.product_name, accession))
                
                # does our hmm database provide GO terms for this accession?
                for go_annot in get_hmmdb_go_terms( accession, cursor ):
                    annot.add_go_annotation(go_annot)

                # do we have a gene symbol for this accession?
                annot.gene_symbol = get_hmmdb_gene_symbol( accession, cursor )

                # do we have an EC number?
                for ec_annot in get_hmmdb_ec_nums( accession, cursor ):
                    annot.add_ec_number(ec_annot)

                ## remember the ID we just saw
                last_qry_id = this_qry_id
def parse_trembl_blast_evidence(polypeptides, blast_list, eval_cutoff):
    '''
    Reads a list file of NCBI BLAST evidence against TrEMBL and a dict of polypeptides,
    populating each with Annotation evidence where appropriate.  Only attaches evidence if
    the product name is the default.

    Currently only considers the top BLAST hit for each query which doesn't have
    'uncharacterized' in the product name.
    '''
    for file in utils.read_list_file(blast_list):
        last_qry_id = None

        for line in open(file):
            line = line.rstrip()
            cols = line.split("\t")

            # We're going to ignore any lines which have 'uncharacterized' in the name
            if 'ncharacterized' in cols[15]:
                continue

            this_qry_id = cols[0]

            # skip this line if it doesn't meet the cutoff
            if float(cols[19]) > eval_cutoff:
                continue

            # the BLAST hits are sorted already with the top hit for each query first
            if last_qry_id != this_qry_id:
                annot = polypeptides[this_qry_id].annotation

                # get the accession from the cols[5]
                #  then process for known accession types
                accession = cols[5]

                # save it, unless the gene product name has already changed from the default
                if annot.product_name == DEFAULT_PRODUCT_NAME:
                    # current hack until DB is updated:
                    # some products look like this:
                    #    Coatomer subunit gamma-2 OS=Bos taurus GN=COPG2 PE=2 SV=1
                    # take off everything after the OS=
                    m = re.search("(.+) OS=", cols[15])

                    if m:
                        annot.product_name = m.group(1)
                    else:
                        annot.product_name = cols[15]

                # remember the ID we just saw
                last_qry_id = this_qry_id
def parse_trembl_blast_evidence(polypeptides, blast_list, eval_cutoff):
    '''
    Reads a list file of NCBI BLAST evidence against TrEMBL and a dict of polypeptides,
    populating each with Annotation evidence where appropriate.  Only attaches evidence if
    the product name is the default.

    Currently only considers the top BLAST hit for each query which doesn't have
    'uncharacterized' in the product name.
    '''
    for file in utils.read_list_file(blast_list):
        last_qry_id = None
        
        for line in open(file):
            line = line.rstrip()
            cols = line.split("\t")

            # We're going to ignore any lines which have 'uncharacterized' in the name
            if 'ncharacterized' in cols[15]:
                continue
            
            this_qry_id = cols[0]

            # skip this line if it doesn't meet the cutoff
            if float(cols[19]) > eval_cutoff:
                continue

            # the BLAST hits are sorted already with the top hit for each query first
            if last_qry_id != this_qry_id:
                annot = polypeptides[this_qry_id].annotation

                # get the accession from the cols[5]
                #  then process for known accession types
                accession = cols[5]

                # save it, unless the gene product name has already changed from the default
                if annot.product_name == DEFAULT_PRODUCT_NAME:
                    # current hack until DB is updated:
                    # some products look like this:
                    #    Coatomer subunit gamma-2 OS=Bos taurus GN=COPG2 PE=2 SV=1
                    # take off everything after the OS=
                    m = re.search("(.+) OS=", cols[15])

                    if m:
                        annot.product_name = m.group(1)
                    else:
                        annot.product_name = cols[15]

                # remember the ID we just saw
                last_qry_id = this_qry_id
def parse_uniref100_blast_evidence(log_fh, polypeptides, blast_list, cursor,
                                   eval_cutoff, algorithm,
                                   uniref100_fasta_path):
    '''
    Reads a list file of NCBI BLAST evidence and a dict of polypeptides, populating
    each with Annotation evidence where appropriate.  Only attaches evidence if
    the product name is the default.

    Currently only considers the top BLAST hit for each query.
    '''
    if algorithm not in ['blast', 'rapsearch2']:
        raise Exception(
            "algorithm argument must be either blast or rapsearch2")

    ## need to load the UniRef100 to TREMBL accession lookup from teh FASTA
    # like UniRef100_K1T359 -> K1T359_9ZZZZ
    uniref2acc = dict()
    print("INFO: parsing UniRef100 FASTA headers for annotation")
    if algorithm == 'rapsearch2':
        for line in open(uniref100_fasta_path):
            if line[0] == '>':
                m = re.match("\>(\S+) (.+) n=.+RepID=(\S+)", line)
                if m:
                    uniref2acc[m.group(1)] = {
                        'acc': m.group(3),
                        'prod': m.group(2)
                    }

    for file in utils.read_list_file(blast_list):
        last_qry_id = None

        for line in open(file):
            # 0 indexing is faster than startswith()
            if line[0] == '#':
                continue

            line = line.rstrip()
            cols = line.split("\t")
            this_qry_id = cols[0]

            # We're going to ignore any lines which have a few keywords in the name
            # First character left off for initcap reasons
            if algorithm == 'blast':
                skip_products = [
                    'ncharacterized', 'ypothetical', 'enomic scaffold'
                ]
                skip = False
                for keyword in skip_products:
                    if keyword in cols[15]:
                        skip = True

                if skip == True:
                    continue

            if algorithm == 'blast':
                e_value = float(cols[19])
            elif algorithm == 'rapsearch2':
                ## rapsearch2 can actually report values outside of python's double range.  Handle these
                try:
                    e_value = math.pow(10, float(cols[10]))
                except OverflowError:
                    print(
                        "WARN: couldn't handle E-value math on the following line (setting to 0):\n{0}"
                        .format(line))
                    e_value = 0

            # skip this line if it doesn't meet the cutoff
            if e_value > eval_cutoff:
                continue

            # the BLAST hits are sorted already with the top hit for each query first
            if last_qry_id != this_qry_id:
                annot = polypeptides[this_qry_id].annotation

                # get the accession then process for known accession types
                accession = None

                if algorithm == 'blast':
                    # UniRef100_K1T359 -> K1T359_9ZZZZ
                    m = re.search("RepID\=(\S+)", cols[15])
                    if m:
                        accession = m.group(1)
                    else:
                        raise Exception(
                            "ERROR: Unexpected product format in UniRef BLAST results: {0}"
                            .format(cols[15]))
                elif algorithm == 'rapsearch2':
                    accession = uniref2acc[cols[1]]['acc']

                assertions = get_uniref_annot(accession, cursor)

                # save it, unless the gene product name has already changed from the default
                if annot.product_name == DEFAULT_PRODUCT_NAME:
                    if algorithm == 'blast':
                        # these hits look like this:
                        #  AD-specific glutamate dehydrogenase n=1 Tax=Ceriporiopsis subvermispora (strain B) RepID=M2RLB9_CERS8
                        m = re.match("(.+) n\=.+", cols[15])
                        if m:
                            annot.product_name = m.group(1)
                        else:
                            raise Exception(
                                "ERROR: Unexpected product format in UniRef BLAST results: {0}"
                                .format(cols[15]))

                        log_fh.write(
                            "INFO: {0}: Updated product name to '{1}' based on BLAST hit to UniRef100 accession '{2}'\n"
                            .format(this_qry_id, annot.product_name,
                                    accession))

                    elif algorithm == 'rapsearch2':
                        annot.product_name = uniref2acc[cols[1]]['prod']

                # if no EC numbers have been set, they can inherit from this
                if len(annot.ec_numbers) == 0:
                    for ec_annot in get_uniref_ec_nums(accession, cursor):
                        annot.add_ec_number(ec_annot)

                # if no GO IDs have been set, they can inherit from this
                if len(annot.go_annotations) == 0:
                    for go_annot in get_uniref_go_terms(accession, cursor):
                        annot.add_go_annotation(go_annot)

                # if no gene symbol has been set, it can inherit from this
                if annot.gene_symbol is None:
                    annot.gene_symbol = assertions['symbol']

                # remember the ID we just saw
                last_qry_id = this_qry_id
def parse_tmhmm_evidence(log_fh, polypeptides, htab_list):
    '''
    Reads a list of raw TMHMM evidence and a dict of polypeptides, adding annotation
    attributes where possible.

    Notes from the esteemed M Giglio:
    The GO term to use would be GO:0016021 "integral component of membrane"
    Or if you want to be more conservative you could go with GO:0016020 "membrane"
    
    Depends on the evidence. For the prok pipe we are pretty conservative, we require five TMHMM
    domains and then we call it putative integral membrane protein. 

    On ECO - in fact Marcus and I are the developers of ECO.  It is an ontology of evidence types.
    An annotation to an ECO term is used in conjunction with another annotation, like a GO term
    (but many other types of annotation can, and are, used with ECO). It provides additional
    information about the annotation. In fact for GO, the assignment of an evidence term along
    with a GO term is a required part of a GO annotation. (ECO terms are the "evidence codes" in GO.)

    INPUT: Expected TMHMM input (all HTML lines are skipped)
    # CHARM010_V2.mRNA.887 Length: 904
    # CHARM010_V2.mRNA.887 Number of predicted TMHs:  6
    # CHARM010_V2.mRNA.887 Exp number of AAs in TMHs: 133.07638
    # CHARM010_V2.mRNA.887 Exp number, first 60 AAs:  21.83212
    # CHARM010_V2.mRNA.887 Total prob of N-in:        0.99994
    # CHARM010_V2.mRNA.887 POSSIBLE N-term signal sequence
    CHARM010_V2.mRNA.887	TMHMM2.0	inside	     1    11
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	    12    34
    CHARM010_V2.mRNA.887	TMHMM2.0	outside	    35   712
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	   713   735
    CHARM010_V2.mRNA.887	TMHMM2.0	inside	   736   755
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	   756   773
    CHARM010_V2.mRNA.887	TMHMM2.0	outside	   774   782
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	   783   805
    CHARM010_V2.mRNA.887	TMHMM2.0	inside	   806   809
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	   810   832
    CHARM010_V2.mRNA.887	TMHMM2.0	outside	   833   871
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	   872   894
    CHARM010_V2.mRNA.887	TMHMM2.0	inside	   895   904
    '''
    # The number of helices spanning the membrane required before counted as a membrane protein
    MIN_HELICAL_SPANS = 3

    # For successful matches, this is the product name which gets applied
    GENE_PRODUCT_NAME = 'Putative integral membrane protein'

    for file in utils.read_list_file(htab_list):
        last_qry_id = None
        current_helix_count = 0

        for line in open(file):
            if line.startswith('<'): continue
            m = re.match("# (.+?)\s+Length: \d+", line)

            if m:
                current_id = m.group(1)

                # purge previous result
                if current_helix_count >= MIN_HELICAL_SPANS:
                    annot = polypeptides[last_qry_id].annotation

                    if annot.product_name == DEFAULT_PRODUCT_NAME:
                        annot.product_name = GENE_PRODUCT_NAME
                        log_fh.write(
                            "INFO: {0}: Updated product name to '{1}' because it had {2} TMHelix domains predicted by TMHMM\n"
                            .format(last_qry_id, annot.product_name,
                                    current_helix_count))
                    else:
                        log_fh.write(
                            "INFO: {0}: TMHMM predicted {1} TMHelix domains but gene product name unchanged because of previous assignment\n"
                            .format(last_qry_id, current_helix_count))

                    ## we add the GO terms no matter what
                    annot.add_go_annotation(
                        annotation.GOAnnotation(go_id='0016021'))

                # reset
                last_qry_id = current_id
                current_helix_count = 0
                continue

            cols = line.split()
            if len(cols) == 5 and cols[2] == 'TMhelix':
                current_helix_count += 1
def parse_sprot_blast_evidence(log_fh, polypeptides, blast_org, blast_list,
                               cursor, eval_cutoff, algorithm):
    '''
    Reads a list file of NCBI BLAST evidence and a dict of polypeptides, populating
    each with Annotation evidence where appropriate.  Only attaches evidence if
    the product name is the default.

    Currently only considers the top BLAST hit for each query.
    '''
    if algorithm not in ['blast', 'rapsearch2']:
        raise Exception(
            "algorithm argument must be either blast or rapsearch2")

    for file in utils.read_list_file(blast_list):
        last_qry_id = None

        for line in open(file):
            # 0 indexing is faster than startswith()
            if line[0] == '#':
                continue

            line = line.rstrip()
            cols = line.split("\t")
            this_qry_id = cols[0]

            if algorithm == 'blast':
                e_value = float(cols[19])
            elif algorithm == 'rapsearch2':
                ## rapsearch2 can actually report values outside of python's double range.  Handle these
                try:
                    e_value = math.pow(10, float(cols[10]))
                except OverflowError:
                    print(
                        "WARN: couldn't handle E-value math on the following line (setting to 0):\n{0}"
                        .format(line))
                    e_value = 0

            # skip this line if it doesn't meet the cutoff
            if e_value > eval_cutoff:
                continue

            # the BLAST hits are sorted already with the top hit for each query first
            if last_qry_id != this_qry_id:
                annot = polypeptides[this_qry_id].annotation

                # get the accession from the cols[5]
                #  then process for known accession types
                if algorithm == 'blast':
                    accession = cols[5]
                elif algorithm == 'rapsearch2':
                    accession = cols[1]

                if accession.startswith('sp|'):
                    # pluck the second part out of this:
                    #  sp|Q4PEV8|EIF3M_USTMA
                    accession = accession.split('|')[1]

                assertions = get_uspdb_annot(accession, cursor)
                blast_org[this_qry_id] = assertions['organism']

                # save it, unless the gene product name has already changed from the default
                if annot.product_name == DEFAULT_PRODUCT_NAME:
                    if algorithm == 'blast':
                        # current hack until DB is updated:
                        # some products look like this:
                        #    Coatomer subunit gamma-2 OS=Bos taurus GN=COPG2 PE=2 SV=1
                        # take off everything after the OS=
                        m = re.search("(.+) OS=", cols[15])

                        if m:
                            annot.product_name = m.group(1)
                        else:
                            annot.product_name = cols[15]
                    elif algorithm == 'rapsearch2':
                        annot.product_name = assertions['product']

                    log_fh.write(
                        "INFO: {0}: Updated product name to '{1}' based on BLAST hit to SPROT accession '{2}'\n"
                        .format(this_qry_id, annot.product_name, accession))

                # if no EC numbers have been set, they can inherit from this
                if len(annot.ec_numbers) == 0:
                    for ec_annot in get_uspdb_ec_nums(accession, cursor):
                        annot.add_ec_number(ec_annot)

                # if no GO IDs have been set, they can inherit from this
                if len(annot.go_annotations) == 0:
                    for go_annot in get_uspdb_go_terms(accession, cursor):
                        annot.add_go_annotation(go_annot)

                # if no gene symbol has been set, it can inherit from this
                if annot.gene_symbol is None:
                    annot.gene_symbol = assertions['symbol']

                # remember the ID we just saw
                last_qry_id = this_qry_id
def parse_kegg_blast_evidence(log_fh, polypeptides, blast_list, eval_cutoff):
    '''
    Reads a list file of NCBI BLAST evidence against KEGG and a dict of polypeptides,
    populating each with Annotation evidence where appropriate.  Only attaches evidence if
    the product name is the default.

    Currently only considers the top BLAST hit for each query which doesn't have
    'uncharacterized' or hypothetical in the product name.
    '''
    for file in utils.read_list_file(blast_list):
        last_qry_id = None

        for line in open(file):
            line = line.rstrip()
            cols = line.split("\t")

            # We're going to ignore any lines which have a few keywords in the name
            # First character left off for initcap reasons
            if 'ncharacterized' in cols[15] or 'ypothetical' in cols[15]:
                continue

            this_qry_id = cols[0]

            # skip this line if it doesn't meet the cutoff
            if float(cols[19]) > eval_cutoff:
                continue

            # the BLAST hits are sorted already with the top hit for each query first
            if last_qry_id != this_qry_id:
                annot = polypeptides[this_qry_id].annotation

                # get the accession from the cols[5]
                accession = cols[5]

                # save it, unless the gene product name has already changed from the default
                if annot.product_name == DEFAULT_PRODUCT_NAME:
                    accession = cols[5]

                    # the product field looks like this:
                    # dam; adenine-specific DNA methyltransferase; K06223 DNA adenine methylase [EC:2.1.1.72]
                    # troponin I type 1 (skeletal, slow); K10371 troponin I, slow skeletal muscle
                    if ' [EC' in cols[15] and cols[15].endswith(']'):
                        m = re.search("\; (K\d+)\s+(.+) \[EC\:(.+)\]",
                                      cols[15])
                    else:
                        m = re.search("\; (K\d+)\s+(.+)", cols[15])

                    if m:
                        kegg_id = m.group(1)
                        product = m.group(2)

                        if len(m.groups()) == 3:
                            ec_num = m.group(3)
                        else:
                            ec_num = None

                        annot.product_name = product
                        log_fh.write(
                            "INFO: {0}: Updated product name to '{1}' based on BLAST hit to KEGG accession '{2}'\n"
                            .format(this_qry_id, annot.product_name,
                                    accession))

                        if ec_num is not None and ec_num is not '':
                            ec = annotation.ECAnnotation(number=ec_num)
                            annot.add_ec_number(ec)

                        kegg_dbxref = annotation.Dbxref(db='KEGG',
                                                        identifier=kegg_id)
                        annot.add_dbxref(kegg_dbxref)

                # remember the ID we just saw
                last_qry_id = this_qry_id
def parse_uniref100_blast_evidence( log_fh, polypeptides, blast_list, cursor, eval_cutoff, algorithm, uniref100_fasta_path ):
    '''
    Reads a list file of NCBI BLAST evidence and a dict of polypeptides, populating
    each with Annotation evidence where appropriate.  Only attaches evidence if
    the product name is the default.

    Currently only considers the top BLAST hit for each query.
    '''
    if algorithm not in ['blast', 'rapsearch2']:
        raise Exception("algorithm argument must be either blast or rapsearch2")

    ## need to load the UniRef100 to TREMBL accession lookup from teh FASTA
    # like UniRef100_K1T359 -> K1T359_9ZZZZ
    uniref2acc = dict()
    print("INFO: parsing UniRef100 FASTA headers for annotation")
    if algorithm == 'rapsearch2':
        for line in open(uniref100_fasta_path):
            if line[0] == '>':
                m = re.match("\>(\S+) (.+) n=.+RepID=(\S+)", line)
                if m:
                    uniref2acc[m.group(1)] = {'acc': m.group(3), 'prod': m.group(2)}
    
    for file in utils.read_list_file(blast_list):
        last_qry_id = None
        
        for line in open(file):
            # 0 indexing is faster than startswith()
            if line[0] == '#':
                continue
            
            line = line.rstrip()
            cols = line.split("\t")
            this_qry_id = cols[0]

            # We're going to ignore any lines which have a few keywords in the name
            # First character left off for initcap reasons
            if algorithm == 'blast':
                skip_products = ['ncharacterized', 'ypothetical', 'enomic scaffold']
                skip = False
                for keyword in skip_products:
                    if keyword in cols[15]:
                        skip = True

                if skip == True:
                    continue

            if algorithm == 'blast':
                e_value = float(cols[19])
            elif algorithm == 'rapsearch2':
                ## rapsearch2 can actually report values outside of python's double range.  Handle these 
                try:
                    e_value = math.pow(10, float(cols[10]))
                except OverflowError:
                    print("WARN: couldn't handle E-value math on the following line (setting to 0):\n{0}".format(line))
                    e_value = 0

            # skip this line if it doesn't meet the cutoff
            if e_value > eval_cutoff:
                continue

            # the BLAST hits are sorted already with the top hit for each query first
            if last_qry_id != this_qry_id:
                annot = polypeptides[this_qry_id].annotation

                # get the accession then process for known accession types
                accession = None

                if algorithm == 'blast':
                    # UniRef100_K1T359 -> K1T359_9ZZZZ
                    m = re.search("RepID\=(\S+)", cols[15])
                    if m:
                        accession = m.group(1)
                    else:
                        raise Exception("ERROR: Unexpected product format in UniRef BLAST results: {0}".format(cols[15]))
                elif algorithm == 'rapsearch2':
                    accession = uniref2acc[cols[1]]['acc']

                assertions = get_uniref_annot( accession, cursor )

                # save it, unless the gene product name has already changed from the default
                if annot.product_name == DEFAULT_PRODUCT_NAME:
                    if algorithm == 'blast':
                        # these hits look like this:
                        #  AD-specific glutamate dehydrogenase n=1 Tax=Ceriporiopsis subvermispora (strain B) RepID=M2RLB9_CERS8
                        m = re.match("(.+) n\=.+", cols[15])
                        if m:
                            annot.product_name = m.group(1)
                        else:
                            raise Exception("ERROR: Unexpected product format in UniRef BLAST results: {0}".format(cols[15]))

                        log_fh.write("INFO: {0}: Updated product name to '{1}' based on BLAST hit to UniRef100 accession '{2}'\n".format(this_qry_id, annot.product_name, accession))
                        
                    elif algorithm == 'rapsearch2':
                        annot.product_name = uniref2acc[cols[1]]['prod']
                        
                # if no EC numbers have been set, they can inherit from this
                if len(annot.ec_numbers) == 0:
                    for ec_annot in get_uniref_ec_nums( accession, cursor ):
                        annot.add_ec_number(ec_annot)

                # if no GO IDs have been set, they can inherit from this
                if len(annot.go_annotations) == 0:
                    for go_annot in get_uniref_go_terms( accession, cursor ):
                        annot.add_go_annotation(go_annot)

                # if no gene symbol has been set, it can inherit from this
                if annot.gene_symbol is None:
                    annot.gene_symbol = assertions['symbol']
                    
                # remember the ID we just saw
                last_qry_id = this_qry_id
def parse_tmhmm_evidence( log_fh, polypeptides, htab_list ):
    '''
    Reads a list of raw TMHMM evidence and a dict of polypeptides, adding annotation
    attributes where possible.

    Notes from the esteemed M Giglio:
    The GO term to use would be GO:0016021 "integral component of membrane"
    Or if you want to be more conservative you could go with GO:0016020 "membrane"
    
    Depends on the evidence. For the prok pipe we are pretty conservative, we require five TMHMM
    domains and then we call it putative integral membrane protein. 

    On ECO - in fact Marcus and I are the developers of ECO.  It is an ontology of evidence types.
    An annotation to an ECO term is used in conjunction with another annotation, like a GO term
    (but many other types of annotation can, and are, used with ECO). It provides additional
    information about the annotation. In fact for GO, the assignment of an evidence term along
    with a GO term is a required part of a GO annotation. (ECO terms are the "evidence codes" in GO.)

    INPUT: Expected TMHMM input (all HTML lines are skipped)
    # CHARM010_V2.mRNA.887 Length: 904
    # CHARM010_V2.mRNA.887 Number of predicted TMHs:  6
    # CHARM010_V2.mRNA.887 Exp number of AAs in TMHs: 133.07638
    # CHARM010_V2.mRNA.887 Exp number, first 60 AAs:  21.83212
    # CHARM010_V2.mRNA.887 Total prob of N-in:        0.99994
    # CHARM010_V2.mRNA.887 POSSIBLE N-term signal sequence
    CHARM010_V2.mRNA.887	TMHMM2.0	inside	     1    11
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	    12    34
    CHARM010_V2.mRNA.887	TMHMM2.0	outside	    35   712
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	   713   735
    CHARM010_V2.mRNA.887	TMHMM2.0	inside	   736   755
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	   756   773
    CHARM010_V2.mRNA.887	TMHMM2.0	outside	   774   782
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	   783   805
    CHARM010_V2.mRNA.887	TMHMM2.0	inside	   806   809
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	   810   832
    CHARM010_V2.mRNA.887	TMHMM2.0	outside	   833   871
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	   872   894
    CHARM010_V2.mRNA.887	TMHMM2.0	inside	   895   904
    '''
    # The number of helices spanning the membrane required before counted as a membrane protein
    MIN_HELICAL_SPANS = 3

    # For successful matches, this is the product name which gets applied
    GENE_PRODUCT_NAME = 'Putative integral membrane protein'
    
    for file in utils.read_list_file(htab_list):
        last_qry_id = None
        current_helix_count = 0
        
        for line in open(file):
            if line.startswith('<'): continue
            m = re.match("# (.+?)\s+Length: \d+", line)

            if m:
                current_id = m.group(1)
                
                # purge previous result
                if current_helix_count >= MIN_HELICAL_SPANS:
                    annot = polypeptides[last_qry_id].annotation

                    if annot.product_name == DEFAULT_PRODUCT_NAME:
                        annot.product_name = GENE_PRODUCT_NAME
                        log_fh.write("INFO: {0}: Updated product name to '{1}' because it had {2} TMHelix domains predicted by TMHMM\n".format(last_qry_id, annot.product_name, current_helix_count))
                    else:
                        log_fh.write("INFO: {0}: TMHMM predicted {1} TMHelix domains but gene product name unchanged because of previous assignment\n".format(last_qry_id, current_helix_count))

                    ## we add the GO terms no matter what
                    annot.add_go_annotation(annotation.GOAnnotation(go_id='0016021'))

                # reset
                last_qry_id = current_id
                current_helix_count = 0
                continue

            cols = line.split()
            if len(cols) == 5 and cols[2] == 'TMhelix':
                current_helix_count += 1
def parse_sprot_blast_evidence( log_fh, polypeptides, blast_org, blast_list, cursor, eval_cutoff, algorithm ):
    '''
    Reads a list file of NCBI BLAST evidence and a dict of polypeptides, populating
    each with Annotation evidence where appropriate.  Only attaches evidence if
    the product name is the default.

    Currently only considers the top BLAST hit for each query.
    '''
    if algorithm not in ['blast', 'rapsearch2']:
        raise Exception("algorithm argument must be either blast or rapsearch2")
    
    for file in utils.read_list_file(blast_list):
        last_qry_id = None
        
        for line in open(file):
            # 0 indexing is faster than startswith()
            if line[0] == '#':
                continue
            
            line = line.rstrip()
            cols = line.split("\t")
            this_qry_id = cols[0]

            if algorithm == 'blast':
                e_value = float(cols[19])
            elif algorithm == 'rapsearch2':
                ## rapsearch2 can actually report values outside of python's double range.  Handle these 
                try:
                    e_value = math.pow(10, float(cols[10]))
                except OverflowError:
                    print("WARN: couldn't handle E-value math on the following line (setting to 0):\n{0}".format(line))
                    e_value = 0

            # skip this line if it doesn't meet the cutoff
            if e_value > eval_cutoff:
                continue

            # the BLAST hits are sorted already with the top hit for each query first
            if last_qry_id != this_qry_id:
                annot = polypeptides[this_qry_id].annotation

                # get the accession from the cols[5]
                #  then process for known accession types
                if algorithm == 'blast':
                    accession = cols[5]
                elif algorithm == 'rapsearch2':
                    accession = cols[1]

                if accession.startswith('sp|'):
                    # pluck the second part out of this:
                    #  sp|Q4PEV8|EIF3M_USTMA
                    accession = accession.split('|')[1]

                assertions = get_uspdb_annot( accession, cursor )
                blast_org[this_qry_id] = assertions['organism']

                # save it, unless the gene product name has already changed from the default
                if annot.product_name == DEFAULT_PRODUCT_NAME:
                    if algorithm == 'blast':
                        # current hack until DB is updated:
                        # some products look like this:
                        #    Coatomer subunit gamma-2 OS=Bos taurus GN=COPG2 PE=2 SV=1
                        # take off everything after the OS=
                        m = re.search("(.+) OS=", cols[15])

                        if m:
                            annot.product_name = m.group(1)
                        else:
                            annot.product_name = cols[15]
                    elif algorithm == 'rapsearch2':
                        annot.product_name = assertions['product']

                    log_fh.write("INFO: {0}: Updated product name to '{1}' based on BLAST hit to SPROT accession '{2}'\n".format(this_qry_id, annot.product_name, accession))

                # if no EC numbers have been set, they can inherit from this
                if len(annot.ec_numbers) == 0:
                    for ec_annot in get_uspdb_ec_nums( accession, cursor ):
                        annot.add_ec_number(ec_annot)

                # if no GO IDs have been set, they can inherit from this
                if len(annot.go_annotations) == 0:
                    for go_annot in get_uspdb_go_terms( accession, cursor ):
                        annot.add_go_annotation(go_annot)

                # if no gene symbol has been set, it can inherit from this
                if annot.gene_symbol is None:
                    annot.gene_symbol = assertions['symbol']

                # remember the ID we just saw
                last_qry_id = this_qry_id
def parse_kegg_blast_evidence(log_fh, polypeptides, blast_list, eval_cutoff):
    '''
    Reads a list file of NCBI BLAST evidence against KEGG and a dict of polypeptides,
    populating each with Annotation evidence where appropriate.  Only attaches evidence if
    the product name is the default.

    Currently only considers the top BLAST hit for each query which doesn't have
    'uncharacterized' or hypothetical in the product name.
    '''
    for file in utils.read_list_file(blast_list):
        last_qry_id = None
        
        for line in open(file):
            line = line.rstrip()
            cols = line.split("\t")

            # We're going to ignore any lines which have a few keywords in the name
            # First character left off for initcap reasons
            if 'ncharacterized' in cols[15] or 'ypothetical' in cols[15]:
                continue
            
            this_qry_id = cols[0]

            # skip this line if it doesn't meet the cutoff
            if float(cols[19]) > eval_cutoff:
                continue

            # the BLAST hits are sorted already with the top hit for each query first
            if last_qry_id != this_qry_id:
                annot = polypeptides[this_qry_id].annotation

                # get the accession from the cols[5]
                accession = cols[5]

                # save it, unless the gene product name has already changed from the default
                if annot.product_name == DEFAULT_PRODUCT_NAME:
                    accession = cols[5]

                    # the product field looks like this:
                    # dam; adenine-specific DNA methyltransferase; K06223 DNA adenine methylase [EC:2.1.1.72]
                    # troponin I type 1 (skeletal, slow); K10371 troponin I, slow skeletal muscle
                    if ' [EC' in cols[15] and cols[15].endswith(']'):
                        m = re.search("\; (K\d+)\s+(.+) \[EC\:(.+)\]", cols[15])
                    else:
                        m = re.search("\; (K\d+)\s+(.+)", cols[15])

                    if m:
                        kegg_id = m.group(1)
                        product = m.group(2)
                        
                        if len(m.groups()) == 3:
                            ec_num = m.group(3)
                        else:
                            ec_num = None

                        annot.product_name = product
                        log_fh.write("INFO: {0}: Updated product name to '{1}' based on BLAST hit to KEGG accession '{2}'\n".format(this_qry_id, annot.product_name, accession))

                        if ec_num is not None and ec_num is not '':
                            ec = annotation.ECAnnotation(number=ec_num)
                            annot.add_ec_number(ec)

                        kegg_dbxref = annotation.Dbxref(db='KEGG', identifier=kegg_id)
                        annot.add_dbxref(kegg_dbxref)
                        
                # remember the ID we just saw
                last_qry_id = this_qry_id
Пример #13
0
    PROJECT_DIR = Project_Directory(
        ARGS.project_dir, ARGS.project_name,
        ["summary", "logs", "history", "minimum_spanning_set"],
        ["patterns", "flags", "history"])

    config_logging(
        os.path.join(
            PROJECT_DIR.get_sub_directory("logs"),
            "{0}_pattern_selection.log".format(PROJECT_DIR.project_name)),
        ARGS.log)

    LOGGER = logging.getLogger(__name__)

    try:
        if ARGS.req_loci_file is not None:
            ARGS.required_loci = read_list_file(ARGS.req_loci_file)
    except IOError:
        LOGGER.error("Cannot open required loci file: %s", ARGS.req_loci_file)
        raise

    try:
        if ARGS.excl_loci_file is not None:
            ARGS.exclude_loci = read_list_file(ARGS.excl_loci_file)
    except IOError:
        LOGGER.error("Cannot open excluded loci file: %s", ARGS.excl_loci_file)
        raise

    try:
        if ARGS.excl_strains_file is not None:
            ARGS.exclude_strains = read_list_file(ARGS.excl_strains_file)
    except IOError: