Пример #1
0
def parse_blastxml(input_path, augustus_mapping, feature_table,
                   annotation_count_with_putative_function, gene_counter,
                   locus_tag, min_coverage, min_ident):

    #print input_path, augustus_mapping, feature_table, annotation_count_with_putative_function, gene_counter, locus_tag
    # extract the sequence name
    seq_name = os.path.splitext(os.path.split(input_path)[-1])[0]
    #input_path.split('Seq')[-1].split('.blastxml')[0]
    #print seq_name
    feature_table.write('>Feature %s\n' % seq_name)
    #locus_tag = 'M7I_'
    with open(input_path) as blast_handle:
        for entry in NCBIXML.parse(blast_handle):
            if entry.application == "BLASTX":
                query_length = entry.query_length / 3
                if type(query_length) == type(1.7):
                    print "Query length is not a multiple of three"
                    break
                query_id = entry.query.split()[0]
                query_info = augustus_mapping[query_id]

                assert query_info.mRNA.seq_type == 'gene'
                gene_start = query_info.mRNA.start
                gene_end = query_info.mRNA.stop
                cds = query_info.exons
                mRNA = query_info.mRNA
            else:
                break

            gene_counter += 1
            hsp_has_annotation = False
            feature_table_text = dict()
            for alignment in entry.alignments:
                for hsp in alignment.hsps:
                    nident = hsp.identities
                    ident = (100 * float(nident) / float(hsp.align_length))
                    """
                        Coverage: 'c8-c7+1 >= 0.5*c23'
                    """
                    coverage = False
                    if int(hsp.query_end) - int(
                            hsp.query_start
                    ) + 1 >= min_coverage * query_length:
                        coverage = True
                    # only annotate hits with an identity over 50% and a coverage over 50%
                    if ident > min_ident and coverage:
                        feature_table_text[hsp.bits] = ""
                        hsp_has_annotation = True
                        """
                        Hit_def changed: It now looks like: 
                        'RecName: Full=Erythronolide synthase, modules 3 and 4; Short=PKS; AltName: Full=6-deoxyerythronolide B synthase II; AltName: Full=DEBS 2; AltName: Full=ORF 2'
                        """
                        print alignment.hit_def
                        accession = alignment.hit_def.encode('utf8')
                        accession = filter(
                            lambda token: token.startswith('RecName:'),
                            map(str.strip,
                                accession.split(';')))[0].split('Full=')[-1]
                        accession = change_according_reviewer(accession,
                                                              note_line=False)

                        feature_table_text[hsp.bits] += '%i\t%i\tgene\n' % (
                            gene_start, gene_end)
                        feature_table_text[
                            hsp.bits] += '\t\t\tlocus_tag\t%s%04d\n' % (
                                locus_tag, gene_counter)

                        short_intron = check_short_introns(cds)
                        if short_intron:
                            feature_table_text[hsp.bits] += '\t\t\tpseudo\n'
                            feature_table_text[
                                hsp.
                                bits] += '\t\t\tnote\tnonfunctional; similar to %s\n' % accession
                            continue
                        """
                            Write the CDS section for the 'annotation' case and save a string for the mRNA section.
                        """
                        mRNA_annotation = ''
                        mRNA_annotation += '%i\t%i\tmRNA\n' % (cds[0].start,
                                                               cds[0].stop)

                        feature_table_text[hsp.bits] += '%i\t%i\tCDS\n' % (
                            cds[0].start, cds[0].stop)
                        for region in cds[1:]:
                            feature_table_text[hsp.bits] += '%i\t%i\n' % (
                                region.start, region.stop)
                            mRNA_annotation += '%i\t%i\n' % (region.start,
                                                             region.stop)

                        if accession.startswith('hypothetical protein') or \
                                accession.startswith('predicted protein') or \
                                accession == '' or accession == 'protein':
                            feature_table_text[
                                hsp.
                                bits] += '\t\t\tproduct\thypothetical protein\n'
                            mRNA_annotation += '\t\t\tproduct\thypothetical protein\n'
                        else:
                            feature_table_text[
                                hsp.bits] += '\t\t\tproduct\tputative %s\n' % (
                                    accession)
                            mRNA_annotation += '\t\t\tproduct\tputative %s\n' % (
                                accession)

                        feature_table_text[
                            hsp.
                            bits] += '\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (
                                locus_tag, gene_counter)
                        mRNA_annotation += '\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (
                            locus_tag, gene_counter)
                        feature_table_text[
                            hsp.
                            bits] += '\t\t\ttranscript_id\tgnl|PBUF|%smrna%04d\n' % (
                                locus_tag, gene_counter)
                        mRNA_annotation += '\t\t\ttranscript_id\tgnl|PBUF|%smrna%04d\n' % (
                            locus_tag, gene_counter)

                        # Write mRNA section
                        feature_table_text[hsp.bits] += mRNA_annotation

                        if str(hsp.expect).find('e') != -1:
                            """ Der evalue ist eine lange Zahl und muss gekuertzt werden. Z.B. 4.787347812347e-124"""
                            evalue_first, evalue_last = str(
                                hsp.expect).split('e')
                            evalue = str(round(float(evalue_first),
                                               1)) + 'e' + evalue_last
                        else:
                            evalue = round(hsp.expect, 1)
                        """
                        hit_def = change_according_reviewer(alignment.hit_def, note_line = True)
                        if hit_def.split('|')[:-1] != []:
                            hit_def = hit_def.split('|')[-1].split()[0]
                        else:
                            hit_def = accession
                        """
                        hit_def = accession
                        """
                        try:
                            protein_accession_gb = hit_def.split('gb|')[1].split('|')[0] #try to extract the genbank accession number >gi|302432474|gb|EFL04290.1|; -> EFL04290.1
                            inference = "similar to AA sequence:INSD: %s" % protein_accession_gb
                            feature_table_text[ hsp.bits ] += '\t\t\tinference\t%s\n' % (inference)

                            protein_accession_ref = hit_def.split('ref|')[1].split('|')[0] #try to extract the genbank accession number >gi|302432474|gb|EFL04290.1|; -> EFL04290.1
                            inference = "similar to AA sequence:RefSeq: %s" % protein_accession_ref
                            feature_table_text[ hsp.bits ] += '\t\t\tinference\t%s\n' % (inference)
                        except:
                            pass
                        """
                        inference = """ab initio prediction:Augustus:2.5.5"""
                        feature_table_text[
                            hsp.bits] += '\t\t\tinference\t%s\n' % (inference)

                        note = """similar to UniProtKB/Swiss-Prot Entry: %(hit_accession)s""" % {
                            'gene_counter': gene_counter,
                            'accession': accession,
                            'alignment_hit_def': hit_def,
                            'hit_accession': alignment.accession,
                            'len': query_length,
                            'evalue': evalue,
                            'bit_score': round(hsp.bits, 2),
                            'locus_tag': locus_tag,
                        }

                        feature_table_text[
                            hsp.bits] += '\t\t\tnote\t%s\n' % (note)

                #for region in cds[1:]:
                #    mRNA_annotation += '%i\t%i\n' % (region.start, region.stop)
                #    feature_table.write('%i\t%i\n' % (region.start, region.stop))

            if hsp_has_annotation == False:
                """
                    If hsp has no annotation, insert a hypothetical protein
                """
                feature_table.write('%i\t%i\tgene\n' % (gene_start, gene_end))
                feature_table.write('\t\t\tlocus_tag\t%s%04d\n' %
                                    (locus_tag, gene_counter))
                assert cds[0].seq_type == 'CDS'
                short_intron = check_short_introns(cds)
                if short_intron:
                    feature_table.write('\t\t\tpseudo\n')
                    feature_table.write('\t\t\tnote\tnonfunctional\n')
                """
                    Write the CDS section for the 'no-annotation' case.
                """
                feature_table.write('%i\t%i\tCDS\n' %
                                    (cds[0].start, cds[0].stop))

                for region in cds[1:]:
                    feature_table.write('%i\t%i\n' %
                                        (region.start, region.stop))

                feature_table.write('\t\t\tproduct\thypothetical protein\n')
                feature_table.write('\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' %
                                    (locus_tag, gene_counter))
                feature_table.write(
                    '\t\t\ttranscript_id\tgnl|PBUF|%smrna%04d\n' %
                    (locus_tag, gene_counter))
                feature_table.write(
                    '\t\t\tnote\tpredicted with Augustus 2.5.5\n')
                """
                    Write the mRNA section for the 'no-annotation' case.
                """
                feature_table.write('%i\t%i\tmRNA\n' %
                                    (cds[0].start, cds[0].stop))
                for region in cds[1:]:
                    feature_table.write('%i\t%i\n' %
                                        (region.start, region.stop))
                feature_table.write('\t\t\tproduct\thypothetical protein\n')
                feature_table.write('\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' %
                                    (locus_tag, gene_counter))
                feature_table.write(
                    '\t\t\ttranscript_id\tgnl|PBUF|%smrna%04d\n' %
                    (locus_tag, gene_counter))

            else:
                bitscores = feature_table_text.keys()
                bitscores.sort(reverse=True)
                feature_table.write(feature_table_text[bitscores[0]])
                if feature_table_text[bitscores[0]].find(
                        '\t\t\tproduct\thypothetical protein\n') == -1:
                    annotation_count_with_putative_function += 1

    return (gene_counter, annotation_count_with_putative_function)
Пример #2
0
def parse_blastxml(input_path, glimmer_mapping, feature_table,
                   annotation_count_with_putative_function, gene_counter,
                   locus_tag, min_coverage, min_ident):
    # extract the sequence number
    seq_number = input_path.split('Seq')[-1].split('.blastxml')[0]
    feature_table.write('>Feature Seq%s\n' % seq_number)

    with open(input_path) as blast_handle:
        for entry in NCBIXML.parse(blast_handle):
            if entry.application == "BLASTX":
                query_length = entry.query_length
                if type(query_length) == type(1.7):
                    print "Query length is not a multiple of three"
                    break
                query_id = entry.query.split()[0]
                query_info = glimmer_mapping[query_id]
                query_start = int(query_info[0])
                query_end = int(query_info[1])
            else:
                break

            gene_counter += 1
            """
            if not entry.alignments:
                feature_table.write('%i\t%i\tgene\n' % (query_start, query_end))
                feature_table.write('\t\t\tlocus_tag\t%s%04d\n' % (locus_tag, gene_counter))
                feature_table.write('%i\t%i\tCDS\n' % (query_start, query_end))
                feature_table.write('\t\t\tproduct\thypothetical protein\n')
                feature_table.write('\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (locus_tag, gene_counter))
                feature_table.write('\t\t\tnote\tpredicted with glimmer3\n')
                break
            """
            hsp_has_annotation = False
            feature_table_text = dict()

            for alignment in entry.alignments:
                for hsp in alignment.hsps:
                    nident = hsp.identities
                    ident = (100 * float(nident) / float(hsp.align_length))
                    coverage = False
                    if int(hsp.query_end) - int(
                            hsp.query_start
                    ) + 1 >= min_coverage * query_length:
                        coverage = True

                    # only annotate hits with an identity over 50% and a coverage over 50%
                    if ident > min_ident and coverage:
                        feature_table_text[hsp.bits] = ""
                        hsp_has_annotation = True
                        """
                        Hit_def changed: It now looks like: 
                        'RecName: Full=Erythronolide synthase, modules 3 and 4; Short=PKS; AltName: Full=6-deoxyerythronolide B synthase II; AltName: Full=DEBS 2; AltName: Full=ORF 2'
                        """
                        print alignment.hit_def
                        accession = alignment.hit_def.encode('utf8')
                        accession = filter(
                            lambda token: token.startswith('RecName:'),
                            map(str.strip,
                                accession.split(';')))[0].split('Full=')[-1]

                        assert change_according_reviewer(
                            'Pimelyl-[acyl-carrier protein] methyl ester esterase',
                            note_line=False
                        ) == 'Pimelyl-[acyl-carrier protein] methyl ester esterase'
                        assert change_according_reviewer(
                            'putative D-malate dehydrogenase [decarboxylating] [gnl|PBUF|STVIR_0046:1-352] [gnl|PBUF|STVIR_0046: raw, aa len= 352]',
                            note_line=False) == 'D-malate dehydrogenase'
                        accession = change_according_reviewer(accession,
                                                              note_line=False)

                        feature_table_text[hsp.bits] += '%i\t%i\tgene\n' % (
                            query_start, query_end)
                        feature_table_text[
                            hsp.bits] += '\t\t\tlocus_tag\t%s%04d\n' % (
                                locus_tag, gene_counter)
                        feature_table_text[hsp.bits] += '%i\t%i\tCDS\n' % (
                            query_start, query_end)

                        if accession.startswith('hypothetical protein') or \
                                accession.startswith('predicted protein') or \
                                accession == '' or accession == 'protein':
                            feature_table_text[
                                hsp.
                                bits] += '\t\t\tproduct\thypothetical protein\n'
                        else:
                            feature_table_text[
                                hsp.bits] += '\t\t\tproduct\tputative %s\n' % (
                                    accession)

                        feature_table_text[
                            hsp.
                            bits] += '\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (
                                locus_tag, gene_counter)

                        if str(hsp.expect).find('e') != -1:
                            """ Der evalue ist eine lange Zahl und muss gekuertzt werden. Z.B. 4.787347812347e-124"""
                            evalue_first, evalue_last = str(
                                hsp.expect).split('e')
                            evalue = str(round(float(evalue_first),
                                               1)) + 'e' + evalue_last
                        else:
                            evalue = round(hsp.expect, 1)
                        """
                        hit_def = change_according_reviewer(alignment.hit_def, note_line = True)
                        if hit_def.split('|')[:-1] != []:
                            hit_def = hit_def.split('|')[-1].split()[0]
                        else:
                            hit_def = accession
                        """
                        """"
                        try:
                            protein_accession_gb = hit_def.split('gb|')[1].split('|')[0] #try to extract the genbank accession number >gi|302432474|gb|EFL04290.1|; -> EFL04290.1
                            inference = "similar to AA sequence:INSD: %s" % protein_accession_gb
                            feature_table_text[ hsp.bits ] += '\t\t\tinference\t%s\n' % (inference)

                            protein_accession_ref = hit_def.split('ref|')[1].split('|')[0] #try to extract the genbank accession number >gi|302432474|gb|EFL04290.1|; -> EFL04290.1
                            inference = "similar to AA sequence:RefSeq: %s" % protein_accession_ref
                            feature_table_text[ hsp.bits ] += '\t\t\tinference\t%s\n' % (inference)
                        except:
                            pass
                        """
                        inference = """ab initio prediction:Glimmer:3"""
                        feature_table_text[
                            hsp.bits] += '\t\t\tinference\t%s\n' % (inference)

                        note = """similar to UniProtKB/Swiss-Prot Entry: %(hit_accession)s""" % {
                            'gene_counter': gene_counter,
                            'accession': accession,
                            'alignment_hit_def': accession,
                            'hit_accession': alignment.accession,
                            'len': query_length,
                            'evalue': evalue,
                            'bit_score': round(hsp.bits, 2),
                            'locus_tag': locus_tag,
                        }
                        feature_table_text[
                            hsp.bits] += '\t\t\tnote\t%s\n' % (note)

            if hsp_has_annotation == False:
                """
                    If hsp has no annotation with the specified identity and coverage, insert a hypothetical protein
                """
                feature_table.write('%i\t%i\tgene\n' %
                                    (query_start, query_end))
                feature_table.write('\t\t\tlocus_tag\t%s%04d\n' %
                                    (locus_tag, gene_counter))
                feature_table.write('%i\t%i\tCDS\n' % (query_start, query_end))
                feature_table.write('\t\t\tproduct\thypothetical protein\n')
                feature_table.write('\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' %
                                    (locus_tag, gene_counter))
                feature_table.write(
                    '\t\t\tnote\tab initio prediction:Glimmer3\n')
            else:
                bitscores = feature_table_text.keys()
                bitscores.sort(reverse=True)
                feature_table.write(feature_table_text[bitscores[0]])
                if feature_table_text[bitscores[0]].find(
                        '\t\t\tproduct\thypothetical protein\n') == -1:
                    annotation_count_with_putative_function += 1

    return (gene_counter, annotation_count_with_putative_function)
def parse_blastxml(input_path, augustus_mapping, feature_table, annotation_count_with_putative_function, gene_counter, locus_tag, min_coverage, min_ident):

    #print input_path, augustus_mapping, feature_table, annotation_count_with_putative_function, gene_counter, locus_tag
    # extract the sequence name
    seq_name = os.path.splitext(os.path.split(input_path)[-1])[0]
    #input_path.split('Seq')[-1].split('.blastxml')[0]
    #print seq_name
    feature_table.write('>Feature %s\n' % seq_name)
    #locus_tag = 'M7I_'
    with open(input_path) as blast_handle:
        for entry in NCBIXML.parse(blast_handle):
            if entry.application == "BLASTX":
                query_length = entry.query_length / 3
                if type(query_length) == type(1.7):
                    print "Query length is not a multiple of three"
                    break
                query_id = entry.query.split()[0]
                query_info = augustus_mapping[ query_id ]

                assert query_info.mRNA.seq_type == 'gene'
                gene_start = query_info.mRNA.start
                gene_end = query_info.mRNA.stop
                cds = query_info.exons
                mRNA = query_info.mRNA
            else:
                break

            gene_counter += 1
            hsp_has_annotation = False
            feature_table_text = dict()
            for alignment in entry.alignments:
                for hsp in alignment.hsps:
                    nident = hsp.identities
                    ident = (100*float(nident)/float(hsp.align_length))
                    """
                        Coverage: 'c8-c7+1 >= 0.5*c23'
                    """
                    coverage = False
                    if int(hsp.query_end) - int(hsp.query_start) + 1 >= min_coverage * query_length:
                        coverage = True
                    # only annotate hits with an identity over 50% and a coverage over 50%
                    if ident > min_ident and coverage:
                        feature_table_text[ hsp.bits ] = ""
                        hsp_has_annotation = True
                        """
                        Hit_def changed: It now looks like: 
                        'RecName: Full=Erythronolide synthase, modules 3 and 4; Short=PKS; AltName: Full=6-deoxyerythronolide B synthase II; AltName: Full=DEBS 2; AltName: Full=ORF 2'
                        """
                        print alignment.hit_def
                        accession = alignment.hit_def.encode('utf8')
                        accession = filter(lambda token: token.startswith('RecName:'), map(str.strip, accession.split(';')))[0].split('Full=')[-1]
                        accession = change_according_reviewer(accession, note_line = False)

                        feature_table_text[ hsp.bits ] += '%i\t%i\tgene\n' % (gene_start, gene_end)
                        feature_table_text[ hsp.bits ] += '\t\t\tlocus_tag\t%s%04d\n' % (locus_tag, gene_counter)

                        short_intron = check_short_introns(cds)
                        if short_intron:
                            feature_table_text[ hsp.bits ] += '\t\t\tpseudo\n'
                            feature_table_text[ hsp.bits ] += '\t\t\tnote\tnonfunctional; similar to %s\n' % accession
                            continue
                        """
                            Write the CDS section for the 'annotation' case and save a string for the mRNA section.
                        """
                        mRNA_annotation = ''
                        mRNA_annotation += '%i\t%i\tmRNA\n' % (cds[0].start, cds[0].stop)

                        feature_table_text[ hsp.bits ] += '%i\t%i\tCDS\n' % (cds[0].start, cds[0].stop)
                        for region in cds[1:]:
                            feature_table_text[ hsp.bits ] += '%i\t%i\n' % (region.start, region.stop)
                            mRNA_annotation += '%i\t%i\n' % (region.start, region.stop)

                        if accession.startswith('hypothetical protein') or \
                                accession.startswith('predicted protein') or \
                                accession == '' or accession == 'protein':
                            feature_table_text[ hsp.bits ] += '\t\t\tproduct\thypothetical protein\n'
                            mRNA_annotation += '\t\t\tproduct\thypothetical protein\n'
                        else:
                            feature_table_text[ hsp.bits ] += '\t\t\tproduct\tputative %s\n' % (accession)
                            mRNA_annotation += '\t\t\tproduct\tputative %s\n' % (accession)

                        feature_table_text[ hsp.bits ] += '\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (locus_tag, gene_counter)
                        mRNA_annotation += '\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (locus_tag, gene_counter)
                        feature_table_text[ hsp.bits ] += '\t\t\ttranscript_id\tgnl|PBUF|%smrna%04d\n' % (locus_tag, gene_counter)
                        mRNA_annotation += '\t\t\ttranscript_id\tgnl|PBUF|%smrna%04d\n' % (locus_tag, gene_counter)

                        # Write mRNA section
                        feature_table_text[ hsp.bits ] += mRNA_annotation

                        if str(hsp.expect).find('e') != -1:
                            """ Der evalue ist eine lange Zahl und muss gekuertzt werden. Z.B. 4.787347812347e-124"""
                            evalue_first, evalue_last = str(hsp.expect).split('e')
                            evalue = str(round(float(evalue_first), 1)) + 'e' + evalue_last
                        else:
                            evalue = round(hsp.expect, 1)
                        """
                        hit_def = change_according_reviewer(alignment.hit_def, note_line = True)
                        if hit_def.split('|')[:-1] != []:
                            hit_def = hit_def.split('|')[-1].split()[0]
                        else:
                            hit_def = accession
                        """
                        hit_def = accession
                        """
                        try:
                            protein_accession_gb = hit_def.split('gb|')[1].split('|')[0] #try to extract the genbank accession number >gi|302432474|gb|EFL04290.1|; -> EFL04290.1
                            inference = "similar to AA sequence:INSD: %s" % protein_accession_gb
                            feature_table_text[ hsp.bits ] += '\t\t\tinference\t%s\n' % (inference)

                            protein_accession_ref = hit_def.split('ref|')[1].split('|')[0] #try to extract the genbank accession number >gi|302432474|gb|EFL04290.1|; -> EFL04290.1
                            inference = "similar to AA sequence:RefSeq: %s" % protein_accession_ref
                            feature_table_text[ hsp.bits ] += '\t\t\tinference\t%s\n' % (inference)
                        except:
                            pass
                        """
                        inference = """ab initio prediction:Augustus:2.5.5"""
                        feature_table_text[ hsp.bits ] += '\t\t\tinference\t%s\n' % (inference)

                        note = """similar to UniProtKB/Swiss-Prot Entry: %(hit_accession)s""" % {'gene_counter': gene_counter, 
                                'accession':accession,
                                'alignment_hit_def': hit_def,
                                'hit_accession': alignment.accession,
                                'len': query_length,
                                'evalue': evalue,
                                'bit_score': round(hsp.bits, 2),
                                'locus_tag': locus_tag,
                                }

                        feature_table_text[ hsp.bits ] += '\t\t\tnote\t%s\n' % (note)


                #for region in cds[1:]:
                #    mRNA_annotation += '%i\t%i\n' % (region.start, region.stop)
                #    feature_table.write('%i\t%i\n' % (region.start, region.stop))


            if hsp_has_annotation == False:

                """
                    If hsp has no annotation, insert a hypothetical protein
                """
                feature_table.write('%i\t%i\tgene\n' % (gene_start, gene_end))
                feature_table.write('\t\t\tlocus_tag\t%s%04d\n' % (locus_tag, gene_counter))
                assert cds[0].seq_type == 'CDS'
                short_intron = check_short_introns(cds)
                if short_intron:
                    feature_table.write('\t\t\tpseudo\n')
                    feature_table.write('\t\t\tnote\tnonfunctional\n')
                """
                    Write the CDS section for the 'no-annotation' case.
                """
                feature_table.write('%i\t%i\tCDS\n' % (cds[0].start, cds[0].stop))

                for region in cds[1:]:
                    feature_table.write('%i\t%i\n' % (region.start, region.stop))

                feature_table.write('\t\t\tproduct\thypothetical protein\n')
                feature_table.write('\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (locus_tag, gene_counter))
                feature_table.write('\t\t\ttranscript_id\tgnl|PBUF|%smrna%04d\n' % (locus_tag, gene_counter))
                feature_table.write('\t\t\tnote\tpredicted with Augustus 2.5.5\n')

                """
                    Write the mRNA section for the 'no-annotation' case.
                """
                feature_table.write('%i\t%i\tmRNA\n' % (cds[0].start, cds[0].stop))
                for region in cds[1:]:
                    feature_table.write('%i\t%i\n' % (region.start, region.stop))
                feature_table.write('\t\t\tproduct\thypothetical protein\n')
                feature_table.write('\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (locus_tag, gene_counter))
                feature_table.write('\t\t\ttranscript_id\tgnl|PBUF|%smrna%04d\n' % (locus_tag, gene_counter))

            else:
                bitscores = feature_table_text.keys()
                bitscores.sort(reverse=True)
                feature_table.write(feature_table_text[ bitscores[0] ])
                if feature_table_text[ bitscores[0] ].find('\t\t\tproduct\thypothetical protein\n') == -1:
                    annotation_count_with_putative_function += 1

    return (gene_counter, annotation_count_with_putative_function)
def parse_blastxml(input_path, glimmer_mapping, feature_table, annotation_count_with_putative_function, gene_counter, locus_tag, min_coverage, min_ident):
    # extract the sequence number
    seq_number = input_path.split('Seq')[-1].split('.blastxml')[0]
    feature_table.write('>Feature Seq%s\n' % seq_number)

    with open(input_path) as blast_handle:
        for entry in NCBIXML.parse(blast_handle):
            if entry.application == "BLASTX":
                query_length = entry.query_length
                if type(query_length) == type(1.7):
                    print "Query length is not a multiple of three"
                    break
                query_id = entry.query.split()[0]
                query_info = glimmer_mapping[ query_id ]
                query_start = int(query_info[0])
                query_end = int(query_info[1])
            else:
                break

            gene_counter += 1
            """
            if not entry.alignments:
                feature_table.write('%i\t%i\tgene\n' % (query_start, query_end))
                feature_table.write('\t\t\tlocus_tag\t%s%04d\n' % (locus_tag, gene_counter))
                feature_table.write('%i\t%i\tCDS\n' % (query_start, query_end))
                feature_table.write('\t\t\tproduct\thypothetical protein\n')
                feature_table.write('\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (locus_tag, gene_counter))
                feature_table.write('\t\t\tnote\tpredicted with glimmer3\n')
                break
            """
            hsp_has_annotation = False
            feature_table_text = dict()

            for alignment in entry.alignments:
                for hsp in alignment.hsps:
                    nident = hsp.identities
                    ident = (100*float(nident)/float(hsp.align_length))
                    coverage = False
                    if int(hsp.query_end) - int(hsp.query_start) + 1 >= min_coverage * query_length:
                        coverage = True

                    # only annotate hits with an identity over 50% and a coverage over 50%
                    if ident > min_ident and coverage:
                        feature_table_text[ hsp.bits ] = ""
                        hsp_has_annotation = True
                        accession = alignment.hit_def.split('OS=')[0].strip()
                        assert change_according_reviewer('Pimelyl-[acyl-carrier protein] methyl ester esterase', note_line = False) == 'Pimelyl-[acyl-carrier protein] methyl ester esterase'
                        assert change_according_reviewer('putative D-malate dehydrogenase [decarboxylating] [gnl|PBUF|STVIR_0046:1-352] [gnl|PBUF|STVIR_0046: raw, aa len= 352]', note_line = False) == 'D-malate dehydrogenase'
                        accession = change_according_reviewer(accession, note_line = False)

                        feature_table_text[ hsp.bits ] += '%i\t%i\tgene\n' % (query_start, query_end)
                        feature_table_text[ hsp.bits ] += '\t\t\tlocus_tag\t%s%04d\n' % (locus_tag, gene_counter)
                        feature_table_text[ hsp.bits ] += '%i\t%i\tCDS\n' % (query_start, query_end)

                        if accession.startswith('hypothetical protein') or \
                                accession.startswith('predicted protein') or \
                                accession == '' or accession == 'protein':
                            feature_table_text[ hsp.bits ] += '\t\t\tproduct\thypothetical protein\n'
                        else:
                            feature_table_text[ hsp.bits ] += '\t\t\tproduct\tputative %s\n' % (accession)

                        feature_table_text[ hsp.bits ] += '\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (locus_tag, gene_counter)

                        if str(hsp.expect).find('e') != -1:
                            """ Der evalue ist eine lange Zahl und muss gekuertzt werden. Z.B. 4.787347812347e-124"""
                            evalue_first, evalue_last = str(hsp.expect).split('e')
                            evalue = str(round(float(evalue_first), 1)) + 'e' + evalue_last
                        else:
                            evalue = round(hsp.expect, 1)

                        """
                        hit_def = change_according_reviewer(alignment.hit_def, note_line = True)
                        if hit_def.split('|')[:-1] != []:
                            hit_def = hit_def.split('|')[-1].split()[0]
                        else:
                            hit_def = accession
                        """
                        
                        """"
                        try:
                            protein_accession_gb = hit_def.split('gb|')[1].split('|')[0] #try to extract the genbank accession number >gi|302432474|gb|EFL04290.1|; -> EFL04290.1
                            inference = "similar to AA sequence:INSD: %s" % protein_accession_gb
                            feature_table_text[ hsp.bits ] += '\t\t\tinference\t%s\n' % (inference)

                            protein_accession_ref = hit_def.split('ref|')[1].split('|')[0] #try to extract the genbank accession number >gi|302432474|gb|EFL04290.1|; -> EFL04290.1
                            inference = "similar to AA sequence:RefSeq: %s" % protein_accession_ref
                            feature_table_text[ hsp.bits ] += '\t\t\tinference\t%s\n' % (inference)
                        except:
                            pass
                        """
                        inference = """ab initio prediction:Glimmer:3"""
                        feature_table_text[ hsp.bits ] += '\t\t\tinference\t%s\n' % (inference)

                        note = """similar to UniProtKB/Swiss-Prot Entry: %(hit_accession)s""" % {'gene_counter': gene_counter, 
                                'accession':accession,
                                'alignment_hit_def': accession,
                                'hit_accession': alignment.accession,
                                'len': query_length,
                                'evalue': evalue,
                                'bit_score': round(hsp.bits, 2),
                                'locus_tag': locus_tag,
                                }
                        feature_table_text[ hsp.bits ] += '\t\t\tnote\t%s\n' % (note)


            if hsp_has_annotation == False:
                """
                    If hsp has no annotation with the specified identity and coverage, insert a hypothetical protein
                """
                feature_table.write('%i\t%i\tgene\n' % (query_start, query_end))
                feature_table.write('\t\t\tlocus_tag\t%s%04d\n' % (locus_tag, gene_counter))
                feature_table.write('%i\t%i\tCDS\n' % (query_start, query_end))
                feature_table.write('\t\t\tproduct\thypothetical protein\n')
                feature_table.write('\t\t\tprotein_id\tgnl|PBUF|%s%04d\n' % (locus_tag, gene_counter))
                feature_table.write('\t\t\tnote\tab initio prediction:Glimmer3\n')
            else:
                bitscores = feature_table_text.keys()
                bitscores.sort(reverse=True)
                feature_table.write(feature_table_text[ bitscores[0] ])
                if feature_table_text[ bitscores[0] ].find('\t\t\tproduct\thypothetical protein\n') == -1:
                    annotation_count_with_putative_function += 1

    return (gene_counter, annotation_count_with_putative_function)