def initialize_polypeptides( log_fh, fasta_file ):
    '''
    Reads a FASTA file of (presumably) polypeptide sequences and creates a dict of Polypeptide
    objects, keyed by ID, with bioannotation.FunctionalAnnotation objects attached.
    '''
    seqs = utils.fasta_dict_from_file(fasta_file)

    polypeptides = dict()

    for seq_id in seqs:
        polypeptide = things.Polypeptide(id=seq_id, length=len(seqs[seq_id]['s']), residues=seqs[seq_id]['s'])
        annot = annotation.FunctionalAnnotation(product_name=DEFAULT_PRODUCT_NAME)
        log_fh.write("INFO: {0}: Set initial product name to '{1}'\n".format(seq_id, DEFAULT_PRODUCT_NAME))
        polypeptide.annotation = annot
        
        polypeptides[seq_id] = polypeptide
    
    return polypeptides
def main():
    parser = argparse.ArgumentParser(
        description='Convert GenBank flat files to GFF3 format')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input GBK file')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Path to an output GFF file to be created')
    parser.add_argument(
        '--with_fasta',
        dest='fasta',
        action='store_true',
        help=
        'Include the FASTA section with genomic sequence at end of file.  (default)'
    )
    parser.add_argument('--no_fasta', dest='fasta', action='store_false')
    parser.set_defaults(fasta=True)
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    ofh.write("##gff-version 3\n")

    assemblies = dict()
    current_assembly = None
    current_gene = None
    current_RNA = None

    rna_count_by_gene = defaultdict(int)
    exon_count_by_RNA = defaultdict(int)

    seqs_pending_writes = False

    features_skipped_count = 0

    # each gb_record is a SeqRecord object
    for gb_record in SeqIO.parse(open(args.input_file, "r"), "genbank"):
        mol_id = gb_record.name

        if mol_id not in assemblies:
            assemblies[mol_id] = things.Assembly(id=mol_id)

        if len(str(gb_record.seq)) > 0:
            seqs_pending_writes = True
            assemblies[mol_id].residues = str(gb_record.seq)
            assemblies[mol_id].length = len(str(gb_record.seq))

        current_assembly = assemblies[mol_id]

        # each feat is a SeqFeature object
        for feat in gb_record.features:
            #print(feat)
            fmin = int(feat.location.start)
            fmax = int(feat.location.end)

            if feat.location.strand == 1:
                strand = '+'
            elif feat.location.strand == -1:
                strand = '-'
            else:
                raise Exception(
                    "ERROR: unstranded feature encountered: {0}".format(feat))

            #print("{0} located at {1}-{2} strand:{3}".format( locus_tag, fmin, fmax, strand ) )
            if feat.type == 'source':
                continue

            if feat.type == 'gene':
                # print the previous gene (if there is one)
                if current_gene is not None:
                    gene.print_as(fh=ofh, source='GenBank', format='gff3')

                locus_tag = feat.qualifiers['locus_tag'][0]
                gene = things.Gene(id=locus_tag, locus_tag=locus_tag)
                gene.locate_on(target=current_assembly,
                               fmin=fmin,
                               fmax=fmax,
                               strand=strand)
                current_gene = gene
                current_RNA = None

            elif feat.type == 'mRNA':
                locus_tag = feat.qualifiers['locus_tag'][0]
                rna_count_by_gene[locus_tag] += 1
                feat_id = "{0}.mRNA.{1}".format(locus_tag,
                                                rna_count_by_gene[locus_tag])

                mRNA = things.mRNA(id=feat_id,
                                   parent=current_gene,
                                   locus_tag=locus_tag)
                mRNA.locate_on(target=current_assembly,
                               fmin=fmin,
                               fmax=fmax,
                               strand=strand)
                gene.add_mRNA(mRNA)
                current_RNA = mRNA

                if feat_id in exon_count_by_RNA:
                    raise Exception(
                        "ERROR: two different RNAs found with same ID: {0}".
                        format(feat_id))
                else:
                    exon_count_by_RNA[feat_id] = 0

            elif feat.type == 'tRNA':
                locus_tag = feat.qualifiers['locus_tag'][0]
                rna_count_by_gene[locus_tag] += 1
                feat_id = "{0}.tRNA.{1}".format(locus_tag,
                                                rna_count_by_gene[locus_tag])

                if 'product' in feat.qualifiers:
                    anticodon = feat.qualifiers['product'][0]
                else:
                    anticodon = None

                tRNA = things.tRNA(id=feat_id,
                                   parent=current_gene,
                                   anticodon=anticodon)
                tRNA.locate_on(target=current_assembly,
                               fmin=fmin,
                               fmax=fmax,
                               strand=strand)
                gene.add_tRNA(tRNA)
                current_RNA = tRNA

                if feat_id in exon_count_by_RNA:
                    raise Exception(
                        "ERROR: two different RNAs found with same ID: {0}".
                        format(feat_id))
                else:
                    exon_count_by_RNA[feat_id] = 0

            elif feat.type == 'rRNA':
                locus_tag = feat.qualifiers['locus_tag'][0]
                rna_count_by_gene[locus_tag] += 1
                feat_id = "{0}.rRNA.{1}".format(locus_tag,
                                                rna_count_by_gene[locus_tag])

                if 'product' in feat.qualifiers:
                    product = feat.qualifiers['product'][0]
                else:
                    product = None

                annot = annotation.FunctionalAnnotation(product_name=product)

                rRNA = things.rRNA(id=feat_id,
                                   parent=current_gene,
                                   annotation=annot)
                rRNA.locate_on(target=current_assembly,
                               fmin=fmin,
                               fmax=fmax,
                               strand=strand)
                gene.add_rRNA(rRNA)
                current_RNA = rRNA

                if feat_id in exon_count_by_RNA:
                    raise Exception(
                        "ERROR: two different RNAs found with same ID: {0}".
                        format(feat_id))
                else:
                    exon_count_by_RNA[feat_id] = 0

            elif feat.type == 'CDS':
                locus_tag = feat.qualifiers['locus_tag'][0]
                # If processing a prokaryotic GBK, we'll encounter CDS before mRNA, so we have to
                #  manually make one
                if current_RNA is None:
                    feat_id = "{0}.mRNA.{1}".format(
                        locus_tag, rna_count_by_gene[locus_tag])
                    mRNA = things.mRNA(id=feat_id, parent=current_gene)
                    mRNA.locate_on(target=current_assembly,
                                   fmin=fmin,
                                   fmax=fmax,
                                   strand=strand)
                    gene.add_mRNA(mRNA)
                    current_RNA = mRNA

                    if 'product' in feat.qualifiers:
                        product = feat.qualifiers['product'][0]
                    else:
                        product = None

                    if 'gene' in feat.qualifiers:
                        gene_symbol = feat.qualifiers['gene'][0]
                    else:
                        gene_symbol = None

                    annot = annotation.FunctionalAnnotation(
                        product_name=product, gene_symbol=gene_symbol)

                    if 'db_xref' in feat.qualifiers:
                        for dbxref in feat.qualifiers['db_xref']:
                            annot.add_dbxref(dbxref)

                    polypeptide_id = "{0}.polypeptide.{1}".format(
                        locus_tag, rna_count_by_gene[locus_tag])
                    polypeptide = things.Polypeptide(id=polypeptide_id,
                                                     parent=mRNA,
                                                     annotation=annot)
                    mRNA.add_polypeptide(polypeptide)

                exon_count_by_RNA[current_RNA.id] += 1
                cds_id = "{0}.CDS.{1}".format(
                    current_RNA.id, exon_count_by_RNA[current_RNA.id])
                current_CDS_phase = 0

                for loc in feat.location.parts:
                    subfmin = int(loc.start)
                    subfmax = int(loc.end)

                    CDS = things.CDS(id=cds_id, parent=current_RNA)
                    CDS.locate_on(target=current_assembly,
                                  fmin=subfmin,
                                  fmax=subfmax,
                                  strand=strand,
                                  phase=current_CDS_phase)
                    current_RNA.add_CDS(CDS)

                    # calculate the starting phase for the next CDS feature (in case there is one)
                    # 0 + 6 = 0     TTGCAT
                    # 0 + 7 = 2     TTGCATG
                    # 1 + 6 = 1     TTGCAT
                    # 2 + 7 = 1     TTGCATG
                    # general: 3 - ((length - previous phase) % 3)
                    current_CDS_phase = 3 - ((
                        (subfmax - subfmin) - current_CDS_phase) % 3)
                    if current_CDS_phase == 3:
                        current_CDS_phase = 0

                    exon_id = "{0}.exon.{1}".format(
                        current_RNA.id, exon_count_by_RNA[current_RNA.id])
                    exon = things.Exon(id=exon_id, parent=current_RNA)
                    exon.locate_on(target=current_assembly,
                                   fmin=subfmin,
                                   fmax=subfmax,
                                   strand=strand)
                    current_RNA.add_exon(exon)
                    exon_count_by_RNA[current_RNA.id] += 1

            else:
                print(
                    "WARNING: The following feature was skipped:\n{0}".format(
                        feat))
                features_skipped_count += 1

    # don't forget to do the last gene, if there were any
    if current_gene is not None:
        gene.print_as(fh=ofh, source='GenBank', format='gff3')

    if args.fasta is True:
        if seqs_pending_writes is True:
            ofh.write("##FASTA\n")
            for assembly_id in assemblies:
                ofh.write(">{0}\n".format(assembly_id))
                ofh.write("{0}\n".format(
                    utils.wrapped_fasta(assemblies[assembly_id].residues)))

    if features_skipped_count > 0:
        print("Warning: {0} unsupported feature types were skipped".format(
            features_skipped_count))
Exemplo n.º 3
0
def main():
    flawed_gff_file = 'canonical.flawed.gff3'
    ilri_gff = 'Theileria-all-Theileria1_ourids.gff'
    source = 'GenBank'
    out_gff = 'canonical.corrected.gff3'

    fout = open(out_gff, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    (assemblies, features) = gff.get_gff3_features(flawed_gff_file)

    print("INFO: loaded {0} assemblies and {1} features".format(
        len(assemblies), len(features)))

    polypeptides = dict()

    for line in open(ilri_gff):
        cols = line.split("\t")

        if len(cols) != 9 or cols[2] != 'polypeptide':
            continue

        id = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')
        polypeptides[parent] = things.Polypeptide(id=id, parent=parent)
        polypeptides[parent].locate_on(target=assemblies[cols[0]],
                                       fmin=int(cols[3]) - 1,
                                       fmax=int(cols[4]),
                                       strand=cols[6])

    print("DEBUG: loaded {0} polypeptides from ILRI file".format(
        len(polypeptides)))

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                if mRNA.id not in polypeptides:
                    print(
                        "DEBUG: {0} not found as a parent to any polypeptide".
                        format(mRNA.id))
                else:
                    polypeptide = polypeptides[mRNA.id]

                # pull this outside of the iteration since iterating might delete some
                CDSs = mRNA.CDSs()

                for CDS in CDSs:
                    keep = True

                    if CDS < polypeptide:
                        mRNA.delete_CDS(CDS)
                    elif CDS <= polypeptide:
                        CDS.location().fmin = polypeptide.location().fmin
                    if CDS > polypeptide:
                        mRNA.delete_CDS(CDS)
                    elif CDS >= polypeptide:
                        CDS.location().fmax = polypeptide.location().fmax
                        #print("WARN: found a CDS {0}:{1}-{2} outside the range of the polypeptide {3}:{4}-{5}".format( \
                        #        CDS.id, CDS.locations[0].fmin, CDS.locations[0].fmax, \
                        #        polypeptide.id, polypeptide.locations[0].fmin, polypeptide.locations[0].fmax))

            gene.print_as(fh=fout, source=source, format='gff3')
Exemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser( description='Metagenemark GFF -> GFF3 conversion script')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by Metagenemark' )
    parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created' )
    parser.add_argument('-p', '--prefix', type=str, required=True, help='Prefix to use in ID generation')
    parser.add_argument('-pf', '--protein_fasta', type=str, required=False, help='Optional protein FASTA to be written')
    args = parser.parse_args()

    assemblies = dict()
    current_assembly = None

    # key like 2 = SRS014890.polypeptide.2
    polypeptide_lookup = dict()
    writing_protein = False
    
    gene = None
    mRNAs = dict()
    current_sequence = None
    current_gene_comment_lines = list()

    fout = open(args.output, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    if args.protein_fasta is not None:
        protein_out = open(args.protein_fasta, mode='wt', encoding='utf-8')

    for line in open(args.input):
        if line.startswith("#"):
            if line.startswith("##FASTA"):
                current_gene_comment_lines.append("#{0}".format(line))
                
            elif line.startswith("##end-Protein"):
                writing_protein = False
                current_gene_comment_lines.append(line)
                
            # since we're already doing our own header, don't duplicate the old one
            elif line.startswith("##gff-version"):
                continue
            else:
                if line.startswith("##Protein "):
                    m = re.match("##Protein (\d+)", line)
                    if m:
                        writing_protein = True
                        protein_out.write(">{0}\n".format(polypeptide_lookup[m.group(1)]))
                    else:
                        raise Exception("ERROR: Expected line to match: ##Protein N")
                elif writing_protein == True:
                    protein_out.write(line[2:])
                    
                current_gene_comment_lines.append(line)

        else:
            cols = line.split("\t")

            if len(cols) != 9:
                continue

            mol_id = cols[0]
            mol_id_m = re.match('^(\S+) ', mol_id)

            if mol_id_m:
                print("MATCH!")
                mol_id = mol_id_m.group(1)
            
            feat_type = cols[2]

            ## we expect only gene types here
            if feat_type not in ['gene', 'CDS']:
                raise Exception("ERROR: expected only 'gene' or 'CDS' feature types as input (depending on metagenemark version).")

            m_gene = re.match('gene_id[ =](\d+)', cols[8])

            if m_gene:
                gene_num = m_gene.group(1)
            else:
                raise Exception("ERROR: expected 9th column to have gene ids like: gene_id 5")

            ## initialize this assembly if we haven't seen it yet
            if mol_id not in assemblies:
                assemblies[mol_id] = things.Assembly(id=mol_id)

            current_assembly = assemblies[mol_id]

            gene = things.Gene(id="{0}.gene.{1}".format(args.prefix, gene_num))
            gene.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )

            mRNA = things.mRNA(id="{0}.mRNA.{1}".format(args.prefix, gene_num), parent=gene.id)
            mRNA.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )
            gene.add_mRNA(mRNA)

            CDS = things.CDS(id="{0}.CDS.{1}".format(args.prefix, gene_num), parent=mRNA.id)
            CDS.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase=int(cols[7]) )
            mRNA.add_CDS(CDS)

            exon = things.Exon(id="{0}.exon.{1}".format(args.prefix, gene_num), parent=mRNA.id)
            exon.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )
            mRNA.add_exon(exon)

            polypeptide_id = "{0}.polypeptide.{1}".format(args.prefix, gene_num)
            polypeptide = things.Polypeptide(id=polypeptide_id, parent=mRNA.id)
            polypeptide.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )
            mRNA.add_polypeptide(polypeptide)
            polypeptide_lookup[gene_num] = polypeptide_id

            gene.print_as(fh=fout, source='GeneMark.hmm', format='gff3')
            fout.write( "".join(current_gene_comment_lines) )
            current_gene_comment_lines = list()
def check_and_add_polypeptide(mRNA):
    polypeptides = mRNA.polypeptides()
    if len(polypeptides) == 0:
        polypeptide_id = "{0}.polypeptide".format(mRNA.id)
        polypeptide = things.Polypeptide(id=polypeptide_id, parent=mRNA.id)
        mRNA.add_polypeptide(polypeptide)