def main():
    parser = argparse.ArgumentParser(
        description='Convert GenBank flat files to GFF3 format')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input GBK file')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Path to an output GFF file to be created')
    parser.add_argument(
        '--with_fasta',
        dest='fasta',
        action='store_true',
        help=
        'Include the FASTA section with genomic sequence at end of file.  (default)'
    )
    parser.add_argument('--no_fasta', dest='fasta', action='store_false')
    parser.set_defaults(fasta=True)
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    ofh.write("##gff-version 3\n")

    assemblies = dict()
    current_assembly = None
    current_gene = None
    current_RNA = None

    rna_count_by_gene = defaultdict(int)
    exon_count_by_RNA = defaultdict(int)

    seqs_pending_writes = False

    features_skipped_count = 0

    # each gb_record is a SeqRecord object
    for gb_record in SeqIO.parse(open(args.input_file, "r"), "genbank"):
        mol_id = gb_record.name

        if mol_id not in assemblies:
            assemblies[mol_id] = things.Assembly(id=mol_id)

        if len(str(gb_record.seq)) > 0:
            seqs_pending_writes = True
            assemblies[mol_id].residues = str(gb_record.seq)
            assemblies[mol_id].length = len(str(gb_record.seq))

        current_assembly = assemblies[mol_id]

        # each feat is a SeqFeature object
        for feat in gb_record.features:
            #print(feat)
            fmin = int(feat.location.start)
            fmax = int(feat.location.end)

            if feat.location.strand == 1:
                strand = '+'
            elif feat.location.strand == -1:
                strand = '-'
            else:
                raise Exception(
                    "ERROR: unstranded feature encountered: {0}".format(feat))

            #print("{0} located at {1}-{2} strand:{3}".format( locus_tag, fmin, fmax, strand ) )
            if feat.type == 'source':
                continue

            if feat.type == 'gene':
                # print the previous gene (if there is one)
                if current_gene is not None:
                    gene.print_as(fh=ofh, source='GenBank', format='gff3')

                locus_tag = feat.qualifiers['locus_tag'][0]
                gene = things.Gene(id=locus_tag, locus_tag=locus_tag)
                gene.locate_on(target=current_assembly,
                               fmin=fmin,
                               fmax=fmax,
                               strand=strand)
                current_gene = gene
                current_RNA = None

            elif feat.type == 'mRNA':
                locus_tag = feat.qualifiers['locus_tag'][0]
                rna_count_by_gene[locus_tag] += 1
                feat_id = "{0}.mRNA.{1}".format(locus_tag,
                                                rna_count_by_gene[locus_tag])

                mRNA = things.mRNA(id=feat_id,
                                   parent=current_gene,
                                   locus_tag=locus_tag)
                mRNA.locate_on(target=current_assembly,
                               fmin=fmin,
                               fmax=fmax,
                               strand=strand)
                gene.add_mRNA(mRNA)
                current_RNA = mRNA

                if feat_id in exon_count_by_RNA:
                    raise Exception(
                        "ERROR: two different RNAs found with same ID: {0}".
                        format(feat_id))
                else:
                    exon_count_by_RNA[feat_id] = 0

            elif feat.type == 'tRNA':
                locus_tag = feat.qualifiers['locus_tag'][0]
                rna_count_by_gene[locus_tag] += 1
                feat_id = "{0}.tRNA.{1}".format(locus_tag,
                                                rna_count_by_gene[locus_tag])

                if 'product' in feat.qualifiers:
                    anticodon = feat.qualifiers['product'][0]
                else:
                    anticodon = None

                tRNA = things.tRNA(id=feat_id,
                                   parent=current_gene,
                                   anticodon=anticodon)
                tRNA.locate_on(target=current_assembly,
                               fmin=fmin,
                               fmax=fmax,
                               strand=strand)
                gene.add_tRNA(tRNA)
                current_RNA = tRNA

                if feat_id in exon_count_by_RNA:
                    raise Exception(
                        "ERROR: two different RNAs found with same ID: {0}".
                        format(feat_id))
                else:
                    exon_count_by_RNA[feat_id] = 0

            elif feat.type == 'rRNA':
                locus_tag = feat.qualifiers['locus_tag'][0]
                rna_count_by_gene[locus_tag] += 1
                feat_id = "{0}.rRNA.{1}".format(locus_tag,
                                                rna_count_by_gene[locus_tag])

                if 'product' in feat.qualifiers:
                    product = feat.qualifiers['product'][0]
                else:
                    product = None

                annot = annotation.FunctionalAnnotation(product_name=product)

                rRNA = things.rRNA(id=feat_id,
                                   parent=current_gene,
                                   annotation=annot)
                rRNA.locate_on(target=current_assembly,
                               fmin=fmin,
                               fmax=fmax,
                               strand=strand)
                gene.add_rRNA(rRNA)
                current_RNA = rRNA

                if feat_id in exon_count_by_RNA:
                    raise Exception(
                        "ERROR: two different RNAs found with same ID: {0}".
                        format(feat_id))
                else:
                    exon_count_by_RNA[feat_id] = 0

            elif feat.type == 'CDS':
                locus_tag = feat.qualifiers['locus_tag'][0]
                # If processing a prokaryotic GBK, we'll encounter CDS before mRNA, so we have to
                #  manually make one
                if current_RNA is None:
                    feat_id = "{0}.mRNA.{1}".format(
                        locus_tag, rna_count_by_gene[locus_tag])
                    mRNA = things.mRNA(id=feat_id, parent=current_gene)
                    mRNA.locate_on(target=current_assembly,
                                   fmin=fmin,
                                   fmax=fmax,
                                   strand=strand)
                    gene.add_mRNA(mRNA)
                    current_RNA = mRNA

                    if 'product' in feat.qualifiers:
                        product = feat.qualifiers['product'][0]
                    else:
                        product = None

                    if 'gene' in feat.qualifiers:
                        gene_symbol = feat.qualifiers['gene'][0]
                    else:
                        gene_symbol = None

                    annot = annotation.FunctionalAnnotation(
                        product_name=product, gene_symbol=gene_symbol)

                    if 'db_xref' in feat.qualifiers:
                        for dbxref in feat.qualifiers['db_xref']:
                            annot.add_dbxref(dbxref)

                    polypeptide_id = "{0}.polypeptide.{1}".format(
                        locus_tag, rna_count_by_gene[locus_tag])
                    polypeptide = things.Polypeptide(id=polypeptide_id,
                                                     parent=mRNA,
                                                     annotation=annot)
                    mRNA.add_polypeptide(polypeptide)

                exon_count_by_RNA[current_RNA.id] += 1
                cds_id = "{0}.CDS.{1}".format(
                    current_RNA.id, exon_count_by_RNA[current_RNA.id])
                current_CDS_phase = 0

                for loc in feat.location.parts:
                    subfmin = int(loc.start)
                    subfmax = int(loc.end)

                    CDS = things.CDS(id=cds_id, parent=current_RNA)
                    CDS.locate_on(target=current_assembly,
                                  fmin=subfmin,
                                  fmax=subfmax,
                                  strand=strand,
                                  phase=current_CDS_phase)
                    current_RNA.add_CDS(CDS)

                    # calculate the starting phase for the next CDS feature (in case there is one)
                    # 0 + 6 = 0     TTGCAT
                    # 0 + 7 = 2     TTGCATG
                    # 1 + 6 = 1     TTGCAT
                    # 2 + 7 = 1     TTGCATG
                    # general: 3 - ((length - previous phase) % 3)
                    current_CDS_phase = 3 - ((
                        (subfmax - subfmin) - current_CDS_phase) % 3)
                    if current_CDS_phase == 3:
                        current_CDS_phase = 0

                    exon_id = "{0}.exon.{1}".format(
                        current_RNA.id, exon_count_by_RNA[current_RNA.id])
                    exon = things.Exon(id=exon_id, parent=current_RNA)
                    exon.locate_on(target=current_assembly,
                                   fmin=subfmin,
                                   fmax=subfmax,
                                   strand=strand)
                    current_RNA.add_exon(exon)
                    exon_count_by_RNA[current_RNA.id] += 1

            else:
                print(
                    "WARNING: The following feature was skipped:\n{0}".format(
                        feat))
                features_skipped_count += 1

    # don't forget to do the last gene, if there were any
    if current_gene is not None:
        gene.print_as(fh=ofh, source='GenBank', format='gff3')

    if args.fasta is True:
        if seqs_pending_writes is True:
            ofh.write("##FASTA\n")
            for assembly_id in assemblies:
                ofh.write(">{0}\n".format(assembly_id))
                ofh.write("{0}\n".format(
                    utils.wrapped_fasta(assemblies[assembly_id].residues)))

    if features_skipped_count > 0:
        print("Warning: {0} unsupported feature types were skipped".format(
            features_skipped_count))
示例#2
0
def main():
    parser = argparse.ArgumentParser(
        description='Convert GFF output from Prodigal into GFF3 format')

    ## output file to be written
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to a GFF file created by Prodigal')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')
    args = parser.parse_args()

    assemblies = dict()
    current_assembly = None

    gene = None
    mRNAs = dict()
    in_sequence = False
    current_sequence = None
    current_gene_comment_lines = list()

    ## Used for tracking the exon count for each gene (for ID purposes)
    exon_count_by_mRNA = dict()

    fout = open(args.output, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    for line in open(args.input):
        if line.startswith("#"):
            pass

        else:
            ##

            gene = None
            mRNAs = dict()
            in_sequence = False
            current_sequence = None
            current_gene_comment_lines = list()

            ##

            cols = line.split("\t")

            if len(cols) != 9:
                continue

            mol_id = cols[0]
            feat_type = cols[2]
            feat_id = gff.column_9_value(cols[8], 'ID')

            ## initialize this assembly if we haven't seen it yet
            if mol_id not in assemblies:
                assemblies[mol_id] = things.Assembly(id=mol_id)

            current_assembly = assemblies[mol_id]

            if feat_type == "CDS":
                # gene
                gene = things.Gene(id=feat_id)
                gene.locate_on(target=current_assembly,
                               fmin=int(cols[3]) - 1,
                               fmax=int(cols[4]),
                               strand=cols[6])

                # mRNA
                mRNA = things.mRNA(id=feat_id + '.t1', parent=gene)
                mRNA.locate_on(target=current_assembly,
                               fmin=int(cols[3]) - 1,
                               fmax=int(cols[4]),
                               strand=cols[6])
                gene.add_mRNA(mRNA)
                mRNAs[mRNA.id] = mRNA
                if feat_id in exon_count_by_mRNA:
                    raise Exception(
                        "ERROR: two different mRNAs found with same ID: {0}".
                        format(feat_id))
                else:
                    exon_count_by_mRNA[feat_id + '.t1'] = 0

                # CDS / exons
                parent_id = gff.column_9_value(cols[8], 'ID') + '.t1'

                ## sanity check that we've seen this parent
                if parent_id not in mRNAs:
                    raise Exception(
                        "ERROR: Found CDS column with parent ({0}) mRNA not yet in the file"
                        .format(parent_id))

                CDS = things.CDS(id=parent_id + '.cds',
                                 parent=mRNAs[parent_id])
                CDS.locate_on(target=current_assembly,
                              fmin=int(cols[3]) - 1,
                              fmax=int(cols[4]),
                              strand=cols[6],
                              phase=int(cols[7]))
                mRNA.add_CDS(CDS)

                # exons weren't explicitly defined in the input file, so we need to derive new IDs for them
                exon_count_by_mRNA[parent_id] += 1
                exon_id = "{0}.exon{1}".format(parent_id,
                                               exon_count_by_mRNA[parent_id])

                exon = things.Exon(id=exon_id, parent=mRNAs[parent_id])
                exon.locate_on(target=current_assembly,
                               fmin=int(cols[3]) - 1,
                               fmax=int(cols[4]),
                               strand=cols[6])
                mRNA.add_exon(exon)

            ##

            gene.print_as(fh=fout, source='Prodigal_v2.6.3', format='gff3')
示例#3
0
def main():
    parser = argparse.ArgumentParser(
        description='Converts CEGMA GFF output to spec-legal GFF3')

    # output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input file to parse')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')

    args = parser.parse_args()

    fout = open(args.output_file, 'w')
    fout.write("##gff-version 3\n")

    assemblies = dict()
    current_assembly = None
    current_gene = None
    current_mRNA = None

    current_gene_fmin = None
    current_gene_fmax = None
    current_gene_strand = None

    next_id_nums = {'gene': 1, 'mRNA': 1, 'CDS': 1, 'exon': 1}
    exon_column_types = ['First', 'Internal', 'Terminal', 'Single']

    for line in open(args.input_file, 'r'):
        if line.startswith('#'):
            continue

        cols = line.split("\t")

        if len(cols) != 9:
            continue

        mol_id = cols[0]
        feat_type = cols[2]
        feat_fmin = int(cols[3]) - 1
        feat_fmax = int(cols[4])

        if feat_type == 'Single' or feat_type == 'First':
            # If there's an existing gene already, print it out
            if current_gene is not None:
                current_gene.locate_on(target=current_assembly,
                                       fmin=current_gene_fmin,
                                       fmax=current_gene_fmax,
                                       strand=current_gene_strand)
                current_mRNA.locate_on(target=current_assembly,
                                       fmin=current_gene_fmin,
                                       fmax=current_gene_fmax,
                                       strand=current_gene_strand)

                #current_gene.print_as(format='text')
                current_gene.print_as(fh=fout, source='cegma', format='gff3')

            # initialize this assembly if we haven't seen it yet
            if mol_id not in assemblies:
                assemblies[mol_id] = things.Assembly(id=mol_id)

            current_assembly = assemblies[mol_id]

            feat_id = "cegma.gene.{0}".format(next_id_nums['gene'])
            next_id_nums['gene'] += 1
            gene = things.Gene(id=feat_id)
            current_gene = gene
            current_gene_strand = cols[6]
            current_gene_fmin = feat_fmin
            current_gene_fmax = feat_fmax

            mRNA_id = "cegma.mRNA.{0}".format(next_id_nums['mRNA'])
            next_id_nums['mRNA'] += 1

            mRNA = things.mRNA(id=mRNA_id, parent=gene)
            gene.add_mRNA(mRNA)
            current_mRNA = mRNA

        # CEGMA versions < 2.5 had two rows for each exon.  We don't need to process both of them, so
        #  we skip the Exon one because its phase information is incorrect.
        if feat_type in exon_column_types:
            CDS_id = "cegma.CDS.{0}".format(next_id_nums['CDS'])
            next_id_nums['CDS'] += 1
            CDS = things.CDS(id=CDS_id, parent=current_mRNA)
            CDS.locate_on(target=current_assembly,
                          fmin=feat_fmin,
                          fmax=feat_fmax,
                          strand=cols[6],
                          phase=cols[7])
            current_mRNA.add_CDS(CDS)

            exon_id = "cegma.exon.{0}".format(next_id_nums['exon'])
            next_id_nums['exon'] += 1
            exon = things.Exon(id=exon_id, parent=current_mRNA)
            exon.locate_on(target=current_assembly,
                           fmin=feat_fmin,
                           fmax=feat_fmax,
                           strand=cols[6])
            mRNA.add_exon(exon)

            if feat_fmin < current_gene_fmin:
                current_gene_fmin = feat_fmin

            if feat_fmax > current_gene_fmax:
                current_gene_fmax = feat_fmax

    # don't forget the last gene
    if current_gene is not None:
        current_gene.locate_on(target=current_assembly,
                               fmin=current_gene_fmin,
                               fmax=current_gene_fmax,
                               strand=current_gene_strand)
        current_mRNA.locate_on(target=current_assembly,
                               fmin=current_gene_fmin,
                               fmax=current_gene_fmax,
                               strand=current_gene_strand)
        current_gene.print_as(fh=fout, source='cegma', format='gff3')
def main():
    parser = argparse.ArgumentParser( description='Convert native (GTF) or GFF output from Augustus into GFF3 format')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by Augustus' )
    parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created' )
    args = parser.parse_args()

    assemblies = dict()
    current_assembly = None
    
    gene = None
    mRNAs = dict()
    in_sequence = False
    current_sequence = None
    current_gene_comment_lines = list()

    ## Used for tracking the exon count for each gene (for ID purposes)
    exon_count_by_mRNA = dict()
    
    fout = open(args.output, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    for line in open(args.input):
        if line.startswith("#"):
            current_gene_comment_lines.append(line)
            
            if line.startswith("# end gene "):
                ## purge the comments, then write the gene
                fout.write( "".join(current_gene_comment_lines) )
                gene.print_as(fh=fout, source='AUGUSTUS', format='gff3')

                gene = None
                mRNAs = dict()
                in_sequence = False
                current_sequence = None
                current_gene_comment_lines = list()

            elif line.startswith("# protein sequence = ["):
                pass
            elif in_sequence is True:
                # build 'current_sequence'
                pass

        else:
            cols = line.split("\t")

            if len(cols) != 9:
                continue

            mol_id = cols[0]
            feat_type = cols[2]

            if feat_type not in ['gene', 'transcript', 'CDS']:
                continue

            ## The output format is GTF by default and (mostly) GFF if the --gff option is used.
            #   If GTF is detected, let's start by transforming the 9th column into GFF so the
            #   libraries can use it
            #   g1  ->  ID=g1
            #   g1.t1  ->  ID=g1.t1;Parent=g1
            #   transcript_id "g1.t1"; gene_id "g1";  ->  ID=g1.t1.cds;Parent=g1.t1
            m_gene = re.match('(g\d+)', cols[8])
            m_transcript = re.match('((g\d+).t\d+)', cols[8])
            m_CDS = re.match('transcript_id "(g\d+.t\d+)"; gene_id "g\d+";', cols[8])

            # the input can be in GTF or GFF.  We need to reformat the 9th column for the GTF entries
            if not cols[8].startswith('ID') and not cols[8].startswith('Parent'):
                if feat_type == 'gene':
                    if m_gene:
                        cols[8] = "ID={0}".format(m_gene.group(1))
                    else:
                        raise Exception("ERROR: GTF detected but gene row has bad 9th column format: {0}".format(cols[8]))
                elif feat_type == 'transcript':
                    if m_transcript:
                        cols[8] = "ID={0};Parent={1}".format(m_transcript.group(1), m_transcript.group(2))
                    else:
                        raise Exception("ERROR: GTF detected but transcript row has bad 9th column format: {0}".format(cols[8]))
                elif feat_type == 'CDS':
                    if m_CDS:
                        cols[8] = "ID={0}.cds;Parent={0}".format(m_CDS.group(1))
                    else:
                        raise Exception("ERROR: GTF detected but CDS row has bad 9th column format: {0}".format(cols[8]))

            feat_id = gff.column_9_value(cols[8], 'ID')

            ## initialize this assembly if we haven't seen it yet
            if mol_id not in assemblies:
                assemblies[mol_id] = things.Assembly(id=mol_id)

            current_assembly = assemblies[mol_id]

            if feat_type == "gene":
                gene = things.Gene(id=feat_id)
                gene.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )

            elif feat_type == "transcript":
                mRNA = things.mRNA(id=feat_id, parent=gene)
                mRNA.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )
                gene.add_mRNA(mRNA)
                mRNAs[mRNA.id] = mRNA

                if feat_id in exon_count_by_mRNA:
                    raise Exception( "ERROR: two different mRNAs found with same ID: {0}".format(feat_id) )
                else:
                    exon_count_by_mRNA[feat_id] = 0
                    
            elif feat_type == "CDS":
                parent_id = gff.column_9_value(cols[8], 'Parent')

                ## sanity check that we've seen this parent
                if parent_id not in mRNAs:
                    raise Exception("ERROR: Found CDS column with parent ({0}) mRNA not yet in the file".format(parent_id))

                CDS = things.CDS(id=feat_id, parent=mRNAs[parent_id])
                CDS.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase=int(cols[7]) )
                mRNA.add_CDS(CDS)
                
                ## exons weren't explicitly defined in the input file, so we need to derive new IDs for them
                exon_count_by_mRNA[parent_id] += 1
                exon_id = "{0}.exon{1}".format(parent_id, exon_count_by_mRNA[parent_id])
                
                exon = things.Exon(id=exon_id, parent=mRNAs[parent_id])
                exon.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )
                mRNA.add_exon(exon)
示例#5
0
def main():
    parser = argparse.ArgumentParser( description='A GTF -> GFF3 conversion script for StringTie output')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GTF file' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created' )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    ofh.write("##gff-version 3\n")

    assemblies = dict()
    current_assembly = None
    current_gene = None
    current_RNA = None
    
    current_match = None

    rna_count_by_gene = defaultdict(int)
    exon_count_by_RNA = defaultdict(int)

    for line in open(args.input_file, "r"):
        cols = line.split("\t")

        if len(cols) != 9:
            print("SKIPPING: {0}".format(line))
            continue
        
        mol_id = cols[0]

        if mol_id not in assemblies:
            assemblies[mol_id] = things.Assembly(id=mol_id)

        current_assembly = assemblies[mol_id]
        ftype  = cols[2]
        fmin = int(cols[3]) - 1
        fmax = int(cols[4])
        strand = cols[6]
        col9 = cols[8]

        # this makes it look like GFF column 9 so I can use biocodeutils.column_9_value(str, key)
        col9 = col9.replace(' "', '="')
        gene_id       = gff.column_9_value(col9, 'gene_id').replace('"', '')
        transcript_id = gff.column_9_value(col9, 'transcript_id').replace('"', '')
        cov = gff.column_9_value(col9, 'cov').replace('"', '')
        
        if ftype == 'transcript':
            if current_gene is not None and current_gene.id != gene_id:
                gene.print_as(fh=ofh, source='StringTie', format='gff3')

            if current_gene is None or current_gene.id != gene_id:
                gene = things.Gene(id=gene_id)
                gene.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                current_gene = gene

            mRNA = things.mRNA(id=transcript_id, parent=current_gene)
            mRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
            gene.add_mRNA(mRNA)
            current_RNA = mRNA
            exon_count_by_RNA[transcript_id] = 0
            current_CDS_phase = 0

        elif ftype == 'exon':
            exon_number = gff.column_9_value(col9, 'exon_number').replace('"', '')
            exon_count_by_RNA[transcript_id] += 1

            cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] )
            CDS = things.CDS(id=cds_id, parent=current_RNA)
            CDS.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand, phase=current_CDS_phase )
            current_RNA.add_CDS(CDS)

             # calculate the starting phase for the next CDS feature (in case there is one)
            current_CDS_phase = 3 - (((fmax - fmin) - current_CDS_phase) % 3)
            if current_CDS_phase == 3:
                current_CDS_phase = 0

            exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] )
            exon = things.Exon(id=exon_id, parent=current_RNA)
            exon.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
            current_RNA.add_exon(exon)
                
    # don't forget to do the last gene, if there were any
    if current_gene is not None:
        gene.print_as(fh=ofh, source='StringTie', format='gff3')
示例#6
0
def main():
    parser = argparse.ArgumentParser( description='Metagenemark GFF -> GFF3 conversion script')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by Metagenemark' )
    parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created' )
    parser.add_argument('-p', '--prefix', type=str, required=True, help='Prefix to use in ID generation')
    parser.add_argument('-pf', '--protein_fasta', type=str, required=False, help='Optional protein FASTA to be written')
    args = parser.parse_args()

    assemblies = dict()
    current_assembly = None

    # key like 2 = SRS014890.polypeptide.2
    polypeptide_lookup = dict()
    writing_protein = False
    
    gene = None
    mRNAs = dict()
    current_sequence = None
    current_gene_comment_lines = list()

    fout = open(args.output, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    if args.protein_fasta is not None:
        protein_out = open(args.protein_fasta, mode='wt', encoding='utf-8')

    for line in open(args.input):
        if line.startswith("#"):
            if line.startswith("##FASTA"):
                current_gene_comment_lines.append("#{0}".format(line))
                
            elif line.startswith("##end-Protein"):
                writing_protein = False
                current_gene_comment_lines.append(line)
                
            # since we're already doing our own header, don't duplicate the old one
            elif line.startswith("##gff-version"):
                continue
            else:
                if line.startswith("##Protein "):
                    m = re.match("##Protein (\d+)", line)
                    if m:
                        writing_protein = True
                        protein_out.write(">{0}\n".format(polypeptide_lookup[m.group(1)]))
                    else:
                        raise Exception("ERROR: Expected line to match: ##Protein N")
                elif writing_protein == True:
                    protein_out.write(line[2:])
                    
                current_gene_comment_lines.append(line)

        else:
            cols = line.split("\t")

            if len(cols) != 9:
                continue

            mol_id = cols[0]
            mol_id_m = re.match('^(\S+) ', mol_id)

            if mol_id_m:
                print("MATCH!")
                mol_id = mol_id_m.group(1)
            
            feat_type = cols[2]

            ## we expect only gene types here
            if feat_type not in ['gene', 'CDS']:
                raise Exception("ERROR: expected only 'gene' or 'CDS' feature types as input (depending on metagenemark version).")

            m_gene = re.match('gene_id[ =](\d+)', cols[8])

            if m_gene:
                gene_num = m_gene.group(1)
            else:
                raise Exception("ERROR: expected 9th column to have gene ids like: gene_id 5")

            ## initialize this assembly if we haven't seen it yet
            if mol_id not in assemblies:
                assemblies[mol_id] = things.Assembly(id=mol_id)

            current_assembly = assemblies[mol_id]

            gene = things.Gene(id="{0}.gene.{1}".format(args.prefix, gene_num))
            gene.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )

            mRNA = things.mRNA(id="{0}.mRNA.{1}".format(args.prefix, gene_num), parent=gene.id)
            mRNA.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )
            gene.add_mRNA(mRNA)

            CDS = things.CDS(id="{0}.CDS.{1}".format(args.prefix, gene_num), parent=mRNA.id)
            CDS.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase=int(cols[7]) )
            mRNA.add_CDS(CDS)

            exon = things.Exon(id="{0}.exon.{1}".format(args.prefix, gene_num), parent=mRNA.id)
            exon.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )
            mRNA.add_exon(exon)

            polypeptide_id = "{0}.polypeptide.{1}".format(args.prefix, gene_num)
            polypeptide = things.Polypeptide(id=polypeptide_id, parent=mRNA.id)
            polypeptide.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )
            mRNA.add_polypeptide(polypeptide)
            polypeptide_lookup[gene_num] = polypeptide_id

            gene.print_as(fh=fout, source='GeneMark.hmm', format='gff3')
            fout.write( "".join(current_gene_comment_lines) )
            current_gene_comment_lines = list()
示例#7
0
def parse_annotation_line(line, genes, molecules):
    cols = line.split("\t")

    if len(cols) != 10:
        print(
            "WARNING: Ignoring the following line because I expected 10 columns:\n{0}"
            .format(line))
        return False

    cols[9] = cols[9].rstrip()

    transcript_id = cols[0]
    CDS_id = cols[1]
    gene_id = get_gene_id_from_transcript(transcript_id)

    if cols[5] is None:
        gene_product_name = cols[3]
    else:
        gene_product_name = cols[5]

    if transcript_id not in molecules:
        raise Exception(
            "ERROR: found molecule {0} in referenced in annotation tab file but not in genomic_fasta file"
            .format(transcript_id))

    if gene_id in genes:
        gene = genes[gene_id]
    else:
        gene = things.Gene(id=gene_id)
        genes[gene_id] = gene

    mRNA = things.mRNA(id=transcript_id)
    gene.add_mRNA(mRNA)

    annotation = annotation.FunctionalAnnotation(
        product_name=gene_product_name)

    ec_num_pattern = re.compile('\d+.')

    if cols[9] is not None:
        ec_nums = cols[9].split(',')

        for ec_num in ec_nums:
            m = ec_num_pattern.search(ec_num)

            if m:
                ec = annotation.ECAnnotation(number=ec_num)
                annotation.add_ec_number(ec)

    go_pattern = re.compile('(\d+)')
    if cols[8] is not None:
        go_terms = cols[8].split(',')

        for go_term in go_terms:
            m = go_pattern.search(go_term)

            if m:
                go = annotation.GOAnnotation(go_id=go_term)
                annotation.add_go_annotation(go)

    CDS = things.CDS(id=CDS_id, annotation=annotation)
    mRNA.add_CDS(CDS)
示例#8
0
def main():
    parser = argparse.ArgumentParser(
        description='A GTF -> GFF3 conversion script for Cufflinks output')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input GTF file')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Path to an output GFF file to be created')
    parser.add_argument('-e',
                        '--export_mode',
                        type=str,
                        required=False,
                        default='model',
                        help='Export mode for results (model or cDNA_match)')
    args = parser.parse_args()

    if args.export_mode not in ['model', 'cDNA_match']:
        raise Exception(
            "ERROR: the only valid values for --export_mode are 'model' or 'cDNA_match'"
        )

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    ofh.write("##gff-version 3\n")

    assemblies = dict()
    current_assembly = None
    current_gene = None
    current_RNA = None

    current_match = None

    rna_count_by_gene = defaultdict(int)
    exon_count_by_RNA = defaultdict(int)

    # each gb_record is a SeqRecord object
    for line in open(args.input_file, "r"):
        cols = line.split("\t")

        if len(cols) != 9:
            print("SKIPPING: {0}".format(line))
            continue

        mol_id = cols[0]

        if mol_id not in assemblies:
            assemblies[mol_id] = things.Assembly(id=mol_id)

        current_assembly = assemblies[mol_id]
        ftype = cols[2]
        fmin = int(cols[3]) - 1
        fmax = int(cols[4])
        strand = cols[6]
        col9 = cols[8]

        # this makes it look like GFF column 9 so I can use biocodeutils.column_9_value(str, key)
        col9 = col9.replace(' "', '="')
        gene_id = gff.column_9_value(col9, 'gene_id').replace('"', '')
        transcript_id = gff.column_9_value(col9,
                                           'transcript_id').replace('"', '')

        if ftype == 'transcript':
            if args.export_mode == 'model':
                if current_gene is not None and current_gene.id != gene_id:
                    gene.print_as(fh=ofh, source='Cufflinks', format='gff3')

                if current_gene is None or current_gene.id != gene_id:
                    gene = things.Gene(id=gene_id)
                    gene.locate_on(target=current_assembly,
                                   fmin=fmin,
                                   fmax=fmax,
                                   strand=strand)
                    current_gene = gene

                mRNA = things.mRNA(id=transcript_id, parent=current_gene)
                mRNA.locate_on(target=current_assembly,
                               fmin=fmin,
                               fmax=fmax,
                               strand=strand)
                gene.add_mRNA(mRNA)
                current_RNA = mRNA
                exon_count_by_RNA[transcript_id] = 0
                current_CDS_phase = 0

            elif args.export_mode == 'cDNA_match':
                if current_match is not None and current_match.id != transcript_id:
                    match.print_as(fh=ofh, source='Cufflinks', format='gff3')

                match = things.Match(id=transcript_id,
                                     subclass='cDNA_match',
                                     length=fmax - fmin)
                match.locate_on(target=current_assembly,
                                fmin=fmin,
                                fmax=fmax,
                                strand=strand)
                current_match = match

        elif ftype == 'exon':
            exon_number = gff.column_9_value(col9,
                                             'exon_number').replace('"', '')

            if args.export_mode == 'model':
                exon_count_by_RNA[transcript_id] += 1

                cds_id = "{0}.CDS.{1}".format(
                    current_RNA.id, exon_count_by_RNA[current_RNA.id])
                CDS = things.CDS(id=cds_id, parent=current_RNA)
                CDS.locate_on(target=current_assembly,
                              fmin=fmin,
                              fmax=fmax,
                              strand=strand,
                              phase=current_CDS_phase)
                current_RNA.add_CDS(CDS)

                # calculate the starting phase for the next CDS feature (in case there is one)
                current_CDS_phase = 3 - ((
                    (fmax - fmin) - current_CDS_phase) % 3)
                if current_CDS_phase == 3:
                    current_CDS_phase = 0

                exon_id = "{0}.exon.{1}".format(
                    current_RNA.id, exon_count_by_RNA[current_RNA.id])
                exon = things.Exon(id=exon_id, parent=current_RNA)
                exon.locate_on(target=current_assembly,
                               fmin=fmin,
                               fmax=fmax,
                               strand=strand)
                current_RNA.add_exon(exon)

            elif args.export_mode == 'cDNA_match':
                mp_id = "{0}.match_part.{1}".format(transcript_id, exon_number)
                mp = things.MatchPart(id=mp_id,
                                      parent=current_match,
                                      length=fmax - fmin)
                mp.locate_on(target=current_assembly,
                             fmin=fmin,
                             fmax=fmax,
                             strand=strand)
                current_match.add_part(mp)

    # don't forget to do the last gene, if there were any
    if args.export_mode == 'model':
        if current_gene is not None:
            gene.print_as(fh=ofh, source='GenBank', format='gff3')

    elif args.export_mode == 'cDNA_match':
        if current_match is not None:
            match.print_as(fh=ofh, source='Cufflinks', format='gff3')