Пример #1
0
def add_aragorn_features(assemblies, features, aragorn_file):
    current_assembly_id = None

    for line in open(aragorn_file):
        line = line.rstrip()

        if line.startswith('>'):
            m = re.match('>(\S+)', line)
            current_assembly_id = m.group(1)

            if current_assembly_id not in assemblies:
                assembly = things.Assembly(id=current_assembly_id, residues='')
                assemblies[current_assembly_id] = assembly
            
        else:
            cols = line.split()

            if len(cols) == 5:
                if cols[1].startswith('tRNA'):
                    feat_type = 'tRNA'
                elif cols[1].startswith('tmRNA'):
                    feat_type = 'tmRNA'
                elif cols[1].startswith('mtRNA'):
                    feat_type = 'mtRNA'
                else:
                    raise Exception("Unexpected type in ARAGORN, column value: {0}".format(cols[1]))
                    
                feat_base = "{0}_{1}".format(feat_type, uuid.uuid4())
                gene_id = "{0}_gene".format(feat_base)
                RNA_id = "{0}_{1}".format(feat_base, feat_type)

                m = re.match('(c*)\[(\d+),(\d+)\]', cols[2])
                if m:
                    rfmin = int(m.group(2)) - 1
                    rfmax = int(m.group(3))

                    if m.group(1):
                        rstrand = -1
                    else:
                        rstrand = 1
                else:
                    raise Exception("ERROR: unexpected coordinate format: {0}".format(cols[2]))
                
                current_assembly = assemblies[current_assembly_id]
                gene = things.Gene(id=gene_id)
                gene.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand)
                features[gene_id] = gene
                current_assembly.add_gene(gene)

                if feat_type == 'tRNA':
                    RNA = things.tRNA(id=RNA_id, parent=gene, anticodon=cols[4][1:4].upper())
                    gene.add_tRNA(RNA)
                else:
                    RNA = things.tmRNA(id=RNA_id, parent=gene)
                    gene.add_tmRNA(RNA)

                RNA.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand)
                RNA.annotation = annotation.FunctionalAnnotation(product_name=cols[1])
                features[RNA_id] = RNA
Пример #2
0
def process_gb_gene(gene):
    annot = annotation.FunctionalAnnotation()
    annot.other_attributes['ensembl_version'] = gene.qualifiers['gene'][0]

    m = re.match('(.+)\.\d+', annot.other_attributes['ensembl_version'])
    if m:
        annot.other_attributes['ensembl_id'] = m.group(1)
    else:
        raise Exception("Found an ensembl id ({0}) without a .version".format(
            annot.other_attributes['ensembl_version']))

    annot.gene_symbol = get_gene_sym(gene)
    annot.product_name = get_product(gene)
    annot.other_attributes['loc'] = get_coordinates(gene)

    return annot
def initialize_polypeptides( log_fh, fasta_file ):
    '''
    Reads a FASTA file of (presumably) polypeptide sequences and creates a dict of Polypeptide
    objects, keyed by ID, with bioannotation.FunctionalAnnotation objects attached.
    '''
    seqs = utils.fasta_dict_from_file(fasta_file)

    polypeptides = dict()

    for seq_id in seqs:
        polypeptide = things.Polypeptide(id=seq_id, length=len(seqs[seq_id]['s']), residues=seqs[seq_id]['s'])
        annot = annotation.FunctionalAnnotation(product_name=DEFAULT_PRODUCT_NAME)
        log_fh.write("INFO: {0}: Set initial product name to '{1}'\n".format(seq_id, DEFAULT_PRODUCT_NAME))
        polypeptide.annotation = annot
        
        polypeptides[seq_id] = polypeptide
    
    return polypeptides
def main():
    parser = argparse.ArgumentParser(
        description='Convert GenBank flat files to GFF3 format')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input GBK file')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Path to an output GFF file to be created')
    parser.add_argument(
        '--with_fasta',
        dest='fasta',
        action='store_true',
        help=
        'Include the FASTA section with genomic sequence at end of file.  (default)'
    )
    parser.add_argument('--no_fasta', dest='fasta', action='store_false')
    parser.set_defaults(fasta=True)
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    ofh.write("##gff-version 3\n")

    assemblies = dict()
    current_assembly = None
    current_gene = None
    current_RNA = None

    rna_count_by_gene = defaultdict(int)
    exon_count_by_RNA = defaultdict(int)

    seqs_pending_writes = False

    features_skipped_count = 0

    # each gb_record is a SeqRecord object
    for gb_record in SeqIO.parse(open(args.input_file, "r"), "genbank"):
        mol_id = gb_record.name

        if mol_id not in assemblies:
            assemblies[mol_id] = things.Assembly(id=mol_id)

        if len(str(gb_record.seq)) > 0:
            seqs_pending_writes = True
            assemblies[mol_id].residues = str(gb_record.seq)
            assemblies[mol_id].length = len(str(gb_record.seq))

        current_assembly = assemblies[mol_id]

        # each feat is a SeqFeature object
        for feat in gb_record.features:
            #print(feat)
            fmin = int(feat.location.start)
            fmax = int(feat.location.end)

            if feat.location.strand == 1:
                strand = '+'
            elif feat.location.strand == -1:
                strand = '-'
            else:
                raise Exception(
                    "ERROR: unstranded feature encountered: {0}".format(feat))

            #print("{0} located at {1}-{2} strand:{3}".format( locus_tag, fmin, fmax, strand ) )
            if feat.type == 'source':
                continue

            if feat.type == 'gene':
                # print the previous gene (if there is one)
                if current_gene is not None:
                    gene.print_as(fh=ofh, source='GenBank', format='gff3')

                locus_tag = feat.qualifiers['locus_tag'][0]
                gene = things.Gene(id=locus_tag, locus_tag=locus_tag)
                gene.locate_on(target=current_assembly,
                               fmin=fmin,
                               fmax=fmax,
                               strand=strand)
                current_gene = gene
                current_RNA = None

            elif feat.type == 'mRNA':
                locus_tag = feat.qualifiers['locus_tag'][0]
                rna_count_by_gene[locus_tag] += 1
                feat_id = "{0}.mRNA.{1}".format(locus_tag,
                                                rna_count_by_gene[locus_tag])

                mRNA = things.mRNA(id=feat_id,
                                   parent=current_gene,
                                   locus_tag=locus_tag)
                mRNA.locate_on(target=current_assembly,
                               fmin=fmin,
                               fmax=fmax,
                               strand=strand)
                gene.add_mRNA(mRNA)
                current_RNA = mRNA

                if feat_id in exon_count_by_RNA:
                    raise Exception(
                        "ERROR: two different RNAs found with same ID: {0}".
                        format(feat_id))
                else:
                    exon_count_by_RNA[feat_id] = 0

            elif feat.type == 'tRNA':
                locus_tag = feat.qualifiers['locus_tag'][0]
                rna_count_by_gene[locus_tag] += 1
                feat_id = "{0}.tRNA.{1}".format(locus_tag,
                                                rna_count_by_gene[locus_tag])

                if 'product' in feat.qualifiers:
                    anticodon = feat.qualifiers['product'][0]
                else:
                    anticodon = None

                tRNA = things.tRNA(id=feat_id,
                                   parent=current_gene,
                                   anticodon=anticodon)
                tRNA.locate_on(target=current_assembly,
                               fmin=fmin,
                               fmax=fmax,
                               strand=strand)
                gene.add_tRNA(tRNA)
                current_RNA = tRNA

                if feat_id in exon_count_by_RNA:
                    raise Exception(
                        "ERROR: two different RNAs found with same ID: {0}".
                        format(feat_id))
                else:
                    exon_count_by_RNA[feat_id] = 0

            elif feat.type == 'rRNA':
                locus_tag = feat.qualifiers['locus_tag'][0]
                rna_count_by_gene[locus_tag] += 1
                feat_id = "{0}.rRNA.{1}".format(locus_tag,
                                                rna_count_by_gene[locus_tag])

                if 'product' in feat.qualifiers:
                    product = feat.qualifiers['product'][0]
                else:
                    product = None

                annot = annotation.FunctionalAnnotation(product_name=product)

                rRNA = things.rRNA(id=feat_id,
                                   parent=current_gene,
                                   annotation=annot)
                rRNA.locate_on(target=current_assembly,
                               fmin=fmin,
                               fmax=fmax,
                               strand=strand)
                gene.add_rRNA(rRNA)
                current_RNA = rRNA

                if feat_id in exon_count_by_RNA:
                    raise Exception(
                        "ERROR: two different RNAs found with same ID: {0}".
                        format(feat_id))
                else:
                    exon_count_by_RNA[feat_id] = 0

            elif feat.type == 'CDS':
                locus_tag = feat.qualifiers['locus_tag'][0]
                # If processing a prokaryotic GBK, we'll encounter CDS before mRNA, so we have to
                #  manually make one
                if current_RNA is None:
                    feat_id = "{0}.mRNA.{1}".format(
                        locus_tag, rna_count_by_gene[locus_tag])
                    mRNA = things.mRNA(id=feat_id, parent=current_gene)
                    mRNA.locate_on(target=current_assembly,
                                   fmin=fmin,
                                   fmax=fmax,
                                   strand=strand)
                    gene.add_mRNA(mRNA)
                    current_RNA = mRNA

                    if 'product' in feat.qualifiers:
                        product = feat.qualifiers['product'][0]
                    else:
                        product = None

                    if 'gene' in feat.qualifiers:
                        gene_symbol = feat.qualifiers['gene'][0]
                    else:
                        gene_symbol = None

                    annot = annotation.FunctionalAnnotation(
                        product_name=product, gene_symbol=gene_symbol)

                    if 'db_xref' in feat.qualifiers:
                        for dbxref in feat.qualifiers['db_xref']:
                            annot.add_dbxref(dbxref)

                    polypeptide_id = "{0}.polypeptide.{1}".format(
                        locus_tag, rna_count_by_gene[locus_tag])
                    polypeptide = things.Polypeptide(id=polypeptide_id,
                                                     parent=mRNA,
                                                     annotation=annot)
                    mRNA.add_polypeptide(polypeptide)

                exon_count_by_RNA[current_RNA.id] += 1
                cds_id = "{0}.CDS.{1}".format(
                    current_RNA.id, exon_count_by_RNA[current_RNA.id])
                current_CDS_phase = 0

                for loc in feat.location.parts:
                    subfmin = int(loc.start)
                    subfmax = int(loc.end)

                    CDS = things.CDS(id=cds_id, parent=current_RNA)
                    CDS.locate_on(target=current_assembly,
                                  fmin=subfmin,
                                  fmax=subfmax,
                                  strand=strand,
                                  phase=current_CDS_phase)
                    current_RNA.add_CDS(CDS)

                    # calculate the starting phase for the next CDS feature (in case there is one)
                    # 0 + 6 = 0     TTGCAT
                    # 0 + 7 = 2     TTGCATG
                    # 1 + 6 = 1     TTGCAT
                    # 2 + 7 = 1     TTGCATG
                    # general: 3 - ((length - previous phase) % 3)
                    current_CDS_phase = 3 - ((
                        (subfmax - subfmin) - current_CDS_phase) % 3)
                    if current_CDS_phase == 3:
                        current_CDS_phase = 0

                    exon_id = "{0}.exon.{1}".format(
                        current_RNA.id, exon_count_by_RNA[current_RNA.id])
                    exon = things.Exon(id=exon_id, parent=current_RNA)
                    exon.locate_on(target=current_assembly,
                                   fmin=subfmin,
                                   fmax=subfmax,
                                   strand=strand)
                    current_RNA.add_exon(exon)
                    exon_count_by_RNA[current_RNA.id] += 1

            else:
                print(
                    "WARNING: The following feature was skipped:\n{0}".format(
                        feat))
                features_skipped_count += 1

    # don't forget to do the last gene, if there were any
    if current_gene is not None:
        gene.print_as(fh=ofh, source='GenBank', format='gff3')

    if args.fasta is True:
        if seqs_pending_writes is True:
            ofh.write("##FASTA\n")
            for assembly_id in assemblies:
                ofh.write(">{0}\n".format(assembly_id))
                ofh.write("{0}\n".format(
                    utils.wrapped_fasta(assemblies[assembly_id].residues)))

    if features_skipped_count > 0:
        print("Warning: {0} unsupported feature types were skipped".format(
            features_skipped_count))
Пример #5
0
def add_aragorn_features(assemblies, features, aragorn_file):
    current_assembly_id = None

    for line in open(aragorn_file):
        line = line.rstrip()

        if line.startswith('>'):
            m = re.match('>(\S+)', line)
            current_assembly_id = m.group(1)

            if current_assembly_id not in assemblies:
                assembly = things.Assembly(id=current_assembly_id, residues='')
                assemblies[current_assembly_id] = assembly

        else:
            cols = line.split()

            if len(cols) == 5:
                if cols[1].startswith('tRNA'):
                    feat_type = 'tRNA'
                elif cols[1].startswith('tmRNA'):
                    feat_type = 'tmRNA'
                elif cols[1].startswith('mtRNA'):
                    feat_type = 'mtRNA'
                else:
                    raise Exception("Unexpected type in ARAGORN, column value: {0}".format(cols[1]))
                    
                feat_base = "{0}_{1}".format(feat_type, uuid.uuid4())
                gene_id = "{0}_gene".format(feat_base)
                RNA_id = "{0}_{1}".format(feat_base, feat_type)

                m = re.match('(c*)\[(\d+),(\d+)\]', cols[2])
                if m:
                    rfmin = int(m.group(2)) - 1
                    rfmax = int(m.group(3))

                    # For predictions spanning the origin of circular molecules, fmax needs to be adjusted
                    #  https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
                    # Sub-heading: Circular Genomes
                    #
                    # Example input lines:
                    #  62  tRNA-Met                 c[2764659,18]      29      (cat)
                    #  1   tRNA-Ile                  [2697442,47]      35      (gat)
                    if rfmax < rfmin:
                        assemblies[current_assembly_id].is_circular = True
                        rfmax = rfmin + rfmax + 1

                    if m.group(1):
                        rstrand = -1
                    else:
                        rstrand = 1
                else:
                    raise Exception("ERROR: unexpected coordinate format: {0}".format(cols[2]))
                
                current_assembly = assemblies[current_assembly_id]
                gene = things.Gene(id=gene_id)
                gene.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand)
                features[gene_id] = gene
                current_assembly.add_gene(gene)

                if feat_type == 'tRNA':
                    RNA = things.tRNA(id=RNA_id, parent=gene, anticodon=cols[4][1:4].upper())
                    gene.add_tRNA(RNA)
                else:
                    RNA = things.tmRNA(id=RNA_id, parent=gene)
                    gene.add_tmRNA(RNA)

                RNA.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand)
                RNA.annotation = annotation.FunctionalAnnotation(product_name=cols[1])
                features[RNA_id] = RNA