Пример #1
0
def gff_to_bed(gff_file, bed_fh=sys.stdout, cds=True):

    parser = GFFParser()
    seqids = parser.parse(gff_file, None)

    for seqid in seqids:
        for feat in seqid.features:
            subf = feat.sub_features
            if feat.type in ("chromosome", "protein"): continue
            is_cds = any(f.type=="mRNA" or f.type=="CDS" for f in subf) and\
                    feat.type=="gene"
            if cds == is_cds:
                print >>bed_fh, "\t".join(str(x) for x in (seqid.id, feat.location.start, \
                        feat.location.end, feat.id, feat.type))
Пример #2
0
 def t_flat_features(self):
     """Check addition of flat non-nested features to multiple records.
     """
     seq_dict = self._get_seq_dict()
     pcr_limit_info = dict(
         gff_source_type = [('Orfeome', 'PCR_product'),
                      ('GenePair_STS', 'PCR_product'),
                      ('Promoterome', 'PCR_product')]
         )
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict,
         limit_info=pcr_limit_info))
     assert len(rec_dict['I'].features) == 4
     assert len(rec_dict['X'].features) == 5
Пример #3
0
 def t_gff3_iterator_limit(self):
     """Iterated interface using a limit query on GFF3 files.
     """
     cds_limit_info = dict(gff_source_type=[('Coding_transcript', 'gene'),
                                            ('Coding_transcript', 'mRNA'),
                                            ('Coding_transcript', 'CDS')],
                           gff_id=['I'])
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(
         parser.parse(self._test_gff_file, limit_info=cds_limit_info))
     assert len(rec_dict) == 1
     tfeature = rec_dict["I"].features[0].sub_features[0]
     for sub_test in tfeature.sub_features:
         assert sub_test.type == "CDS", sub_test
Пример #4
0
 def t_basic_solid_parse(self):
     """Basic parsing of SOLiD GFF results files.
     """
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file))
     test_feature = rec_dict['3_341_424_F3'].features[0]
     assert test_feature.location.nofuzzy_start == 102716
     assert test_feature.location.nofuzzy_end == 102736
     assert len(test_feature.qualifiers) == 7
     assert test_feature.qualifiers['score'] == ['10.6']
     assert test_feature.qualifiers['source'] == ['solid']
     assert test_feature.strand == -1
     assert test_feature.type == 'read'
     assert test_feature.qualifiers['g'] == ['T2203031313223113212']
     assert len(test_feature.qualifiers['q']) == 20
def main(seq_file, gff_file):
    # -- To be customized
    # You need to update these parameters to point to your local database
    # XXX demo example could be swapped to use SQLite when that is integrated
    db_name = "orphan.db"
    biodb_name = 'metagenomic_database'

    print "Parsing FASTA sequence file..."
    with open(seq_file) as seq_handle:
        seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))

    print "Parsing GFF data file..."
    parser = GFFParser()
    recs = parser.parse(gff_file, seq_dict )#, limit_info=limit_info)
    for r in recs:
        print r.features[0]
Пример #6
0
 def t_line_adjust(self):
     """Adjust lines during parsing to fix potential GFF problems.
     """
     def adjust_fn(results):
         rec_index = results['quals']['i'][0]
         read_name = results['rec_id']
         results['quals']['read_name'] = [read_name]
         results['rec_id'] = rec_index
         return results
     parser = GFFParser(line_adjust_fn=adjust_fn)
     recs = [r for r in parser.parse(self._test_gff_file)]
     assert len(recs) == 1
     work_rec = recs[0]
     assert work_rec.id == '1'
     assert len(work_rec.features) == 112
     assert work_rec.features[0].qualifiers['read_name'] == \
             ['3_336_815_F3']
Пример #7
0
def main(seq_file, gff_file):
    # -- To be customized
    # You need to update these parameters to point to your local database
    # XXX demo example could be swapped to use SQLite when that is integrated
    user = "******"
    passwd = "cdev"
    host = "localhost"
    db_name = "wb199_gff"
    biodb_name = "wb199_gff_cds_pcr"
    # These need to be updated to reflect what you would like to parse
    # out of the GFF file. Set limit_info=None to parse everything, but
    # be sure the file is small or you may deal with memory issues.
    rnai_types = [('Orfeome', 'PCR_product'), ('GenePair_STS', 'PCR_product'),
                  ('Promoterome', 'PCR_product')]
    gene_types = [('Non_coding_transcript', 'gene'),
                  ('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'),
                  ('Coding_transcript', 'CDS')]
    limit_info = dict(gff_source_type=rnai_types + gene_types)
    # --
    print "Parsing FASTA sequence file..."
    with open(seq_file) as seq_handle:
        seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))

    print "Parsing GFF data file..."
    parser = GFFParser()
    recs = parser.parse(gff_file, seq_dict, limit_info=limit_info)

    print "Writing to BioSQL database..."
    server = BioSeqDatabase.open_database(driver="MySQLdb",
                                          user=user,
                                          passwd=passwd,
                                          host=host,
                                          db=db_name)
    try:
        if biodb_name not in server.keys():
            server.new_database(biodb_name)
        else:
            server.remove_database(biodb_name)
            server.adaptor.commit()
            server.new_database(biodb_name)
        db = server[biodb_name]
        db.load(recs)
        server.adaptor.commit()
    except:
        server.adaptor.rollback()
        raise
Пример #8
0
 def t_nested_features(self):
     """Check three-deep nesting of features with gene, mRNA and CDS.
     """
     seq_dict = self._get_seq_dict()
     cds_limit_info = dict(
             gff_source_type = [('Coding_transcript', 'gene'),
                          ('Coding_transcript', 'mRNA'),
                          ('Coding_transcript', 'CDS')],
             gff_id = ['I']
             )
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict,
         limit_info=cds_limit_info))
     final_rec = rec_dict['I']
     # first gene feature is plain
     assert len(final_rec.features) == 2 # two gene feature
     assert len(final_rec.features[0].sub_features) == 1 # one transcript
     # 15 final CDS regions
     assert len(final_rec.features[0].sub_features[0].sub_features) == 15
Пример #9
0
def main(gff_file,th_fasta):
    parser = GFFParser()
    #parser = GFFExaminer()
    seqids = parser.parse(gff_file, None)
    #seqids = parser.parent_child_map(gff_file)
    fasta = Fasta(th_fasta, flatten_inplace=True)
    out_fasta = open('this_is_a_test','w')
    for i,seqid in enumerate(seqids):
        ss= condens_transcript(seqid.features)
        for i,feat in enumerate(ss):
            #print feat
            ids = []
            has_cds = False
            ids.append(feat.id)
            for subf in feat.sub_features:
                if str(feat.type) == 'CDS' or feat.type == 'gene'  or feat.type == 'protein':
                    has_cds = True
            if has_cds: continue
            print >>out_fasta, '>%s' %ids[0]
            print >>out_fasta, fasta[seqid.id.lower()][int(feat.location.start):int(feat.location.end)]
Пример #10
0
 def t_nested_multiparent_features(self):
     """Verify correct nesting of features with multiple parents.
     """
     seq_dict = self._get_seq_dict()
     cds_limit_info = dict(
             gff_source_type = [('Coding_transcript', 'gene'),
                          ('Coding_transcript', 'mRNA'),
                          ('Coding_transcript', 'CDS')],
             gff_id = ['I']
             )
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict,
         limit_info=cds_limit_info))
     final_rec = rec_dict['I']
     # second gene feature is multi-parent
     assert len(final_rec.features) == 2 # two gene feature
     cur_subs = final_rec.features[1].sub_features
     assert len(cur_subs) == 3 # three transcripts
     # the first and second transcript have the same CDSs
     assert len(cur_subs[0].sub_features) == 6
     assert len(cur_subs[1].sub_features) == 6
     assert cur_subs[0].sub_features[0] is cur_subs[1].sub_features[0]
Пример #11
0
def main(gff_file, outdir, th_fasta):
    """empty docstring"""
    parser = GFFParser()
    seqids = parser.parse(gff_file, None)

    fasta = Fasta(th_fasta, flatten_inplace=True)
    out_fasta = open(outdir + "/at_no_cds.fasta", "w")
    for seqid in seqids:
        seq_features = conden_transcripts(seqid.features)
        for feat in seq_features:
            has_cds = True
            ids = []
            ids.append(feat.id)
            for subf in feat.sub_features:
                if subf.type in set(
                    ['tRNA', 'rRNA', 'miRNA', 'snoRNA', 'ncRNA', 'snRNA']):
                    has_cds = False
            if has_cds: continue
            #non_cds_feats.append(feat)
            print >> out_fasta, ">%s" % ids[0]
            print >> out_fasta, fasta[seqid.id.lower(
            )][int(feat.location.start):int(feat.location.end)]