Python GFFParser.parse示例，BCBio.GFF.GFFParser.parse Python示例

示例#1

0

显示文件

文件： test_GFFSeqIOFeatureAdder.py 项目： jamescasbon/bcbb

    def t_gff3_to_gff3(self):
        """Read in and write out GFF3 without any loss of information.
        """
        parser = GFFParser()
        recs = SeqIO.to_dict(parser.parse(self._test_gff_file))
        out_handle = StringIO.StringIO()
        writer = GFF3Writer()
        writer.write(recs.values(), out_handle)
        wrote_handle = StringIO.StringIO(out_handle.getvalue())
        recs_two = SeqIO.to_dict(parser.parse(wrote_handle))

        orig_rec = recs.values()[0]
        re_rec = recs.values()[0]
        assert len(orig_rec.features) == len(re_rec.features)
        for i, orig_f in enumerate(orig_rec.features):
            assert str(orig_f) == str(re_rec.features[i])

示例#2

0

显示文件

文件： test_GFFSeqIOFeatureAdder.py 项目： jamescasbon/bcbb

 def t_unknown_seq(self):
     """Prepare unknown base sequences with the correct length.
     """
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file))
     assert len(rec_dict["I"].seq) == 12766937
     assert len(rec_dict["X"].seq) == 17718531

示例#3

0

显示文件

def gff_to_bed(gff_file,
               bed_fh=sys.stdout,
               cds=True,
               species=None,
               rename=False):

    parser = GFFParser()
    seqids = parser.parse(gff_file, None)

    cur_chr = None
    cur_gene_order = 0
    for seqid in seqids:
        for feat in seqid.features:
            subf = feat.sub_features
            if feat.type in ("chromosome", "protein"): continue
            is_cds = any(f.type=="mRNA" or f.type=="CDS" for f in subf) and\
                    feat.type=="gene"
            if cds == is_cds:
                cur_gene_order += 1
                if species != None:
                    seqid_final = species + seqid.id[-2:]  # hard coded
                else:
                    seqid_final = seqid.id
                if rename:
                    if seqid.id != cur_chr:
                        cur_gene_order = 1
                        cur_chr = seqid.id
                    gene_name = seqid_final + 'g' + '0' * (
                        5 - len(str(cur_gene_order))) + str(cur_gene_order)
                else:
                    gene_name = feat.id

                print >>bed_fh, "\t".join(str(x) for x in (seqid_final, int(str(feat.location.start))+1, \
                        feat.location.end, gene_name))  # +1 is hard coded to current BCBio.GFF

示例#4

0

显示文件

文件： gff_to_bed.py 项目： Jingping/BiteTools

def gff_to_bed(gff_file, bed_fh=sys.stdout, cds=True, species=None, rename=False):

    parser = GFFParser()
    seqids = parser.parse(gff_file, None)

    cur_chr = None
    cur_gene_order = 0
    for seqid in seqids:
        for feat in seqid.features:
            subf = feat.sub_features
            if feat.type in ("chromosome", "protein"):
                continue
            is_cds = any(f.type == "mRNA" or f.type == "CDS" for f in subf) and feat.type == "gene"
            if cds == is_cds:
                cur_gene_order += 1
                if species != None:
                    seqid_final = species + seqid.id[-2:]  # hard coded
                else:
                    seqid_final = seqid.id
                if rename:
                    if seqid.id != cur_chr:
                        cur_gene_order = 1
                        cur_chr = seqid.id
                    gene_name = seqid_final + "g" + "0" * (5 - len(str(cur_gene_order))) + str(cur_gene_order)
                else:
                    gene_name = feat.id

                print >> bed_fh, "\t".join(
                    str(x) for x in (seqid_final, int(str(feat.location.start)) + 1, feat.location.end, gene_name)
                )  # +1 is hard coded to current BCBio.GFF

示例#5

0

显示文件

文件： test_GFFSeqIOFeatureAdder.py 项目： jamescasbon/bcbb

    def not_t_full_celegans(self):
        """Test the full C elegans chromosome and GFF files.

        This is used to test GFF on large files and is not run as a standard
        test. You will need to download the files and adjust the paths
        to run this.
        """
        # read the sequence information
        seq_file = os.path.join(self._full_dir, "c_elegans.WS199.dna.fa")
        gff_file = os.path.join(self._full_dir, "c_elegans.WS199.gff3")
        seq_handle = open(seq_file)
        seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))
        seq_handle.close()
        #with open(gff_file) as gff_handle:
        #    possible_limits = feature_adder.available_limits(gff_handle)
        #    pprint.pprint(possible_limits)
        rnai_types = [('Orfeome', 'PCR_product'),
                    ('GenePair_STS', 'PCR_product'),
                    ('Promoterome', 'PCR_product')]
        gene_types = [('Non_coding_transcript', 'gene'),
                      ('Coding_transcript', 'gene'),
                      ('Coding_transcript', 'mRNA'),
                      ('Coding_transcript', 'CDS')]
        limit_info = dict(gff_source_type = rnai_types + gene_types)
        parser = GFFParser()
        for rec in parser.parse(gff_file, seq_dict, limit_info=limit_info):
            pass

示例#6

0

显示文件

文件： test_GFFSeqIOFeatureAdder.py 项目： jamescasbon/bcbb

 def t_ensembl_nested_features(self):
     """Test nesting of features with GFF2 files using transcript_id.
     """
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._ensembl_file))
     assert len(rec_dict["I"].features) == 2
     t_feature = rec_dict["I"].features[0]
     assert len(t_feature.sub_features) == 32

示例#7

0

显示文件

 def t_gff3_noval_attrib(self):
     """Parse GFF3 file from NCBI with a key/value pair with no value.
     """
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_ncbi))
     assert len(rec_dict) == 1
     t_feature = rec_dict.values()[0].features[0]
     assert t_feature.qualifiers["pseudo"] == ["true"]

示例#8

0

显示文件

文件： test_GFFSeqIOFeatureAdder.py 项目： jamescasbon/bcbb

 def t_fasta_directive(self):
     """Parse FASTA sequence information contained in a GFF3 file.
     """
     parser = GFFParser()
     recs = SeqIO.to_dict(parser.parse(self._gff_file))
     assert len(recs) == 1
     test_rec = recs['chr17']
     assert str(test_rec.seq) == "GATTACAGATTACA"

示例#9

0

显示文件

def parse(gff_content, source=None):
    gff_parser = GFFParser()
    gff = gff_parser.parse(io.StringIO(gff_content))

    records = []
    for record in gff:
        records.append(_create_record_model(record, source))
    if len(records) >= 1:
        return records[0]

示例#10

0

显示文件

 def t_gff_annotations(self):
     """Check GFF annotations placed on an entire sequence.
     """
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_ann_file))
     final_rec = rec_dict['I']
     assert len(final_rec.annotations.keys()) == 2
     assert final_rec.annotations['source'] == ['Expr_profile']
     assert final_rec.annotations['expr_profile'] == ['B0019.1']

示例#11

0

显示文件

def gff_to_bed(gff_file, bed_fh=sys.stdout, cds=True):

    parser = GFFParser()
    seqids = parser.parse(gff_file, None)

    for seqid in seqids:
        for feat in seqid.features:
            print >> bed_fh, "\t".join(
                str(x) for x in (seqid.id, feat.location.start,
                                 feat.location.end, feat.id, feat.type))

示例#12

0

显示文件

文件： gff_parsing_tools.py 项目： dwheelerau/modules

def get_feature_cord(gff_file,user_feature="gene"):
    """returns st, stop of a feature with orintation as tuple
    in a list to account for CDS
     in a dictionary with gene id as key:[(st,stop,orin)]
     freature can be one of 'protein','gene','mRNA','CDS','exon'
     user_feature="gene"
     a_dict['FBgn0031208']= [(7528, 9484, 1, '2L')]
     user_feature = "mRNA".
     a_dict['FBgn0031208']=[(7528, 9484, 1, 'FBtr0300689', '2L'),
      (7528, 9484, 1, 'FBtr0300690', '2L'),
       (7528, 9484, 1, 'FBtr0330654', '2L')]
    user_feature = "CDS"
    a_dict['FBgn0031208']= [([(7679, 8116), (8192, 8610)], 1, 'FBtr0300689', '2L'),
      ([(7679, 8116), (8192, 8589), (8667, 9276)], 1, 'FBtr0300690', '2L'),
        ([(7679, 8116), (8228, 8610)], 1, 'FBtr0330654', '2L')]
     """
    limit_info = dict(gff_type = ['protein','gene','mRNA','CDS','exon'])
    feature_dict = {}
    parser = GFFParser()
    in_handle = open(gff_file)
    for rec in parser.parse(in_handle,limit_info=limit_info):
        rec_id = rec.id
        for feat in rec.features:
            if feat.type == "gene":
                gene_id = feat.id
                if user_feature == "gene":
                    #gene_id = feat.id
                    assert gene_id not in feature_dict
                    feature_dict[gene_id]=[(feat.location.start.position,
                        feat.location.end.position,feat.strand,rec_id)]
                else:
                    for sub in feat.sub_features:
                        if sub.type == "mRNA":
                            if user_feature == "mRNA":
                                info = (sub.location.start.position,
                                        sub.location.end.position,sub.strand,sub.id,rec_id)
                                if gene_id in feature_dict:
                                    feature_dict[gene_id].append(info)
                                else:
                                    feature_dict[gene_id] = [info]
                            else:
                                codons = []
                                for sub_sub in sub.sub_features:
                                    if sub_sub.type == "CDS":
                                        st = sub_sub.location.start.position
                                        end = sub_sub.location.end.position
                                        codons.append((st,end))
                                info = (codons,sub.strand,sub.id,rec_id)
                                if gene_id in feature_dict:
                                    feature_dict[gene_id].append(info)
                                else:
                                    feature_dict[gene_id] = [info]
    in_handle.close()
    return feature_dict

示例#13

0

显示文件

 def t_gff3_multiple_ids(self):
     """Deal with GFF3 with non-unique ID attributes, using NCBI example.
     """
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_ncbi))
     assert len(rec_dict) == 1
     t_features = rec_dict.values()[0].features[1:]
     # 4 feature sets, same ID, different positions, different attributes
     assert len(t_features) == 4
     for f in t_features:
         assert len(f.sub_features) == 3

示例#14

0

显示文件

 def t_no_dict_error(self):
     """Ensure an error is raised when no dictionary to map to is present.
     """
     parser = GFFParser(create_missing=False)
     try:
         for rec in parser.parse(self._test_gff_file):
             pass
         # no error -- problem
         raise AssertionError('Did not complain with missing dictionary')
     except KeyError:
         pass

示例#15

0

显示文件

文件： test_GFFSeqIOFeatureAdder.py 项目： jamescasbon/bcbb

 def t_wb_cds_nested_features(self):
     """Nesting of GFF2 features with a flat CDS key value pair.
     """
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._wb_alt_file))
     assert len(rec_dict) == 2
     features = rec_dict.values()[1].features
     assert len(features) == 1
     tfeature = features[0]
     assert tfeature.id == "cr01.sctg102.wum.2.1"
     assert len(tfeature.sub_features) == 7

示例#16

0

显示文件

文件： gff_parsing_tools.py 项目： dwheelerau/modules

def extract_seq(gff_file,outfile):
    '''for gff with seq attached goes through and parses out to seq rec as
    fasta to a new file'''
    in_handle = open(gff_file)
    fasta_file = open(outfile,"w")
    parser = GFFParser()
    for rec in parser.parse(in_handle):#, limit_info=limit_info):
        #rec_seq = rec.seq.tostring()
        SeqIO.write(rec,fasta_file,"fasta")
    in_handle.close()
    fasta_file.close()

示例#17

0

显示文件

文件： test_GFFSeqIOFeatureAdder.py 项目： jamescasbon/bcbb

 def t_local_map_reduce(self):
     """General map reduce framework without parallelization.
     """
     cds_limit_info = dict(
             gff_type = ["gene", "mRNA", "CDS"],
             gff_id = ['I']
             )
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file,
         limit_info=cds_limit_info))
     test_rec = rec_dict['I']
     assert len(test_rec.features) == 32

示例#18

0

显示文件

文件： test_GFFSeqIOFeatureAdder.py 项目： jamescasbon/bcbb

 def t_jgi_gff(self):
     """Parsing of JGI formatted GFF2, nested using transcriptId and proteinID
     """
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._jgi_file))
     tfeature = rec_dict['chr_1'].features[0]
     assert tfeature.location.nofuzzy_start == 37060
     assert tfeature.location.nofuzzy_end == 38216
     assert tfeature.type == 'inferred_parent'
     assert len(tfeature.sub_features) == 6
     sfeature = tfeature.sub_features[1]
     assert sfeature.qualifiers['proteinId'] == ['873']
     assert sfeature.qualifiers['phase'] == ['0']

示例#19

0

显示文件

文件： test_GFFSeqIOFeatureAdder.py 项目： jamescasbon/bcbb

    def t_basic_directives(self):
        """Parse out top level meta-data supplied in a GFF3 file.
        """

        parser = GFFParser()
        recs = SeqIO.to_dict(parser.parse(self._gff_file))
        anns = recs['chr17'].annotations
        assert anns['gff-version'] == ['3']
        assert anns['attribute-ontology'] == ['baz']
        assert anns['feature-ontology'] == ['bar']
        assert anns['source-ontology'] == ['boo']
        assert anns['sequence-region'] == [('foo', '1', '100'), ('chr17',
            '62467934', '62469545')]

示例#20

0

显示文件

def gff_to_bed(gff_file, bed_fh=sys.stdout, cds=True):

    parser = GFFParser()
    seqids = parser.parse(gff_file, None)

    for seqid in seqids:
        for feat in seqid.features:
            subf = feat.sub_features
            if feat.type in ("chromosome", "protein"): continue
            is_cds = any(f.type=="mRNA" or f.type=="CDS" for f in subf) and\
                    feat.type=="gene"
            if cds == is_cds:
                print >>bed_fh, "\t".join(str(x) for x in (seqid.id, feat.location.start, \
                        feat.location.end, feat.id, feat.type))

示例#21

0

显示文件

文件： test_GFFSeqIOFeatureAdder.py 项目： jamescasbon/bcbb

 def t_tricky_semicolons(self):
     """Parsing of tricky semi-colon positions in WormBase GFF2.
     """
     limit_info = dict(
             gff_source_type = [('Genomic_canonical', 'region')]
             )
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._wormbase_file,
         limit_info=limit_info))
     work_rec = rec_dict['I']
     assert len(work_rec.features) == 1
     test_feature = work_rec.features[0]
     assert test_feature.qualifiers['Note'] == \
       ['Clone cTel33B; Genbank AC199162', 'Clone cTel33B; Genbank AC199162']

示例#22

0

显示文件

 def t_flat_features(self):
     """Check addition of flat non-nested features to multiple records.
     """
     seq_dict = self._get_seq_dict()
     pcr_limit_info = dict(
         gff_source_type = [('Orfeome', 'PCR_product'),
                      ('GenePair_STS', 'PCR_product'),
                      ('Promoterome', 'PCR_product')]
         )
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict,
         limit_info=pcr_limit_info))
     assert len(rec_dict['I'].features) == 4
     assert len(rec_dict['X'].features) == 5

示例#23

0

显示文件

文件： test_GFFSeqIOFeatureAdder.py 项目： GunioRobot/bcbb

 def t_gff3_iterator_limit(self):
     """Iterated interface using a limit query on GFF3 files.
     """
     cds_limit_info = dict(gff_source_type=[('Coding_transcript', 'gene'),
                                            ('Coding_transcript', 'mRNA'),
                                            ('Coding_transcript', 'CDS')],
                           gff_id=['I'])
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(
         parser.parse(self._test_gff_file, limit_info=cds_limit_info))
     assert len(rec_dict) == 1
     tfeature = rec_dict["I"].features[0].sub_features[0]
     for sub_test in tfeature.sub_features:
         assert sub_test.type == "CDS", sub_test

示例#24

0

显示文件

 def t_basic_solid_parse(self):
     """Basic parsing of SOLiD GFF results files.
     """
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file))
     test_feature = rec_dict['3_341_424_F3'].features[0]
     assert test_feature.location.nofuzzy_start == 102716
     assert test_feature.location.nofuzzy_end == 102736
     assert len(test_feature.qualifiers) == 7
     assert test_feature.qualifiers['score'] == ['10.6']
     assert test_feature.qualifiers['source'] == ['solid']
     assert test_feature.strand == -1
     assert test_feature.type == 'read'
     assert test_feature.qualifiers['g'] == ['T2203031313223113212']
     assert len(test_feature.qualifiers['q']) == 20

示例#25

0

显示文件

文件： test_GFFSeqIOFeatureAdder.py 项目： jamescasbon/bcbb

 def t_wormbase_nested_features(self):
     """Test nesting of features with GFF2 files using Transcript only.
     """
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._wormbase_file))
     assert len(rec_dict) == 3
     parent_features = [f for f in rec_dict["I"].features if f.type ==
             "Transcript"]
     assert len(parent_features) == 1
     inferred_features = [f for f in rec_dict["I"].features if f.type ==
             "inferred_parent"]
     assert len(inferred_features) == 0
     tfeature = parent_features[0]
     assert tfeature.qualifiers["WormPep"][0] == "WP:CE40797"
     assert len(tfeature.sub_features) == 46

示例#26

0

显示文件

文件： gff_to_biosql.py 项目： rumanubhardwaj/BioSQL-Extensions

def main(seq_file, gff_file):
    # -- To be customized
    # You need to update these parameters to point to your local database
    # XXX demo example could be swapped to use SQLite when that is integrated
    db_name = "orphan.db"
    biodb_name = 'metagenomic_database'

    print "Parsing FASTA sequence file..."
    with open(seq_file) as seq_handle:
        seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))

    print "Parsing GFF data file..."
    parser = GFFParser()
    recs = parser.parse(gff_file, seq_dict )#, limit_info=limit_info)
    for r in recs:
        print r.features[0]

示例#27

0

显示文件

文件： test_GFFSeqIOFeatureAdder.py 项目： JCVI-Cloud/VICVB

 def t_gff3_iterator_limit(self):
     """Iterated interface using a limit query on GFF3 files.
     """
     cds_limit_info = dict(
             gff_source_type = [('Coding_transcript', 'gene'),
                          ('Coding_transcript', 'mRNA'),
                          ('Coding_transcript', 'CDS')],
             gff_id = ['I']
             )
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file,
         limit_info=cds_limit_info))
     assert len(rec_dict) == 1
     tfeature = rec_dict["I"].features[0].sub_features[0]
     for sub_test in tfeature.sub_features:
         assert sub_test.type == "CDS", sub_test

示例#28

0

显示文件

def main(seq_file, gff_file):
    # -- To be customized
    # You need to update these parameters to point to your local database
    # XXX demo example could be swapped to use SQLite when that is integrated
    user = "******"
    passwd = "cdev"
    host = "localhost"
    db_name = "wb199_gff"
    biodb_name = "wb199_gff_cds_pcr"
    # These need to be updated to reflect what you would like to parse
    # out of the GFF file. Set limit_info=None to parse everything, but
    # be sure the file is small or you may deal with memory issues.
    rnai_types = [('Orfeome', 'PCR_product'), ('GenePair_STS', 'PCR_product'),
                  ('Promoterome', 'PCR_product')]
    gene_types = [('Non_coding_transcript', 'gene'),
                  ('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'),
                  ('Coding_transcript', 'CDS')]
    limit_info = dict(gff_source_type=rnai_types + gene_types)
    # --
    print "Parsing FASTA sequence file..."
    with open(seq_file) as seq_handle:
        seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))

    print "Parsing GFF data file..."
    parser = GFFParser()
    recs = parser.parse(gff_file, seq_dict, limit_info=limit_info)

    print "Writing to BioSQL database..."
    server = BioSeqDatabase.open_database(driver="MySQLdb",
                                          user=user,
                                          passwd=passwd,
                                          host=host,
                                          db=db_name)
    try:
        if biodb_name not in server.keys():
            server.new_database(biodb_name)
        else:
            server.remove_database(biodb_name)
            server.adaptor.commit()
            server.new_database(biodb_name)
        db = server[biodb_name]
        db.load(recs)
        server.adaptor.commit()
    except:
        server.adaptor.rollback()
        raise

示例#29

0

显示文件

 def t_line_adjust(self):
     """Adjust lines during parsing to fix potential GFF problems.
     """
     def adjust_fn(results):
         rec_index = results['quals']['i'][0]
         read_name = results['rec_id']
         results['quals']['read_name'] = [read_name]
         results['rec_id'] = rec_index
         return results
     parser = GFFParser(line_adjust_fn=adjust_fn)
     recs = [r for r in parser.parse(self._test_gff_file)]
     assert len(recs) == 1
     work_rec = recs[0]
     assert work_rec.id == '1'
     assert len(work_rec.features) == 112
     assert work_rec.features[0].qualifiers['read_name'] == \
             ['3_336_815_F3']

示例#30

0

显示文件

文件： gff_to_biosql.py 项目： wbtxd2004/Blast_database

def main(seq_file, gff_file):
    # -- To be customized
    # You need to update these parameters to point to your local database
    # XXX demo example could be swapped to use SQLite when that is integrated
    user = "******"
    passwd = "wubin"
    host = "localhost"
    db_name = "volvoxdb"
    biodb_name = "volvoxdb_cds_pcr"
    # These need to be updated to reflect what you would like to parse
    # out of the GFF file. Set limit_info=None to parse everything, but
    # be sure the file is small or you may deal with memory issues.
    rnai_types = [('Orfeome', 'PCR_product'),
                ('GenePair_STS', 'PCR_product'),
                ('Promoterome', 'PCR_product')]
    gene_types = [('Non_coding_transcript', 'gene'),
                  ('Coding_transcript', 'gene'),
                  ('Coding_transcript', 'mRNA'),
                  ('Coding_transcript', 'CDS')]
    limit_info = dict(gff_source_type = rnai_types + gene_types)
    # --
    print "Parsing FASTA sequence file..."
    with open(seq_file) as seq_handle:
        seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))

    print "Parsing GFF data file..."
    parser = GFFParser()
    recs = parser.parse(gff_file, seq_dict, limit_info=limit_info)

    print "Writing to BioSQL database..."
    server = BioSeqDatabase.open_database(driver="MySQLdb", user=user,
            passwd=passwd, host=host, db=db_name)
    try:
        if biodb_name not in server.keys():
            server.new_database(biodb_name)
        else:
            server.remove_database(biodb_name)
            server.adaptor.commit()
            server.new_database(biodb_name)
        db = server[biodb_name]
        db.load(recs)
        server.adaptor.commit()
    except:
        server.adaptor.rollback()
        raise

示例#31

0

显示文件

文件： locus_tag2ffn.py 项目： chrisgulvik/genomics_scripts

def locus_tag2record_info(want, infile):
    """ returns a tuple containing record info 
	given a GFF file and locus_tag to locate """
    parser = GFFParser()
    with open(infile, "r") as gff:
        record_info = (None, None, None, None, None)
        for record in parser.parse(gff, limit_info=dict(gff_type=["gene", "CDS", "locus_tag", "product"])):
            for feature in record.features:
                if feature.type == "gene" and "locus_tag" in feature.qualifiers:
                    locus_tag = feature.qualifiers.get("locus_tag", None)
                    if want in locus_tag:
                        record_info = (
                            feature.location.start.position,
                            feature.location.end.position,
                            feature.strand,
                            record.id,
                            record.seq.tostring(),
                        )
    return record_info

示例#32

0

显示文件

文件： test_GFFSeqIOFeatureAdder.py 项目： jamescasbon/bcbb

 def t_basic_attributes(self):
     """Parse out basic attributes of GFF2 from Ensembl GTF.
     """
     limit_info = dict(
             gff_source_type = [('snoRNA', 'exon')]
             )
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._ensembl_file,
         limit_info=limit_info))
     work_rec = rec_dict['I']
     assert len(work_rec.features) == 1
     test_feature = work_rec.features[0]
     qual_keys = test_feature.qualifiers.keys()
     qual_keys.sort()
     assert qual_keys == ['Parent', 'exon_number', 'gene_id', 'gene_name',
             'source', 'transcript_id', 'transcript_name']
     assert test_feature.qualifiers['source'] == ['snoRNA']
     assert test_feature.qualifiers['transcript_name'] == ['NR_001477.2']
     assert test_feature.qualifiers['exon_number'] == ['1']

示例#33

0

显示文件

 def t_nested_features(self):
     """Check three-deep nesting of features with gene, mRNA and CDS.
     """
     seq_dict = self._get_seq_dict()
     cds_limit_info = dict(
             gff_source_type = [('Coding_transcript', 'gene'),
                          ('Coding_transcript', 'mRNA'),
                          ('Coding_transcript', 'CDS')],
             gff_id = ['I']
             )
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict,
         limit_info=cds_limit_info))
     final_rec = rec_dict['I']
     # first gene feature is plain
     assert len(final_rec.features) == 2 # two gene feature
     assert len(final_rec.features[0].sub_features) == 1 # one transcript
     # 15 final CDS regions
     assert len(final_rec.features[0].sub_features[0].sub_features) == 15

示例#34

0

显示文件

def main(gff_file,th_fasta):
    parser = GFFParser()
    #parser = GFFExaminer()
    seqids = parser.parse(gff_file, None)
    #seqids = parser.parent_child_map(gff_file)
    fasta = Fasta(th_fasta, flatten_inplace=True)
    out_fasta = open('this_is_a_test','w')
    for i,seqid in enumerate(seqids):
        ss= condens_transcript(seqid.features)
        for i,feat in enumerate(ss):
            #print feat
            ids = []
            has_cds = False
            ids.append(feat.id)
            for subf in feat.sub_features:
                if str(feat.type) == 'CDS' or feat.type == 'gene'  or feat.type == 'protein':
                    has_cds = True
            if has_cds: continue
            print >>out_fasta, '>%s' %ids[0]
            print >>out_fasta, fasta[seqid.id.lower()][int(feat.location.start):int(feat.location.end)]

示例#35

0

显示文件

文件： find_rna.py 项目： PMSeitzer/find_cns

def main(gff_file, outdir, th_fasta):
    """empty docstring"""
    parser = GFFParser()
    seqids = parser.parse(gff_file,None)

    fasta = Fasta(th_fasta, flatten_inplace=True)
    out_fasta = open(outdir + "/at_no_cds.fasta", "w")
    for seqid in seqids:
        seq_features = conden_transcripts(seqid.features)
        for feat in seq_features:
            has_cds = False
            ids = []
            ids.append(feat.id)
            for subf in feat.sub_features:
                if subf.type == 'CDS' or subf.type == 'chromosome':
                    has_cds = True
            if has_cds: continue
            #non_cds_feats.append(feat) 
            print >>out_fasta, ">%s" % ids[0]
            print >>out_fasta, fasta[seqid.id.lower()][int(feat.location.start) : int(feat.location.end)]

示例#36

0

显示文件

文件： find_rna.py 项目： yuzhenpeng/find_cns

def main(gff_file, outdir, th_fasta):
    """empty docstring"""
    parser = GFFParser()
    seqids = parser.parse(gff_file, None)

    fasta = Fasta(th_fasta, flatten_inplace=True)
    out_fasta = open(outdir + "/at_no_cds.fasta", "w")
    for seqid in seqids:
        seq_features = conden_transcripts(seqid.features)
        for feat in seq_features:
            has_cds = True
            ids = []
            ids.append(feat.id)
            for subf in feat.sub_features:
                if subf.type in set(
                    ['tRNA', 'rRNA', 'miRNA', 'snoRNA', 'ncRNA', 'snRNA']):
                    has_cds = False
            if has_cds: continue
            #non_cds_feats.append(feat)
            print >> out_fasta, ">%s" % ids[0]
            print >> out_fasta, fasta[seqid.id.lower(
            )][int(feat.location.start):int(feat.location.end)]

示例#37

0

显示文件

 def t_nested_multiparent_features(self):
     """Verify correct nesting of features with multiple parents.
     """
     seq_dict = self._get_seq_dict()
     cds_limit_info = dict(
             gff_source_type = [('Coding_transcript', 'gene'),
                          ('Coding_transcript', 'mRNA'),
                          ('Coding_transcript', 'CDS')],
             gff_id = ['I']
             )
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict,
         limit_info=cds_limit_info))
     final_rec = rec_dict['I']
     # second gene feature is multi-parent
     assert len(final_rec.features) == 2 # two gene feature
     cur_subs = final_rec.features[1].sub_features
     assert len(cur_subs) == 3 # three transcripts
     # the first and second transcript have the same CDSs
     assert len(cur_subs[0].sub_features) == 6
     assert len(cur_subs[1].sub_features) == 6
     assert cur_subs[0].sub_features[0] is cur_subs[1].sub_features[0]