Пример #1
0
def gff_to_bed(gff_file,
               bed_fh=sys.stdout,
               cds=True,
               species=None,
               rename=False):

    parser = GFFParser()
    seqids = parser.parse(gff_file, None)

    cur_chr = None
    cur_gene_order = 0
    for seqid in seqids:
        for feat in seqid.features:
            subf = feat.sub_features
            if feat.type in ("chromosome", "protein"): continue
            is_cds = any(f.type=="mRNA" or f.type=="CDS" for f in subf) and\
                    feat.type=="gene"
            if cds == is_cds:
                cur_gene_order += 1
                if species != None:
                    seqid_final = species + seqid.id[-2:]  # hard coded
                else:
                    seqid_final = seqid.id
                if rename:
                    if seqid.id != cur_chr:
                        cur_gene_order = 1
                        cur_chr = seqid.id
                    gene_name = seqid_final + 'g' + '0' * (
                        5 - len(str(cur_gene_order))) + str(cur_gene_order)
                else:
                    gene_name = feat.id

                print >>bed_fh, "\t".join(str(x) for x in (seqid_final, int(str(feat.location.start))+1, \
                        feat.location.end, gene_name))  # +1 is hard coded to current BCBio.GFF
Пример #2
0
    def not_t_full_celegans(self):
        """Test the full C elegans chromosome and GFF files.

        This is used to test GFF on large files and is not run as a standard
        test. You will need to download the files and adjust the paths
        to run this.
        """
        # read the sequence information
        seq_file = os.path.join(self._full_dir, "c_elegans.WS199.dna.fa")
        gff_file = os.path.join(self._full_dir, "c_elegans.WS199.gff3")
        seq_handle = open(seq_file)
        seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))
        seq_handle.close()
        #with open(gff_file) as gff_handle:
        #    possible_limits = feature_adder.available_limits(gff_handle)
        #    pprint.pprint(possible_limits)
        rnai_types = [('Orfeome', 'PCR_product'),
                    ('GenePair_STS', 'PCR_product'),
                    ('Promoterome', 'PCR_product')]
        gene_types = [('Non_coding_transcript', 'gene'),
                      ('Coding_transcript', 'gene'),
                      ('Coding_transcript', 'mRNA'),
                      ('Coding_transcript', 'CDS')]
        limit_info = dict(gff_source_type = rnai_types + gene_types)
        parser = GFFParser()
        for rec in parser.parse(gff_file, seq_dict, limit_info=limit_info):
            pass
Пример #3
0
def gff_to_bed(gff_file, bed_fh=sys.stdout, cds=True, species=None, rename=False):

    parser = GFFParser()
    seqids = parser.parse(gff_file, None)

    cur_chr = None
    cur_gene_order = 0
    for seqid in seqids:
        for feat in seqid.features:
            subf = feat.sub_features
            if feat.type in ("chromosome", "protein"):
                continue
            is_cds = any(f.type == "mRNA" or f.type == "CDS" for f in subf) and feat.type == "gene"
            if cds == is_cds:
                cur_gene_order += 1
                if species != None:
                    seqid_final = species + seqid.id[-2:]  # hard coded
                else:
                    seqid_final = seqid.id
                if rename:
                    if seqid.id != cur_chr:
                        cur_gene_order = 1
                        cur_chr = seqid.id
                    gene_name = seqid_final + "g" + "0" * (5 - len(str(cur_gene_order))) + str(cur_gene_order)
                else:
                    gene_name = feat.id

                print >> bed_fh, "\t".join(
                    str(x) for x in (seqid_final, int(str(feat.location.start)) + 1, feat.location.end, gene_name)
                )  # +1 is hard coded to current BCBio.GFF
Пример #4
0
 def t_unknown_seq(self):
     """Prepare unknown base sequences with the correct length.
     """
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file))
     assert len(rec_dict["I"].seq) == 12766937
     assert len(rec_dict["X"].seq) == 17718531
Пример #5
0
 def t_fasta_directive(self):
     """Parse FASTA sequence information contained in a GFF3 file.
     """
     parser = GFFParser()
     recs = SeqIO.to_dict(parser.parse(self._gff_file))
     assert len(recs) == 1
     test_rec = recs['chr17']
     assert str(test_rec.seq) == "GATTACAGATTACA"
Пример #6
0
 def t_ensembl_nested_features(self):
     """Test nesting of features with GFF2 files using transcript_id.
     """
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._ensembl_file))
     assert len(rec_dict["I"].features) == 2
     t_feature = rec_dict["I"].features[0]
     assert len(t_feature.sub_features) == 32
Пример #7
0
 def t_gff3_noval_attrib(self):
     """Parse GFF3 file from NCBI with a key/value pair with no value.
     """
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_ncbi))
     assert len(rec_dict) == 1
     t_feature = rec_dict.values()[0].features[0]
     assert t_feature.qualifiers["pseudo"] == ["true"]
Пример #8
0
def parse(gff_content, source=None):
    gff_parser = GFFParser()
    gff = gff_parser.parse(io.StringIO(gff_content))

    records = []
    for record in gff:
        records.append(_create_record_model(record, source))
    if len(records) >= 1:
        return records[0]
Пример #9
0
 def t_gff_annotations(self):
     """Check GFF annotations placed on an entire sequence.
     """
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_ann_file))
     final_rec = rec_dict['I']
     assert len(final_rec.annotations.keys()) == 2
     assert final_rec.annotations['source'] == ['Expr_profile']
     assert final_rec.annotations['expr_profile'] == ['B0019.1']
Пример #10
0
 def t_gff3_iterator(self):
     """Iterated parsing in GFF3 files with nested features.
     """
     parser = GFFParser()
     recs = [r for r in parser.parse_in_parts(self._test_gff_file,
         target_lines=70)]
     # should be one big set because we don't have a good place to split
     assert len(recs) == 6
     assert len(recs[0].features) == 59
Пример #11
0
 def t_solid_iterator(self):
     """Iterated parsing in a flat file without nested features.
     """
     parser = GFFParser()
     feature_sizes = []
     for rec in parser.parse_in_parts(self._test_gff_file, target_lines=5):
         feature_sizes.append(len(rec.features))
     assert len(feature_sizes) == 112
     assert max(feature_sizes) == 1
Пример #12
0
def gff_to_bed(gff_file, bed_fh=sys.stdout, cds=True):

    parser = GFFParser()
    seqids = parser.parse(gff_file, None)

    for seqid in seqids:
        for feat in seqid.features:
            print >> bed_fh, "\t".join(
                str(x) for x in (seqid.id, feat.location.start,
                                 feat.location.end, feat.id, feat.type))
Пример #13
0
 def t_solid_iterator(self):
     """Iterated parsing in a flat file without nested features.
     """
     parser = GFFParser()
     feature_sizes = []
     for rec in parser.parse_in_parts(self._test_gff_file,
             target_lines=5):
         feature_sizes.append(len(rec.features))
     assert len(feature_sizes) == 112
     assert max(feature_sizes) == 1
Пример #14
0
 def t_gff2_iteration(self):
     """Test iterated features with GFF2 files, breaking without parents.
     """
     parser = GFFParser()
     recs = []
     for rec in parser.parse_in_parts(self._wormbase_file, target_lines=15):
         recs.append(rec)
     assert len(recs) == 4
     assert recs[0].features[0].type == 'region'
     assert recs[0].features[1].type == 'SAGE_tag'
     assert len(recs[0].features[2].sub_features) == 29
Пример #15
0
 def t_wb_cds_nested_features(self):
     """Nesting of GFF2 features with a flat CDS key value pair.
     """
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._wb_alt_file))
     assert len(rec_dict) == 2
     features = rec_dict.values()[1].features
     assert len(features) == 1
     tfeature = features[0]
     assert tfeature.id == "cr01.sctg102.wum.2.1"
     assert len(tfeature.sub_features) == 7
Пример #16
0
def main(in_file):
    base, ext = os.path.splitext(in_file)
    out_file = "%s.gff3" % (base)
    in_handle = open(in_file)
    out_handle = open(out_file, "w")
    reader = GFFParser()
    writer = GFF3Writer()
    writer.write(reader.parse_in_parts(in_handle, target_lines=25000),
            out_handle)
    in_handle.close()
    out_handle.close()
Пример #17
0
def get_feature_cord(gff_file,user_feature="gene"):
    """returns st, stop of a feature with orintation as tuple
    in a list to account for CDS
     in a dictionary with gene id as key:[(st,stop,orin)]
     freature can be one of 'protein','gene','mRNA','CDS','exon'
     user_feature="gene"
     a_dict['FBgn0031208']= [(7528, 9484, 1, '2L')]
     user_feature = "mRNA".
     a_dict['FBgn0031208']=[(7528, 9484, 1, 'FBtr0300689', '2L'),
      (7528, 9484, 1, 'FBtr0300690', '2L'),
       (7528, 9484, 1, 'FBtr0330654', '2L')]
    user_feature = "CDS"
    a_dict['FBgn0031208']= [([(7679, 8116), (8192, 8610)], 1, 'FBtr0300689', '2L'),
      ([(7679, 8116), (8192, 8589), (8667, 9276)], 1, 'FBtr0300690', '2L'),
        ([(7679, 8116), (8228, 8610)], 1, 'FBtr0330654', '2L')]
     """
    limit_info = dict(gff_type = ['protein','gene','mRNA','CDS','exon'])
    feature_dict = {}
    parser = GFFParser()
    in_handle = open(gff_file)
    for rec in parser.parse(in_handle,limit_info=limit_info):
        rec_id = rec.id
        for feat in rec.features:
            if feat.type == "gene":
                gene_id = feat.id
                if user_feature == "gene":
                    #gene_id = feat.id
                    assert gene_id not in feature_dict
                    feature_dict[gene_id]=[(feat.location.start.position,
                        feat.location.end.position,feat.strand,rec_id)]
                else:
                    for sub in feat.sub_features:
                        if sub.type == "mRNA":
                            if user_feature == "mRNA":
                                info = (sub.location.start.position,
                                        sub.location.end.position,sub.strand,sub.id,rec_id)
                                if gene_id in feature_dict:
                                    feature_dict[gene_id].append(info)
                                else:
                                    feature_dict[gene_id] = [info]
                            else:
                                codons = []
                                for sub_sub in sub.sub_features:
                                    if sub_sub.type == "CDS":
                                        st = sub_sub.location.start.position
                                        end = sub_sub.location.end.position
                                        codons.append((st,end))
                                info = (codons,sub.strand,sub.id,rec_id)
                                if gene_id in feature_dict:
                                    feature_dict[gene_id].append(info)
                                else:
                                    feature_dict[gene_id] = [info]
    in_handle.close()
    return feature_dict
Пример #18
0
 def t_no_dict_error(self):
     """Ensure an error is raised when no dictionary to map to is present.
     """
     parser = GFFParser(create_missing=False)
     try:
         for rec in parser.parse(self._test_gff_file):
             pass
         # no error -- problem
         raise AssertionError('Did not complain with missing dictionary')
     except KeyError:
         pass
Пример #19
0
def extract_seq(gff_file,outfile):
    '''for gff with seq attached goes through and parses out to seq rec as
    fasta to a new file'''
    in_handle = open(gff_file)
    fasta_file = open(outfile,"w")
    parser = GFFParser()
    for rec in parser.parse(in_handle):#, limit_info=limit_info):
        #rec_seq = rec.seq.tostring()
        SeqIO.write(rec,fasta_file,"fasta")
    in_handle.close()
    fasta_file.close()
Пример #20
0
 def t_gff3_multiple_ids(self):
     """Deal with GFF3 with non-unique ID attributes, using NCBI example.
     """
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_ncbi))
     assert len(rec_dict) == 1
     t_features = rec_dict.values()[0].features[1:]
     # 4 feature sets, same ID, different positions, different attributes
     assert len(t_features) == 4
     for f in t_features:
         assert len(f.sub_features) == 3
Пример #21
0
 def t_simple_parsing(self):
     """Parse GFF into a simple line by line dictionary without nesting.
     """
     parser = GFFParser()
     num_lines = 0
     for line_info in parser.parse_simple(self._test_gff_file):
         num_lines += 1
     assert num_lines == 177, num_lines
     line_info = line_info['child'][0]
     assert line_info['quals']['confirmed_est'] == \
             ['yk1055g06.5', 'OSTF085G5_1']
     assert line_info['location'] == [4582718, 4583189]
Пример #22
0
 def t_local_map_reduce(self):
     """General map reduce framework without parallelization.
     """
     cds_limit_info = dict(
             gff_type = ["gene", "mRNA", "CDS"],
             gff_id = ['I']
             )
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file,
         limit_info=cds_limit_info))
     test_rec = rec_dict['I']
     assert len(test_rec.features) == 32
Пример #23
0
 def t_jgi_gff(self):
     """Parsing of JGI formatted GFF2, nested using transcriptId and proteinID
     """
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._jgi_file))
     tfeature = rec_dict['chr_1'].features[0]
     assert tfeature.location.nofuzzy_start == 37060
     assert tfeature.location.nofuzzy_end == 38216
     assert tfeature.type == 'inferred_parent'
     assert len(tfeature.sub_features) == 6
     sfeature = tfeature.sub_features[1]
     assert sfeature.qualifiers['proteinId'] == ['873']
     assert sfeature.qualifiers['phase'] == ['0']
Пример #24
0
    def t_basic_directives(self):
        """Parse out top level meta-data supplied in a GFF3 file.
        """

        parser = GFFParser()
        recs = SeqIO.to_dict(parser.parse(self._gff_file))
        anns = recs['chr17'].annotations
        assert anns['gff-version'] == ['3']
        assert anns['attribute-ontology'] == ['baz']
        assert anns['feature-ontology'] == ['bar']
        assert anns['source-ontology'] == ['boo']
        assert anns['sequence-region'] == [('foo', '1', '100'), ('chr17',
            '62467934', '62469545')]
Пример #25
0
 def t_tricky_semicolons(self):
     """Parsing of tricky semi-colon positions in WormBase GFF2.
     """
     limit_info = dict(
             gff_source_type = [('Genomic_canonical', 'region')]
             )
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._wormbase_file,
         limit_info=limit_info))
     work_rec = rec_dict['I']
     assert len(work_rec.features) == 1
     test_feature = work_rec.features[0]
     assert test_feature.qualifiers['Note'] == \
       ['Clone cTel33B; Genbank AC199162', 'Clone cTel33B; Genbank AC199162']
Пример #26
0
 def t_flat_features(self):
     """Check addition of flat non-nested features to multiple records.
     """
     seq_dict = self._get_seq_dict()
     pcr_limit_info = dict(
         gff_source_type = [('Orfeome', 'PCR_product'),
                      ('GenePair_STS', 'PCR_product'),
                      ('Promoterome', 'PCR_product')]
         )
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict,
         limit_info=pcr_limit_info))
     assert len(rec_dict['I'].features) == 4
     assert len(rec_dict['X'].features) == 5
Пример #27
0
 def t_gff3_iterator_limit(self):
     """Iterated interface using a limit query on GFF3 files.
     """
     cds_limit_info = dict(gff_source_type=[('Coding_transcript', 'gene'),
                                            ('Coding_transcript', 'mRNA'),
                                            ('Coding_transcript', 'CDS')],
                           gff_id=['I'])
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(
         parser.parse(self._test_gff_file, limit_info=cds_limit_info))
     assert len(rec_dict) == 1
     tfeature = rec_dict["I"].features[0].sub_features[0]
     for sub_test in tfeature.sub_features:
         assert sub_test.type == "CDS", sub_test
Пример #28
0
def gff_to_bed(gff_file, bed_fh=sys.stdout, cds=True):

    parser = GFFParser()
    seqids = parser.parse(gff_file, None)

    for seqid in seqids:
        for feat in seqid.features:
            subf = feat.sub_features
            if feat.type in ("chromosome", "protein"): continue
            is_cds = any(f.type=="mRNA" or f.type=="CDS" for f in subf) and\
                    feat.type=="gene"
            if cds == is_cds:
                print >>bed_fh, "\t".join(str(x) for x in (seqid.id, feat.location.start, \
                        feat.location.end, feat.id, feat.type))
Пример #29
0
 def t_basic_solid_parse(self):
     """Basic parsing of SOLiD GFF results files.
     """
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file))
     test_feature = rec_dict['3_341_424_F3'].features[0]
     assert test_feature.location.nofuzzy_start == 102716
     assert test_feature.location.nofuzzy_end == 102736
     assert len(test_feature.qualifiers) == 7
     assert test_feature.qualifiers['score'] == ['10.6']
     assert test_feature.qualifiers['source'] == ['solid']
     assert test_feature.strand == -1
     assert test_feature.type == 'read'
     assert test_feature.qualifiers['g'] == ['T2203031313223113212']
     assert len(test_feature.qualifiers['q']) == 20
Пример #30
0
 def t_wormbase_nested_features(self):
     """Test nesting of features with GFF2 files using Transcript only.
     """
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._wormbase_file))
     assert len(rec_dict) == 3
     parent_features = [f for f in rec_dict["I"].features if f.type ==
             "Transcript"]
     assert len(parent_features) == 1
     inferred_features = [f for f in rec_dict["I"].features if f.type ==
             "inferred_parent"]
     assert len(inferred_features) == 0
     tfeature = parent_features[0]
     assert tfeature.qualifiers["WormPep"][0] == "WP:CE40797"
     assert len(tfeature.sub_features) == 46
def main(seq_file, gff_file):
    # -- To be customized
    # You need to update these parameters to point to your local database
    # XXX demo example could be swapped to use SQLite when that is integrated
    db_name = "orphan.db"
    biodb_name = 'metagenomic_database'

    print "Parsing FASTA sequence file..."
    with open(seq_file) as seq_handle:
        seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))

    print "Parsing GFF data file..."
    parser = GFFParser()
    recs = parser.parse(gff_file, seq_dict )#, limit_info=limit_info)
    for r in recs:
        print r.features[0]
Пример #32
0
    def t_gff3_to_gff3(self):
        """Read in and write out GFF3 without any loss of information.
        """
        parser = GFFParser()
        recs = SeqIO.to_dict(parser.parse(self._test_gff_file))
        out_handle = StringIO.StringIO()
        writer = GFF3Writer()
        writer.write(recs.values(), out_handle)
        wrote_handle = StringIO.StringIO(out_handle.getvalue())
        recs_two = SeqIO.to_dict(parser.parse(wrote_handle))

        orig_rec = recs.values()[0]
        re_rec = recs.values()[0]
        assert len(orig_rec.features) == len(re_rec.features)
        for i, orig_f in enumerate(orig_rec.features):
            assert str(orig_f) == str(re_rec.features[i])
Пример #33
0
 def t_gff3_iterator_limit(self):
     """Iterated interface using a limit query on GFF3 files.
     """
     cds_limit_info = dict(
             gff_source_type = [('Coding_transcript', 'gene'),
                          ('Coding_transcript', 'mRNA'),
                          ('Coding_transcript', 'CDS')],
             gff_id = ['I']
             )
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file,
         limit_info=cds_limit_info))
     assert len(rec_dict) == 1
     tfeature = rec_dict["I"].features[0].sub_features[0]
     for sub_test in tfeature.sub_features:
         assert sub_test.type == "CDS", sub_test
Пример #34
0
 def t_line_adjust(self):
     """Adjust lines during parsing to fix potential GFF problems.
     """
     def adjust_fn(results):
         rec_index = results['quals']['i'][0]
         read_name = results['rec_id']
         results['quals']['read_name'] = [read_name]
         results['rec_id'] = rec_index
         return results
     parser = GFFParser(line_adjust_fn=adjust_fn)
     recs = [r for r in parser.parse(self._test_gff_file)]
     assert len(recs) == 1
     work_rec = recs[0]
     assert work_rec.id == '1'
     assert len(work_rec.features) == 112
     assert work_rec.features[0].qualifiers['read_name'] == \
             ['3_336_815_F3']
Пример #35
0
def main(seq_file, gff_file):
    # -- To be customized
    # You need to update these parameters to point to your local database
    # XXX demo example could be swapped to use SQLite when that is integrated
    user = "******"
    passwd = "cdev"
    host = "localhost"
    db_name = "wb199_gff"
    biodb_name = "wb199_gff_cds_pcr"
    # These need to be updated to reflect what you would like to parse
    # out of the GFF file. Set limit_info=None to parse everything, but
    # be sure the file is small or you may deal with memory issues.
    rnai_types = [('Orfeome', 'PCR_product'), ('GenePair_STS', 'PCR_product'),
                  ('Promoterome', 'PCR_product')]
    gene_types = [('Non_coding_transcript', 'gene'),
                  ('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'),
                  ('Coding_transcript', 'CDS')]
    limit_info = dict(gff_source_type=rnai_types + gene_types)
    # --
    print "Parsing FASTA sequence file..."
    with open(seq_file) as seq_handle:
        seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))

    print "Parsing GFF data file..."
    parser = GFFParser()
    recs = parser.parse(gff_file, seq_dict, limit_info=limit_info)

    print "Writing to BioSQL database..."
    server = BioSeqDatabase.open_database(driver="MySQLdb",
                                          user=user,
                                          passwd=passwd,
                                          host=host,
                                          db=db_name)
    try:
        if biodb_name not in server.keys():
            server.new_database(biodb_name)
        else:
            server.remove_database(biodb_name)
            server.adaptor.commit()
            server.new_database(biodb_name)
        db = server[biodb_name]
        db.load(recs)
        server.adaptor.commit()
    except:
        server.adaptor.rollback()
        raise
Пример #36
0
def main(seq_file, gff_file):
    # -- To be customized
    # You need to update these parameters to point to your local database
    # XXX demo example could be swapped to use SQLite when that is integrated
    user = "******"
    passwd = "wubin"
    host = "localhost"
    db_name = "volvoxdb"
    biodb_name = "volvoxdb_cds_pcr"
    # These need to be updated to reflect what you would like to parse
    # out of the GFF file. Set limit_info=None to parse everything, but
    # be sure the file is small or you may deal with memory issues.
    rnai_types = [('Orfeome', 'PCR_product'),
                ('GenePair_STS', 'PCR_product'),
                ('Promoterome', 'PCR_product')]
    gene_types = [('Non_coding_transcript', 'gene'),
                  ('Coding_transcript', 'gene'),
                  ('Coding_transcript', 'mRNA'),
                  ('Coding_transcript', 'CDS')]
    limit_info = dict(gff_source_type = rnai_types + gene_types)
    # --
    print "Parsing FASTA sequence file..."
    with open(seq_file) as seq_handle:
        seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))

    print "Parsing GFF data file..."
    parser = GFFParser()
    recs = parser.parse(gff_file, seq_dict, limit_info=limit_info)

    print "Writing to BioSQL database..."
    server = BioSeqDatabase.open_database(driver="MySQLdb", user=user,
            passwd=passwd, host=host, db=db_name)
    try:
        if biodb_name not in server.keys():
            server.new_database(biodb_name)
        else:
            server.remove_database(biodb_name)
            server.adaptor.commit()
            server.new_database(biodb_name)
        db = server[biodb_name]
        db.load(recs)
        server.adaptor.commit()
    except:
        server.adaptor.rollback()
        raise
Пример #37
0
 def t_nested_features(self):
     """Check three-deep nesting of features with gene, mRNA and CDS.
     """
     seq_dict = self._get_seq_dict()
     cds_limit_info = dict(
             gff_source_type = [('Coding_transcript', 'gene'),
                          ('Coding_transcript', 'mRNA'),
                          ('Coding_transcript', 'CDS')],
             gff_id = ['I']
             )
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict,
         limit_info=cds_limit_info))
     final_rec = rec_dict['I']
     # first gene feature is plain
     assert len(final_rec.features) == 2 # two gene feature
     assert len(final_rec.features[0].sub_features) == 1 # one transcript
     # 15 final CDS regions
     assert len(final_rec.features[0].sub_features[0].sub_features) == 15
Пример #38
0
def main(gff_file,th_fasta):
    parser = GFFParser()
    #parser = GFFExaminer()
    seqids = parser.parse(gff_file, None)
    #seqids = parser.parent_child_map(gff_file)
    fasta = Fasta(th_fasta, flatten_inplace=True)
    out_fasta = open('this_is_a_test','w')
    for i,seqid in enumerate(seqids):
        ss= condens_transcript(seqid.features)
        for i,feat in enumerate(ss):
            #print feat
            ids = []
            has_cds = False
            ids.append(feat.id)
            for subf in feat.sub_features:
                if str(feat.type) == 'CDS' or feat.type == 'gene'  or feat.type == 'protein':
                    has_cds = True
            if has_cds: continue
            print >>out_fasta, '>%s' %ids[0]
            print >>out_fasta, fasta[seqid.id.lower()][int(feat.location.start):int(feat.location.end)]
Пример #39
0
def main(gff_file, outdir, th_fasta):
    """empty docstring"""
    parser = GFFParser()
    seqids = parser.parse(gff_file, None)

    fasta = Fasta(th_fasta, flatten_inplace=True)
    out_fasta = open(outdir + "/at_no_cds.fasta", "w")
    for seqid in seqids:
        seq_features = conden_transcripts(seqid.features)
        for feat in seq_features:
            has_cds = True
            ids = []
            ids.append(feat.id)
            for subf in feat.sub_features:
                if subf.type in set(
                    ['tRNA', 'rRNA', 'miRNA', 'snoRNA', 'ncRNA', 'snRNA']):
                    has_cds = False
            if has_cds: continue
            #non_cds_feats.append(feat)
            print >> out_fasta, ">%s" % ids[0]
            print >> out_fasta, fasta[seqid.id.lower(
            )][int(feat.location.start):int(feat.location.end)]
Пример #40
0
 def t_nested_multiparent_features(self):
     """Verify correct nesting of features with multiple parents.
     """
     seq_dict = self._get_seq_dict()
     cds_limit_info = dict(
             gff_source_type = [('Coding_transcript', 'gene'),
                          ('Coding_transcript', 'mRNA'),
                          ('Coding_transcript', 'CDS')],
             gff_id = ['I']
             )
     parser = GFFParser()
     rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict,
         limit_info=cds_limit_info))
     final_rec = rec_dict['I']
     # second gene feature is multi-parent
     assert len(final_rec.features) == 2 # two gene feature
     cur_subs = final_rec.features[1].sub_features
     assert len(cur_subs) == 3 # three transcripts
     # the first and second transcript have the same CDSs
     assert len(cur_subs[0].sub_features) == 6
     assert len(cur_subs[1].sub_features) == 6
     assert cur_subs[0].sub_features[0] is cur_subs[1].sub_features[0]