示例#1
0
def rebase(parent, child, interpro=False, protein2dna=False):
    child_features = __get_features(child, interpro=interpro)

    for rec in GFF.parse(parent):
        # TODO, replace with recursion in case it's matched against a
        # non-parent feature. We're cheating a bit here right now...
        replacement_features = []
        for feature in rec.features:
            if feature.id in child_features:
                new_subfeatures = child_features[feature.id]
                # TODO: update starts
                fixed_subfeatures = []
                for x in new_subfeatures:
                    # Then update the location of the actual feature
                    __update_feature_location(x, feature, protein2dna)

                    if interpro:
                        for y in ('status', 'Target'):
                            try:
                                del x.qualifiers[y]
                            except:
                                pass

                    fixed_subfeatures.append(x)
                replacement_features.extend(fixed_subfeatures)
        # We do this so we don't include the original set of features that we
        # were rebasing against in our result.
        rec.features = replacement_features
        GFF.write([rec], sys.stdout)
示例#2
0
文件: io.py 项目: Chris7/edge
    def to_gff(self, filename):
        """
        Export to GFF format, saving to the specified filename.
        """
        records = []

        for fragment in self.__genome.fragments.all():
            fragment = fragment.indexed_fragment()
            seq = Seq(fragment.sequence)
            rec = SeqRecord(seq, "%s" % (fragment.name,))
            features = []

            for annotation in fragment.annotations():
                # FeatureLocation first bp is AfterPosition, so -1
                loc = FeatureLocation(annotation.base_first-1, annotation.base_last)
                qualifiers = {'name': annotation.feature.name}
                feature = SeqFeature(loc,
                                     type=annotation.feature.type,
                                     strand=1,
                                     qualifiers=qualifiers)
                features.append(feature)

            rec.features = features
            records.append(rec)

        with open(filename, "w") as out_handle:
            GFF.write(records, out_handle, include_fasta=True)
示例#3
0
def CpGIslandsToGFF(island_location):
# Output methylation regions (CpG Islands, namely) to a GFF3 compliant file 

    out_file = os.getcwd() \
    + '/' \
    + os.path.splitext(base)[0] \
    + '.gff'


    seq = cur_record.seq
    rec = SeqRecord(seq, "ID1") 

    qualifiers = {"source": "bssimulation", "score": '.', "ID": cur_record.name}
    sub_qualifiers = {"source": "bssimulation"}
    top_feature = SeqFeature(FeatureLocation(0, len(cur_record)), type="region", strand=0,
                         qualifiers=qualifiers)
    for i in island_location:
        begin = int(i[0] - i[1]/2)
        end = int(i[0] + i[1]/2)

        top_feature.sub_features.append(SeqFeature(FeatureLocation(begin, end), 
            type="CpG_island", 
            strand=0,
            qualifiers=sub_qualifiers))

    rec.features = [top_feature]
 
    with open(out_file, "w") as out_handle:
        GFF.write([rec], out_handle)
示例#4
0
文件: io.py 项目: ginkgobioworks/edge
    def to_gff_file(self, file):
        """
        Export to GFF format, saving to provided file like object.
        """
        records = []

        for fragment in self.__genome.fragments.all():
            fragment = fragment.indexed_fragment()
            seq = Seq(fragment.sequence)
            rec = SeqRecord(seq, "%s" % (fragment.name,))
            features = []

            for annotation in fragment.annotations():
                # FeatureLocation first bp is AfterPosition, so -1
                loc = FeatureLocation(annotation.base_first - 1, annotation.base_last)
                qualifiers = {'name': annotation.feature.name}
                strand = annotation.feature.strand
                feature = SeqFeature(loc,
                                     type=annotation.feature.type,
                                     strand=0 if strand is None else strand,
                                     qualifiers=qualifiers)
                features.append(feature)

            rec.features = features
            records.append(rec)

        GFF.write(records, file, include_fasta=True)
 def t_write_from_recs(self):
     """Write out GFF3 from SeqRecord inputs.
     """
     seq = Seq("GATCGATCGATCGATCGATC")
     rec = SeqRecord(seq, "ID1")
     qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"],
                   "ID": "gene1"}
     sub_qualifiers = {"source": "prediction"}
     top_feature = SeqFeature(FeatureLocation(0, 20), type="gene", strand=1,
                                                       qualifiers=qualifiers)
     top_feature.sub_features = [SeqFeature(FeatureLocation(0, 5), type="exon", strand=1,
                                            qualifiers=sub_qualifiers),
                                 SeqFeature(FeatureLocation(15, 20), type="exon", strand=1,
                                            qualifiers=sub_qualifiers)]
     rec.features = [top_feature]
     out_handle = StringIO.StringIO()
     GFF.write([rec], out_handle)
     wrote_info = out_handle.getvalue().split("\n")
     assert wrote_info[0] == "##gff-version 3"
     assert wrote_info[1] == "##sequence-region ID1 1 20"
     assert wrote_info[2].split("\t") == ['ID1', 'prediction', 'gene', '1',
                                          '20', '10.0', '+', '.',
                                          'other=Some,annotations;ID=gene1']
     assert wrote_info[3].split("\t") == ['ID1', 'prediction', 'exon', '1', '5',
                                          '.', '+', '.', 'Parent=gene1']
示例#6
0
def main():
    from argparse import ArgumentParser

    parser = ArgumentParser("Convert SAM to GFF3 format using BCBio GFF")
    parser.add_argument("sam_filename")
    parser.add_argument("-i", "--input_fasta", default=None, help="(Optional) input fasta. If given, coverage will be calculated.")
    parser.add_argument("-s", "--source", required=True, help="source name (ex: hg38, mm10)")

    args = parser.parse_args()

    if not args.sam_filename.endswith('.sam'):
        print >> sys.stderr, "Only accepts files ending in .sam. Abort!"
        sys.exit(-1)

    prefix = args.sam_filename[:-4]
    output_gff3 = prefix + '.gff3'

    q_dict = None
    if args.input_fasta is not None:
        q_dict = dict((r.id, len(r.seq)) for r in SeqIO.parse(open(args.input_fasta), 'fasta'))

    with open(output_gff3, 'w') as f:
        recs = [convert_sam_rec_to_gff3_rec(r0, args.source) for r0 in GMAPSAMReader(args.sam_filename, True, query_len_dict=q_dict)]
        BCBio_GFF.write(filter(lambda x: x is not None, recs), f)


    print >> sys.stderr, "Output written to {0}.".format(output_gff3)
示例#7
0
def rebase(parent, child, interpro=False, protein2dna=False):
    child_features = __get_features(child, interpro=interpro)

    for rec in GFF.parse(parent):
        replacement_features = []
        for feature in feature_lambda(
                rec.features,
                feature_test_qual_value,
                {
                    'qualifier': 'ID',
                    'attribute_list': child_features.keys(),
                },
                subfeatures=False):

            new_subfeatures = child_features[feature.id]
            fixed_subfeatures = []
            for x in new_subfeatures:
                # Then update the location of the actual feature
                __update_feature_location(x, feature, protein2dna)

                if interpro:
                    for y in ('status', 'Target'):
                        try:
                            del x.qualifiers[y]
                        except:
                            pass

                fixed_subfeatures.append(x)
            replacement_features.extend(fixed_subfeatures)
        # We do this so we don't include the original set of features that we
        # were rebasing against in our result.
        rec.features = replacement_features
        rec.annotations = {}
        GFF.write([rec], sys.stdout)
def load_gff(gff):
    genes = defaultdict(list)
    gene_exon_positions = defaultdict(lambda: defaultdict(tuple))
    try:
        with open(gff) as g:
            for line in g:
                if line.startswith('#') or 'contig' in line:
                    continue
                feature = GFF(line)
                gene_id = get_gene_attribute(feature, "ID")
                if feature.featuretype == 'exon':
                    gene_exon_positions[feature.genename][gene_id] = (feature.start, feature.end)
                if feature.featuretype == 'CDS':
                    for exon in gene_exon_positions[feature.genename]:
                        e = gene_exon_positions[feature.genename][exon]
                        if e[0] <= feature.start <= e[1] and e[0] <= feature.end <= e[1]:
                            gene_id = exon + "_CDS"
                if gene_id is None:
                    print("No gene id for CDS found", feature, end="")
                feature.id = gene_id
                genes[feature.genename].append(feature)
                
    except IOError:
        print("Failed to load GFF file {}".format(gff))
        sys.exit()
    
    return genes
示例#9
0
 def genbank_to_gff(self,
         genbank_file):
     from Bio import SeqIO
     from BCBio import GFF
     gff_file = "%s.gff" % (os.path.splitext(genbank_file)[0],)
     with open(gff_file, "w") as out_handle:
         GFF.write(SeqIO.parse(genbank_file, "genbank"), out_handle, include_fasta=True)
     return dict(gff_file=gff_file)
示例#10
0
def main(gb_file,include_fasta=None):
    out_file = "%s.gff" % os.path.splitext(gb_file)[0]
    inc_fasta = False
    if include_fasta is not None:
        if include_fasta.lower() in ("true","yes","1"):
            inc_fasta = True
        
    with open(out_file, "w") as out_handle:
        GFF.write(SeqIO.parse(gb_file, "genbank"), out_handle, inc_fasta)
示例#11
0
def genbank_to_gff(gb_file):
    """Convert GenBank file to GFF for IGV display.
    """
    max_size = 1e4
    gff_file = "%s.gff3" % os.path.splitext(gb_file)[0]
    if not os.path.exists(gff_file):
        with open(gb_file) as in_handle:
            with open(gff_file, "w") as out_handle:
                gb_iterator = SeqIO.parse(in_handle, "genbank")
                GFF.write(_filter_features(gb_iterator, max_size),
                          out_handle)
示例#12
0
def embl2gff(dat, org, gff):
    """
    Parse embl file and estract mature miRNA location information.
    """
    # extract records
    dat_parser = SeqIO.parse(dat, "embl")
    # extract organism specific miRNAs
    org_mirnas = [mirna for mirna in dat_parser if mirna.name.startswith(org)]
    for mirna in org_mirnas:
        mirna.id = mirna.name
    GFF.write(org_mirnas, gff)
示例#13
0
def to_GFF(args):
    """
    Convert a GenBank or EMBL file to GFF

    Biopython does not natively support GFF

    Can be useful for QUAST (Quality Assessment Tool for Genome Assemblies)

    :param args: an argparse args list
    """
    in_type = args.inFormat.lower()
    with open(args.input) as fin, open(args.output, "w") as fout:
        GFF.write(SeqIO.parse(fin, in_type), fout)
 def t_write_seqrecord(self):
     """Write single SeqRecords.
     """
     seq = Seq("GATCGATCGATCGATCGATC")
     rec = SeqRecord(seq, "ID1")
     qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"],
                   "ID": "gene1"}
     rec.features = [SeqFeature(FeatureLocation(0, 20), type="gene", strand=1,
                                qualifiers=qualifiers)]
     out_handle = StringIO.StringIO()
     GFF.write([rec], out_handle, include_fasta=True)
     wrote_info = out_handle.getvalue().split("\n")
     gff_line = wrote_info[2]
     assert gff_line.split("\t")[0] == "ID1"
    def t_gff3_to_gff3(self):
        """Read in and write out GFF3 without any loss of information.
        """
        recs = SeqIO.to_dict(GFF.parse(self._test_gff_file))
        out_handle = StringIO.StringIO()
        GFF.write(recs.values(), out_handle)
        wrote_handle = StringIO.StringIO(out_handle.getvalue())
        recs_two = SeqIO.to_dict(GFF.parse(wrote_handle))

        orig_rec = recs.values()[0]
        re_rec = recs.values()[0]
        assert len(orig_rec.features) == len(re_rec.features)
        for i, orig_f in enumerate(orig_rec.features):
            assert str(orig_f) == str(re_rec.features[i])
示例#16
0
    def handle(self, *args, **options):
        organism, created = Organism.objects.get_or_create(
                common_name=options['organism_name'],
                taxon=options['taxon'],
                ebi_id=options['ebi_id']
        )

        for record in SeqIO.parse(options['fasta'], "fasta"):
            refseq, created = RefSeq.objects.get_or_create(
                name=record.id,
                length=len(record.seq),
                organism=organism
            )

        for rec in GFF.parse(options['gff3']):
            rs = RefSeq.objects.get(name=rec.id, organism=organism)
            for feat in rec.features:
                if feat.type != 'gene':
                    continue
                gene, created = Gene.objects.get_or_create(
                    start=feat.location.start,
                    end=feat.location.end,
                    strand=feat.location.strand,
                    refseq=rs,
                    db_object_id=feat.id,
                    db_object_symbol=feat.id
                )
示例#17
0
def get_gff_dict(gfffile):
    """Creates a dictionary with product information from given gff file.
    
    Returns dictionary. Dictionary key is the contig id, values are products for the contig."""
    out_dict = {}

    for rec in GFF.parse(gfffile):

        # Add features if there are any
        if rec.features > 0:
            gff_info = None
                
            # Add all features
            # Features are separated by ,
            # example:
            # featuretype;product;product,featuretype;product
            # or
            # CDS;protein3;protein31,CDS;protein3
            for f in rec.features:
                if len(f.qualifiers['product']) > 0:
                    # if gff_info is None, do not add ',' separator
                    try:
                        gff_info += ",%s" % ";".join([f.type] + f.qualifiers['product'])
                    except TypeError:
                        gff_info = ";".join([f.type] + f.qualifiers['product'])

            # Test if there were any features with a product
            if gff_info == None:
                gff_info = "N/A"
        else:
            gff_info = "N/A"

        out_dict[rec.id] = gff_info

    return out_dict
示例#18
0
def gene_to_early_exons(gene_name, num_exons):
	#Initialize variables
	exons = {}
	exonCount = 0
	maxExons = 0

	#Open annotation file
	annotation_file = 'crispr_app/Homo_sapiens.GRCh38.84.gtf'
	limit_info = dict(
	         gff_type = ["exon"])
	annotation_handle = open(annotation_file)

	#Parse through annotated data, searching for matching gene names & exons 
	strand = ''
	for rec in GFF.parse(annotation_handle, limit_info=limit_info, target_lines=1):
		feature = rec.features[0]
	 	qualifiers = feature.qualifiers

	 	#Once matching gene is found, determine the exon regions and chromosome
	 	if str(qualifiers['gene_name']).strip('[').strip(']').strip('\'') == gene_name:
	 		chromosome = rec.id
	 		strand = feature.strand

	 		#Get only first version of gene in annotated data
	 		exonNum = str(qualifiers['exon_number']).strip('[').strip(']').strip('\'')
	 		maxExons = max(maxExons, int(exonNum))
	 		exonCount +=1 
			if exonCount > maxExons:
				break
			if exonCount > num_exons:
				break
			exons[exonNum] = [int(feature.location.start), int(feature.location.end), strand]
	annotation_handle.close()
	return exons, chromosome
    def not_t_full_celegans(self):
        """Test the full C elegans chromosome and GFF files.

        This is used to test GFF on large files and is not run as a standard
        test. You will need to download the files and adjust the paths
        to run this.
        """
        # read the sequence information
        seq_file = os.path.join(self._full_dir, "c_elegans.WS199.dna.fa")
        gff_file = os.path.join(self._full_dir, "c_elegans.WS199.gff3")
        seq_handle = open(seq_file)
        seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))
        seq_handle.close()
        #with open(gff_file) as gff_handle:
        #    possible_limits = feature_adder.available_limits(gff_handle)
        #    pprint.pprint(possible_limits)
        rnai_types = [('Orfeome', 'PCR_product'),
                    ('GenePair_STS', 'PCR_product'),
                    ('Promoterome', 'PCR_product')]
        gene_types = [('Non_coding_transcript', 'gene'),
                      ('Coding_transcript', 'gene'),
                      ('Coding_transcript', 'mRNA'),
                      ('Coding_transcript', 'CDS')]
        limit_info = dict(gff_source_type = rnai_types + gene_types)
        for rec in GFF.parse(gff_file, seq_dict, limit_info=limit_info):
            pass
示例#20
0
def gene_positions(genefile, include_chromosome=True, include_strand=True, coding_only=False, ignore_strange_cases=False):
    """ Return a gene_ID:(chromosome, strand, start_pos, end_pos) dictionary based on GFF input file. 
    
    The positions are 1-based, end-inclusive. 
    If include_chromosome and/or include_strand is False, the corresponding values are missing from the output tuples.

    If coding_only is True, the start/end positions are the start and end of the first and last exon (i.e. excluding the UTRs). 
     In that case, if  a gene doesn't have an mRNA with exons, or has multiple mRNAs, raise an Exception, 
      unless ignore_strange_cases is True, then just don't include it in the output.
    """
    gene_positions = {}
    with open(os.path.expanduser(genefile)) as GENEFILE:
        # if coding_only is False, only look at genes, not sub-features
        genefile_parsing_limits = {'gff_type': ['gene']} if not coding_only else {}
        for chromosome_record in GFF.parse(GENEFILE, limit_info=genefile_parsing_limits):
            for gene_record in chromosome_record.features:
                # BCBio uses 0-based and end-exclusive positions (first-third base is bases 0,1,2, i.e range 0-3) - 
                #  convert to 1-based end-inclusive (so first-third base is bases 1,2,3, i.e. range 1-3)
                if include_chromosome:      full_pos_info = (chromosome_record.id,)
                else:                       full_pos_info = ()
                if include_strand:          full_pos_info += (GFF_strands[gene_record.strand],)
                if not coding_only:
                    full_pos_info += get_feature_start_end(gene_record)
                else:
                    try:    start_end = get_gene_start_end_excluding_UTRs(gene_record)
                    except (NoRNAError, MultipleRNAError):
                        if ignore_strange_cases:    continue
                        else:                       raise
                    full_pos_info += start_end
                gene_positions[gene_record.id] = full_pos_info
    return gene_positions
示例#21
0
def read_gff_transcripts(fobj, fname="", min_exons=1, merge=0):
    
    # Setup logging
    logger = logging.getLogger('pita')
  
    if merge > 0:
        logger.warning("Merging exons not yet implemented for GFF files!")

    #limits = dict(gff_type = ["mRNA", "exon"])
    smap = {"1":"+",1:"+","-1":"-",-1:"-", None:"+"}
    transcripts = []
    for rec in GFF.parse(fobj):
        chrom = rec.id
        for feature in rec.features:
            #logger.debug("feature: {0}", feature)
            
            for gene in _gff_type_iterator(feature, ['mRNA', 'transcript', 'inferred_parent']):
                #logger.debug("Adding gene: {0}", gene)
                exons = []
                #logger.debug("subfeatures: {0}", gene.sub_features)
                for exon in [f for f in gene.sub_features if f.type == 'exon']:
                    #link[gene.id] = link.setdefault(gene.id, 0) + 1
                    start = int(exon.location.start.position)# - 1    
                    end = int(exon.location.end.position)
                    strand = smap[exon.strand]
                    exons.append([chrom, start, end, strand])
                logger.debug("%s: %s - %s exons", fname, gene.id, len(exons))
                if len(exons) >= min_exons:
                    transcripts.append([gene.id, fname, exons])

    return transcripts
示例#22
0
 def t_key_whitespace(self):
     """Fix keys with problematic whitespace.
     """
     tfile = os.path.join(self._test_dir, "spaces.gff3")
     for i, line_info in enumerate(GFF.parse_simple(tfile)):
         if i > 2:
             assert line_info["quals"]["foo"] == ["bar"]
示例#23
0
def prepareSample(filter_matrix, gff_path):
	random.seed()
	candidate_list = []
	handle = open(gff_path, 'r')
	gene_count = 0
	for record in GFF.parse(handle):
		for feature in record.features:
			if feature.type == 'gene':
				locus_tag = feature.qualifiers['locus_tag'][0]
				isMatch = False
				gene_count += 1
				for key in filter_matrix:
					if key == locus_tag:
						isMatch = True
						break
				if isMatch == False:
					candidate_list.append(locus_tag)
	countToAdd = round(gene_count / 2) - len(filter_matrix)
	if countToAdd > 0:
		for i in range(1, countToAdd):
			list_len = len(candidate_list)
			list_id = random.randint(0, list_len - 1)
			locus_str = candidate_list[ list_id ]
			filter_matrix[locus_str] = (0, 0)
			candidate_list.remove( locus_str )
				
	handle.close()
	return(filter_matrix)
 def t_ensembl_nested_features(self):
     """Test nesting of features with GFF2 files using transcript_id.
     """
     rec_dict = SeqIO.to_dict(GFF.parse(self._ensembl_file))
     assert len(rec_dict["I"].features) == 2
     t_feature = rec_dict["I"].features[0]
     assert len(t_feature.sub_features) == 32
 def t_write_fasta(self):
     """Include FASTA records in GFF output.
     """
     seq = Seq("GATCGATCGATCGATCGATC")
     rec = SeqRecord(seq, "ID1")
     qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"],
                   "ID": "gene1"}
     rec.features = [SeqFeature(FeatureLocation(0, 20), type="gene", strand=1,
                                qualifiers=qualifiers)]
     out_handle = StringIO.StringIO()
     GFF.write([rec], out_handle, include_fasta=True)
     wrote_info = out_handle.getvalue().split("\n")
     fasta_parts = wrote_info[3:]
     assert fasta_parts[0] == "##FASTA"
     assert fasta_parts[1] == ">ID1 <unknown description>"
     assert fasta_parts[2] == str(seq)
 def t_fasta_directive(self):
     """Parse FASTA sequence information contained in a GFF3 file.
     """
     recs = SeqIO.to_dict(GFF.parse(self._gff_file))
     assert len(recs) == 1
     test_rec = recs['chr17']
     assert str(test_rec.seq) == "GATTACAGATTACA"
示例#27
0
def doWork( args ):
    panel=Panel(fig_width=900, padding = 25, grid=None, xmin=0)
    seq_length = 0
    for gff in args.gffs:
        seqrecord = GFF.parse(gff).next()
        if len(seqrecord) > seq_length:
            seq_length = len(seqrecord)
        #seqrecord = SeqIO.parse(args.infile, "genbank").next()
        cds_track = tracks.BaseTrack(sort_by = 'collapse')
        for feature in seqrecord.features:
            if feature.type == 'CDS':
                #print feature.qualifiers['product']
                if feature.qualifiers['product'][0] == 'hypothetical protein':
                    col = '#BDBDBD'
                else:
                    col = '#2B8CBE'
                feat = features.GenericSeqFeature(feature, color_by_cm=False,
                        fc=col )
                cds_track.append(feat)
            elif feature.type == 'source':
                cds_track.append(features.GenericSeqFeature(feature,
                    color_by_cm=False, alpha=0.0, fc='1.0', ec='1.0'))
            else:
                cds_track.append(features.GenericSeqFeature(feature,
                    color_by_cm=False, fc='0.0', ec='0.0'))
        panel.add_track(cds_track)
    panel.save(args.outfile, xmin=0,xmax=seq_length)
示例#28
0
def main(gff_file, fasta_file = None):
    # Use splitext to remove the extension of the original input file
    out_file = "%s.gb" % os.path.splitext(gff_file)[0]

    # Parser will differ slightly if fasta file is given
    if os.stat(gff_file) == 0 or ((fasta_file is not None) and os.stat(fasta_file)):
        print "ERROR: Empty file provided or cannot stat files"
        exit(64);
    elif fasta_file is None:
        gff_iter = GFF.parse(gff_file) #Parser/generator object
    else:
        fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta", generic_dna)) # Process fasta file
        gff_iter = GFF.parse(gff_file, fasta_input) # Give fasta file to parser
    
    # One line to call all the checking function and to write in genbank format
    SeqIO.write(_check_gff(_fix_ncbi_id(gff_iter)), out_file, "genbank")
def shortrna_regions(mirna_gff, star_csv, seq_file):
    """Return miRNA sequences with corresponding guide and star regions.
    """
    seq_index = SeqIO.index(seq_file, "fasta")
    mirna_seqs = dict()
    with open(star_csv) as in_handle:
        for name, guide, star in csv.reader(in_handle):
            mirna_seqs[name] = (guide.strip(), star.strip())

    for rec in GFF.parse(mirna_gff):
        cur_seq = str(seq_index[rec.id].seq)
        for f in rec.features:
            name = f.qualifiers["ID"][0]
            start, end = (f.location.nofuzzy_start, f.location.nofuzzy_end)
            yield (rec.id, start, end, name)
            #guide, star = mirna_seqs.get(name, ("", ""))
            for seq_name, guide, star in [(n, g, s) for n, (g, s) in
                    mirna_seqs.iteritems() if n.startswith(name)]:
                for find_seq, ext in [(guide, "guide"), (star, "star")]:
                    if find_seq:
                        if f.strand == -1:
                            find_seq = str(Seq(find_seq).reverse_complement())
                        region = cur_seq[start:end]
                        pos = region.find(find_seq)
                        if pos > -1:
                            yield (rec.id, start + pos, start + pos + len(find_seq),
                                    "%s_%s" % (seq_name, ext))
                        else:
                            print f.strand, name, ext, pos, find_seq, region
                            raise NotImplementedError
def load_gff(gff):
    """Parses a single GFF file and returns a chromosome-indexed dict for
       that file.

    Arguments
    ---------
    gff: str
        Filepath to GFF

    Returns
    -------
    dict: A dictionary representation of the GFF entries, indexed by
            chromosome ID
    """
    annotations = {}

    if gff.endswith('.gz'):
        import gzip
        from io import TextIOWrapper
        fp = TextIOWrapper(gzip.open(gff))
    else:
        fp = open(gff)

    for entry in GFF.parse(fp):
        if len(entry.features) > 0 and entry.features[0].type == 'chromosome':
            annotations[entry.id] = entry
    fp.close()

    return annotations
示例#31
0
def gb2gff(infile, outfile):
    """Translate GenBank file to GFF3 file. TODO: the procedure now does not 
    handle join correctly

    Args:
        infile (str): input GenBank file
        outfile (str): output GFF3 file
    Returns:
        Number of records written
    """

    gb_handle = open(infile, 'r')
    gff_handle = open(outfile, 'w')
    res = GFF.write(SeqIO.parse(gb_handle, "gb"), gff_handle)
    gff_handle.close()
    return (res)
示例#32
0
 def t_wormbase_nested_features(self):
     """Test nesting of features with GFF2 files using Transcript only.
     """
     rec_dict = SeqIO.to_dict(GFF.parse(self._wormbase_file))
     assert len(rec_dict) == 3
     parent_features = [
         f for f in rec_dict["I"].features if f.type == "Transcript"
     ]
     assert len(parent_features) == 1
     inferred_features = [
         f for f in rec_dict["I"].features if f.type == "inferred_parent"
     ]
     assert len(inferred_features) == 0
     tfeature = parent_features[0]
     assert tfeature.qualifiers["WormPep"][0] == "WP:CE40797"
     assert len(tfeature.sub_features) == 46
示例#33
0
def working_stuff():

    from BCBio import GFF

    gff_type = ["gene", "mRNA", "CDS", "exon"]
    source_type = zip(["Coding_transcript"] * len(gff_type), gff_type)

    filter_type = dict(gff_source_type=source_type, gff_id="I")

    gff_handle = open(
        "/fml/ag-raetsch/share/databases/genomes/C_elegans/elegans_WS199/annotation/c_elegans.WS199.gff3"
    )

    element = [e for e in GFF.parse(gff_handle, limit_info=filter_type)]

    return element
示例#34
0
 def t_extra_comma(self):
     """Correctly handle GFF3 files with extra trailing commas.
     """
     tfile = os.path.join(self._test_dir, "mouse_extra_comma.gff3")
     in_handle = open(tfile)
     for rec in GFF.parse(in_handle):
         pass
     in_handle.close()
     tested = False
     for sub_top in rec.features[0].sub_features:
         for sub in sub_top.sub_features:
             if sub.qualifiers.get("Name",
                                   "") == ["CDS:NC_000083.5:LOC100040603"]:
                 tested = True
                 assert len(sub.qualifiers["Parent"]) == 1
     assert tested, "Did not find sub-feature to test"
示例#35
0
def get_features_from_file(handle: IO) -> Dict[str, List[SeqFeature]]:
    """ Generates new SeqFeatures from a GFF file.

        Arguments:
            handle: a file handle/stream with the GFF contents

        Returns:
            a dictionary mapping record ID to a list of SeqFeatures for that record
    """
    try:
        gff_records = list(GFF.parse(handle))
    except Exception as err:
        raise AntismashInputError("could not parse records from GFF3 file") from err

    results = {}
    for gff_record in gff_records:
        features = []
        for feature in gff_record.features:
            if feature.type == 'CDS':
                new_features = [feature]
            else:
                new_features = check_sub(feature)
                if not new_features:
                    continue

            name = feature.id
            locus_tag = feature.qualifiers.get("locus_tag")

            for qtype in ["gene", "name", "Name"]:
                if qtype in feature.qualifiers:
                    name_tmp = feature.qualifiers[qtype][0]
                    # Assume name/Name to be sane if they don't contain a space
                    if " " in name_tmp:
                        continue
                    name = name_tmp
                    break

            for i, new_feature in enumerate(new_features):
                variant = name
                if len(new_features) > 1:
                    variant = "{0}_{1}".format(name, i)
                new_feature.qualifiers['gene'] = [variant]
                if locus_tag is not None:
                    new_feature.qualifiers["locus_tag"] = locus_tag
                features.append(new_feature)
        results[gff_record.id] = features
    return results
示例#36
0
def runbarrnap(genome,outfilePath = '.'):
    '''
    assumes write permission in current working directory
    :param genome: path to genome assumes assembly file name ("_genomic.fna.gz")
    :param outfile: target file with rRNA sequences detected in fasta format with header
    ">asmID:ribosomalSubunit:start-end:dir:acc"
    :return: True if successfully run, false if not
    '''
    if os.path.isfile(genome):
        fileName = os.path.split(genome)[1]
        asmID = fileName.split('.')[0]
        outfileName = asmID + '_rRNA.fasta'
        try:
            tempGenome = asmID + '.tmp'
            with gzip.open(genome) as in_file, open(tempGenome,'w') as out_file:
                shutil.copyfileobj(in_file,out_file)
        except IOError:
            tempGenome = asmID + '.tmp'
            shutil.copy(genome, tempGenome)
        ## unzip
        command = ['barrnap','--incseq',tempGenome]
        try:
            barrnapProc = subprocess.Popen(command,stdout=subprocess.PIPE)
            handle = GFF.parse(barrnapProc.stdout)
            seqs = []
            for seq in handle:
                for feat in seq.features:
                    rnaSeq = feat.extract(seq)
                    rSU = feat.qualifiers['name'][0]
                    eval = float(feat.qualifiers['score'][0])
                    start = feat.location.start
                    end = feat.location.end
                    dir = feat.location.strand
                    if 'note' in feat.qualifiers:
                        partial = 'partial'
                    else:
                        partial = 'full'
                    rnaSeq.name = "{}:{}:{}:{}-{}:{}:{}".format(asmID,rSU,eval,start,end,dir,partial)
                    rnaSeq.id = "{}:{}:{}:{}-{}:{}:{}".format(asmID,rSU,eval,start,end,dir,partial)
                    rnaSeq.description = ''
                    seqs.append(rnaSeq)
            with open(os.path.join(outfilePath,outfileName),'w') as outfile:
                SeqIO.write(seqs,outfile,'fasta')
        except Exception as e:
            print(e)
        os.remove(tempGenome)
示例#37
0
def repair(fasta, gff3):
    recs = {}
    # seqids = {}
    for record in GFF.parse(gff3):
        # seqids[record.id] = ''
        recs[record.id] = record

    seqs = []
    for seq in SeqIO.parse(fasta, "fasta"):
        if seq.id not in recs:
            continue

        current = recs[seq.id]
        for num, feat in enumerate(current.features):
            if num == 0:  # ignore first feature bc that's the full one
                continue

            cds, sd = get_CDS_and_SD(feat)
            cds_start = seq.seq[cds.location.start:cds.location.start + 3]
            broken_start = break_start(cds_start)

            if (
                    cds_start != broken_start
            ):  # try to break start sequence while keeping amino acid the same
                seq.seq = (seq.seq[0:cds.location.start] + broken_start +
                           seq.seq[cds.location.start + 3:])

            else:  # if couldn't change start, must break SD
                mod_sd_start = 0
                mod_sd_end = 0
                if (sd.location.start % 3) + 1 != 1:
                    mod_sd_start = next_first_frame(sd.location.start, 1)
                if (sd.location.end % 3) + 1 != 1:
                    mod_sd_end = next_first_frame(sd.location.end, -1)

                sd_seq = seq.seq[sd.location.start -
                                 mod_sd_start:sd.location.end - mod_sd_end]
                broken_sd = break_sd(sd_seq)
                if sd_seq != broken_sd:
                    seq.seq = (seq.seq[0:(sd.location.start - mod_sd_start)] +
                               broken_sd +
                               seq.seq[(sd.location.end - mod_sd_end):])

        seqs.append(seq)

    SeqIO.write(seqs, sys.stdout, "fasta")
示例#38
0
def main(seqFilepath, gffFilepath, outFilepath):
    # load fasta
    seqRec_lst = []
    seqName_lst = []
    for seqRec in SeqIO.parse(seqFilepath, "fasta"):
        seqRec_lst.append(seqRec)
        seqName_lst.append(seqRec.id)
    print("LOADED {} seqs from {}".format(len(seqRec_lst), seqFilepath))

    # load gff and distribute CDS
    cds_lstlst = [[] for _ in range(len(seqName_lst))]
    with open(gffFilepath) as f:
        for rec in GFF.parse(f, target_lines=1):
            assert len(rec.features) == 1
            if rec.features[0].type == "CDS":
                try:
                    idx = seqName_lst.index(rec.id)
                    cds_lstlst[idx].append(rec)
                except ValueError:
                    pass  # corresponding sequence does not exists in seqRec_lst

    for idx, seqName in enumerate(seqName_lst):
        print("\tLOADED {0} CDSs in {1}".format(len(cds_lstlst[idx]), seqName))

    thres_lst = list(range(50, 1000 + 1, 50))
    columns = ["+1", "+2", "+3", "-1", "-2", "-3"]
    out_mat = np.zeros((len(thres_lst), len(columns))).astype(int)

    for seqRec, cds_lst, seqName in zip(seqRec_lst, cds_lstlst, seqName_lst):
        orf_df = get_orf_df(seqRec)

        for i, thres in enumerate(thres_lst):
            filtered_df = orf_df[orf_df["length"] >= thres]
            pos_lst = get_pos_lst(cds_lst, filtered_df)
            overlap_dctdct = get_overlap_dctdct(pos_lst)

            for _, dct in overlap_dctdct.items():
                out_mat[i, columns.index(dct["relLane"])] += (dct["oend"] -
                                                              dct["ostart"])
        print("\tDONE with {}".format(seqName))

    out_df = pd.DataFrame(out_mat, columns=columns)
    out_df["thres"] = thres_lst
    out_df = out_df[["thres"] + columns]
    out_df.to_csv(outFilepath, index=False)
    print("OUTPUT to {}".format(outFilepath))
示例#39
0
    def __init__(self, fasta, gtf):

        self.fasta = fasta
        self.gtf = gtf

        sys.stderr.write("Reading FASTA file...\n")
        with flexi_open(fasta, 'rU') as handle:
            chromosomes = SeqIO.to_dict(SeqIO.parse(handle, 'fasta'))

        self.intervaldict = dict()

        sys.stderr.write("Reading GTF file (this will take some time)...\n")
        limit_info = dict(gff_type=('CDS', ))
        with flexi_open(gtf, 'r') as handle:
            for rec in GFF.parse(handle,
                                 limit_info=limit_info,
                                 base_dict=chromosomes):

                # Fix strand info.
                for feature in rec.features:  # Each top-level CDS
                    if hasattr(feature, 'strand'):
                        if feature.strand is None:
                            # An unfortunate effect of the GFF parser.
                            # Check subfeatures, take the first defined strand.
                            # Note: may be bad assumption in weird species (ciliates??).
                            for subfeat in get_subfeatures(feature):
                                if subfeat.strand is not None:
                                    feature.strand = subfeat.strand
                                    break

                # Now, chromosomes['X'] is a SeqRecord with features each of
                # which has a feature.extract method which can be used to
                # access the underlying DNA sequence. CDS features are nested
                # automatically. See also feature.qualifiers for a list of IDs
                # (gene name, ID etc) associated with it.
                chromosomes[rec.id] = rec

                # We need to create some interval trees to identify
                # affected CDS features etc.
                self._index_record_in_intervaldict(rec)

        self.chromosomes = chromosomes

        self._precompute_chrlens()

        sys.stderr.write("Object initialisation complete.\n")
def record_with_extracted_annotations_generator(
        gff_file, white_list_of_annotation_types):
    for record in GFF.parse(open(gff_file)):
        #print("Extracting annotations from %s" % record.id)
        new_record = deepcopy(record)
        new_record.features = []
        #print record.features
        for feature in record.features:

            #print ("%s\t%s" % (record.id, feature.id))

            if (feature.id
                    in annotation_ids) and (feature.type
                                            in white_list_of_annotation_types):
                new_record.features.append(feature)
        if len(new_record.features) > 0:
            yield new_record
示例#41
0
    def do_import(self):
        in_file = self.__gff_fasta_fn
        in_handle = open(in_file)

        # In DEBUG=True mode, Django keeps list of queries and blows up memory
        # usage when doing a big import. The following line disables this
        # logging.
        connection.use_debug_cursor = False

        for rec in GFF.parse(in_handle):
            f = GFFFragmentImporter(rec).do_import()
            self.__genome.genome_fragment_set.create(fragment=f,
                                                     inherited=False)

        # Be nice and turn debug cursor back on
        connection.use_debug_cursor = True
        in_handle.close()
示例#42
0
def __get_features(child, interpro=False):
    child_features = {}
    for rec in GFF.parse(child):
        for feature in rec.features:
            parent_feature_id = rec.id
            if interpro:
                if feature.type == 'polypeptide':
                    continue
                if '_' in parent_feature_id:
                    parent_feature_id = parent_feature_id[parent_feature_id.
                                                          index('_') + 1:]

            try:
                child_features[parent_feature_id].append(feature)
            except KeyError:
                child_features[parent_feature_id] = [feature]
    return child_features
示例#43
0
def FindInitSites():
    GFFgen = GFF.parse(
        '/users/buskirk/documents/profiling/GFF/MG1655/coli3.gff')
    chrom = GFFgen.next()
    f_genome = chrom.seq
    r_genome = chrom.seq.reverse_complement()

    ORFlist = []
    ORFlist.append([
        'start', 'stop', 'retapa', 'onc112', 'codon1st', 'codonlast',
        'peptide', 'strand'
    ])

    # for the plus strand
    f1 = 'retapa'  #retapamulin data from Shura Mankin
    pathi = "/users/buskirk/documents/profiling/projects/sORFs/wigfiles/"
    density_filestring1 = pathi + f1
    counts_f1 = readwig(density_filestring1 + "_plus")

    f2 = 'onc'  #Onc112 data
    pathi = "/users/buskirk/documents/profiling/projects/sORFs/wigfiles/"
    density_filestring2 = pathi + f2
    counts_f2 = readwig(density_filestring2 + "_plus")

    gp_plus = geneplot(chrom, 1)  # 1 is plus

    plusORFs = INIT_scan(f_genome, gp_plus, counts_f1, counts_f2, 'plus')
    ORFlist.extend(plusORFs)

    # for the minus strand
    counts_f1 = readwig(density_filestring1 + "_minus")
    counts_f1.reverse()

    counts_f2 = readwig(density_filestring2 + "_minus")
    counts_f2.reverse()

    gp_minus = geneplot(chrom, -1)  # -1 is minus
    gp_minus.reverse()

    minusORFs = INIT_scan(r_genome, gp_minus, counts_f1, counts_f2, 'minus')
    ORFlist.extend(minusORFs)

    writelisttoexcel(
        ORFlist,
        '/users/buskirk/documents/profiling/projects/sORFs/init_sites')
示例#44
0
def get_features_from_file(
    seq_record,
    handle,
    limit_to_seq_id: Union[bool, Dict[str, List[str]]] = False
) -> List[SeqFeature]:
    """ Generates new SeqFeatures from a Record and a GFF file.

        Arguments:
            seq_record: the record that features belong to
            limit_to_seq_id: False or a dictionary of GFF.parse options

        Returns:
            a list of SeqFeatures parsed from the GFF file
    """
    features = []
    for record in GFF.parse(handle, limit_info=limit_to_seq_id):
        for feature in record.features:
            if feature.type == 'CDS':
                new_features = [feature]
            else:
                new_features = check_sub(feature, seq_record)
                if not new_features:
                    continue

            name = feature.id
            locus_tag = feature.qualifiers.get("locus_tag")

            for qtype in ["gene", "name", "Name"]:
                if qtype in feature.qualifiers:
                    name_tmp = feature.qualifiers[qtype][0]
                    # Assume name/Name to be sane if they don't contain a space
                    if " " in name_tmp:
                        continue
                    name = name_tmp
                    break

            for i, new_feature in enumerate(new_features):
                variant = name
                if len(new_features) > 1:
                    variant = "{0}_{1}".format(name, i)
                new_feature.qualifiers['gene'] = [variant]
                if locus_tag is not None:
                    new_feature.qualifiers["locus_tag"] = locus_tag
                features.append(new_feature)
    return features
示例#45
0
def __get_features(child, interpro=False):
    child_features = {}
    for rec in GFF.parse(child):
        log.info("Parsing %s", rec.id)
        for feature in rec.features:
            parent_feature_id = rec.id
            if interpro:
                if feature.type == "polypeptide":
                    continue
                if "_" in parent_feature_id:
                    parent_feature_id = parent_feature_id[parent_feature_id.
                                                          index("_") + 1:]

            try:
                child_features[parent_feature_id].append(feature)
            except KeyError:
                child_features[parent_feature_id] = [feature]
    return child_features
示例#46
0
def examine(gff_file, fasta_file):
    gff_handle = open(gff_file)

    fasta_handle = open(fasta_file)
    fasta_dict = SeqIO.to_dict(SeqIO.parse(fasta_handle, "fasta"))

    for rec in GFF.parse(gff_handle):
        #print rec.id
        for one_feature in rec.features:
            #print  one_feature.qualifiers.keys()
            locus = one_feature.qualifiers["Name"][0]
            product = one_feature.qualifiers["product"][0]
            seq = one_feature.extract(fasta_dict[rec.id].seq)
            out = ">%s|%s|%s\n%s" % (rec.id, locus, product, seq)
            print out

    gff_handle.close()
    fasta_handle.close()
示例#47
0
文件: uniprot.py 项目: feiranl/ssbio
    def features(self):
        """list: Get the features from the feature file, metadata file, or in memory"""
        if self.feature_file:
            log.debug('{}: reading features from feature file {}'.format(self.id, self.feature_path))
            with open(self.feature_path) as handle:
                feats = list(GFF.parse(handle))
                if len(feats) > 1:
                    log.warning('Too many sequences in GFF')
                else:
                    return feats[0].features

        elif self.metadata_file:
            log.debug('{}: reading features from metadata file {}'.format(self.id, self.metadata_path))
            tmp_sr = SeqIO.read(self.metadata_path, 'uniprot-xml')
            return tmp_sr.features

        else:
            return self._features
示例#48
0
    def generator(gff_file, fasta_dict, exon_fd):
        with open(gff_file, "r") as gff_fd:
            for record in GFF.parse(gff_fd, target_lines=100000):
                for feature in record.features:
                    #print (feature.type)
                    if feature.type == "transcript":
                        print(feature)
                        print(feature.sub_features)
                        exon_fd.write(feature.id + "\n")
                        exon_fd.write(str(feature.location))
                        exon_fd.write("\n")
                        exon_fd.write(str(feature.sub_features) + "\n")

                    if feature.type == "gene":
                        feature_record = feature.extract(fasta_dict[record.id])
                        feature_record.id = feature.qualifiers["gene_id"][0]
                        feature_record.description = ""
                        yield feature_record
示例#49
0
def tableify(gff3, fasta):
    names = {}
    for fasta_rec in SeqIO.parse(fasta, 'fasta'):
        names[fasta_rec.id] = []

    for gff_rec in GFF.parse(gff3):
        names[gff_rec.id].append(str(len(gff_rec.features) -
                                     1))  # number of internal starts
        starts = []
        for feat in gff_rec.features:
            feat_start = (feat.location.start - 9) / 3 + 1
            if feat_start is not 1:  # start codon position of each internal start
                starts.append(str(feat_start))
        names[gff_rec.id].append(starts)

    for n in sorted(names):
        if len(names[n]):
            print '\t'.join([n, names[n][0], ', '.join(names[n][1])])
示例#50
0
 def t_basic_attributes(self):
     """Parse out basic attributes of GFF2 from Ensembl GTF.
     """
     limit_info = dict(
             gff_source_type = [('snoRNA', 'exon')]
             )
     rec_dict = SeqIO.to_dict(GFF.parse(self._ensembl_file,
         limit_info=limit_info))
     work_rec = rec_dict['I']
     assert len(work_rec.features) == 1
     test_feature = work_rec.features[0]
     qual_keys = test_feature.qualifiers.keys()
     qual_keys.sort()
     assert qual_keys == ['Parent', 'exon_number', 'gene_id', 'gene_name',
             'source', 'transcript_id', 'transcript_name']
     assert test_feature.qualifiers['source'] == ['snoRNA']
     assert test_feature.qualifiers['transcript_name'] == ['NR_001477.2']
     assert test_feature.qualifiers['exon_number'] == ['1']
示例#51
0
    def ParseRecord(self, cn):
        org = self._wa.organisms.findOrganismByCn(cn)
        self._wa.annotations.setSequence(org['commonName'], org['id'])

        data = io.StringIO(
            self._wa.io.write(
                exportType='GFF3',
                seqType='genomic',
                exportAllSequences=False,
                exportGff3Fasta=True,
                output="text",
                exportFormat="text",
                sequences=cn,
            ))
        data.seek(0)

        for record in GFF.parse(data):
            yield WebApolloSeqRecord(record, self._wa)
示例#52
0
def basic_parsing(gffFile):
    """ GFF3 parse, extract information """

    with open(gffFile) as in_handle:
        for rec in GFF.parse(in_handle):
            # iterate features
            for feature in rec.features:
                # iterate sub features
                sub_features_temp = []
                for sub_feature in feature.sub_features:
                    sub_features_temp.append(sub_feature)
                yield SeqFeature(
                    type=feature.type,
                    location=feature.location,
                    strand=feature.strand,
                    qualifiers=feature.qualifiers,
                    sub_features=sub_features_temp,
                )
示例#53
0
def require_shinefind(gff3, fasta):
    sd_finder = NaiveSDCaller()
    # Load up sequence(s) for GFF3 data
    seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
    # Parse GFF3 records
    for record in GFF.parse(gff3, base_dict=seq_dict):
        # Reopen
        genes = list(
            feature_lambda(record.features,
                           feature_test_type, {"type": "gene"},
                           subfeatures=True))
        good_genes = []
        for gene in genes:
            cdss = sorted(
                list(
                    feature_lambda(
                        gene.sub_features,
                        feature_test_type,
                        {"type": "CDS"},
                        subfeatures=False,
                    )),
                key=lambda x: x.location.start,
            )
            if len(cdss) == 0:
                continue

            cds = cdss[0]

            sds, start, end, seq = sd_finder.testFeatureUpstream(cds,
                                                                 record,
                                                                 sd_min=5,
                                                                 sd_max=15)
            if len(sds) >= 1:
                sd_features = sd_finder.to_features(sds,
                                                    gene.location.strand,
                                                    start,
                                                    end,
                                                    feature_id=gene.id)
                gene.sub_features.append(sd_features[0])

                good_genes.append(gene)

        record.features = good_genes
        yield record
示例#54
0
def FindrRNA(path, tempFile):
    
    in_handle = open(path)

    for record in GFF.parse(in_handle, limit_info=limit_info):

        if len(record.features) != 0:
            for i in range(len(record.features)):

                if '23S ribosomal RNA' in record.features[i].qualifiers["product"][0]:
                    #print(record.features[i])
                    #print(record.id)
                    #print(record.features[i].location.strand)
                    #print(record.features[i].location.start)
                    #print(record.features[i].location.end)
                    #print(type(record.features[i].location))
                    #feature_seq = record.seq[record.features[i].location.start:record.features[i].location.end].reverse_complement()
                    #print(feature_seq)
                    '''
                    seq_23S = SeqRecord(record.features[i].location.extract(record.seq),\
                                    id=record.features[i].id,\
                                    description=str(record.id +'-'+ record.features[i].qualifiers["product"][0]))
                    '''
                    seq_23S = SeqRecord(record.features[i].location.extract(record.seq),\
                                    id=tempFile[:-4],\
                                    description=str(record.id +'-'+record.features[i].id+'-'+ record.features[i].qualifiers["product"][0]+'-'+str(len(record.seq))))
                    #print(seq_23S)
                    #SeqIO.write(seq_23S, path[:-4]+"_23S_rRNA.fasta", "fasta")
                    SeqIO.write(seq_23S, "/home/junyuchen/Lab/16S-Prediction/"+tempFile[:-4]+"_23S_rRNA.fasta", "fasta")
                elif '16S ribosomal RNA' in record.features[i].qualifiers["product"][0]:
                    #print(record.features[i])
                    #print(record.id)
                    seq_16S = SeqRecord(record.features[i].location.extract(record.seq),\
                                    id=tempFile[:-4],\
                                    description=str(record.id +'-'+record.features[i].id+'-'+ record.features[i].qualifiers["product"][0]+'-'+str(len(record.seq))))
                
                    #SeqIO.write(seq_16S, path[:-4]+"_16S_rRNA.fasta", "fasta")
                    SeqIO.write(seq_16S, "/home/junyuchen/Lab/16S-Prediction/"+tempFile[:-4]+"_16S_rRNA.fasta", "fasta")
                    #print(seq_16S)


                #print
            #print(record.features)
    in_handle.close()
示例#55
0
def main(fasta, gff3):
    seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))

    codon_usage = {}

    for rec in GFF.parse(gff3, base_dict=seq_dict):
        for feat in feature_lambda(rec.features,
                                   feature_test_type, {"type": "CDS"},
                                   subfeatures=True):
            seq = str(feat.extract(rec).seq)[0:3]
            try:
                codon_usage[seq] += 1
            except KeyError:
                codon_usage[seq] = 1

    # TODO: print all actg combinations? Or just ones that are there
    print "# Codon\tCount"
    for key in sorted(codon_usage):
        print "\t".join((key, str(codon_usage[key])))
示例#56
0
def get_SNP_regions(my_genes, in_file="gencode.v24.annotation.gff3"):

    in_handle = open(in_file)
    regions = []
    for index, rec in enumerate(GFF.parse(in_handle, target_lines=130)):

        for x in rec.features:
            if x.type == 'gene':

                for y in my_genes:
                    if y == x.qualifiers['gene_name'][0]:
                        regions.append({
                            'chr': rec.id,
                            'start': x.location.start,
                            'end': x.location.end
                        })

    in_handle.close()
    return regions
示例#57
0
def load_annotations(target_gff):
    """Loads genome annotations from specified GFF(s)."""
    # Get chromosomes/contigs from GFF file
    chromosomes = {}

    # Load existing gene annotations
    annotations_fp = open(target_gff)

    for entry in GFF.parse(annotations_fp):
        # For TriTrypDB 29 and above, there are no longer chromosome entries
        # in the GFF files
        # if len(entry.features) > 0 and entry.features[0].type in ['chromosome', 'contig']:
        if len(entry.features) > 0:
            chromosomes[entry.id] = entry

    # clean up
    annotations_fp.close()

    return chromosomes
示例#58
0
    def record_with_extracted_transcripts_generator(gff_file,
                                                    transcript_ids):
        for record in GFF.parse(open(gff_file)):
            new_record = deepcopy(record)
            new_record.features = []
            for feature in record.features:
                if (feature.type == "mRNA" or feature.type == "transcript") and (feature.id in transcript_ids):
                    new_record.features.append(feature)
                elif feature.type == "gene":
                    new_feature = deepcopy(feature)
                    new_feature.sub_features = []

                    for subfeature in feature.sub_features:
                        if (subfeature.type == "mRNA" or subfeature.type == "transcript") and (subfeature.id in transcript_ids):
                            new_feature.sub_features.append(subfeature)
                    if len(new_feature.sub_features) > 0:
                        new_record.features.append(new_feature)
            if len(new_record.features) > 0:
                yield new_record
示例#59
0
def run(sequence, options):
    handle = open(options.gff3)
    # If there's only one sequence in both, read all, otherwise, read only appropriate part of GFF3.
    if options.single_entries:
        limit_info = False
    else:
        limit_info = dict(gff_id=[sequence.id])

    for record in GFF.parse(handle, limit_info=limit_info):
        for feature in record.features:
            if feature.type == 'CDS':
                new_features = [feature]
            else:
                new_features = check_sub(feature, sequence)
                if not new_features:
                    continue

            name = feature.id
            if len(name) > 40:
                raise ValueError(
                    "Feature ID too long, < 40 characters required: %s" % name)
            locus_tag = None
            if "locus_tag" in feature.qualifiers:
                locus_tag = feature.qualifiers["locus_tag"]

            for qtype in ["gene", "name", "Name"]:
                if qtype in feature.qualifiers:
                    name_tmp = feature.qualifiers[qtype][0]
                    # Assume name/Name to be sane if they don't contain a space
                    if " " in name_tmp:
                        continue
                    name = name_tmp
                    break

            for i, n in enumerate(new_features):
                variant = name
                if len(new_features) > 1:
                    variant = "{0}_{1}".format(name, i)
                n.qualifiers['gene'] = [variant]
                if locus_tag is not None:
                    n.qualifiers["locus_tag"] = locus_tag
                sequence.features.append(n)
示例#60
0
def require_shinefind(gff3, fasta):
    sd_finder = NaiveSDCaller()
    # Load up sequence(s) for GFF3 data
    seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
    # Parse GFF3 records
    for record in GFF.parse(gff3, base_dict=seq_dict):
        # Reopen
        genes = list(
            feature_lambda(record.features,
                           feature_test_type, {'type': 'gene'},
                           subfeatures=True))
        good_genes = []
        for gene in genes:
            cdss = list(
                feature_lambda(gene.sub_features,
                               feature_test_type, {'type': 'CDS'},
                               subfeatures=False))
            if len(cdss) == 0:
                continue

            # Someday this will bite me in the arse.
            cds = cdss[0]

            sds, start, end, seq = sd_finder.testFeatureUpstream(cds,
                                                                 record,
                                                                 sd_min=5,
                                                                 sd_max=15)
            if len(sds) >= 1:
                # TODO
                # Double plus yuck
                sd_features = sd_finder.to_features(sds,
                                                    gene.location.strand,
                                                    start,
                                                    end,
                                                    feature_id=gene.id)
                gene.sub_features.append(sd_features[0])

                good_genes.append(gene)

        # Yuck!
        record.features = good_genes
        yield record