def rebase(parent, child, interpro=False, protein2dna=False): child_features = __get_features(child, interpro=interpro) for rec in GFF.parse(parent): # TODO, replace with recursion in case it's matched against a # non-parent feature. We're cheating a bit here right now... replacement_features = [] for feature in rec.features: if feature.id in child_features: new_subfeatures = child_features[feature.id] # TODO: update starts fixed_subfeatures = [] for x in new_subfeatures: # Then update the location of the actual feature __update_feature_location(x, feature, protein2dna) if interpro: for y in ('status', 'Target'): try: del x.qualifiers[y] except: pass fixed_subfeatures.append(x) replacement_features.extend(fixed_subfeatures) # We do this so we don't include the original set of features that we # were rebasing against in our result. rec.features = replacement_features GFF.write([rec], sys.stdout)
def to_gff(self, filename): """ Export to GFF format, saving to the specified filename. """ records = [] for fragment in self.__genome.fragments.all(): fragment = fragment.indexed_fragment() seq = Seq(fragment.sequence) rec = SeqRecord(seq, "%s" % (fragment.name,)) features = [] for annotation in fragment.annotations(): # FeatureLocation first bp is AfterPosition, so -1 loc = FeatureLocation(annotation.base_first-1, annotation.base_last) qualifiers = {'name': annotation.feature.name} feature = SeqFeature(loc, type=annotation.feature.type, strand=1, qualifiers=qualifiers) features.append(feature) rec.features = features records.append(rec) with open(filename, "w") as out_handle: GFF.write(records, out_handle, include_fasta=True)
def CpGIslandsToGFF(island_location): # Output methylation regions (CpG Islands, namely) to a GFF3 compliant file out_file = os.getcwd() \ + '/' \ + os.path.splitext(base)[0] \ + '.gff' seq = cur_record.seq rec = SeqRecord(seq, "ID1") qualifiers = {"source": "bssimulation", "score": '.', "ID": cur_record.name} sub_qualifiers = {"source": "bssimulation"} top_feature = SeqFeature(FeatureLocation(0, len(cur_record)), type="region", strand=0, qualifiers=qualifiers) for i in island_location: begin = int(i[0] - i[1]/2) end = int(i[0] + i[1]/2) top_feature.sub_features.append(SeqFeature(FeatureLocation(begin, end), type="CpG_island", strand=0, qualifiers=sub_qualifiers)) rec.features = [top_feature] with open(out_file, "w") as out_handle: GFF.write([rec], out_handle)
def to_gff_file(self, file): """ Export to GFF format, saving to provided file like object. """ records = [] for fragment in self.__genome.fragments.all(): fragment = fragment.indexed_fragment() seq = Seq(fragment.sequence) rec = SeqRecord(seq, "%s" % (fragment.name,)) features = [] for annotation in fragment.annotations(): # FeatureLocation first bp is AfterPosition, so -1 loc = FeatureLocation(annotation.base_first - 1, annotation.base_last) qualifiers = {'name': annotation.feature.name} strand = annotation.feature.strand feature = SeqFeature(loc, type=annotation.feature.type, strand=0 if strand is None else strand, qualifiers=qualifiers) features.append(feature) rec.features = features records.append(rec) GFF.write(records, file, include_fasta=True)
def t_write_from_recs(self): """Write out GFF3 from SeqRecord inputs. """ seq = Seq("GATCGATCGATCGATCGATC") rec = SeqRecord(seq, "ID1") qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"], "ID": "gene1"} sub_qualifiers = {"source": "prediction"} top_feature = SeqFeature(FeatureLocation(0, 20), type="gene", strand=1, qualifiers=qualifiers) top_feature.sub_features = [SeqFeature(FeatureLocation(0, 5), type="exon", strand=1, qualifiers=sub_qualifiers), SeqFeature(FeatureLocation(15, 20), type="exon", strand=1, qualifiers=sub_qualifiers)] rec.features = [top_feature] out_handle = StringIO.StringIO() GFF.write([rec], out_handle) wrote_info = out_handle.getvalue().split("\n") assert wrote_info[0] == "##gff-version 3" assert wrote_info[1] == "##sequence-region ID1 1 20" assert wrote_info[2].split("\t") == ['ID1', 'prediction', 'gene', '1', '20', '10.0', '+', '.', 'other=Some,annotations;ID=gene1'] assert wrote_info[3].split("\t") == ['ID1', 'prediction', 'exon', '1', '5', '.', '+', '.', 'Parent=gene1']
def main(): from argparse import ArgumentParser parser = ArgumentParser("Convert SAM to GFF3 format using BCBio GFF") parser.add_argument("sam_filename") parser.add_argument("-i", "--input_fasta", default=None, help="(Optional) input fasta. If given, coverage will be calculated.") parser.add_argument("-s", "--source", required=True, help="source name (ex: hg38, mm10)") args = parser.parse_args() if not args.sam_filename.endswith('.sam'): print >> sys.stderr, "Only accepts files ending in .sam. Abort!" sys.exit(-1) prefix = args.sam_filename[:-4] output_gff3 = prefix + '.gff3' q_dict = None if args.input_fasta is not None: q_dict = dict((r.id, len(r.seq)) for r in SeqIO.parse(open(args.input_fasta), 'fasta')) with open(output_gff3, 'w') as f: recs = [convert_sam_rec_to_gff3_rec(r0, args.source) for r0 in GMAPSAMReader(args.sam_filename, True, query_len_dict=q_dict)] BCBio_GFF.write(filter(lambda x: x is not None, recs), f) print >> sys.stderr, "Output written to {0}.".format(output_gff3)
def rebase(parent, child, interpro=False, protein2dna=False): child_features = __get_features(child, interpro=interpro) for rec in GFF.parse(parent): replacement_features = [] for feature in feature_lambda( rec.features, feature_test_qual_value, { 'qualifier': 'ID', 'attribute_list': child_features.keys(), }, subfeatures=False): new_subfeatures = child_features[feature.id] fixed_subfeatures = [] for x in new_subfeatures: # Then update the location of the actual feature __update_feature_location(x, feature, protein2dna) if interpro: for y in ('status', 'Target'): try: del x.qualifiers[y] except: pass fixed_subfeatures.append(x) replacement_features.extend(fixed_subfeatures) # We do this so we don't include the original set of features that we # were rebasing against in our result. rec.features = replacement_features rec.annotations = {} GFF.write([rec], sys.stdout)
def load_gff(gff): genes = defaultdict(list) gene_exon_positions = defaultdict(lambda: defaultdict(tuple)) try: with open(gff) as g: for line in g: if line.startswith('#') or 'contig' in line: continue feature = GFF(line) gene_id = get_gene_attribute(feature, "ID") if feature.featuretype == 'exon': gene_exon_positions[feature.genename][gene_id] = (feature.start, feature.end) if feature.featuretype == 'CDS': for exon in gene_exon_positions[feature.genename]: e = gene_exon_positions[feature.genename][exon] if e[0] <= feature.start <= e[1] and e[0] <= feature.end <= e[1]: gene_id = exon + "_CDS" if gene_id is None: print("No gene id for CDS found", feature, end="") feature.id = gene_id genes[feature.genename].append(feature) except IOError: print("Failed to load GFF file {}".format(gff)) sys.exit() return genes
def genbank_to_gff(self, genbank_file): from Bio import SeqIO from BCBio import GFF gff_file = "%s.gff" % (os.path.splitext(genbank_file)[0],) with open(gff_file, "w") as out_handle: GFF.write(SeqIO.parse(genbank_file, "genbank"), out_handle, include_fasta=True) return dict(gff_file=gff_file)
def main(gb_file,include_fasta=None): out_file = "%s.gff" % os.path.splitext(gb_file)[0] inc_fasta = False if include_fasta is not None: if include_fasta.lower() in ("true","yes","1"): inc_fasta = True with open(out_file, "w") as out_handle: GFF.write(SeqIO.parse(gb_file, "genbank"), out_handle, inc_fasta)
def genbank_to_gff(gb_file): """Convert GenBank file to GFF for IGV display. """ max_size = 1e4 gff_file = "%s.gff3" % os.path.splitext(gb_file)[0] if not os.path.exists(gff_file): with open(gb_file) as in_handle: with open(gff_file, "w") as out_handle: gb_iterator = SeqIO.parse(in_handle, "genbank") GFF.write(_filter_features(gb_iterator, max_size), out_handle)
def embl2gff(dat, org, gff): """ Parse embl file and estract mature miRNA location information. """ # extract records dat_parser = SeqIO.parse(dat, "embl") # extract organism specific miRNAs org_mirnas = [mirna for mirna in dat_parser if mirna.name.startswith(org)] for mirna in org_mirnas: mirna.id = mirna.name GFF.write(org_mirnas, gff)
def to_GFF(args): """ Convert a GenBank or EMBL file to GFF Biopython does not natively support GFF Can be useful for QUAST (Quality Assessment Tool for Genome Assemblies) :param args: an argparse args list """ in_type = args.inFormat.lower() with open(args.input) as fin, open(args.output, "w") as fout: GFF.write(SeqIO.parse(fin, in_type), fout)
def t_write_seqrecord(self): """Write single SeqRecords. """ seq = Seq("GATCGATCGATCGATCGATC") rec = SeqRecord(seq, "ID1") qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"], "ID": "gene1"} rec.features = [SeqFeature(FeatureLocation(0, 20), type="gene", strand=1, qualifiers=qualifiers)] out_handle = StringIO.StringIO() GFF.write([rec], out_handle, include_fasta=True) wrote_info = out_handle.getvalue().split("\n") gff_line = wrote_info[2] assert gff_line.split("\t")[0] == "ID1"
def t_gff3_to_gff3(self): """Read in and write out GFF3 without any loss of information. """ recs = SeqIO.to_dict(GFF.parse(self._test_gff_file)) out_handle = StringIO.StringIO() GFF.write(recs.values(), out_handle) wrote_handle = StringIO.StringIO(out_handle.getvalue()) recs_two = SeqIO.to_dict(GFF.parse(wrote_handle)) orig_rec = recs.values()[0] re_rec = recs.values()[0] assert len(orig_rec.features) == len(re_rec.features) for i, orig_f in enumerate(orig_rec.features): assert str(orig_f) == str(re_rec.features[i])
def handle(self, *args, **options): organism, created = Organism.objects.get_or_create( common_name=options['organism_name'], taxon=options['taxon'], ebi_id=options['ebi_id'] ) for record in SeqIO.parse(options['fasta'], "fasta"): refseq, created = RefSeq.objects.get_or_create( name=record.id, length=len(record.seq), organism=organism ) for rec in GFF.parse(options['gff3']): rs = RefSeq.objects.get(name=rec.id, organism=organism) for feat in rec.features: if feat.type != 'gene': continue gene, created = Gene.objects.get_or_create( start=feat.location.start, end=feat.location.end, strand=feat.location.strand, refseq=rs, db_object_id=feat.id, db_object_symbol=feat.id )
def get_gff_dict(gfffile): """Creates a dictionary with product information from given gff file. Returns dictionary. Dictionary key is the contig id, values are products for the contig.""" out_dict = {} for rec in GFF.parse(gfffile): # Add features if there are any if rec.features > 0: gff_info = None # Add all features # Features are separated by , # example: # featuretype;product;product,featuretype;product # or # CDS;protein3;protein31,CDS;protein3 for f in rec.features: if len(f.qualifiers['product']) > 0: # if gff_info is None, do not add ',' separator try: gff_info += ",%s" % ";".join([f.type] + f.qualifiers['product']) except TypeError: gff_info = ";".join([f.type] + f.qualifiers['product']) # Test if there were any features with a product if gff_info == None: gff_info = "N/A" else: gff_info = "N/A" out_dict[rec.id] = gff_info return out_dict
def gene_to_early_exons(gene_name, num_exons): #Initialize variables exons = {} exonCount = 0 maxExons = 0 #Open annotation file annotation_file = 'crispr_app/Homo_sapiens.GRCh38.84.gtf' limit_info = dict( gff_type = ["exon"]) annotation_handle = open(annotation_file) #Parse through annotated data, searching for matching gene names & exons strand = '' for rec in GFF.parse(annotation_handle, limit_info=limit_info, target_lines=1): feature = rec.features[0] qualifiers = feature.qualifiers #Once matching gene is found, determine the exon regions and chromosome if str(qualifiers['gene_name']).strip('[').strip(']').strip('\'') == gene_name: chromosome = rec.id strand = feature.strand #Get only first version of gene in annotated data exonNum = str(qualifiers['exon_number']).strip('[').strip(']').strip('\'') maxExons = max(maxExons, int(exonNum)) exonCount +=1 if exonCount > maxExons: break if exonCount > num_exons: break exons[exonNum] = [int(feature.location.start), int(feature.location.end), strand] annotation_handle.close() return exons, chromosome
def not_t_full_celegans(self): """Test the full C elegans chromosome and GFF files. This is used to test GFF on large files and is not run as a standard test. You will need to download the files and adjust the paths to run this. """ # read the sequence information seq_file = os.path.join(self._full_dir, "c_elegans.WS199.dna.fa") gff_file = os.path.join(self._full_dir, "c_elegans.WS199.gff3") seq_handle = open(seq_file) seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta")) seq_handle.close() #with open(gff_file) as gff_handle: # possible_limits = feature_adder.available_limits(gff_handle) # pprint.pprint(possible_limits) rnai_types = [('Orfeome', 'PCR_product'), ('GenePair_STS', 'PCR_product'), ('Promoterome', 'PCR_product')] gene_types = [('Non_coding_transcript', 'gene'), ('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')] limit_info = dict(gff_source_type = rnai_types + gene_types) for rec in GFF.parse(gff_file, seq_dict, limit_info=limit_info): pass
def gene_positions(genefile, include_chromosome=True, include_strand=True, coding_only=False, ignore_strange_cases=False): """ Return a gene_ID:(chromosome, strand, start_pos, end_pos) dictionary based on GFF input file. The positions are 1-based, end-inclusive. If include_chromosome and/or include_strand is False, the corresponding values are missing from the output tuples. If coding_only is True, the start/end positions are the start and end of the first and last exon (i.e. excluding the UTRs). In that case, if a gene doesn't have an mRNA with exons, or has multiple mRNAs, raise an Exception, unless ignore_strange_cases is True, then just don't include it in the output. """ gene_positions = {} with open(os.path.expanduser(genefile)) as GENEFILE: # if coding_only is False, only look at genes, not sub-features genefile_parsing_limits = {'gff_type': ['gene']} if not coding_only else {} for chromosome_record in GFF.parse(GENEFILE, limit_info=genefile_parsing_limits): for gene_record in chromosome_record.features: # BCBio uses 0-based and end-exclusive positions (first-third base is bases 0,1,2, i.e range 0-3) - # convert to 1-based end-inclusive (so first-third base is bases 1,2,3, i.e. range 1-3) if include_chromosome: full_pos_info = (chromosome_record.id,) else: full_pos_info = () if include_strand: full_pos_info += (GFF_strands[gene_record.strand],) if not coding_only: full_pos_info += get_feature_start_end(gene_record) else: try: start_end = get_gene_start_end_excluding_UTRs(gene_record) except (NoRNAError, MultipleRNAError): if ignore_strange_cases: continue else: raise full_pos_info += start_end gene_positions[gene_record.id] = full_pos_info return gene_positions
def read_gff_transcripts(fobj, fname="", min_exons=1, merge=0): # Setup logging logger = logging.getLogger('pita') if merge > 0: logger.warning("Merging exons not yet implemented for GFF files!") #limits = dict(gff_type = ["mRNA", "exon"]) smap = {"1":"+",1:"+","-1":"-",-1:"-", None:"+"} transcripts = [] for rec in GFF.parse(fobj): chrom = rec.id for feature in rec.features: #logger.debug("feature: {0}", feature) for gene in _gff_type_iterator(feature, ['mRNA', 'transcript', 'inferred_parent']): #logger.debug("Adding gene: {0}", gene) exons = [] #logger.debug("subfeatures: {0}", gene.sub_features) for exon in [f for f in gene.sub_features if f.type == 'exon']: #link[gene.id] = link.setdefault(gene.id, 0) + 1 start = int(exon.location.start.position)# - 1 end = int(exon.location.end.position) strand = smap[exon.strand] exons.append([chrom, start, end, strand]) logger.debug("%s: %s - %s exons", fname, gene.id, len(exons)) if len(exons) >= min_exons: transcripts.append([gene.id, fname, exons]) return transcripts
def t_key_whitespace(self): """Fix keys with problematic whitespace. """ tfile = os.path.join(self._test_dir, "spaces.gff3") for i, line_info in enumerate(GFF.parse_simple(tfile)): if i > 2: assert line_info["quals"]["foo"] == ["bar"]
def prepareSample(filter_matrix, gff_path): random.seed() candidate_list = [] handle = open(gff_path, 'r') gene_count = 0 for record in GFF.parse(handle): for feature in record.features: if feature.type == 'gene': locus_tag = feature.qualifiers['locus_tag'][0] isMatch = False gene_count += 1 for key in filter_matrix: if key == locus_tag: isMatch = True break if isMatch == False: candidate_list.append(locus_tag) countToAdd = round(gene_count / 2) - len(filter_matrix) if countToAdd > 0: for i in range(1, countToAdd): list_len = len(candidate_list) list_id = random.randint(0, list_len - 1) locus_str = candidate_list[ list_id ] filter_matrix[locus_str] = (0, 0) candidate_list.remove( locus_str ) handle.close() return(filter_matrix)
def t_ensembl_nested_features(self): """Test nesting of features with GFF2 files using transcript_id. """ rec_dict = SeqIO.to_dict(GFF.parse(self._ensembl_file)) assert len(rec_dict["I"].features) == 2 t_feature = rec_dict["I"].features[0] assert len(t_feature.sub_features) == 32
def t_write_fasta(self): """Include FASTA records in GFF output. """ seq = Seq("GATCGATCGATCGATCGATC") rec = SeqRecord(seq, "ID1") qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"], "ID": "gene1"} rec.features = [SeqFeature(FeatureLocation(0, 20), type="gene", strand=1, qualifiers=qualifiers)] out_handle = StringIO.StringIO() GFF.write([rec], out_handle, include_fasta=True) wrote_info = out_handle.getvalue().split("\n") fasta_parts = wrote_info[3:] assert fasta_parts[0] == "##FASTA" assert fasta_parts[1] == ">ID1 <unknown description>" assert fasta_parts[2] == str(seq)
def t_fasta_directive(self): """Parse FASTA sequence information contained in a GFF3 file. """ recs = SeqIO.to_dict(GFF.parse(self._gff_file)) assert len(recs) == 1 test_rec = recs['chr17'] assert str(test_rec.seq) == "GATTACAGATTACA"
def doWork( args ): panel=Panel(fig_width=900, padding = 25, grid=None, xmin=0) seq_length = 0 for gff in args.gffs: seqrecord = GFF.parse(gff).next() if len(seqrecord) > seq_length: seq_length = len(seqrecord) #seqrecord = SeqIO.parse(args.infile, "genbank").next() cds_track = tracks.BaseTrack(sort_by = 'collapse') for feature in seqrecord.features: if feature.type == 'CDS': #print feature.qualifiers['product'] if feature.qualifiers['product'][0] == 'hypothetical protein': col = '#BDBDBD' else: col = '#2B8CBE' feat = features.GenericSeqFeature(feature, color_by_cm=False, fc=col ) cds_track.append(feat) elif feature.type == 'source': cds_track.append(features.GenericSeqFeature(feature, color_by_cm=False, alpha=0.0, fc='1.0', ec='1.0')) else: cds_track.append(features.GenericSeqFeature(feature, color_by_cm=False, fc='0.0', ec='0.0')) panel.add_track(cds_track) panel.save(args.outfile, xmin=0,xmax=seq_length)
def main(gff_file, fasta_file = None): # Use splitext to remove the extension of the original input file out_file = "%s.gb" % os.path.splitext(gff_file)[0] # Parser will differ slightly if fasta file is given if os.stat(gff_file) == 0 or ((fasta_file is not None) and os.stat(fasta_file)): print "ERROR: Empty file provided or cannot stat files" exit(64); elif fasta_file is None: gff_iter = GFF.parse(gff_file) #Parser/generator object else: fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta", generic_dna)) # Process fasta file gff_iter = GFF.parse(gff_file, fasta_input) # Give fasta file to parser # One line to call all the checking function and to write in genbank format SeqIO.write(_check_gff(_fix_ncbi_id(gff_iter)), out_file, "genbank")
def shortrna_regions(mirna_gff, star_csv, seq_file): """Return miRNA sequences with corresponding guide and star regions. """ seq_index = SeqIO.index(seq_file, "fasta") mirna_seqs = dict() with open(star_csv) as in_handle: for name, guide, star in csv.reader(in_handle): mirna_seqs[name] = (guide.strip(), star.strip()) for rec in GFF.parse(mirna_gff): cur_seq = str(seq_index[rec.id].seq) for f in rec.features: name = f.qualifiers["ID"][0] start, end = (f.location.nofuzzy_start, f.location.nofuzzy_end) yield (rec.id, start, end, name) #guide, star = mirna_seqs.get(name, ("", "")) for seq_name, guide, star in [(n, g, s) for n, (g, s) in mirna_seqs.iteritems() if n.startswith(name)]: for find_seq, ext in [(guide, "guide"), (star, "star")]: if find_seq: if f.strand == -1: find_seq = str(Seq(find_seq).reverse_complement()) region = cur_seq[start:end] pos = region.find(find_seq) if pos > -1: yield (rec.id, start + pos, start + pos + len(find_seq), "%s_%s" % (seq_name, ext)) else: print f.strand, name, ext, pos, find_seq, region raise NotImplementedError
def load_gff(gff): """Parses a single GFF file and returns a chromosome-indexed dict for that file. Arguments --------- gff: str Filepath to GFF Returns ------- dict: A dictionary representation of the GFF entries, indexed by chromosome ID """ annotations = {} if gff.endswith('.gz'): import gzip from io import TextIOWrapper fp = TextIOWrapper(gzip.open(gff)) else: fp = open(gff) for entry in GFF.parse(fp): if len(entry.features) > 0 and entry.features[0].type == 'chromosome': annotations[entry.id] = entry fp.close() return annotations
def gb2gff(infile, outfile): """Translate GenBank file to GFF3 file. TODO: the procedure now does not handle join correctly Args: infile (str): input GenBank file outfile (str): output GFF3 file Returns: Number of records written """ gb_handle = open(infile, 'r') gff_handle = open(outfile, 'w') res = GFF.write(SeqIO.parse(gb_handle, "gb"), gff_handle) gff_handle.close() return (res)
def t_wormbase_nested_features(self): """Test nesting of features with GFF2 files using Transcript only. """ rec_dict = SeqIO.to_dict(GFF.parse(self._wormbase_file)) assert len(rec_dict) == 3 parent_features = [ f for f in rec_dict["I"].features if f.type == "Transcript" ] assert len(parent_features) == 1 inferred_features = [ f for f in rec_dict["I"].features if f.type == "inferred_parent" ] assert len(inferred_features) == 0 tfeature = parent_features[0] assert tfeature.qualifiers["WormPep"][0] == "WP:CE40797" assert len(tfeature.sub_features) == 46
def working_stuff(): from BCBio import GFF gff_type = ["gene", "mRNA", "CDS", "exon"] source_type = zip(["Coding_transcript"] * len(gff_type), gff_type) filter_type = dict(gff_source_type=source_type, gff_id="I") gff_handle = open( "/fml/ag-raetsch/share/databases/genomes/C_elegans/elegans_WS199/annotation/c_elegans.WS199.gff3" ) element = [e for e in GFF.parse(gff_handle, limit_info=filter_type)] return element
def t_extra_comma(self): """Correctly handle GFF3 files with extra trailing commas. """ tfile = os.path.join(self._test_dir, "mouse_extra_comma.gff3") in_handle = open(tfile) for rec in GFF.parse(in_handle): pass in_handle.close() tested = False for sub_top in rec.features[0].sub_features: for sub in sub_top.sub_features: if sub.qualifiers.get("Name", "") == ["CDS:NC_000083.5:LOC100040603"]: tested = True assert len(sub.qualifiers["Parent"]) == 1 assert tested, "Did not find sub-feature to test"
def get_features_from_file(handle: IO) -> Dict[str, List[SeqFeature]]: """ Generates new SeqFeatures from a GFF file. Arguments: handle: a file handle/stream with the GFF contents Returns: a dictionary mapping record ID to a list of SeqFeatures for that record """ try: gff_records = list(GFF.parse(handle)) except Exception as err: raise AntismashInputError("could not parse records from GFF3 file") from err results = {} for gff_record in gff_records: features = [] for feature in gff_record.features: if feature.type == 'CDS': new_features = [feature] else: new_features = check_sub(feature) if not new_features: continue name = feature.id locus_tag = feature.qualifiers.get("locus_tag") for qtype in ["gene", "name", "Name"]: if qtype in feature.qualifiers: name_tmp = feature.qualifiers[qtype][0] # Assume name/Name to be sane if they don't contain a space if " " in name_tmp: continue name = name_tmp break for i, new_feature in enumerate(new_features): variant = name if len(new_features) > 1: variant = "{0}_{1}".format(name, i) new_feature.qualifiers['gene'] = [variant] if locus_tag is not None: new_feature.qualifiers["locus_tag"] = locus_tag features.append(new_feature) results[gff_record.id] = features return results
def runbarrnap(genome,outfilePath = '.'): ''' assumes write permission in current working directory :param genome: path to genome assumes assembly file name ("_genomic.fna.gz") :param outfile: target file with rRNA sequences detected in fasta format with header ">asmID:ribosomalSubunit:start-end:dir:acc" :return: True if successfully run, false if not ''' if os.path.isfile(genome): fileName = os.path.split(genome)[1] asmID = fileName.split('.')[0] outfileName = asmID + '_rRNA.fasta' try: tempGenome = asmID + '.tmp' with gzip.open(genome) as in_file, open(tempGenome,'w') as out_file: shutil.copyfileobj(in_file,out_file) except IOError: tempGenome = asmID + '.tmp' shutil.copy(genome, tempGenome) ## unzip command = ['barrnap','--incseq',tempGenome] try: barrnapProc = subprocess.Popen(command,stdout=subprocess.PIPE) handle = GFF.parse(barrnapProc.stdout) seqs = [] for seq in handle: for feat in seq.features: rnaSeq = feat.extract(seq) rSU = feat.qualifiers['name'][0] eval = float(feat.qualifiers['score'][0]) start = feat.location.start end = feat.location.end dir = feat.location.strand if 'note' in feat.qualifiers: partial = 'partial' else: partial = 'full' rnaSeq.name = "{}:{}:{}:{}-{}:{}:{}".format(asmID,rSU,eval,start,end,dir,partial) rnaSeq.id = "{}:{}:{}:{}-{}:{}:{}".format(asmID,rSU,eval,start,end,dir,partial) rnaSeq.description = '' seqs.append(rnaSeq) with open(os.path.join(outfilePath,outfileName),'w') as outfile: SeqIO.write(seqs,outfile,'fasta') except Exception as e: print(e) os.remove(tempGenome)
def repair(fasta, gff3): recs = {} # seqids = {} for record in GFF.parse(gff3): # seqids[record.id] = '' recs[record.id] = record seqs = [] for seq in SeqIO.parse(fasta, "fasta"): if seq.id not in recs: continue current = recs[seq.id] for num, feat in enumerate(current.features): if num == 0: # ignore first feature bc that's the full one continue cds, sd = get_CDS_and_SD(feat) cds_start = seq.seq[cds.location.start:cds.location.start + 3] broken_start = break_start(cds_start) if ( cds_start != broken_start ): # try to break start sequence while keeping amino acid the same seq.seq = (seq.seq[0:cds.location.start] + broken_start + seq.seq[cds.location.start + 3:]) else: # if couldn't change start, must break SD mod_sd_start = 0 mod_sd_end = 0 if (sd.location.start % 3) + 1 != 1: mod_sd_start = next_first_frame(sd.location.start, 1) if (sd.location.end % 3) + 1 != 1: mod_sd_end = next_first_frame(sd.location.end, -1) sd_seq = seq.seq[sd.location.start - mod_sd_start:sd.location.end - mod_sd_end] broken_sd = break_sd(sd_seq) if sd_seq != broken_sd: seq.seq = (seq.seq[0:(sd.location.start - mod_sd_start)] + broken_sd + seq.seq[(sd.location.end - mod_sd_end):]) seqs.append(seq) SeqIO.write(seqs, sys.stdout, "fasta")
def main(seqFilepath, gffFilepath, outFilepath): # load fasta seqRec_lst = [] seqName_lst = [] for seqRec in SeqIO.parse(seqFilepath, "fasta"): seqRec_lst.append(seqRec) seqName_lst.append(seqRec.id) print("LOADED {} seqs from {}".format(len(seqRec_lst), seqFilepath)) # load gff and distribute CDS cds_lstlst = [[] for _ in range(len(seqName_lst))] with open(gffFilepath) as f: for rec in GFF.parse(f, target_lines=1): assert len(rec.features) == 1 if rec.features[0].type == "CDS": try: idx = seqName_lst.index(rec.id) cds_lstlst[idx].append(rec) except ValueError: pass # corresponding sequence does not exists in seqRec_lst for idx, seqName in enumerate(seqName_lst): print("\tLOADED {0} CDSs in {1}".format(len(cds_lstlst[idx]), seqName)) thres_lst = list(range(50, 1000 + 1, 50)) columns = ["+1", "+2", "+3", "-1", "-2", "-3"] out_mat = np.zeros((len(thres_lst), len(columns))).astype(int) for seqRec, cds_lst, seqName in zip(seqRec_lst, cds_lstlst, seqName_lst): orf_df = get_orf_df(seqRec) for i, thres in enumerate(thres_lst): filtered_df = orf_df[orf_df["length"] >= thres] pos_lst = get_pos_lst(cds_lst, filtered_df) overlap_dctdct = get_overlap_dctdct(pos_lst) for _, dct in overlap_dctdct.items(): out_mat[i, columns.index(dct["relLane"])] += (dct["oend"] - dct["ostart"]) print("\tDONE with {}".format(seqName)) out_df = pd.DataFrame(out_mat, columns=columns) out_df["thres"] = thres_lst out_df = out_df[["thres"] + columns] out_df.to_csv(outFilepath, index=False) print("OUTPUT to {}".format(outFilepath))
def __init__(self, fasta, gtf): self.fasta = fasta self.gtf = gtf sys.stderr.write("Reading FASTA file...\n") with flexi_open(fasta, 'rU') as handle: chromosomes = SeqIO.to_dict(SeqIO.parse(handle, 'fasta')) self.intervaldict = dict() sys.stderr.write("Reading GTF file (this will take some time)...\n") limit_info = dict(gff_type=('CDS', )) with flexi_open(gtf, 'r') as handle: for rec in GFF.parse(handle, limit_info=limit_info, base_dict=chromosomes): # Fix strand info. for feature in rec.features: # Each top-level CDS if hasattr(feature, 'strand'): if feature.strand is None: # An unfortunate effect of the GFF parser. # Check subfeatures, take the first defined strand. # Note: may be bad assumption in weird species (ciliates??). for subfeat in get_subfeatures(feature): if subfeat.strand is not None: feature.strand = subfeat.strand break # Now, chromosomes['X'] is a SeqRecord with features each of # which has a feature.extract method which can be used to # access the underlying DNA sequence. CDS features are nested # automatically. See also feature.qualifiers for a list of IDs # (gene name, ID etc) associated with it. chromosomes[rec.id] = rec # We need to create some interval trees to identify # affected CDS features etc. self._index_record_in_intervaldict(rec) self.chromosomes = chromosomes self._precompute_chrlens() sys.stderr.write("Object initialisation complete.\n")
def record_with_extracted_annotations_generator( gff_file, white_list_of_annotation_types): for record in GFF.parse(open(gff_file)): #print("Extracting annotations from %s" % record.id) new_record = deepcopy(record) new_record.features = [] #print record.features for feature in record.features: #print ("%s\t%s" % (record.id, feature.id)) if (feature.id in annotation_ids) and (feature.type in white_list_of_annotation_types): new_record.features.append(feature) if len(new_record.features) > 0: yield new_record
def do_import(self): in_file = self.__gff_fasta_fn in_handle = open(in_file) # In DEBUG=True mode, Django keeps list of queries and blows up memory # usage when doing a big import. The following line disables this # logging. connection.use_debug_cursor = False for rec in GFF.parse(in_handle): f = GFFFragmentImporter(rec).do_import() self.__genome.genome_fragment_set.create(fragment=f, inherited=False) # Be nice and turn debug cursor back on connection.use_debug_cursor = True in_handle.close()
def __get_features(child, interpro=False): child_features = {} for rec in GFF.parse(child): for feature in rec.features: parent_feature_id = rec.id if interpro: if feature.type == 'polypeptide': continue if '_' in parent_feature_id: parent_feature_id = parent_feature_id[parent_feature_id. index('_') + 1:] try: child_features[parent_feature_id].append(feature) except KeyError: child_features[parent_feature_id] = [feature] return child_features
def FindInitSites(): GFFgen = GFF.parse( '/users/buskirk/documents/profiling/GFF/MG1655/coli3.gff') chrom = GFFgen.next() f_genome = chrom.seq r_genome = chrom.seq.reverse_complement() ORFlist = [] ORFlist.append([ 'start', 'stop', 'retapa', 'onc112', 'codon1st', 'codonlast', 'peptide', 'strand' ]) # for the plus strand f1 = 'retapa' #retapamulin data from Shura Mankin pathi = "/users/buskirk/documents/profiling/projects/sORFs/wigfiles/" density_filestring1 = pathi + f1 counts_f1 = readwig(density_filestring1 + "_plus") f2 = 'onc' #Onc112 data pathi = "/users/buskirk/documents/profiling/projects/sORFs/wigfiles/" density_filestring2 = pathi + f2 counts_f2 = readwig(density_filestring2 + "_plus") gp_plus = geneplot(chrom, 1) # 1 is plus plusORFs = INIT_scan(f_genome, gp_plus, counts_f1, counts_f2, 'plus') ORFlist.extend(plusORFs) # for the minus strand counts_f1 = readwig(density_filestring1 + "_minus") counts_f1.reverse() counts_f2 = readwig(density_filestring2 + "_minus") counts_f2.reverse() gp_minus = geneplot(chrom, -1) # -1 is minus gp_minus.reverse() minusORFs = INIT_scan(r_genome, gp_minus, counts_f1, counts_f2, 'minus') ORFlist.extend(minusORFs) writelisttoexcel( ORFlist, '/users/buskirk/documents/profiling/projects/sORFs/init_sites')
def get_features_from_file( seq_record, handle, limit_to_seq_id: Union[bool, Dict[str, List[str]]] = False ) -> List[SeqFeature]: """ Generates new SeqFeatures from a Record and a GFF file. Arguments: seq_record: the record that features belong to limit_to_seq_id: False or a dictionary of GFF.parse options Returns: a list of SeqFeatures parsed from the GFF file """ features = [] for record in GFF.parse(handle, limit_info=limit_to_seq_id): for feature in record.features: if feature.type == 'CDS': new_features = [feature] else: new_features = check_sub(feature, seq_record) if not new_features: continue name = feature.id locus_tag = feature.qualifiers.get("locus_tag") for qtype in ["gene", "name", "Name"]: if qtype in feature.qualifiers: name_tmp = feature.qualifiers[qtype][0] # Assume name/Name to be sane if they don't contain a space if " " in name_tmp: continue name = name_tmp break for i, new_feature in enumerate(new_features): variant = name if len(new_features) > 1: variant = "{0}_{1}".format(name, i) new_feature.qualifiers['gene'] = [variant] if locus_tag is not None: new_feature.qualifiers["locus_tag"] = locus_tag features.append(new_feature) return features
def __get_features(child, interpro=False): child_features = {} for rec in GFF.parse(child): log.info("Parsing %s", rec.id) for feature in rec.features: parent_feature_id = rec.id if interpro: if feature.type == "polypeptide": continue if "_" in parent_feature_id: parent_feature_id = parent_feature_id[parent_feature_id. index("_") + 1:] try: child_features[parent_feature_id].append(feature) except KeyError: child_features[parent_feature_id] = [feature] return child_features
def examine(gff_file, fasta_file): gff_handle = open(gff_file) fasta_handle = open(fasta_file) fasta_dict = SeqIO.to_dict(SeqIO.parse(fasta_handle, "fasta")) for rec in GFF.parse(gff_handle): #print rec.id for one_feature in rec.features: #print one_feature.qualifiers.keys() locus = one_feature.qualifiers["Name"][0] product = one_feature.qualifiers["product"][0] seq = one_feature.extract(fasta_dict[rec.id].seq) out = ">%s|%s|%s\n%s" % (rec.id, locus, product, seq) print out gff_handle.close() fasta_handle.close()
def features(self): """list: Get the features from the feature file, metadata file, or in memory""" if self.feature_file: log.debug('{}: reading features from feature file {}'.format(self.id, self.feature_path)) with open(self.feature_path) as handle: feats = list(GFF.parse(handle)) if len(feats) > 1: log.warning('Too many sequences in GFF') else: return feats[0].features elif self.metadata_file: log.debug('{}: reading features from metadata file {}'.format(self.id, self.metadata_path)) tmp_sr = SeqIO.read(self.metadata_path, 'uniprot-xml') return tmp_sr.features else: return self._features
def generator(gff_file, fasta_dict, exon_fd): with open(gff_file, "r") as gff_fd: for record in GFF.parse(gff_fd, target_lines=100000): for feature in record.features: #print (feature.type) if feature.type == "transcript": print(feature) print(feature.sub_features) exon_fd.write(feature.id + "\n") exon_fd.write(str(feature.location)) exon_fd.write("\n") exon_fd.write(str(feature.sub_features) + "\n") if feature.type == "gene": feature_record = feature.extract(fasta_dict[record.id]) feature_record.id = feature.qualifiers["gene_id"][0] feature_record.description = "" yield feature_record
def tableify(gff3, fasta): names = {} for fasta_rec in SeqIO.parse(fasta, 'fasta'): names[fasta_rec.id] = [] for gff_rec in GFF.parse(gff3): names[gff_rec.id].append(str(len(gff_rec.features) - 1)) # number of internal starts starts = [] for feat in gff_rec.features: feat_start = (feat.location.start - 9) / 3 + 1 if feat_start is not 1: # start codon position of each internal start starts.append(str(feat_start)) names[gff_rec.id].append(starts) for n in sorted(names): if len(names[n]): print '\t'.join([n, names[n][0], ', '.join(names[n][1])])
def t_basic_attributes(self): """Parse out basic attributes of GFF2 from Ensembl GTF. """ limit_info = dict( gff_source_type = [('snoRNA', 'exon')] ) rec_dict = SeqIO.to_dict(GFF.parse(self._ensembl_file, limit_info=limit_info)) work_rec = rec_dict['I'] assert len(work_rec.features) == 1 test_feature = work_rec.features[0] qual_keys = test_feature.qualifiers.keys() qual_keys.sort() assert qual_keys == ['Parent', 'exon_number', 'gene_id', 'gene_name', 'source', 'transcript_id', 'transcript_name'] assert test_feature.qualifiers['source'] == ['snoRNA'] assert test_feature.qualifiers['transcript_name'] == ['NR_001477.2'] assert test_feature.qualifiers['exon_number'] == ['1']
def ParseRecord(self, cn): org = self._wa.organisms.findOrganismByCn(cn) self._wa.annotations.setSequence(org['commonName'], org['id']) data = io.StringIO( self._wa.io.write( exportType='GFF3', seqType='genomic', exportAllSequences=False, exportGff3Fasta=True, output="text", exportFormat="text", sequences=cn, )) data.seek(0) for record in GFF.parse(data): yield WebApolloSeqRecord(record, self._wa)
def basic_parsing(gffFile): """ GFF3 parse, extract information """ with open(gffFile) as in_handle: for rec in GFF.parse(in_handle): # iterate features for feature in rec.features: # iterate sub features sub_features_temp = [] for sub_feature in feature.sub_features: sub_features_temp.append(sub_feature) yield SeqFeature( type=feature.type, location=feature.location, strand=feature.strand, qualifiers=feature.qualifiers, sub_features=sub_features_temp, )
def require_shinefind(gff3, fasta): sd_finder = NaiveSDCaller() # Load up sequence(s) for GFF3 data seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) # Parse GFF3 records for record in GFF.parse(gff3, base_dict=seq_dict): # Reopen genes = list( feature_lambda(record.features, feature_test_type, {"type": "gene"}, subfeatures=True)) good_genes = [] for gene in genes: cdss = sorted( list( feature_lambda( gene.sub_features, feature_test_type, {"type": "CDS"}, subfeatures=False, )), key=lambda x: x.location.start, ) if len(cdss) == 0: continue cds = cdss[0] sds, start, end, seq = sd_finder.testFeatureUpstream(cds, record, sd_min=5, sd_max=15) if len(sds) >= 1: sd_features = sd_finder.to_features(sds, gene.location.strand, start, end, feature_id=gene.id) gene.sub_features.append(sd_features[0]) good_genes.append(gene) record.features = good_genes yield record
def FindrRNA(path, tempFile): in_handle = open(path) for record in GFF.parse(in_handle, limit_info=limit_info): if len(record.features) != 0: for i in range(len(record.features)): if '23S ribosomal RNA' in record.features[i].qualifiers["product"][0]: #print(record.features[i]) #print(record.id) #print(record.features[i].location.strand) #print(record.features[i].location.start) #print(record.features[i].location.end) #print(type(record.features[i].location)) #feature_seq = record.seq[record.features[i].location.start:record.features[i].location.end].reverse_complement() #print(feature_seq) ''' seq_23S = SeqRecord(record.features[i].location.extract(record.seq),\ id=record.features[i].id,\ description=str(record.id +'-'+ record.features[i].qualifiers["product"][0])) ''' seq_23S = SeqRecord(record.features[i].location.extract(record.seq),\ id=tempFile[:-4],\ description=str(record.id +'-'+record.features[i].id+'-'+ record.features[i].qualifiers["product"][0]+'-'+str(len(record.seq)))) #print(seq_23S) #SeqIO.write(seq_23S, path[:-4]+"_23S_rRNA.fasta", "fasta") SeqIO.write(seq_23S, "/home/junyuchen/Lab/16S-Prediction/"+tempFile[:-4]+"_23S_rRNA.fasta", "fasta") elif '16S ribosomal RNA' in record.features[i].qualifiers["product"][0]: #print(record.features[i]) #print(record.id) seq_16S = SeqRecord(record.features[i].location.extract(record.seq),\ id=tempFile[:-4],\ description=str(record.id +'-'+record.features[i].id+'-'+ record.features[i].qualifiers["product"][0]+'-'+str(len(record.seq)))) #SeqIO.write(seq_16S, path[:-4]+"_16S_rRNA.fasta", "fasta") SeqIO.write(seq_16S, "/home/junyuchen/Lab/16S-Prediction/"+tempFile[:-4]+"_16S_rRNA.fasta", "fasta") #print(seq_16S) #print #print(record.features) in_handle.close()
def main(fasta, gff3): seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) codon_usage = {} for rec in GFF.parse(gff3, base_dict=seq_dict): for feat in feature_lambda(rec.features, feature_test_type, {"type": "CDS"}, subfeatures=True): seq = str(feat.extract(rec).seq)[0:3] try: codon_usage[seq] += 1 except KeyError: codon_usage[seq] = 1 # TODO: print all actg combinations? Or just ones that are there print "# Codon\tCount" for key in sorted(codon_usage): print "\t".join((key, str(codon_usage[key])))
def get_SNP_regions(my_genes, in_file="gencode.v24.annotation.gff3"): in_handle = open(in_file) regions = [] for index, rec in enumerate(GFF.parse(in_handle, target_lines=130)): for x in rec.features: if x.type == 'gene': for y in my_genes: if y == x.qualifiers['gene_name'][0]: regions.append({ 'chr': rec.id, 'start': x.location.start, 'end': x.location.end }) in_handle.close() return regions
def load_annotations(target_gff): """Loads genome annotations from specified GFF(s).""" # Get chromosomes/contigs from GFF file chromosomes = {} # Load existing gene annotations annotations_fp = open(target_gff) for entry in GFF.parse(annotations_fp): # For TriTrypDB 29 and above, there are no longer chromosome entries # in the GFF files # if len(entry.features) > 0 and entry.features[0].type in ['chromosome', 'contig']: if len(entry.features) > 0: chromosomes[entry.id] = entry # clean up annotations_fp.close() return chromosomes
def record_with_extracted_transcripts_generator(gff_file, transcript_ids): for record in GFF.parse(open(gff_file)): new_record = deepcopy(record) new_record.features = [] for feature in record.features: if (feature.type == "mRNA" or feature.type == "transcript") and (feature.id in transcript_ids): new_record.features.append(feature) elif feature.type == "gene": new_feature = deepcopy(feature) new_feature.sub_features = [] for subfeature in feature.sub_features: if (subfeature.type == "mRNA" or subfeature.type == "transcript") and (subfeature.id in transcript_ids): new_feature.sub_features.append(subfeature) if len(new_feature.sub_features) > 0: new_record.features.append(new_feature) if len(new_record.features) > 0: yield new_record
def run(sequence, options): handle = open(options.gff3) # If there's only one sequence in both, read all, otherwise, read only appropriate part of GFF3. if options.single_entries: limit_info = False else: limit_info = dict(gff_id=[sequence.id]) for record in GFF.parse(handle, limit_info=limit_info): for feature in record.features: if feature.type == 'CDS': new_features = [feature] else: new_features = check_sub(feature, sequence) if not new_features: continue name = feature.id if len(name) > 40: raise ValueError( "Feature ID too long, < 40 characters required: %s" % name) locus_tag = None if "locus_tag" in feature.qualifiers: locus_tag = feature.qualifiers["locus_tag"] for qtype in ["gene", "name", "Name"]: if qtype in feature.qualifiers: name_tmp = feature.qualifiers[qtype][0] # Assume name/Name to be sane if they don't contain a space if " " in name_tmp: continue name = name_tmp break for i, n in enumerate(new_features): variant = name if len(new_features) > 1: variant = "{0}_{1}".format(name, i) n.qualifiers['gene'] = [variant] if locus_tag is not None: n.qualifiers["locus_tag"] = locus_tag sequence.features.append(n)
def require_shinefind(gff3, fasta): sd_finder = NaiveSDCaller() # Load up sequence(s) for GFF3 data seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) # Parse GFF3 records for record in GFF.parse(gff3, base_dict=seq_dict): # Reopen genes = list( feature_lambda(record.features, feature_test_type, {'type': 'gene'}, subfeatures=True)) good_genes = [] for gene in genes: cdss = list( feature_lambda(gene.sub_features, feature_test_type, {'type': 'CDS'}, subfeatures=False)) if len(cdss) == 0: continue # Someday this will bite me in the arse. cds = cdss[0] sds, start, end, seq = sd_finder.testFeatureUpstream(cds, record, sd_min=5, sd_max=15) if len(sds) >= 1: # TODO # Double plus yuck sd_features = sd_finder.to_features(sds, gene.location.strand, start, end, feature_id=gene.id) gene.sub_features.append(sd_features[0]) good_genes.append(gene) # Yuck! record.features = good_genes yield record