def _get(relative_path, genome=None): """ :param relative_path: relative path of the file inside the repository :param genome: genome name. Can contain chromosome name after comma, like hg19-chr20, in case of BED, the returning BedTool will be with added filter. :return: BedTools object if it's a BED file, or filepath """ chrom = None if genome: if '-chr' in genome: genome, chrom = genome.split('-') check_genome(genome) relative_path = relative_path.format(genome=genome) path = abspath(join(dirname(__file__), relative_path)) if not isfile(path) and isfile(path + '.gz'): path += '.gz' if path.endswith('.bed') or path.endswith('.bed.gz'): if path.endswith('.bed.gz'): bedtools = which('bedtools') if not bedtools: critical('bedtools not found in PATH: ' + str(os.environ['PATH'])) debug('BED is compressed, creating BedTool') bed = BedTool(path) else: debug('BED is uncompressed, creating BedTool') bed = BedTool(path) if chrom: debug('Filtering BEDTool for chrom ' + chrom) bed = bed.filter(lambda r: r.chrom == chrom) return bed else: return path
def annotate_peaks(notsif, beds, names): """Takes notsif, transforms to bed, and outputs annotation of where the miRNA seed is interrogating via Cytoscape edge attribute file. """ strand = find_strand_from_filename(notsif) mirna_bed = BedTool(notsif_to_bed(notsif, strand), from_string=True) # create the reference beds reference = {} for name, bed in izip(names, beds): reference[name] = BedTool(bed) for name in names: # intersect the mirna bed with the reference annotations for hit in mirna_bed.intersect(reference[name], s=True, stream=True): # name field returned from notsif_to_bed is delimited by "|" mirna_name = hit.name.split("|")[0] gene_name = hit.name.split("|")[1] # Cytoscape formatting seed_length = "(%s)" % hit.score fields = (mirna_name, seed_length, gene_name, "=", name) print " ".join(map(str, fields))
def gene_regions(vf, af): v = BedTool(vf) feats = BedTool(af) # first establish all the columns in the annotation file cols = set(f[4] for f in feats) results = {} intersection = v.intersect(feats, wb=True) if len(intersection) > 0: #sort_cmd1 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$9"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s 1<>%s' % (intersection.fn, intersection.fn) #call(sort_cmd1, shell=True) tempfile1 = tempfile.mktemp() sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$9"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s > %s' % (intersection.fn, tempfile1) call(sort_cmd2, shell=True) intersection = BedTool(tempfile1) annots = intersection.groupby(g=[1,2,3,4,5], c=6, ops='collapse') for entry in annots: regions = {} regions[entry[4]] = entry[5] results[entry.name] = Series(regions) df = DataFrame(results, index = cols) return df.T.fillna(0)
def xstream(a, b, distance, updown, out): """ find all things in b that are within distance of a in the given direction (up or down-stream) """ direction = dict(u="l", d="r")[updown[0]] kwargs = {'sw':True, direction: distance} if "l" in kwargs: kwargs["r"] = 0 else: kwargs["l"] = 0 a = BedTool(a).saveas() kwargs['stream'] = True c = a.window(b, **kwargs) afields = a.field_count() seen = collections.defaultdict(set) for feat in c: key = "\t".join(feat[:afields]) # keep track of all the feature names that overlap this one seen[key].update((feat[afields + 3],)) # the entries that did appear in the window for row in seen: out.write(row + "\t" + ",".join(sorted(seen[row])) + "\n") # write the entries that did not appear in the window'ed Bed for row in a: key = "\t".join(row[:afields]) if key in seen: continue out.write(str(row) + "\t.\n") out.flush() assert len(BedTool(out.name)) == len(a)
def sequence_from_bedfile(fastafile, features=None, bedfile=None, pad5=0, pad3=0): """Fasta sequences from set of genomic features in a bed file Args: fastafile: fasta file with genomic sequence features: dataframe of features/coords with bed file col names bedfile: optionally provide a bed file instead pad5,pad3: flanking sequence at 5' or 3' ends Returns: a pandas dataframe with name, sequence and coord columns""" from pybedtools import BedTool if bedfile != None: features = utils.bed_to_dataframe(bedfile) new = [] for n,r in features.iterrows(): if r.strand == '+': coords = (r.chr,r.chromStart-pad5,r.chromEnd+pad3) seq = str(BedTool.seq(coords, fastafile)) else: #reverse strand coords = (r.chr,r.chromStart-pad3,r.chromEnd+pad5) seq = str(BedTool.seq(coords, fastafile)) seq = HTSeq.Sequence(seq).get_reverse_complement() #print n, coords, r['name'] new.append([r['name'],str(seq),coords]) new = pd.DataFrame(new, columns=['name','seq','coords']) return new
def calculate_ovl(nbedfile, obedfile, opts, scoresfile): nbedtool = BedTool(nbedfile) obedtool = BedTool(obedfile) ab = nbedtool.intersect(obedtool, wao=True, f=opts.f, r=opts.r, s=opts.s) cmd = """cut -f4,5,10,13 | awk -F $'\t' 'BEGIN { OFS = FS } ($3 != "."){ print $1,$3,$2,$4; }'""" sh(cmd, infile=ab.fn, outfile=scoresfile)
def segmentations(vf, af): v = BedTool(vf) feats = BedTool(af) results = {} intersection = v.intersect(feats, wb=True) if len(intersection) > 0: sort_cmd1 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$8"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s 1<>%s' % (intersection.fn, intersection.fn) call(sort_cmd1, shell=True) annots = intersection.groupby(g=[1,2,3,4,5], c=6, ops='collapse') for entry in annots: regions = {} regions[entry[4]] = entry[5] results[entry.name] = Series(regions) names = { 'CTCF': 'CTCF_REG', 'E': 'ENH', 'PF': 'TSS_FLANK', 'R': 'REP', 'T': 'TRAN', 'TSS': 'TSS', 'WE': 'WEAK_ENH' } return DataFrame(results, index=names.keys()).T.rename(columns=names)
def generate_bed_file_annotations(bed_directory, output_directory, loci): """ Generates the annotation file for every bed file in the bed_directory folder """ # Loop over the bed files in the bed directory. bed_file_list = glob.glob(os.path.join(bed_directory, "*.bed")) logging.info("Start to generate BED file annotations") logging.info("Writing annotation to: {0}/".format(output_directory)) for locus in loci: zscore = os.path.join(output_directory, locus) bed_lines, rsids = _bed_from_zscore(zscore) tmp_bed = open("tmp.bed","w").writelines(bed_lines) snps = BedTool("tmp.bed") no_snps = _get_line_number(zscore) a_matrix= AnnotateLociMatrix(len(bed_file_list), no_snps) logging.info("Annotating locus: {0}, using VCF file {1}".format(locus, zscore)) for beds in bed_file_list: test_annotation = BedTool(beds) inter = snps.intersect(test_annotation) idxs = [] for inte in inter: idxs.append(rsids.index(inte.name)) zeroes = np.zeros(len(rsids)) for idx in idxs: zeroes[idx] = 1 a_matrix.add_annotation(zeroes, beds) annotations_file = os.path.join(output_directory, locus + ".annotations") logging.info("Writing annotation matrix to: {0}".format(annotations_file)) a_matrix.write_annotations(annotations_file) os.remove("tmp.bed")
def calc_origin_bkgd_freqs(bedtool, strand, fasta_filename, verbose): # add strand to bedtool if strand == 'pos': strand_char = '+' elif strand == 'neg': strand_char = '-' intervals = [] for row in bedtool: # input is BED6, output needs BED6 row.strand = strand_char intervals.append(row) stranded_bedtool = BedTool(intervals) fastatool = stranded_bedtool.sequence(fi=fasta_filename, s=True) kwargs = {'region_size_min':1, 'region_size_max':1, 'ignore_chroms':[], 'only_chroms':[], 'verbose':verbose} if verbose: print >>sys.stderr, ">> calculating background freqs ..." result = calc_bkgd_counts(fastatool.seqfn, **kwargs) return result
def gene_regions(vf, af): v = BedTool(vf) feats = BedTool(af) # first establish all the columns in the annotation file cols = set(f[4] for f in feats) results = {} intersection = v.intersect(feats, wb=True) if len(intersection) > 0: annots = intersection.groupby(g=[1,2,3,4], c=9, ops='collapse') for entry in annots: regions = {} for region in entry[4].split(','): if region in regions: regions[region] += 1 else: regions[region] = 1 results[entry.name] = Series(regions) df = DataFrame(results, index = cols) return df.T.fillna(0)
def main(): p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument('bed', help='bed with miRNA as name') p.add_argument('--reference-beds', dest='reference', nargs='+', help='reference beds for each feature to annotate') p.add_argument('--names', nargs='+', help='names corresponding to reference files') args = p.parse_args() if not args.names and not args.reference: sys.exit(p.print_help()) bed = BedTool(args.bed) # create the reference beds reference = {} for refname, refbed in izip(args.names, args.reference): reference[refname] = BedTool(refbed) for refname in args.names: # intersect the mirna bed with the reference annotations for b in bed.intersect(reference[refname], s=True, stream=True): # Cytoscape formatting fields = (b.name, "=", refname) print " ".join(map(str, fields))
def getNegativeDatasetFASTA(config): try: coordinates = BedTool(config['negativesBedFile']) genome = BedTool(config['maize_genome_filepath']) dataset = coordinates.sequence(fi=genome, fo=config['negative_dataset_output']) except ValueError: print 'getNegativeDatasetFASTA; File ', config['maize_genome_filepath'], ' not found'
def filter_bed(bedfile, snp_list, outfile=sys.stdout): """Filter a bedfile to only include snps in snp_list, print to outfile. :bedfile: A bed file of all the SNPs, can be gzipped. :snp_list: List/tuple/set/frozenset of snp names. :outfile: Something .bed or .bed.gz, deault STDOUT. :returns: 0 on success 1 on failure """ try: from pybedtools import BedTool except ImportError: logme.log('pybedtools is not installed.\n' + 'Please install and try again. You can get it from here:\n' + 'https://github.com/daler/pybedtools', level='error') return -1 if not isinstance(snp_list, (tuple, list, set, frozenset)): raise Exception('snp_list must be tuple/list/set/frozenset ' + 'it is: {}'.format(type(snp_list))) bed = BedTool(bedfile) filtered = bed.filter(lambda a: a.name in snp_list) with open_zipped(outfile, 'w') as fout: fout.write(str(filtered))
def get_coverage(bed_prefix, directory, file_prefix, bam): """ Coverage at all positions is calculated. This is then used for coverage analysis and to determine read depth at any false negative sites :param bed_prefix: all regions in the bed files submitted are in a file generated during intersections :param directory: location of patient results :param file_prefix: prefix used for all files in pipeline i.e. worklist-patient :return out: filename for coverage stats """ #TODO change BAM path so filename is not required print 'Generating coverage stats.' whole_bed = '/results/Analysis/MiSeq/MasterBED/GIAB/' + bed_prefix + '.whole.bed' out = directory + '/giab_results/whole_bed_coverage.txt' command = '/results/Pipeline/program/sambamba/build/sambamba depth base --min-coverage=0 -q29 -m -L ' + whole_bed + \ ' ' + bam + ' > ' + out + '.tmp' try: subprocess.check_call(command, shell=True) except subprocess.CalledProcessError as e: print 'Error executing command:' + str(e.returncode) exit(1) print 'Sambamba complete.' #issue with sambamba that leaves out regions that have 0 coverage - intersect regions to find missing and add # them to the file at coverage 0 temp_bed = out.replace('.txt', '.bed.tmp') command = 'awk \'{print($1"\\t"$2"\\t"$2+1"\\t"$3)}\' ' + out + '.tmp | grep -v "COV" > ' + temp_bed print command try: subprocess.check_call(command, shell=True) print 'BED coordinates extracted.' except subprocess.CalledProcessError as e: print 'Error executing command:' + str(e.returncode) exit(1) coverage_bed = BedTool(temp_bed) print 'BED tool created' whole_bedtool = BedTool(whole_bed) print 'Intersecting' missing_regions = whole_bedtool.intersect(coverage_bed, v=True) missing_file = directory + '/giab_results/regions_missing' missing_regions.moveto(missing_file) print 'Generating file' sample_split = file_prefix.split('-') sample = sample_split[1] + '-' + sample_split[2] command = '''while read i; do start=`echo "$i"|cut -f2`; end=`echo "$i"|cut -f3`; chr=`echo "$i"|cut -f1`; end_true=`echo "${end} - 1" | bc`; for j in $(seq $start $end_true); do new_end=`echo -e "${j} + 1" | bc`; echo -e "$chr\\t${j}\\t0\\t0\\t0\\t0\\t0\\t0\\t0\\t''' + sample + '''";done;done < ''' + missing_file + '> ' + directory + '/to_add' print command try: subprocess.check_call(command, shell=True) except subprocess.CalledProcessError as e: print 'Error executing command:' + str(e.returncode) exit(1) command = 'cat ' + out + '.tmp ' + directory + '/to_add > ' + out try: subprocess.check_call(command, shell=True) except subprocess.CalledProcessError as e: print 'Error executing command:' + str(e.returncode) exit(1) print 'fix complete.' return out
def main(): p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument('peaks', help='peaks bed') p.add_argument('exons', help='refseq exons from UCSC') p.add_argument('gtf', help='refseq gtf with feature of interest') p.add_argument('feature', help='feature of interest in the gtf') p.add_argument('-v', '--verbose', action="store_true", help='maximum verbosity') args = p.parse_args() if args.verbose: sys.stderr.write(">> building exon library...\n") exon_lib = make_exon_lib(args.exons) peaks = BedTool(args.peaks) exons = BedTool(args.exons) full_ref = BedTool(args.gtf) if args.verbose: sys.stderr.write(">> filtering for feature...\n") filtered_ref = full_ref.filter(lambda gtf: gtf[2] == args.feature) if args.verbose: sys.stderr.write(">> selecting exonic peaks...\n") exonic_peaks = peaks.intersect(exons, wo=True) if args.verbose: sys.stderr.write(">> calculating distance fractions...\n") # D for distance (returns negative if upstream) for peak in exonic_peaks.closest(filtered_ref, D="a"): try: p = ComplexLine(peak) corrected_distance = 0.0 total_exon_length = 0.0 # parse gtf attrs gene_id = p.gtfattrs.split(';')[0].rstrip('"').lstrip('gene_id "') # looking downstream wrt peak if p.gtfdistance > 0: # exon with peak corrected_distance = p.exonstop - p.peakstop for exon in exon_lib[p.exoninfo.name]: # add downstream exon lengths if exon > p.exoninfo.number: corrected_distance += exon_lib[p.exoninfo.name][exon] # looking upstream wrt peak else: # exon with peak corrected_distance = p.peakstart - p.exonstart for exon in exon_lib[p.exoninfo.name]: # add upstream exon lengths if exon < p.exoninfo.number: corrected_distance += exon_lib[p.exoninfo.name][exon] for exon in exon_lib[p.exoninfo.name]: total_exon_length += exon_lib[p.exoninfo.name][exon] # fraction print (corrected_distance / total_exon_length) except ValueError: continue
def getPositiveDatasetFASTA(config): if (not os.path.isfile(config['positive_dataset_output'])): try: coordinates = BedTool(config['bed_file_post']) genome = BedTool(config['maize_genome_filepath']) dataset = coordinates.sequence(fi=genome, fo=config['positive_dataset_output']) except ValueError: print 'getPositiveDatasetFASTA; File ', config['maize_genome_filepath'], ' not found'
def cpg_islands(vf, af): print "inside cpg_islands" v = BedTool(vf) cpg = BedTool(af) overlap = v.intersect(cpg, wb=True) results = dict([ (r.name, 1) for r in overlap ]) print "exit cpg_islands" return Series(results, name="cpg_island")
def motifs(vf, af): print "inside motif" v = BedTool(vf) cpg = BedTool(af) overlap = v.intersect(cpg, wb=True) results = dict([ (r.name, 1) for r in overlap ]) print "exit motif" return Series(results, name="pwm")
def feat_dist(vf, af, name): print "inside feat_dist" v = BedTool(vf) a = BedTool(af) closest = v.closest(a, D="b") results = dict([ (r.name, int(r[len(r.fields)-1])) for r in closest ]) print "exiting feat_dist" return Series(results, name=name)
def gc_content(vf, fa, flank=50): print "inside gc_content" v = BedTool(vf) flanks = v.slop(g=pybedtools.chromsizes('hg19'), b=flank) nc = flanks.nucleotide_content(fi=fa) results = dict([ (r.name, float(r[5])) for r in nc ]) print "exiting gc_content" return Series(results, name="GC")
def getCDSs(bedfilename, reffilename, strand): """ return iterator of coding sequences """ bed = BedTool(bedfilename) bed = bed.filter(lambda x: x.strand == strand) fasta = reffilename bed = bed.sequence(fi=fasta, s=True) return SeqIO.parse(bed.seqfn, "fasta")
def build_vcf_intervals(reads, vcf_recs, bam_handle): """ Find if any of these reads match a known SUN/indel by simple bedtools intersections """ vcf_bed_recs = [ChromosomeInterval(x.CHROM, x.start, x.end, None) for x in vcf_recs] vcf_bed = BedTool(vcf_bed_recs) reads_bed_recs = [(bam_handle.getrname(x.tid), x.positions[0], x.positions[-1]) for x in reads if len(x.positions) > 2] reads_bed = BedTool(reads_bed_recs) return list(vcf_bed.intersect(reads_bed))
def calc_signals(bam_filename, region_bed_filename, signal_colnum, region_type, normalize, verbose): ''' generator to calculate signals from BED regions mapped onto positive and negative strand data.''' region_bedtool = BedTool(region_bed_filename) # bedtools.map operations operations = ('sum','count') signal_type = 'raw' if normalize: signal_type = 'norm' for signal_strand in STRANDS: signal_bedtool = load_coverage(bam_filename, strand=signal_strand, verbose=verbose) for oper in operations: map_bedtool = region_bedtool.map(signal_bedtool, o=oper, c=signal_colnum, null=0) for region_row, signal_row in izip(region_bedtool, map_bedtool): try: region_name = region_row[3] region_score = region_row[4] region_strand = region_row[5] except IndexError: region_name = '%s-%s-%d-%d' % (region_type, region_row.chrom, region_row.start, region_row.end) region_score = 0 # default region_strand = 'none' if region_strand == '+': region_strand = 'pos' elif region_strand == '-': region_strand = 'neg' # last field is the calculated signal signal = float(signal_row[-1]) if normalize and signal != 0: region_size = float(region_row.end - region_row.start) signal = signal / region_size result = (region_name, region_score, 'region-'+region_strand, region_type, 'signal-'+signal_strand, oper, signal, signal_type) yield result
def calc_intersection(bedtools, verbose): intersect_tool = BedTool() if verbose: print >>sys.stderr, ">> generating intersection ... " result = intersect_tool.multi_intersect(i=[bt.fn for bt in bedtools]) return result
def check_bed(self, bed_file, stream): bed = BedTool(bed_file, from_string=stream) try: sorted_bed = bed.sort() merged_bed = sorted_bed.merge(c="4", o="distinct") return merged_bed except Exception as exception: print ("ERROR: " + str(exception))
def folding_analysis(bedfilename, fastafilename, verbose): bedtool = BedTool(bedfilename) for region in bedtool: region_seq = bedtool.sequence() struct, mfe = RNA.fold(region_seq) for pos, nuc in enumerate(region_seq): struct_char = struct[pos]
def filterReadsByLength(inbam, minlength, maxlength): ''' Takes a bam file and selects intervals that are within the defined lengths. Input: bam file and min/max lengths Output: bedTool ''' # convert bam to bed intervals = BedTool(inbam).bam_to_bed() filt = intervals.filter(lambda x: len(x) > minlength and len(x) < maxlength).saveas() # print filt return filt
def transcripts_list_to_bed6(self, file_name=None, save_in_file=False ): bed6_trans = [trans_to_b6.get_bed6() for trans_to_b6 in self.transcripts_list()] # aqui deve entrar algum distema de filtros bed6_trans = BedTool('\n'.join(bed6_trans), from_string=True).sort() if not save_in_file: return bed6_trans else: if file_name: return bed6_trans.saveas(fn=file_name, trackline="track name='Transcripts {}' color=128,0,0".format(file_name.split('/')[-1])) else: raise IOError('\nthe file_name method function needs a name or complete file path with name.\n')
def repeats(vf, af): v = BedTool(vf) feats = BedTool(af) intersection = v.intersect(feats, wb=True) results = {} if len(intersection) > 0: annots = intersection.groupby(g=[1,2,3,4], c=8, ops='collapse') for entry in annots: types = entry[4].split(',') results[entry.name] = len(types) return Series(results, name='repeat')
def main(): p = optparse.OptionParser(__doc__) p.add_option("-a", dest="a", help="file to annotate. first 3 columns are " "chrom start stop") p.add_option("-b", dest="b", help="superbed to annotate with") p.add_option("--header", dest="header", help="a file has a header", action="store_true", default=False) p.add_option("-N","--no-near", dest="no_near", help="dont find the nearest gene, just the up/downstream", action="store_true", default=False) p.add_option("--upstream", dest="upstream", type=int, default=None, help="distance upstream of [a] to look for [b]") p.add_option("--downstream", dest="downstream", type=int, default=None, help="distance downstream of [a] to look for [b]") p.add_option("--transcripts", dest="transcripts", action="store_true", default=False, help="use transcript names in output as well as" " gene name. default is just gene name") opts, args = p.parse_args() if (opts.a is None or opts.b is None): sys.exit(not p.print_help()) b = opts.b if not opts.transcripts: b = remove_transcripts(b) if not (opts.upstream or opts.downstream): superanno(opts.a, b, opts.header, opts.no_near, sys.stdout) else: out = open(BedTool._tmp(), "w") superanno(opts.a, b, opts.header, opts.no_near, out) out.close() new_header = [] out_fh = open(out.name) new_header = [out_fh.readline().rstrip("\r\n")] if opts.header else [] for xdir in ("upstream", "downstream"): dist = getattr(opts, xdir) if dist is None: continue new_out = open(BedTool._tmp(), "w") xstream(out_fh, b, dist, xdir, new_out) new_header.append("%s_%i" % (xdir, dist)) new_out.close() out_fh = open(new_out.name) if opts.header: print "\t".join(new_header) for line in open(out_fh.name): sys.stdout.write(line)
def getFlank(): """ Change the position in tabbed file in order to include flank region. For vcf file, we need to generate a new temporary tabbed file which contain the ID, start position and stop position. If the argument --notempfile is passed, it create the file [tabbed_file_name]_out.[ext] in the directory ./result. """ lenChr = parseFa() if (args.typeA != None and ext == "gff3") or change: fileTab2 = BedTool(tabO) else: fileTab2 = BedTool(args.tabinput) if args.verbose != 0 and args.notempf: print("\n ----- Creating file '" + tabOP + "'. ----- ") if (fileTab2.file_type == "bed"): with open(lenChr.name, "r") as lenC: res = "" for feature in fileTab2: fjoin = ("__".join(str(feature).split("\t"))).replace( " ", "\\s") for line in lenC: lenghtC = re.search(feature.chrom + "\t(\d+)", line) if lenghtC: break lenC.seek(0) if feature.stop + args.flank > int(lenghtC.group( 1)): # TODO : unreadable, gotta change that stop = int(lenghtC.group(1)) else: stop = feature.stop + args.flank if feature.start - args.flank < 0: start = 0 else: start = feature.start - args.flank res += feature.chrom + "\t" + str(start) + "\t" + str( stop) + "\t" + fjoin + "\n" BedTool(res, from_string=True, deli="\t").saveas(tabOP) #fileTab2.slop(b=args.flank, g=lenChr.name ,output=tabOP, header=True) if (fileTab2.file_type == "gff"): with open(lenChr.name, "r") as lenC: res = "" countCds = 0 for feature in fileTab2: taline = str(feature).split("\t") if taline[2] == "CDS": countCds += 1 taline[-1] = taline[-1][0:-1] + ";Note=" + str(countCds) else: countCds = 0 fjoin = ("__".join(taline)).replace(" ", "\\s") for line in lenC: lenghtC = re.search(feature.chrom + "\t(\d+)", line) if lenghtC: break lenC.seek(0) if feature.stop + args.flank > int(lenghtC.group( 1)): # TODO : unreadable, gotta change that stop = int(lenghtC.group(1)) else: stop = feature.stop + args.flank if feature.start - args.flank < 0: start = 0 else: start = feature.start + 1 - args.flank res += feature.chrom + "\t" + str(start) + "\t" + str( stop) + "\t" + fjoin + "\n" BedTool(res, from_string=True, deli="\t").saveas(tabOP) #fileTab2.slop(b=args.flank, g=lenChr.name ,output=tabOP, header=True) elif (fileTab2.file_type == "vcf"): with open(lenChr.name, "r") as lenC: res = "" for feature in fileTab2: fjoin = ("__".join(str(feature).split("\t"))).replace( " ", "\\s") for line in lenC: lenghtC = re.search(feature.chrom + "\t(\d+)", line) if lenghtC: break lenC.seek(0) if feature.stop + args.flank - 1 + (len(feature[3]) - 1) > int( lenghtC.group( 1)): # TODO : unreadable, gotta change that stop = int(lenghtC.group(1)) else: stop = feature.stop + args.flank - 1 + (len(feature[3]) - 1) if feature.start - args.flank - 1 < 0: start = 0 else: start = feature.start - args.flank - 1 res += feature.chrom + "\t" + str(start) + "\t" + str( stop) + "\t" + fjoin + "\n" BedTool(res, from_string=True, deli="\t").saveas(tabOP) return (tabOP)
def make_features_multiTask(positive_windows, y_positive, nonnegative_regions_bed, bigwig_files, bigwig_names, genome, epochs, valid_chroms, test_chroms): chroms, chroms_sizes, genome_bed = get_genome_bed() train_chroms = chroms for chrom in valid_chroms + test_chroms: train_chroms.remove(chrom) genome_bed_train, genome_bed_valid, genome_bed_test = \ [subset_chroms(chroms_set, genome_bed) for chroms_set in (train_chroms, valid_chroms, test_chroms)] positive_windows_train = [] positive_windows_valid = [] positive_windows_test = [] positive_data_train = [] positive_data_valid = [] positive_data_test = [] import pdb print('Splitting positive windows into training, validation, and testing sets') for positive_window, target_array in itertools.izip(positive_windows, y_positive): if len(positive_window.chrom) > 8: pdb.set_trace() chrom = positive_window.chrom start = int(positive_window.start) stop = int(positive_window.stop) if chrom in test_chroms: positive_windows_test.append(positive_window) positive_data_test.append((chrom, start, stop, shift_size, bigwig_files, [], target_array)) elif chrom in valid_chroms: positive_windows_valid.append(positive_window) positive_data_valid.append((chrom, start, stop, shift_size, bigwig_files, [], target_array)) else: positive_windows_train.append(positive_window) positive_data_train.append((chrom, start, stop, shift_size, bigwig_files, [], target_array)) positive_windows_train = BedTool(positive_windows_train) positive_windows_valid = BedTool(positive_windows_valid) positive_windows_test = BedTool(positive_windows_test) import pdb print('Getting negative training examples') negative_windows_train = BedTool.cat(*(epochs*[positive_windows]), postmerge=False) #negative_windows_train = BedTool.cat(*(10*[positive_windows]), postmerge=False) #pdb.set_trace() negative_windows_train = negative_windows_train.shuffle(g=genome_sizes_file, incl=genome_bed_train.fn, excl=nonnegative_regions_bed.fn, noOverlapping=False, seed=np.random.randint(-214783648, 2147483647)) #seed=np.random.randint(-21478364, 21474836)) print('Getting negative validation examples') negative_windows_valid = positive_windows_valid.shuffle(g=genome_sizes_file, incl=genome_bed_valid.fn, excl=nonnegative_regions_bed.fn, noOverlapping=False, seed=np.random.randint(-214783648, 2147483647)) #seed=np.random.randint(-21478364, 21474836)) print('Getting negative testing examples') negative_windows_test = positive_windows_test.shuffle(g=genome_sizes_file, incl=genome_bed_test.fn, excl=nonnegative_regions_bed.fn, noOverlapping=False, seed=np.random.randint(-214783648, 2147483647)) #seed=np.random.randint(-21478364, 21474836)) # Train print('Extracting data from negative training BEDs') negative_targets = np.zeros(y_positive.shape[1]) negative_data_train = [(window.chrom, window.start, window.stop, shift_size, bigwig_files, [], negative_targets) for window in negative_windows_train] # Validation print('Extracting data from negative validation BEDs') negative_data_valid = [(window.chrom, window.start, window.stop, shift_size, bigwig_files, [], negative_targets) for window in negative_windows_valid] # Test print('Extracting data from negative testing BEDs') negative_data_test = [(window.chrom, window.start, window.stop, shift_size, bigwig_files, [], negative_targets) for window in negative_windows_test] num_positive_train_windows = len(positive_data_train) data_valid = negative_data_valid + positive_data_valid data_test = negative_data_test + positive_data_test print('Shuffling training data') data_train = [] for i in xrange(epochs): epoch_data = [] epoch_data.extend(positive_data_train) epoch_data.extend(negative_data_train[i*num_positive_train_windows:(i+1)*num_positive_train_windows]) np.random.shuffle(epoch_data) data_train.extend(epoch_data) print('Generating data iterators') bigwig_rc_order = get_bigwig_rc_order(bigwig_names) datagen_train = DataIterator(data_train, genome, batch_size, L, bigwig_rc_order) datagen_valid = DataIterator(data_valid, genome, batch_size, L, bigwig_rc_order) datagen_test = DataIterator(data_test, genome, batch_size, L, bigwig_rc_order) print(len(datagen_train), 'training samples') print(len(datagen_valid), 'validation samples') print(len(datagen_test), 'test samples') return datagen_train, datagen_valid, datagen_test, data_valid,data_test
def overlap(bed_file, gtf_file, result_file): variants_bed = BedTool(bed_file) gtf = BedTool(gtf_file) variants_bed.intersect(gtf, wb=True).moveto(result_file)
def radloci(bedgz): ''' Use pybedtools to get estimated callable RAD loci positions. Return number of loci. ''' bedcov = BedTool(bedgz) filtcov = [] locidic = {} chrsize = {} chrset = set() for line in bedcov: mychr = line[0] chrset.add(mychr) # Locus must have minimum coverage of 4 if int(line[3]) > 3: bedrow = (line[0], int(line[1]), int(line[2])) filtcov.append(bedrow) endpoint = int(line[2]) if mychr in chrsize: endpoints = chrsize[mychr] endpoints.append(endpoint) chrsize[mychr] = endpoints else: chrsize[mychr] = [endpoint] # Replace list of endpoints with max chr position for key, value in chrsize.items(): maxpos = max(value) chrsize[key] = maxpos # The chrsize dict now contains chr lengths filtbed = BedTool(filtcov) # Merge regions to get loci # Distance of 100 should merge # properly paired reads with insert size < 300 loci = filtbed.merge(d=100) # Counter list for all loci locicnt = 0 # Counter list for chr loci chrcnts = [] # Dic to hold chr loci stats for l in loci: locicnt = locicnt + 1 scaff = l[0] start = int(l[1]) stop = int(l[2]) locmid = int((start + stop) / 2) if scaff in locidic: scaffcnt = locidic[scaff][0] + 1 locpos = locidic[scaff][1] locpos.append(locmid) locidic[scaff] = [scaffcnt, locpos] else: locidic[scaff] = [1, [locmid]] for key, value in locidic.items(): # Append loci per chr to list # Ignore unplaced and scaff if 'npla' not in key and 'scaff' not in key: chrcnts.append(value[0]) # Get chr len from dict chrlen = chrsize[key] # Calculate max bin size with chr len toprange = math.ceil(chrlen / 100000.0) * 100000.0 # Number of 100kb bins binnum = int(toprange / 100000.0) # Calculate histogram hist, bins = np.histogram(value[1], bins=binnum, range=(0, toprange)) hist = np.ndarray.tolist(hist) mhist = round(statistics.mean(hist), 4) sdhist = round(statistics.stdev(hist), 4) lchrdic = {} lchrdic['Loci'] = value[0] lchrdic['Mean loci per 100kb'] = mhist lchrdic['StDev loci per 100kb'] = sdhist locidic[key] = lchrdic try: lchrdic = {} lchrdic['Loci'] = locicnt lchrdic['Mean loci per chr'] = round(statistics.mean(chrcnts), 4) lchrdic['StDev loci per chr'] = round(statistics.stdev(chrcnts), 4) locidic['total'] = lchrdic except: lchrdic['Loci'] = 'NA' lchrdic['Mean loci per chr'] = 'NA' lchrdic['StDev loci per chr'] = 'NA' locidic['total'] = lchrdic for chrom in chrset: if chrom not in locidic: emptydic = {} emptydic['Loci'] = 0 emptydic['Mean loci per 100kb'] = 0 emptydic['StDev loci per 100kb'] = 0 locidic[chrom] = emptydic return locidic
if len(lstGenes) == 0: exit("[Nk_makeBED] Any gene found in genes list file `" + pathGenes + "`") # Search genes id lstMissingGene = lstGenes for gene in gffutilsDB.features_of_type("gene"): if gene.attributes["Name"][0] in lstGenes: setGene.add(gene) lstMissingGene.remove(gene.attributes["Name"][0]) # Error if any gene not found if len(lstMissingGene) > 0: exit("[Nk_makeBED] Unable to find following gene(s): `" + ",".join(lstMissingGene) + "`") #3nd case: bed & no genes => search intersected gene identifier else: bed = BedTool(pathBed) genes = BedTool(pathGff) # Search gff exons intersection for intersect_elem in genes + bed: if intersect_elem.fields[2] == "exon": exon = gffutilsDB[intersect_elem.attrs["ID"]] # retrieve correspunding transcript for rna in gffutilsDB.parents(exon, order_by='start'): for gene in gffutilsDB.parents(rna, featuretype='gene', order_by='start'): setGene.add(gene) # delete created temp file cleanup(remove_all=True) #***** CONSTRUCT ANNOTATED REGIONS BED FILE *****#
for s in range(0, len(genesnps), 1): if genesnps[s] in snptable: tempsnp = genesnps[s] ac_sum = ac_sum + int(snptable[tempsnp][1]) hom_sum = hom_sum + int(snptable[tempsnp][2]) return [ac_sum, hom_sum] #Make list of all SNPs across all genes present in snpfile allsnplist = makesnplist(options.snpfilename) #Make a hashtable with keys as each SNP, and stores a list of indices of carriers for that SNP count_table = {} #Open vcf file vcffile = BedTool(options.vcffilename) if options.bedfilename is not None: bed = BedTool(options.bedfilename) vcffile_temp = vcffile.intersect(bed) else: if chrformat == "chr": dummy_bed = BedTool('chr1000 100000000 100000001', from_string=True) else: dummy_bed = BedTool('1000 100000000 100000001', from_string=True) vcffile_temp = vcffile.subtract(dummy_bed) for line_vcf1 in open(vcffile_temp.fn): line_vcf = line_vcf1.rstrip().split('\t') if line_vcf[0][0] != "#" and ("," not in line_vcf[4]): if not (options.passfilter and line_vcf[6] != "PASS"): if options.snpformat == "VCFID":
def overlap_pe(variants_bedpe_file, gtf_file, result_file): variants_bed = BedTool(variants_bedpe_file) gtf = BedTool(gtf_file) olaps = variants_bed.pair_to_bed(gtf, stream=True).moveto(result_file)
def get_muts_tracks_info(muts_input_file, tracks_dir, muts_dir_out, split_muts_file_by_chr=True): muts_tracks_files = [] tracks_files = [x for x in os.listdir(tracks_dir) if x.endswith('.bed')] if split_muts_file_by_chr: muts_files = [] chr_ext = "." + tracks_files[0].split('.')[-1] if os.path.exists(muts_dir_out): muts_tracks_files = [ muts_dir_out + '/' + x for x in os.listdir(muts_dir_out) if x.endswith('_overlapping_tracks.bed10') ] if len(muts_tracks_files) > 0: return muts_tracks_files else: os.mkdir(muts_dir_out) muts_files = [ muts_dir_out + '/' + x for x in os.listdir(muts_dir_out) if x.endswith(chr_ext) ] if len(muts_files) <= 0: os.system( """awk '{{print $0 >> "{muts_dir}/"$1"{chr_ext}"}}' {muts_file} """ .format(muts_dir=muts_dir_out, chr_ext=chr_ext, muts_file=muts_input_file)) muts_files = [ muts_dir_out + '/' + x for x in os.listdir(muts_dir_out) if x.endswith(chr_ext) ] print('muts_files: ', muts_files) print('tracks_files: ', tracks_files) for muts_file in muts_files: if muts_file.split('/')[-1] in tracks_files: muts_tracks_file = muts_file + "_overlapping_tracks.bed10" if not os.path.exists(muts_tracks_file): print("Intersecting and Grouping: ", muts_tracks_file) BedTool(muts_file).intersect( BedTool(tracks_dir + '/' + tracks_files[ tracks_files.index(muts_file.split('/')[-1])]), wo=True, loj=True).groupby(g=[1, 2, 3, 4, 5, 6, 7, 8, 9], c=13, o=['collapse' ]).saveas(muts_tracks_file) muts_tracks_files.append(muts_tracks_file) else: for tracks_file in tracks_files: if not os.path.exists(muts_dir_out): os.mkdir(muts_dir_out) muts_tracks_file = muts_dir_out + '/' + tracks_file + "_overlapping_tracks.bed10" if not os.path.exists(muts_tracks_file): print("Intersecting and Grouping: ", muts_tracks_file) BedTool(muts_input_file).intersect( BedTool(tracks_dir + '/' + tracks_file), wo=True, loj=True).groupby(g=[1, 2, 3, 4, 5, 6, 7, 8, 9], c=13, o=['collapse']).saveas(muts_tracks_file) muts_tracks_files.append(muts_tracks_file) print('muts_tracks_files: ', muts_tracks_files) return muts_tracks_files
def get_data(df): Popen('mkdir -p ./' + args.outfile + ".datamatrix/temp/", shell=True) bedtool = BedTool.from_dataframe(df).sort().saveas( args.outfile + '.datamatrix/bedtool_df.bed') a = nuc_cont(bedtool) if args.var_files: var_files_list = list(args.var_files) for i in range(len(var_files_list)): var = get_var_counts(bedtool, var_files_list[i]) a = a.merge(var, on='name') elif not args.var_files: pass if args.bw_files: bw_files_list = list(args.bw_files) for i in range(len(bw_files_list)): bw = get_bigwig_scores(bw_files_list[i], df) a = a.merge(bw, on='name') elif not args.con_files: pass if args.kmer_list: print() print("Starting K-mer counting") get_kmer_counts(args.kmer_list) elif not args.kmer_list: pass if args.rnafold == True: print() print("Starting RNAfold for MFE scoring") get_MFE_scores() elif args.rnafold == False: pass if args.qgrs_mapper == True: print() print("Starting QGRS Mapper for G-Quadruplex scoring") get_QGRS_scores() elif args.qgrs_mapper == False: pass if str(args.nuc_info) == 'full': z = a.drop( ['length', 'seqname', 'start', 'end', 'score', 'strand', 'seq'], 1) else: z = a.drop('seq', 1) if (args.rnafold == True) or (args.qgrs_mapper == True) or (args.kmer_list is not False): z.to_csv(args.outfile + ".datamatrix/temp/data_generic_results.csv", index=False) temp_files = glob.glob(args.outfile + ".datamatrix/temp/*.csv") z_list = [] for i in range(len(temp_files)): df = pd.read_csv(temp_files[i], index_col=0) if ((args.nuc_info == 'full') & ("data_generic_results.csv" in temp_files[i])): df = df.set_index('name') else: pass z_list.append(df) #with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor: # z = executor.submit(pd.concat, z_list, axis=1, join='outer').result() z = pd.concat(z_list, axis=1, join='outer', sort=False) z = z.reset_index().rename(columns={'index': 'name'}).fillna(0) else: pass z = filter_columns(z) return z.drop_duplicates()
sys.exit(1) if args.version: getVersion() sys.exit(1) if args.warn: warnings.filterwarnings("ignore") if args.fasta1 == None: sys.exit("ERROR : Argument --fasta1 (-f1) is missing.") if args.fasta2 == None: sys.exit("ERROR : Argument --fasta2 (-f2) is missing.") if args.tabinput == None: sys.exit("ERROR : Argument --tabinput (-ti) is missing.") global ext ext = BedTool(args.tabinput).file_type if ext == "gff": ext = "gff3" if args.directory[-1] == "/": args.directory = args.directory[:-1] try: os.mkdir(args.directory) if args.verbose != 0: print("\n ----- Creating directory '" + args.directory + "/'. -----") except: pass if args.mismatch == None: if args.percentage: warnings.warn(
def main(): """ Start the TeddyPi pipeline, loads main configuration file, collects input files and parses TE/SV caller specific configuration. This module returns filtered and integrated datasets for tpi_ortho.py. """ options = parse_args(argv[1:]) modulename = "TeddyPi" print u"TeddyPi - Transposable Element detection and discovery for Phylogenetic Inference" print u"---------------------------------------------------------------------------------\n" print u"[ {} ] Initialize configuration from {}...".format(modulename, options.config), # Load main configuration with open(options.config) as fin: config = yaml.load(fin) programs = config['programs'] print u"done." tpi_helpers.create_out_path(config['out_dir']) # Create output directory transposons = config['refte'] # Load reference TE file # 1. Filter operations for each program and species filtered_files = defaultdict(dict) for samplename in config['samples']: print u"[ {} ] Loading data for sample {}; ".format(modulename, samplename) print u"[ {} ] Config has info on these TE/SV callers: {}".format(modulename, ",".join([elem['name'] for elem in programs])) per_sample_files = (fname for fname in os.listdir(config['data_dir']) if fname.startswith(samplename) and fname.endswith( ".vcf")) # TODO avoid reloading processed files for sample_file in per_sample_files: # print "%s, " % sample_file per_sample_vcf = tpi_filter.LoadVCF(data_dir=config['data_dir'], out_dir=config['out_dir'], fname=sample_file, sname=samplename) simple_source = per_sample_vcf.vcf_source.split(" ")[0].lower() if config['programs'] == "auto" or simple_source in [elem['name'] for elem in programs]: per_sample_vcf.skip = False # flag to skip filtering per_sample_vcf.filter_variants() print u"[ {} ] Filtered variants written to: {}\n".format(modulename, per_sample_vcf.out_fname) filtered_files[samplename][simple_source] = per_sample_vcf.out_fname else: print u"[ {} ] Error: Auto-detection of TE/SV callers disabled and VCF-source {} not mentioned in " \ u"config.\nskipping...".format(modulename, simple_source) # 2. Integrate SV-deletions and convert to Ref+ TE calls # tpi_svintegration.py if 'call_operations' in config.keys(): print u"[ {} ] Call operations found in configfile".format(modulename) for op, sources in config['call_operations'].iteritems(): try: assert (set([elem['name'] for elem in programs]) >= (set(sources))) except AssertionError: print u"VCF sources for operations have not been parsed." print u"[ {} ] For operation {}, sources {} were not parsed. Check \' programs \' parameter in {}" \ .format(modulename, op, ",".join(sources), options.config) continue if op == "non_redundant": print u"[ {} ] Starting operation {} on sources {} over all samples,".format(modulename, op, ",".join(sources)) for sample in filtered_files.keys(): print u"[ %s ] %s " % (op, sample) sets = (BedTool(os.path.join(config['out_dir'], filtered_files[sample][src])) for src in sources) nr = nonredundant_2_sets(sets) nr_set_outfile = "{s}.{t}.nr.bed".format(s=sample, t="DEL") nr_set_outfile = os.path.join(config['out_dir'], nr_set_outfile) nr.saveas(nr_set_outfile) print u"[ {} ] non_redundant set saved to {}".format(op, nr_set_outfile) te_isect_outfile = "{s}.{t}.bed".format(s=sample, t="TE") sv_set = nr.window(transposons, w=config['ortho_merge_distance']).saveas( os.path.join(config['out_dir'], te_isect_outfile)) print u"[ {} ] TE intersected set saved to {}".format(op, os.path.join(config['out_dir'], te_isect_outfile)) te_cls_outfile = "{s}.{t}.cls.bed".format(s=sample, t="TE") sv_set = BedTool(cluster_calls(sv_set)).saveas(os.path.join(config['out_dir'], te_cls_outfile)) print u"[ {} ] clustered set saved to {}".format(op, os.path.join(config['out_dir'], te_cls_outfile)) elif op == "intersection": print u"[ {} ] Starting operation {} on sources {} over all samples,".format(modulename, op, ",".join(sources)) for sample in filtered_files.keys(): print u"[ %s ] %s " % (op, sample) sets = (BedTool(os.path.join(config['out_dir'], filtered_files[sample][src])) for src in sources) isect = sets.next().window(sets.next(), w=100, u=True).sort() isect_set_outfile = "{s}.{t}.is.vcf".format(s=sample, t="NONREF_ISEC") isect.saveas(os.path.join(config['out_dir'], isect_set_outfile)) print u"[ {} ] intersected set saved to {}".format(op, os.path.join(config['out_dir'], isect_set_outfile)) elif op == "te_intersect": print u"[ {} ] Starting operation {} on sources {} over all samples,".format(modulename, op, ",".join(sources)) for sample in filtered_files.keys(): print u"[ %s ] %s " % (op, sample) assert len(sources) == 1 src = sources[0] bt_set = tpi_helpers.make_BED_fromVCF(os.path.join(config['out_dir'], filtered_files[sample][src])) te_isect_outfile = "{s}.{t}.bed".format(s=sample, t="TE") sv_set = bt_set.window(transposons, w=50).saveas( os.path.join(config['out_dir'], te_isect_outfile)) print u"[ {} ] TE intersected set saved to {}".format(op, os.path.join(config['out_dir'], te_isect_outfile)) te_cls_outfile = "{s}.{t}.cls.bed".format(s=sample, t="TE") sv_set = BedTool(cluster_calls(sv_set)).saveas(os.path.join(config['out_dir'], te_cls_outfile)) print u"[ {} ] clustered set saved to {}".format(op, os.path.join(config['out_dir'], te_cls_outfile)) else: print u"[ {} ] Operation '{}' not known. Nothing will be done. Check the configuration file.".format( modulename, op) return 1
def main(): args = check_options(get_options()) # jellyfish par jfsize = '100M' # ?build bwa index bwaindexfile = os.path.basename(args.genome) tmpfolder = args.tmp bwatestindex = os.path.join(tmpfolder, bwaindexfile + '.sa') bwaindex = os.path.join(tmpfolder, bwaindexfile) bwabuild = True if os.path.isfile(bwatestindex): bwabuild = False if bwabuild: # build bwa index bwa.bwaindex(args.bwa, args.genome, tmpfolder) print("bwa index build finished ...") else: print("Use", bwatestindex) sampleinfor = dict() names = args.names.split(',') reads1 = args.reads1.split(',') reads2 = args.reads2.split(',') cnsfile = os.path.join(args.saved, '_'.join(names) + '_cns_probe.csv') print(cnsfile) cnsio = open(cnsfile, 'w') for i in range(len(names)): name = names[i] read1 = reads1[i] read2 = reads2[i] bamfile = os.path.join(tmpfolder, name + '.bam') bcffile = os.path.join(tmpfolder, name + '.bcf') jffile = os.path.join(tmpfolder, name + '.jf') cnsprobe = os.path.join(args.saved, name + '_probe.txt') # new add indel indelNprobe = os.path.join(args.saved, name + '_indel_probe.txt') mindepth = os.path.join(tmpfolder, name + '_mindepth.bed') if name in sampleinfor: print("error same name:", name) else: sampleinfor[name] = dict() sampleinfor[name]['read1'] = read1 sampleinfor[name]['read2'] = read2 sampleinfor[name]['bamfile'] = bamfile sampleinfor[name]['bcffile'] = bcffile sampleinfor[name]['jffile'] = jffile # sampleinfor[name]['kmerscore'] = kmerscore # # sampleinfor[name]['kmerscoreio'] = open(kmerscore, 'w') sampleinfor[name]['cnsprobe'] = cnsprobe sampleinfor[name]['cnsprobeio'] = open(cnsprobe, 'w') # new add indel sampleinfor[name]['indelNprobelist'] = list() sampleinfor[name]['indelNprobeio'] = open(indelNprobe, 'w') sampleinfor[name]['mindepth'] = mindepth # run bwa mem bwa.bwamem_paired(bwabin=args.bwa, samtoolsbin=args.samtools, reffile=bwaindex, outfile=bamfile, inputfile1=read1, inputfile2=read2, samplename=name, threadnumber=args.threads) print("bwa mem", name, 'finished') # get min depth bed file bamdepth.bamdepthtobed(bamfile=bamfile, outbed=mindepth, mindepth=args.mindepth, minlength=200) print(mindepth, 'done') # generate bcf file from bam file bcftools.bamtobcf(bcfbin=args.bcftools, reffile=bwaindex, bamfile=bamfile, outbcf=bcffile) print(bcffile, "done") # generate jf file jellyfish.makegenerator(filenames=[read1, read2], type='gz', generators='generators') jellyfish.jfgeneratorscount(jfpath=args.jellyfish, mer=args.length, output=jffile, generators='generators', threads=args.threads, size='100M') print(jffile, "done") probe = BedTool(args.probe).sort() for name in sampleinfor: nowprobe = BedTool(sampleinfor[name]['mindepth']).sort() probe = probe.intersect(nowprobe, wa=True, u=True) # cnsprobe for name in sampleinfor: bcfpool = Pool(args.threads) bcfrunerlist = list() consensusprobelist = list() for i in probe: probestr = str(i).rstrip() bcfconsensusruner = bcftools.BcfConsensusRuner( probestr=probestr, bcftoolspath=args.bcftools, bcffile=sampleinfor[name]['bcffile'], sample=name) bcfrunerlist.append(bcfconsensusruner) # consensusprobe = bcftools.probestrtoconsensus(bcfconsensusruner) # # print(probestr, consensusprobe, sep='\t') reslist = list() for res in bcfpool.imap_unordered(bcftools.probestrtoconsensus, bcfrunerlist): # print(res['probestr'], name, res['consensusprobe'], sep='\t', file=sampleinfor[name]['cnsprobeio']) if len(res['consensusprobe']) != args.length: sampleinfor[name]['indelNprobelist'].append(res) elif 'N' in res['consensusprobe']: continue else: consensusprobelist.append(res['consensusprobe']) # consensusprobelist.append(res) reslist.append(res) bcfpool.close() consensusprobekmerscore = jellyfish.jfquerylist( jfkmerfile=sampleinfor[name]['jffile'], jfpath=args.jellyfish, seqlist=consensusprobelist) kmerscoredict = dict() kmerscorelist = list() for score in consensusprobekmerscore: # print(score, file=sampleinfor[name]['kmerscoreio']) (subseq, kmerscore) = score.split(',') if 'N' not in subseq: kmerscoredict[subseq] = int(kmerscore) kmerscorelist.append(int(kmerscore)) maxkmer = pd.Series(kmerscorelist).quantile(0.9) minkmer = args.minkmer for consensusprobe in reslist: probestr = consensusprobe['probestr'] consensusprobeseq = consensusprobe['consensusprobe'] if consensusprobeseq in kmerscoredict: if kmerscoredict[consensusprobeseq] <= maxkmer: if kmerscoredict[consensusprobeseq] >= minkmer: print(probestr, consensusprobeseq, kmerscoredict[consensusprobeseq], sep='\t', file=sampleinfor[name]['cnsprobeio']) for name in sampleinfor: sampleinfor[name]['cnsprobeio'].close() # sampleinfor[name]['kmerscoreio'].close() # print(sampleinfor) for res in sampleinfor[name]['indelNprobelist']: print(res['probestr'], name, res['consensusprobe'], sep='\t', file=sampleinfor[name]['indelNprobeio']) sampleinfor[name]['indelNprobeio'].close() probdict = dict() for name in sampleinfor: with open(sampleinfor[name]['cnsprobe']) as inio: for infor in inio: infor = infor.rstrip() inforlist = infor.split('\t') orgprb = inforlist[3] if orgprb in probdict: probdict[orgprb][name] = infor else: probdict[orgprb] = dict() probdict[orgprb][name] = infor print('chrom', 'start', 'end', 'refseq', ','.join(sampleinfor), 'consensusprobe', 'consensusscore', 'consensussite', 'consensusdiff', sep=',', file=cnsio) for orgprb in probdict: sharecount = len(probdict[orgprb]) values_view = probdict[orgprb].values() value_iterator = iter(values_view) first_value = next(value_iterator).split('\t') outinfo = first_value[0:3] if len(sampleinfor) == sharecount: # print(sampleinfor, sharecount) # print(orgprb, len(probdict[orgprb])) probelist = list() namelist = list() namelist.append('refseq') probelist.append(orgprb) for name in sampleinfor: infor = probdict[orgprb][name].split('\t') speciesprobe = infor[-2] namelist.append(name) if len(speciesprobe) == len(orgprb): probelist.append(speciesprobe) if len(namelist) == len(probelist): # print(namelist, probelist) res = probecompare.getconsensusprobe(probelist) outinfo.extend(probelist) print(','.join(outinfo), res['consensusprobe'], res['consensusscore'], res['consensussite'], res['consensusdiff'], sep=',', file=cnsio) cnsio.close() print("finished")
def __init__(self, intervals_file, fasta_file, dnase_file, cell_line=None, RNAseq_PC_file=None, mappability_file=None, GENCODE_dir=None, use_linecache=True): # intervals if use_linecache: linecache.clearcache() BT = BedToolLinecache else: BT = BedTool self.bt = BT(intervals_file) # Fasta self.fasta_file = fasta_file self.fasta_extractor = None # initialize later # DNase self.dnase_file = dnase_file self.dnase_extractor = None # mappability if mappability_file is None: # download the mappability file if not existing common_dl_dir = os.path.join(this_dir, "../../template/downloaded/dataloader_files") makedir_exist_ok(common_dl_dir) rf = RemoteFile(url="http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig", md5="1d15ddafe2c8df51cf08495db96679e7") mappability_file = os.path.join(common_dl_dir, "wgEncodeDukeMapabilityUniqueness35bp.bigWig") if not os.path.exists(mappability_file) or not rf.validate(mappability_file): # download the path rf.get_file(mappability_file) self.mappability_file = mappability_file self.mappability_extractor = None # Gencode features if GENCODE_dir is None: gp = os.path.join(this_dir, "../../template/downloaded/dataloader_files/gencode_features/") else: gp = GENCODE_dir download_gencode_dir(gp) # download files self.gencode_beds = [ ("cpg", BedTool(gp + '/cpgisland.bed.gz')), ("cds", BedTool(gp + '/wgEncodeGencodeBasicV19.cds.merged.bed.gz')), ("intron", BedTool(gp + '/wgEncodeGencodeBasicV19.intron.merged.bed.gz')), ("promoter", BedTool(gp + '/wgEncodeGencodeBasicV19.promoter.merged.bed.gz')), ("utr5", BedTool(gp + '/wgEncodeGencodeBasicV19.utr5.merged.bed.gz')), ("utr3", BedTool(gp + '/wgEncodeGencodeBasicV19.utr3.merged.bed.gz')), ] # Overlap beds - could be done incrementally print("Overlapping all the bed-files") # The BT() and .fn are there in order to leverage BedToolLinecache self.overlap_beds = [(b, BT(self.bt.intersect(v, wa=True, c=True).fn)) for b, v in self.gencode_beds] print("Assesing the file") assert len(self.overlap_beds[1][1]) == len(self.bt) # Get the metadata features if cell_line is None: if RNAseq_PC_file is None: raise ValueError("RNAseq_PC_file has to be specified when cell_line=None") assert os.path.exists(RNAseq_PC_file) else: # Using the pre-defined cell-line output_dir = os.path.join(this_dir, "../../template/downloaded/dataloader_files/RNAseq_features/") makedir_exist_ok(output_dir) RNAseq_PC_file = os.path.join(output_dir, cell_line, "meta.txt") url_template = ('https://s3.eu-central-1.amazonaws.com/kipoi-models/dataloader_files/' 'FactorNet/dataloader_files/RNAseq_features/{}/meta.txt') # rf = RemoteFile(url=url_template.format(cell_line)) if not os.path.exists(RNAseq_PC_file): # or not rf.validate(mappability_file): # download the path download_url(url_template.format(cell_line), os.path.join(output_dir, cell_line), "meta.txt") # rf.get_file(RNAseq_PC_file) self.meta_feat = pd.read_csv(RNAseq_PC_file, sep="\t", header=None)[0].values
def samToTab(): """ In order to get the result of alignment usable for further use, we create a new tabbed file in the same format as the inputted tabbed file. This file is the final output of this program. First of all, we extract the start and stop position and remove the flank region to get the initial annotation length. Then we generate the output file with these new position and keep all the information of the original tabbed file. The name of output file is by defaut "[fasta2_name]_out.[ext]" or the name that user specify with argument --output (but still with [ext] as extension). It return the name of the file created. """ tabou = "" if (ext == "gff3" and args.typeA != None) or change: args.tabinput = tabO with open(args.tabinput, "r") as tabi, open(alnN, "r") as sam: for i in tabi: # tabi = tabbed file after all modification line = i.split("\t") if line[0][0] == "#": tabou += i continue tabou += "# File generated the " + datetime.datetime.now( ).strftime( "%d %b %Y" ) + " with following command line : \n" + "# " + " ".join( sys.argv) + "\n" break for f in sam: # samf = alignment file inside Bedtools object isMd = True samf = f.split("\t") if samf[0][0] == "@": continue if int(samf[1]) != 0: # Ignoring complementary match (flag 2048) continue leng = 0 res = re.findall("\d+\w", samf[5]) for i in res: if i[-1] in ["M", "=", "X", "I", "S"]: leng += int(i[:-1]) if leng > args.flank: leng = leng - args.flank else: leng = 0 countM = mdParser( f) # countM is the number of perfect match for a alignment if args.mismatch != None: if not args.percentage: if countM >= leng - args.mismatch: isMd = True else: isMd = False else: if countM >= leng - ((args.mismatch * leng) / 100): isMd = True else: isMd = False tab = samf[0].replace("\\s", " ").split("__") if ext == "gff3" and isMd: tab[3] = int(samf[3]) + args.flank tab[4] = int(tab[3]) + leng - args.flank tabou += "\s".join(map(str, tab)) + "\n" elif ext == "bed" and isMd: tab[1] = int(samf[3]) + args.flank tab[2] = int(tab[1]) + leng - args.flank tabou += "\s".join(map(str, tab)) + "\n" elif ext == "vcf" and isMd: #samf[5]==str(len(tab[3])+(args.flank*2))+"M" : tab[1] = int(samf[3]) + args.flank tabou += "\s".join(map(str, tab)) + "\n" #if f[11][-1]!="0" and f[5]=="101M": # show ID of sequence which contain a missmatch # print(f[0].split(":")[1]+"\t"+f[12]) #if ext == "vcf" and samf[5]=="101M" and samf[5]==samf[12].split(":")[-1]+"M" and okw : # perfect match only for snp #if ext == "vcf" and samf[5]==str(len(samf[0].split("__")[3])+(args.flank*2))+"M" and okw : if args.verbose != 0: print(" ----- Creating file '" + args.out + "'. ----- \n") BedTool(tabou, from_string=True, deli="\s").saveas(args.out) return (args.out)
os.makedirs(model_dir) if not os.path.exists(log_dir): os.makedirs(log_dir) if not os.path.exists(srv_dir): os.makedirs(srv_dir) # Train/val/test intervals DATA_DIR = '/srv/scratch/jesikmin' train_dir, val_dir, test_dir = os.path.join(DATA_DIR, 'train_interval'),\ os.path.join(DATA_DIR, 'val_interval'),\ os.path.join(DATA_DIR, 'test_interval') print train_dir, val_dir, test_dir # Get train/val/test intervals train_intervals = list(BedTool(train_dir)) val_intervals = list(BedTool(val_dir)) test_intervals = list(BedTool(test_dir)) print '# of Train Intervals: {}'.format(len(train_intervals)) print '# of Val Intervals: {}'.format(len(val_intervals)) print '# of Test Intervals: {}'.format(len(test_intervals)) # Get input/output data directories data = Data_Directories() print data.intervals.keys() print data.input_atac[day].keys() print data.output_histone[day].keys() # Extract input candidates # Create an ArrayExtractor for ATAC-seq of a given day and specified fragment length input_candidates = ArrayExtractor(data.input_atac[day][frag])
def isComplete(samtotabOut): """ This function call the function getPosCds(tab) with both inputted tabbed file and newly generated tabbed file and check if CDS in the newly generated file are in the same position within the mRNA (or gene). It take the name of the tabbed file newly generated in argument. """ if ext == "gff3": dicoPos1 = getPosCds(args.tabinput) dicoPos2 = getPosCds(samtotabOut) outTab = samtotabOut.split("/")[-1] geneInt = [] #lastG=0 geneOk = 0 ok = 0 countG = 0 selectable = False filtered = "# File generated the " + datetime.datetime.now().strftime( "%d %b %Y" ) + " with following command line : \n" + "# " + " ".join( sys.argv) + "\n" for key1 in dicoPos1.keys(): for key2 in dicoPos2.keys(): if key2[0] == key1[0]: if len(dicoPos1[key1]) == len(dicoPos2[key2]): geneInt.append(key2[1]) # for v in range (0,len(dicoPos1[key1])) : # print(dicoPos1[key1][v]) # if dicoPos1[key1][v] == dicoPos2[key2][v] : # TODO : c'est de la merde. # geneOk+=1 # print(geneOk) # if geneOk >= len(dicoPos1[key1]) : # here we can add/rm condition to accept or not the mRNA/gene # # add the mRNA/gene number to the list of "acceptable mRNA/gene to select" # geneOk = 0 # else : # geneOk = 0 if "gene" in typeAclean: typeC = "gene" elif "mrna" in typeAclean: typeC = "mrna" with open(samtotabOut, "r") as tabou: for line in tabou: if line[0] == "#": continue lineS = line.strip().split("\t") if lineS[2].lower() == typeC: # TODO : unreadable resTag = re.search("ID=(\w+)", lineS[-1]) if resTag: geneId = resTag.group(1) if lineS[2] == "CDS": resTagCds = re.search("Parent=(\w+)", lineS[-1]) if resTagCds: cdsId = resTagCds.group(1) if lineS[2].lower() == typeC: countG += 1 if countG in geneInt: selectable = True else: selectable = False if lineS[2] == "CDS" and geneId != cdsId: selectable = False if selectable: filtered += ("\s".join(lineS)) + "\n" countG = 0 if args.verbose != 0: print(" ----- Generating filtered GFF file '" + (args.directory + "/filtered_" + outTab) + "'. -----\n") BedTool(filtered, from_string=True, deli="\s").saveas(args.directory + "/filtered_" + outTab) return
detected = [(int(x.name), float(x.attrs['topcoverage'])) for x in BedTool(path)] detected.sort(key=lambda x: x[0]) recovery = find_closest(detected, original) true_total = len(original) discovered_total = len(detected) true_positive = len([x for x in recovery if x[2] <= maxd]) print(true_positive) false_positive = discovered_total - true_positive return true_positive / true_total, 1 - false_positive / discovered_total # sensitivity specificity original = [(int(x.name), float(x.score)) for x in BedTool(args.original)] original.sort(key=lambda x: x[0]) #process_detected(args.detected, original, args.maxd) name2stat = [] for path in [x for x in get_only_files(args.detected) if 'annotated' in x]: name = get_name(path, args.mode) print(name) sens, spec = process_detected(path, original, args.maxd) name2stat.append((name, sens * 100, spec * 100)) name2stat.sort(key=lambda x: int(x[0])) data = [x[1] for x in name2stat], [x[2] for x in name2stat] labels = [x[0] for x in name2stat] fontsize = 24
def load_data(n_row=None, cleaned=True): # https://lncipedia.org/download data_dict = { 'id': [], 'name': [], 'length': [], 'ratio_g': [], 'ratio_t': [], 'ratio_c': [], 'ratio_a': [], 'number_exons': [], 'chromosom': [], 'start_pos': [], 'end_pos': [], 'length_from_pos': [], 'number_introns': [], 'mean_exon_length': [], 'mfe': [] } fasta_data = SeqIO.parse("data/lncipedia_5_2.fasta", "fasta") bed_raw_data = BedTool('data/lncipedia.bed') examiner = GFFExaminer() in_handle = open('data/lncipedia_5_2_hg38.gff') annotation_data = {} for i, rec in enumerate(GFF.parse(in_handle)): # chromosom e.g. chr1 for feature in rec.features: # lncRNA eg. LNC1725 if not feature.type == 'lnc_RNA': break exon_locations = [] lnc_id = feature.id for sub_feature in feature.sub_features: if sub_feature.type == 'exon': exon = (sub_feature.location.start, sub_feature.location.end) exon_locations.append(exon) annotation_data[lnc_id] = exon_locations in_handle.close() bed_data = {} for record in bed_raw_data: bed_data[record.name] = { 'number_exons': int(record.fields[9]), 'chromosom': record.fields[0], 'start_pos': int(record.fields[1]), # im bed -1 im vgl zu gff und online 'end_pos': int(record.fields[2]) } for i, record in enumerate(fasta_data): length = len(record.seq) data_dict['length'].append(length) data_dict['id'].append(record.id) data_dict['name'].append(record.name) if record.name in bed_data: for bed_feature in [ 'number_exons', 'chromosom', 'start_pos', 'end_pos' ]: data_dict[bed_feature].append( bed_data[record.name][bed_feature]) end_pos = bed_data[record.name]['end_pos'] start_pos = bed_data[record.name]['start_pos'] exon_locations = annotation_data[record.id] data_dict['length_from_pos'].append(end_pos - start_pos) data_dict['number_introns'].append( calc_number_introns(start_pos, end_pos, exon_locations)) data_dict['mean_exon_length'].append( calc_mean_exon_length(exon_locations)) else: for feature in [ 'number_exons', 'chromosom', 'start_pos', 'end_pos', 'length_from_pos', 'number_introns', 'mean_exon_length' ]: data_dict[feature].append(-1) count_g = 0 count_a = 0 count_t = 0 count_c = 0 for c in record.seq: if c == 'G': count_g += 1 elif c == 'T': count_t += 1 elif c == 'C': count_c += 1 elif c == 'A': count_a += 1 data_dict['ratio_g'].append(count_g / length * 100) data_dict['ratio_t'].append(count_t / length * 100) data_dict['ratio_c'].append(count_c / length * 100) data_dict['ratio_a'].append(count_a / length * 100) if n_row: if i == n_row: break list_of_lmfes = pickle.load(open("data/list_of_mfes2.pickle", "rb")) data_dict['mfe'].extend(list_of_lmfes) df = pd.DataFrame.from_dict(data_dict) # run only for rows where we have valid chromosomes df['chromosom'].loc[df['chromosom'] != -1] = df['chromosom'].loc[ df['chromosom'] != -1].apply(lambda x: x.split('chr')[1]) if cleaned: df = df[(df['chromosom'] != 'X') & (df['chromosom'] != 'Y')] df['chromosom'] = pd.to_numeric(df['chromosom']) # Also remove rows with invalid mfe and chromosomes df = df.loc[df['chromosom'] != -1].loc[ df['mfe'] != -1].iloc[:, 2:].apply(lambda x: (x - x.mean()) / x.std(), axis=0) return df
# bedtools map - a 3chase.bed - b 3chase.bedGraph - c 4 - o mean bed_annotated_int_comp = bed_annotated_int_bt.complement( g=chr_file) bed_annotated_int_bt.map(bedGraph_var_bt, c=4, o=statistic, null=0).saveas('values_' + id_worm + '_' + var_traj + '.txt') bed_annotated_int_comp.map(bedGraph_var_bt, c=4, o=statistic, null=0).saveas('values_' + id_worm + '_' + var_traj + '.comp.txt') bedGraph_var_bt.intersect(bed_annotated_int_bt).saveas('values_' + id_worm + '_' + var_traj + '.bedGraph') bedGraph_var_bt.intersect(bed_annotated_int_comp).saveas( 'values_' + id_worm + '_' + var_traj + '.comp.bedGraph') ## if no annotations present then returns bedGraph for its plotting and and empty bedgraph for annotated regions else: bedGraph_var_bt.saveas('values_' + id_worm + '_' + var_traj + '.comp.bedGraph') bed_no_intervals = BedTool(list_no_intervals).saveas('values_' + id_worm + '_' + var_traj + '.bedGraph')
# In[ ]: from pybedtools import genome_registry from pygtftk.gtf_interface import GTF # In[ ]: from pybedtools import BedTool grch38gff='/home/drew/Desktop/IPyNB-Variant-Analysis/data/cuffcmp.combined.gtf' #snps = BedTool('snps.bed.gz') # [1] genes = BedTool(grch38gff) # [1] # In[ ]: get_ipython().run_cell_magic('bash', '', 'ln -P /home/drew/Desktop/IPyNB-Variant-Analysis/data\nln -P /media/drew/easystore/ReferenceGenomes/GCA_000001405.15_GRCh38_no_alt_analysis_set/\nln -P /media/drew/easystore/ReferenceGenomes/GRCh38/') # In[ ]: intergenic_snps = snps.subtract(genes) # [2] nearby = genes.closest(intergenic_snps, d=True, stream=True) # [2, 3] for gene in nearby: # [4]
def expand(self, svs, nbp): return BedTool(svs).slop(b=nbp, g='config/' + self._gt + '.genome')
window_size = 5001 process_all = False sample_num = 1000 # In[3]: # retrieve data data = Data_Directories() print data.intervals.keys() print data.input_atac['day0'].keys() print data.output_histone['day0'].keys() # In[4]: # get intervals for day0 data day0_intervals = list(BedTool(data.intervals['day0'])) print '# of Intervals Extracted for day0: {}'.format(len(day0_intervals)) # In[5]: # create an ArrayExtractor for ATAC-seq for day0 with 140 base pairs bw_140bp_day0 = ArrayExtractor(data.input_atac['day0']['140']) print 'Finished extracting bigwig for day0, 140bp' # In[6]: # create a BigWigExtractor for histone makr 'H3K27ac' for day0 bw_histone_mark_day0 = BigwigExtractor(data.output_histone['day0']['H3K27ac']) print 'Finished extracting bigwig for day0, 140bp' # In[7]:
def load_beddata(genome, bed_file, use_meta, use_gencode, input_dir, is_sorted, chrom=None): bed = BedTool(bed_file) if not is_sorted: print('Sorting BED file') bed = bed.sort() is_sorted = True blacklist = make_blacklist() print('Determining which windows are valid') bed_intersect_blacklist_count = bed.intersect(blacklist, wa=True, c=True, sorted=is_sorted) if chrom: nonblacklist_bools = np.array([i.chrom==chrom and i.count==0 for i in bed_intersect_blacklist_count]) else: nonblacklist_bools = np.array([i.count==0 for i in bed_intersect_blacklist_count]) print('Filtering away blacklisted windows') bed_filtered = bed.intersect(blacklist, wa=True, v=True, sorted=is_sorted) if chrom: print('Filtering away windows not in chromosome:', chrom) bed_filtered = subset_chroms([chrom], bed_filtered) print('Generating test data iterator') bigwig_names, bigwig_files_list = load_bigwigs([input_dir]) bigwig_files = bigwig_files_list[0] if use_meta: meta_names, meta_list = load_meta([input_dir]) meta = meta_list[0] else: meta = [] meta_names = None shift = 0 if use_gencode: cpg_bed = BedTool('resources/cpgisland.bed.gz') cds_bed = BedTool('resources/wgEncodeGencodeBasicV19.cds.merged.bed.gz') intron_bed = BedTool('resources/wgEncodeGencodeBasicV19.intron.merged.bed.gz') promoter_bed = BedTool('resources/wgEncodeGencodeBasicV19.promoter.merged.bed.gz') utr5_bed = BedTool('resources/wgEncodeGencodeBasicV19.utr5.merged.bed.gz') utr3_bed = BedTool('resources/wgEncodeGencodeBasicV19.utr3.merged.bed.gz') peaks_cpg_bedgraph = bed_filtered.intersect(cpg_bed, wa=True, c=True) peaks_cds_bedgraph = bed_filtered.intersect(cds_bed, wa=True, c=True) peaks_intron_bedgraph = bed_filtered.intersect(intron_bed, wa=True, c=True) peaks_promoter_bedgraph = bed_filtered.intersect(promoter_bed, wa=True, c=True) peaks_utr5_bedgraph = bed_filtered.intersect(utr5_bed, wa=True, c=True) peaks_utr3_bedgraph = bed_filtered.intersect(utr3_bed, wa=True, c=True) data_bed = [(window.chrom, window.start, window.stop, 0, bigwig_files, np.append(meta, np.array([cpg.count, cds.count, intron.count, promoter.count, utr5.count, utr3.count], dtype=bool))) for window, cpg, cds, intron, promoter, utr5, utr3 in itertools.izip(bed_filtered, peaks_cpg_bedgraph,peaks_cds_bedgraph,peaks_intron_bedgraph,peaks_promoter_bedgraph,peaks_utr5_bedgraph,peaks_utr3_bedgraph)] else: data_bed = [(window.chrom, window.start, window.stop, shift, bigwig_files, meta) for window in bed_filtered] #from data_iter import DataIterator from data_iter import DataIterator bigwig_rc_order = get_bigwig_rc_order(bigwig_names) datagen_bed = DataIterator(data_bed, genome, 100, L, bigwig_rc_order, shuffle=False) return bigwig_names, meta_names, datagen_bed, nonblacklist_bools
def download_and_unify_datasets(cell_name, assay_type, assay_info_dict, target_cellinfo_dirs_path, number_of_votes_from_highquality_datasets=1, number_of_votes_from_lowquality_datasets=2, number_of_files_to_consider_from_highquality_datasets='all', number_of_files_to_consider_from_lowquality_datasets='all', dont_consider_low_quality_datasets_when_highquality_datasets_available=True, consider_peak_score_from_peak_file = True, peak_score_index=6): current_dir = os.getcwd() final_dataset_of_this_assay_cell = cell_name+"_"+assay_type+".bed4" final_datasets_of_this_assay_cell = [] if not os.path.exists(target_cellinfo_dirs_path+'/'+cell_name): os.mkdir(target_cellinfo_dirs_path+'/'+cell_name) if not os.path.exists(target_cellinfo_dirs_path+'/'+cell_name+'/'+assay_type): os.mkdir(target_cellinfo_dirs_path+'/'+cell_name+'/'+assay_type) os.chdir(target_cellinfo_dirs_path+'/'+cell_name+'/'+assay_type) print(target_cellinfo_dirs_path+'/'+cell_name+'/'+assay_type) if not os.path.exists(final_dataset_of_this_assay_cell): for factor in assay_info_dict.keys(): peak_score_from_peak_file_exists = True final_dataset = factor+".bed4" if os.path.exists(final_dataset): #if the final merged file of this factor was already available then no need to do any more operations final_datasets_of_this_assay_cell.append(final_dataset) continue list_of_high_quality_datasets_from_this_factor = [] list_of_low_quality_datasets_from_this_factor = [] print(assay_info_dict[factor]) if 'high' in assay_info_dict[factor]: if number_of_files_to_consider_from_highquality_datasets=='all': list_of_high_quality_datasets_from_this_factor=assay_info_dict[factor][assay_info_dict[factor].index("high")+1] else: for i in range(0, number_of_files_to_consider_from_highquality_datasets): if i < len(assay_info_dict[factor][assay_info_dict[factor].index("high")+1]): list_of_high_quality_datasets_from_this_factor.append(assay_info_dict[factor][assay_info_dict[factor].index("high")+1][i]) else: break if 'low' in assay_info_dict[factor]: if 'high' in assay_info_dict[factor] and dont_consider_low_quality_datasets_when_highquality_datasets_available: pass #in case no high quality dataset was available or it was specifically asked to include even with the availablility of high quality datasets then use the low quality datasets as well else: if number_of_files_to_consider_from_lowquality_datasets=='all': list_of_low_quality_datasets_from_this_factor=assay_info_dict[factor][assay_info_dict[factor].index("low")+1] else: for i in range(0, number_of_files_to_consider_from_lowquality_datasets): if i < len(assay_info_dict[factor][assay_info_dict[factor].index("low")+1]): list_of_low_quality_datasets_from_this_factor.append(assay_info_dict[factor][assay_info_dict[factor].index("low")+1][i]) else: break #process the datasets from the high quality list list_of_high_quality_peakfiles_from_this_factor = [] final_dataset_high_quality = factor+"_high" + ".bed" final_dataset_high_quality_name = open(final_dataset_high_quality, 'w') for dataset in list_of_high_quality_datasets_from_this_factor: dataset_name = "" if "ENCFF" in dataset: dataset_path = "https://www.encodeproject.org/files/"+dataset+"/@@download/"+dataset+".bed.gz" dataset_name = factor+"_"+dataset+".bed" if not os.path.exists(dataset_name): if not os.path.exists(dataset_name+".gz"): downloaded_obj = urlopen(dataset_path) print("downloading.... " + dataset_path) with open(os.path.basename(dataset_name+".gz"), 'wb') as local_file: local_file.write(downloaded_obj.read()) with gzip.open(dataset_name+".gz", 'rb') as dataset_name_zip, open(dataset_name, 'w') as dataset_name_unzipped: dataset_name_unzipped.write(dataset_name_zip.read()) #os.system("gunzip " + dataset_name+".gz") elif dataset.startswith("http://") or dataset.startswith("ftp://"): dataset_path = dataset dataset_name = factor+"_"+dataset.strip().split('/')[-1] dataset_name_unzipped = dataset_name if "." in dataset_name: if dataset_name.split('.')[-1]=="gz": dataset_name_unzipped = '.'.join(dataset_name.split('.')[0:-1]) if os.path.exists(dataset_name_unzipped):#this could be the gzip or the unzipped file dataset_name = '.'.join(dataset_name.split('.')[0:-1]) else: if not os.path.exists(dataset_name): downloaded_obj = urlopen(dataset_path) print("downloading.... " + dataset_path) with open(dataset_name, 'wb') as local_file: local_file.write(downloaded_obj.read()) if "." in dataset_name: if dataset_name.split('.')[-1]=="gz": with gzip.open(dataset_name, 'rb') as dataset_name_unzip_read, open(dataset_name_unzipped, 'wb') as dataset_name_unzipped_write: dataset_name_unzipped_write.write(dataset_name_unzip_read.read()) #os.system("gunzip " + dataset_name) dataset_name = '.'.join(dataset_name.split('.')[0:-1]) else:#path to a local file dataset_path = dataset dataset_name = factor+"_"+dataset.strip().split('/')[-1] if not os.path.exists(dataset_name): shutil.copy(dataset_name, "./") if "." in dataset_name: if dataset_name.split('.')[-1]=="gz": with gzip.open(dataset_name, 'rb') as dataset_name_unzip_read, open('.'.join(dataset_name.split('.')[0:-1]), 'wb') as dataset_name_unzip_write: dataset_name_unzip_write.write(dataset_name_unzip_read.read()) #os.system("gunzip " + dataset_name) dataset_name = '.'.join(dataset_name.split('.')[0:-1]) if dataset_name!="": dataset_sort_bedtools = pybedtools.BedTool(dataset_name) sorting_result = dataset_sort_bedtools.sort() list_of_high_quality_peakfiles_from_this_factor.append(sorting_result.fn) #Combine all high quality peak files into one peak_score_from_peak_file_exists = True if len(list_of_high_quality_peakfiles_from_this_factor)!=0: print(cell_name + ": high: " + assay_type + ":" + factor + ": " + ','.join(list_of_high_quality_peakfiles_from_this_factor)) #merge the high quality datasets if len(list_of_high_quality_peakfiles_from_this_factor)==1: if assay_type == "ChromatinStates": final_dataset_high_quality = list_of_high_quality_peakfiles_from_this_factor[0] else: merged_output = open(list_of_high_quality_peakfiles_from_this_factor[0], 'r').readlines() try:#in case the line didn't have col index or the value of col index was not convertable to float then it's an indication of no score availabbility peak_score = float(merged_output[0][peak_score_index]) except (IndexError, ValueError) as e: repr( e ) peak_score_from_peak_file_exists = False if peak_score_from_peak_file_exists and consider_peak_score_from_peak_file: for line in merged_output: final_dataset_high_quality_name.write('\t'.join(line.strip().split('\t')[0:3]) + '\t' + line.strip().split('\t')[peak_score_index] +"\n") else: for line in merged_output: final_dataset_high_quality_name.write('\t'.join(line.strip().split('\t')[0:3]) +"\n") final_dataset_high_quality_name.close() elif len(list_of_high_quality_peakfiles_from_this_factor)>1: if assay_type == "ChromatinStates": #write all the files into one with open(final_dataset_high_quality, 'w') as concatenated_file_write: for file_name in list_of_high_quality_peakfiles_from_this_factor: with open(file_name, 'r') as infile: concatenated_file_write.write(infile.read()) else: bedTools_obj = BedTool() merging_all = bedTools_obj.multi_intersect(i=list_of_high_quality_peakfiles_from_this_factor).filter(lambda x: int(x[3]) >= number_of_votes_from_highquality_datasets).sort().merge() #in case the line didn't have col index or the value of col index was not convertable to float then it's an indication of no score availabbility for file_i in list_of_high_quality_peakfiles_from_this_factor:#check if all the files have peak scores with open(file_i) as read_file_i: try: h = read_file_i.readline() peak_score = float(h.strip().split('\t')[peak_score_index]) except (IndexError, ValueError) as e: repr( e ) peak_score_from_peak_file_exists = False if peak_score_from_peak_file_exists and consider_peak_score_from_peak_file: #tmp_dir = './tmp_dir_to_remove_{}'.format(list_of_high_quality_peakfiles_from_this_factor[0].split('/')[-1]) #os.makedirs(tmp_dir) list_of_high_quality_peakfiles_from_this_factor_updated = [] for i_file in list_of_high_quality_peakfiles_from_this_factor: with open(i_file, 'r') as ifile, open(i_file + "_tmp", 'w') as ofile: for line in ifile.readlines(): ofile.write('\t'.join(line.strip().split('\t')[0:3]) + '\t{}\n'.format(line.strip().split('\t')[peak_score_index])) list_of_high_quality_peakfiles_from_this_factor_updated.append(i_file + "_tmp") peak_score_index_updated = 3 #os.system('cp ' + merging_all.fn + ' . ' ) merging_all = merging_all.intersect(list_of_high_quality_peakfiles_from_this_factor_updated, wo=True).sort().groupby(g=[1,2,3], c=peak_score_index_updated+1+4, o=['mean'])#4 cols from the mergeBed and one extra from the intersection then it follows the cols from each file #os.system('cp ' + merging_all.fn + ' . ' ) for l in list_of_high_quality_peakfiles_from_this_factor_updated: os.remove(l) merged_output = open(merging_all.fn, 'r').readlines() for line in merged_output: final_dataset_high_quality_name.write('\t'.join(line.strip().split('\t')[0::]) +"\n") final_dataset_high_quality_name.close() #handling peak files from low quality datasets list_of_low_quality_peakfiles_from_this_factor = [] final_dataset_low_quality = factor+"_low" + ".bed" final_dataset_low_quality_name = open(final_dataset_low_quality, 'w') for dataset in list_of_low_quality_datasets_from_this_factor: dataset_name = "" if "ENCFF" in dataset: dataset_path = "https://www.encodeproject.org/files/"+dataset+"/@@download/"+dataset+".bed.gz" dataset_name = factor+"_"+dataset+".bed" if not os.path.exists(dataset_name): if not os.path.exists(dataset_name+".gz"): downloaded_obj = urlopen(dataset_path) print("downloading.... " + dataset_path) with open(os.path.basename(dataset_name+".gz"), 'wb') as local_file: local_file.write(downloaded_obj.read()) with gzip.open(dataset_name+".gz", 'rb') as dataset_name_zip, open(dataset_name, 'w') as dataset_name_unzipped: dataset_name_unzipped.write(dataset_name_zip.read()) #os.system("gunzip " + dataset_name+".gz") elif dataset.startswith("http://") or dataset.startswith("ftp://"): dataset_path = dataset dataset_name = factor+"_"+dataset.strip().split('/')[-1] dataset_name_unzipped = dataset_name if "." in dataset_name: if dataset_name.split('.')[-1]=="gz": dataset_name_unzipped = '.'.join(dataset_name.split('.')[0:-1]) if os.path.exists(dataset_name_unzipped):#this could be the gzip or the unzipped file dataset_name = '.'.join(dataset_name.split('.')[0:-1]) else: if not os.path.exists(dataset_name): downloaded_obj = urlopen(dataset_path) print("downloading.... " + dataset_path) with open(dataset_name, 'wb') as local_file: local_file.write(downloaded_obj.read()) if "." in dataset_name: if dataset_name.split('.')[-1]=="gz": with gzip.open(dataset_name, 'rb') as dataset_name_unzip_read, open(dataset_name_unzipped, 'wb') as dataset_name_unzipped_write: dataset_name_unzipped_write.write(dataset_name_unzip_read.read()) #os.system("gunzip " + dataset_name) dataset_name = '.'.join(dataset_name.split('.')[0:-1]) else:#path to a local file dataset_path = dataset dataset_name = factor+"_"+dataset.strip().split('/')[-1] if not os.path.exists(dataset_name): shutil.copy(dataset, "./"+dataset_name) if "." in dataset_name: if dataset_name.split('.')[-1]=="gz": with gzip.open(dataset_name, 'rb') as dataset_name_unzip_read, open('.'.join(dataset_name.split('.')[0:-1]), 'wb') as dataset_name_unzip_write: dataset_name_unzip_write.write(dataset_name_unzip_read.read()) #os.system("gunzip " + dataset_name) dataset_name = '.'.join(dataset_name.split('.')[0:-1]) if dataset_name!="": dataset_sort_bedtools = pybedtools.BedTool(dataset_name) sorting_result = dataset_sort_bedtools.sort() list_of_low_quality_peakfiles_from_this_factor.append(sorting_result.fn) #Combine all low quality peak files into one peak_score_from_peak_file_exists = True if len(list_of_low_quality_peakfiles_from_this_factor)!=0: print(cell_name + ": low: " + assay_type + ":" + factor + ": " + ','.join(list_of_low_quality_peakfiles_from_this_factor)) #merge the low quality datasets if len(list_of_low_quality_peakfiles_from_this_factor)==1: if assay_type == "ChromatinStates": final_dataset_low_quality = list_of_low_quality_peakfiles_from_this_factor[0] else: merged_output = open(list_of_low_quality_peakfiles_from_this_factor[0], 'r').readlines() try:#in case the line didn't have col index or the value of col index was not convertable to float then it's an indication of no score availabbility peak_score = float(merged_output[0][peak_score_index]) except (IndexError, ValueError) as e: repr( e ) peak_score_from_peak_file_exists = False if peak_score_from_peak_file_exists and consider_peak_score_from_peak_file: for line in merged_output: final_dataset_low_quality_name.write('\t'.join(line.strip().split('\t')[0:3]) + '\t' + line.strip().split('\t')[peak_score_index] + "\n") else: for line in merged_output: final_dataset_low_quality_name.write('\t'.join(line.strip().split('\t')[0:3]) + "\n") final_dataset_low_quality_name.close() elif len(list_of_low_quality_peakfiles_from_this_factor)>1: if assay_type == "ChromatinStates": #write all the files into one with open(final_dataset_low_quality, 'w') as concatenated_file_write: for file_name in list_of_low_quality_peakfiles_from_this_factor: with open(file_name, 'r') as infile: concatenated_file_write.write(infile.read()) else: bedTools_obj = BedTool() merging_all = bedTools_obj.multi_intersect(i=list_of_low_quality_peakfiles_from_this_factor).filter(lambda x: int(x[3]) >= number_of_votes_from_lowquality_datasets).sort().merge() for file_i in list_of_low_quality_peakfiles_from_this_factor:#check if all the files have peak scores with open(file_i) as read_file_i: try: h = read_file_i.readline() peak_score = float(h.strip().split('\t')[peak_score_index]) except (IndexError, ValueError) as e: repr( e ) peak_score_from_peak_file_exists = False if peak_score_from_peak_file_exists and consider_peak_score_from_peak_file: list_of_low_quality_datasets_from_this_factor_updated = [] for i_file in list_of_low_quality_peakfiles_from_this_factor: with open(i_file, 'r') as ifile, open(i_file + "_tmp", 'w') as ofile: for line in ifile.readlines(): ofile.write('\t'.join(line.strip().split('\t')[0:3]) + '\t{}\n'.format(line.strip().split('\t')[peak_score_index])) list_of_low_quality_datasets_from_this_factor_updated.append(i_file + "_tmp") peak_score_index_updated = 3 merging_all = merging_all.intersect(list_of_low_quality_datasets_from_this_factor_updated, wo=True).sort().groupby(g=[1,2,3], c=peak_score_index_updated+1+4, o=['mean'])#4 cols from the mergeBed and one extra from the intersection then it follows the cols from each file for l in list_of_low_quality_datasets_from_this_factor_updated: os.remove(l) merged_output = open(merging_all.fn, 'r').readlines() for line in merged_output: final_dataset_low_quality_name.write('\t'.join(line.strip().split('\t')[0::]) +"\n") final_dataset_low_quality_name.close() #Combine results of low and high quality peak files and merge them with adding the factor name peak_score_from_peak_file_exists = True final_file = "" if os.stat(final_dataset_high_quality).st_size==0 and os.stat(final_dataset_low_quality).st_size==0: continue else: merge_final_lines = [] highlow_combined = "highlow_combined" os.system("cat " + final_dataset_high_quality + " " + final_dataset_low_quality + " > " + highlow_combined) if assay_type == "ChromatinStates":#because the chromatinstates are defined for all genome bins merging them would cause create 25 regions only since all the bins are starting consequentively final_file = highlow_combined else: with open(highlow_combined) as read_file_i: try: h = read_file_i.readline() peak_score = float(h.strip().split('\t')[3]) except (IndexError, ValueError) as e: repr( e ) peak_score_from_peak_file_exists = False highlow_combined_obj = BedTool(highlow_combined) merge_final = "" if peak_score_from_peak_file_exists and consider_peak_score_from_peak_file: merge_final = highlow_combined_obj.sort().merge(c=4, o='mean') else: merge_final = highlow_combined_obj.sort().merge() final_file = merge_final.fn with open(final_file, 'r') as merge_final_read: merge_final_lines = merge_final_read.readlines() with open(final_dataset, 'w') as final_dataset_writer: if assay_type == "ChromatinStates": for line in merge_final_lines: final_dataset_writer.write('\t'.join(line.strip().split('\t')[0:3]) + '\t' + cell_name+"#ChromHMM#"+line.strip().split('\t')[3].replace(" ", "-") + '\n') elif assay_type == "ChIP-seq": if peak_score_from_peak_file_exists and consider_peak_score_from_peak_file: for line in merge_final_lines: peak_score = "#"+str(line.strip().split('\t')[3]) final_dataset_writer.write('\t'.join(line.strip().split('\t')[0:3]) + '\t' + cell_name+"#TFBinding#"+factor.replace(" ", "-")+peak_score + '\n') else: for line in merge_final_lines: peak_score = "" final_dataset_writer.write('\t'.join(line.strip().split('\t')[0:3]) + '\t' + cell_name+"#TFBinding#"+factor.replace(" ", "-")+peak_score + '\n') else: if peak_score_from_peak_file_exists and consider_peak_score_from_peak_file: for line in merge_final_lines: peak_score = "#"+str(line.strip().split('\t')[3]) final_dataset_writer.write('\t'.join(line.strip().split('\t')[0:3]) + '\t' + cell_name+"#"+factor.replace(" ", "-")+peak_score + '\n') else: peak_score = "" for line in merge_final_lines: final_dataset_writer.write('\t'.join(line.strip().split('\t')[0:3]) + '\t' + cell_name+"#"+factor.replace(" ", "-")+peak_score + '\n') final_datasets_of_this_assay_cell.append(final_dataset) os.remove(highlow_combined) os.remove(factor+"_high" + ".bed") os.remove(factor+"_low" + ".bed") #combine peak files of all the factors into one with open(final_dataset_of_this_assay_cell, 'w') as final_dataset_of_this_cell_out: for peak_file in final_datasets_of_this_assay_cell: with open(peak_file, 'r') as infile: final_dataset_of_this_cell_out.write(infile.read()) os.chdir(current_dir) return final_dataset_of_this_assay_cell, final_datasets_of_this_assay_cell
def nonnegative_wrapper(a, bl_file): bl = BedTool(bl_file) a_slop = a.slop(g=genome_sizes_file, b=genome_window_size) return bl.cat(a_slop).fn
newvals = [x for x in np.arange(0, max(yvals), scale)] newpos = np.arange(0, step * len(newvals), step) #print(sum(ylabels)) ax.set_yticks(newpos) ax.set_yticklabels(["%d" % (x * 100) for x in newvals]) def check(region, mincov): return len([ x for x in region.attrs['topcoverage'].split(",")[:3] if float(x) > mincov ]) > 1 annpeaks = [x for x in BedTool(args.path) if check(x, args.mincov)] fontsize = 24 linewidth = 5 scores = [float(x.attrs['tss']) for x in annpeaks if x.attrs['tss'] != 'nan'] scores.sort() selected_scores = [x for x in scores if x <= 300 and x >= -100] #print(min(scores)) fig, axes = plt.subplots(ncols=2, figsize=(22, 7), frameon=False) fig.tight_layout(rect=[0.05, 0.1, 1, 1]) fig.subplots_adjust(wspace=0.2) for data, ax in zip([scores, selected_scores], axes): _, bins, _ = ax.hist(data, bins=20, density=True) ax.set_xlabel('TSS distance', fontsize=fontsize)
def intersect_count(chip_bed, windows_file): windows = BedTool(windows_file) chip_bedgraph = windows.intersect(chip_bed, wa=True, c=True, f=1.0*(genome_window_size/2+1)/genome_window_size, sorted=True) bed_counts = [i.count for i in chip_bedgraph] return bed_counts
def consolidate(nbedfile, obedfile, cbedfile): from pybedtools import BedTool nbedtool = BedTool(nbedfile) obedtool = BedTool(obedfile) ab = nbedtool.intersect(obedtool, s=True, u=True) ba = obedtool.intersect(nbedtool, s=True, u=True) cmd = "cat {0} {1} | sort -k1,1 -k2,2n".format(ab.fn, ba.fn) fp = popen(cmd) ovl = BedTool(fp.readlines()) abmerge = ovl.merge(s=True, nms=True, scores="mean").sort() cmd = "cat {0}".format(abmerge.fn) fp = popen(cmd, debug=False) ovl = BedTool(fp.readlines()) notovl = nbedtool.intersect(ovl.sort(), s=True, v=True) infile = "{0} {1}".format(notovl.fn, ovl.fn) tmpfile = "/tmp/reformat.{0}.bed".format(os.getpid()) cmd = "sort -k1,1 -k2,2n" sh(cmd, infile=infile, outfile=tmpfile) fp = open(cbedfile, "w") bed = Bed(tmpfile) for b in bed: if ";" in b.accn: accns = set() for accn in b.accn.split(";"): accns.add(accn) b.accn = ";".join(accns) print(b, file=fp) fp.close() os.remove(tmpfile) sort([cbedfile, "-i"])
print_compiled(current_selection, size) stat_counts.append(len(current_selection)) current_selection = [m] else: print_compiled(current_selection, size) stat_counts.append(len(current_selection)) return stat_counts ######################################################################################################## ### Execution Section file_dict = gather_files(args.path, args.replicates, args.name) for dname, files in file_dict.items(): blist = [BedTool(x) for x in files] size = len(blist) res_total, stat_total_counts = find_shared_peaks(blist, args.maxd) with open(os.path.join(args.outdir, "%s.gff" % dname), 'w') as f: f.write("# %s\n" % ",".join([os.path.basename(x).split(".")[0] for x in files])) for compiled in res_total: f.write(print_compiled(compiled, size)) sys.stderr.write("\n%s\n" % dname) sys.stderr.write(shared_peaks_stat_to_string(stat_total_counts, size)) #import argparse #import os
if not args.debug: pass else: print() print(("Running in debug mode. Only the first " + str(args.debug) + " entries will be used.")) print() print("Starting datamatrix assembly process") Popen('mkdir ' + args.outfile + '.datamatrix', shell=True) print() print("Sorting input bed file.") input_bed = BedTool( args.input_file).sort().saveas(args.outfile + '.datamatrix/input_list.bed') if 'strand' in list(BedTool(input_bed[0:1]).saveas().to_dataframe().columns): print("Strand information found in input file. Running in stranded mode.") print() strd = True else: print( "Strand information NOT found in input file. Running in unstranded mode." ) print() strd = False ##Load the genome file that matches the version of the GTF you are using. Pysam will be used to build an index of ##the FASTA file.