def build_labels(regions_fname, idr_peaks_dir, relaxed_peaks_dir, output_dir, overwrite_existing=False): matched_peaks = defaultdict(lambda: {'idr': None, 'relaxed': None}) for fname in os.listdir(idr_peaks_dir): sample, factor = fname.split(".")[1:3] assert matched_peaks[(sample, factor)]['idr'] is None matched_peaks[(sample, factor)]['idr'] = os.path.join(idr_peaks_dir, fname) for fname in os.listdir(relaxed_peaks_dir): sample, factor = fname.split(".")[1:3] assert matched_peaks[(sample, factor)]['idr'] is not None assert matched_peaks[(sample, factor)]['relaxed'] is None matched_peaks[(sample, factor)]['relaxed'] = os.path.join( relaxed_peaks_dir, fname) all_args = [] for (sample, factor), fnames in matched_peaks.iteritems(): all_args.append( (regions_fname, fnames['idr'], fnames['relaxed'], output_dir)) run_in_parallel(16, build_labels_for_sample_and_factor, all_args) return
def estimate_bootstrapped_scores_from_final_submissions_dir( DB, path, score_callback, label_dir, nthreads): conn = sqlite3.connect(DB) c = conn.cursor() c.execute(''' CREATE TABLE IF NOT EXISTS scores ( factor text, sample text, principalId int, submission_date text, submission_fname text, bootstrap_index int, recall_at_10_fdr real, recall_at_50_fdr real, auPRC real, auROC real, rank int );''') c.close() conn.commit() conn.close() submission_args = [] fnames_not_sorted = os.listdir(path) #for ff in fnames_not_sorted: print ff fnames = sorted(fnames_not_sorted, key=lambda x: x.split(".")[1], reverse=True) #for ff in fnames: print ff #os.exit(1) done = [] for fname in os.listdir(path): # JIN #factor, sample, _, principal_id, _, _ = os.path.basename(fname).split(".") #['3343330', '7998005', 'F', 'NANOG', 'induced_pluripotent_stem_cell', 'tab', 'gz'] principal_id, submissionId, _, factor, sample, _, _ = os.path.basename(fname).split(".") if (principal_id, factor, sample) in done: print("SKIPPING:", principal_id, factor, sample, fname) continue else: done.append( (principal_id, factor, sample) ) #print os.path.basename(fname).split(".") full_fname = os.path.abspath(os.path.join(path, fname)) #full_fname = os.path.basename( os.path.abspath(os.path.join(path, fname)) ) #print factor, sample, principal_id, str(datetime.now()), full_fname submission_args.append([ DB, factor, sample, principal_id, str(datetime.now()), full_fname, score_callback, label_dir ]) run_in_parallel(nthreads, calc_and_insert_new_results, submission_args) return
def main(): #print load_genome_metadata(1) genome = pysam.FastaFile('hg19.genome.fa') #models = load_selex_models_from_db() models = load_binding_models_from_db() peaks = load_peaks(sys.argv[1]) seqs_iter = ( genome.fetch(contig, start, stop+1) for contig, start, stop in peaks ) seqs = FixedLengthDNASequences(seqs_iter) with ThreadSafeFile("output.txt", "w") as ofp: all_args = [(ofp, model, seqs, peaks) for model in models] run_in_parallel(24, score_model_worker, all_args) return
def main(): #print load_genome_metadata(1) genome = pysam.FastaFile('hg19.genome.fa') #models = load_selex_models_from_db() models = load_binding_models_from_db() peaks = load_peaks(sys.argv[1]) seqs_iter = (genome.fetch(contig, start, stop + 1) for contig, start, stop in peaks) seqs = FixedLengthDNASequences(seqs_iter) with ThreadSafeFile("output.txt", "w") as ofp: all_args = [(ofp, model, seqs, peaks) for model in models] run_in_parallel(24, score_model_worker, all_args) return
def main(): sample_grpd_labels = defaultdict(list) for fname in os.listdir(labels_dir): if not fname.endswith(".npy"): continue sample, factor, split_name, _, _ = fname.split(".") if factor != 'FOXA2' and factor != 'ATF3': continue ofname = "%s.%s.labels.tsv" % (factor, split_name) sample_grpd_labels[os.path.join(labels_dir, "..", ofname)].append( (sample, os.path.join(labels_dir, fname))) args = [list(x) + [regions,] for x in sample_grpd_labels.iteritems()] #print args run_in_parallel(16, build_labels_tsv, args) return
def build_labels_tsvs(regions, labels_dir, output_dir): sample_grpd_labels = defaultdict(list) for fname in os.listdir(labels_dir): if not fname.endswith(".npy"): continue sample, factor, split_name, _, _ = fname.split(".") ofname = "%s.%s.labels.tsv" % (factor, split_name) sample_grpd_labels[os.path.join(output_dir, ofname)].append( (sample, os.path.join(labels_dir, fname))) args = [list(x) + [ regions, ] for x in sample_grpd_labels.iteritems()] run_in_parallel(16, build_labels_tsv, args) return
def build_hidden_test_set_arrays(): regions_fname = \ "/mnt/data/TF_binding/DREAM_challenge/public_data/annotations/test_regions.blacklistfiltered.bed.gz" idr_peaks_dir = "/mnt/data/TF_binding/DREAM_challenge/hidden_test_set_chipseq_data/idr/" relaxed_peaks_dir = "/mnt/data/TF_binding/DREAM_challenge/hidden_test_set_chipseq_data/relaxed/" output_dir = "/mnt/data/TF_binding/DREAM_challenge/hidden_test_set_chipseq_data/arrays/" metadata = load_metadata() ladderboard_samples = set( (x.TF, x.CELL_TYPE) for x in metadata if x.HIDDEN_TEST_SET is True) args = [] for tf_name, cell_type in ladderboard_samples: idr_fname = "ChIPseq.{sample_name}.{tf_name}.conservative.train.narrowPeak.gz".format( sample_name=cell_type, tf_name=tf_name) idr_fname = os.path.join(idr_peaks_dir, idr_fname) relaxed_peaks_fname = "ChIPseq.{sample_name}.{tf_name}.relaxed.narrowPeak.gz".format( sample_name=cell_type, tf_name=tf_name) relaxed_peaks_fname = os.path.join(relaxed_peaks_dir, relaxed_peaks_fname) args.append( [regions_fname, idr_fname, relaxed_peaks_fname, output_dir, True]) run_in_parallel(16, build_labels_for_sample_and_factor, args)
def copy_chipseq_data(sample_dirs, idr_peaks_output_dir, relaxed_peaks_output_dir, fold_change_output_dir, regions_bed_fname): ## first group the IDR optimal peaks, and when there are alternates choose ## the experiment that has the largest number of peaks optimal_peaks = defaultdict(list) for sample_name, pk_fname in find_idr_peaks( sample_dirs, "/mnt/data/TF_binding/DREAM_challenge/all_data/chipseq/output/DREAM_challenge/{}/out/peak/idr/optimal_set/", ".IDR0.05.filt.narrowPeak.gz"): optimal_peaks[sample_and_factor_from_samplename(sample_name)].append( (sample_name, pk_fname)) # the new sample_dirs has a single entry for each TF,sample_type combo sample_dirs = [] for (factor, sample), sample_and_fnames in optimal_peaks.iteritems(): if len(sample_and_fnames) == 1: sample_dirs.append(sample_and_fnames[0][0]) elif len(sample_and_fnames) > 1: assert False, "There shouldn't be any samples with more than 1 replicate" #print sample_and_fnames[0][0], len(sample_and_fnames) best_sample, most_num_lines = None, 0 for sample_name, fname in sample_and_fnames: num_lines = find_num_peaks(fname) if num_lines > most_num_lines: best_sample = sample_name most_num_lines = num_lines print best_sample, most_num_lines else: assert False, "It shouldn't be possible to have zero fnames" ## now that we've refined the sample list, copy all of the peaks and wiggles ## into the correct directory # copy the IDR peaks cmds = [] for sample_name, pk_fname in find_idr_peaks( sample_dirs, "/mnt/data/TF_binding/DREAM_challenge/all_data/chipseq/output/DREAM_challenge/{}/out/peak/idr/optimal_set/", ".IDR0.05.filt.narrowPeak.gz"): sample, factor = sample_and_factor_from_samplename(sample_name) ofname = "ChIPseq.{factor}.{sample}.conservative.train.narrowPeak.gz".format( factor=factor, sample=sample) cmd = "bedtools intersect -wa -u -a {pk_fname} -b {regions_fname} | pigz > {ofname} ".format( pk_fname=pk_fname, regions_fname=regions_bed_fname, ofname=os.path.join(idr_peaks_output_dir, ofname)) cmds.append([ cmd, ]) # copy the relaxed peaks for sample_name, pk_fname in find_idr_peaks( sample_dirs, "/mnt/data/TF_binding/DREAM_challenge/all_data/chipseq/output/DREAM_challenge/{}/out/peak/idr/pooled_pseudo_reps/", "unthresholded-peaks.txt.gz"): sample, factor = sample_and_factor_from_samplename(sample_name) ofname = "ChIPseq.{factor}.{sample}.relaxed.narrowPeak.gz".format( factor=factor, sample=sample) cmd = "bedtools intersect -wa -u -a {pk_fname} -b {regions_fname} | pigz > {ofname} ".format( pk_fname=pk_fname, regions_fname=regions_bed_fname, ofname=os.path.join(relaxed_peaks_output_dir, ofname)) cmds.append([ cmd, ]) # run all of the peaks intersection cmds run_in_parallel(16, os.system, cmds) # copy the fold change wiggles print sample_dirs fold_change_bigwigs = list( find_idr_peaks( sample_dirs, "/mnt/data/TF_binding/DREAM_challenge/all_data/chipseq/output/DREAM_challenge/{}/out/signal/macs2/pooled_rep/", ".fc.signal.bw")) cmds = [] for i, (sample_name, pk_fname) in enumerate(fold_change_bigwigs): print i, len(fold_change_bigwigs) try: sample, factor = sample_and_factor_from_samplename(sample_name) ofname = "ChIPseq.{factor}.{sample}.fc.signal.train.bw".format( factor=factor, sample=sample) try: open(ofname) except IOError: cmd = "bwtool remove mask -inverse {} {} {}".format( regions_bed_fname, pk_fname, os.path.join(fold_change_output_dir, ofname)) cmds.append([ cmd, ]) else: pass ### the old copy command #os.system("cp -u {} {}".format( # pk_fname, os.path.join(fold_change_output_dir, ofname))) except FileNotFoundError: print "Can't run:", cmd run_in_parallel(16, os.system, cmds)