Exemplo n.º 1
0
def build_labels(regions_fname,
                 idr_peaks_dir,
                 relaxed_peaks_dir,
                 output_dir,
                 overwrite_existing=False):
    matched_peaks = defaultdict(lambda: {'idr': None, 'relaxed': None})
    for fname in os.listdir(idr_peaks_dir):
        sample, factor = fname.split(".")[1:3]
        assert matched_peaks[(sample, factor)]['idr'] is None
        matched_peaks[(sample,
                       factor)]['idr'] = os.path.join(idr_peaks_dir, fname)

    for fname in os.listdir(relaxed_peaks_dir):
        sample, factor = fname.split(".")[1:3]
        assert matched_peaks[(sample, factor)]['idr'] is not None
        assert matched_peaks[(sample, factor)]['relaxed'] is None
        matched_peaks[(sample, factor)]['relaxed'] = os.path.join(
            relaxed_peaks_dir, fname)

    all_args = []
    for (sample, factor), fnames in matched_peaks.iteritems():
        all_args.append(
            (regions_fname, fnames['idr'], fnames['relaxed'], output_dir))
    run_in_parallel(16, build_labels_for_sample_and_factor, all_args)
    return
Exemplo n.º 2
0
def estimate_bootstrapped_scores_from_final_submissions_dir(
        DB, path, score_callback, label_dir, nthreads):
    conn = sqlite3.connect(DB)
    c = conn.cursor()
    c.execute('''
    CREATE TABLE IF NOT EXISTS scores  (
        factor text,
        sample text,

        principalId int,
        submission_date text,
        submission_fname text,
        
        bootstrap_index int,

        recall_at_10_fdr real, 
        recall_at_50_fdr real, 
        auPRC real, 
        auROC real,
        
        rank int
    );''')
    c.close()
    conn.commit()
    conn.close()
    
    submission_args = []
    fnames_not_sorted = os.listdir(path)
    #for ff in fnames_not_sorted: print ff
    fnames = sorted(fnames_not_sorted, key=lambda x: x.split(".")[1], reverse=True)
    #for ff in fnames: print ff
    #os.exit(1)
    done = []
    for fname in os.listdir(path):
    # JIN
        #factor, sample, _, principal_id, _, _ = os.path.basename(fname).split(".")
    #['3343330', '7998005', 'F', 'NANOG', 'induced_pluripotent_stem_cell', 'tab', 'gz']
        principal_id, submissionId, _, factor, sample, _, _ = os.path.basename(fname).split(".")
        if (principal_id, factor, sample) in done:
            print("SKIPPING:", principal_id, factor, sample, fname)
            continue
        else:
            done.append( (principal_id, factor, sample) )

    #print os.path.basename(fname).split(".")

        full_fname = os.path.abspath(os.path.join(path, fname))
        #full_fname = os.path.basename( os.path.abspath(os.path.join(path, fname)) )
        #print factor, sample, principal_id, str(datetime.now()), full_fname
        submission_args.append([
            DB, 
            factor, sample, 
            principal_id, str(datetime.now()), full_fname,
            score_callback, label_dir
        ])

    run_in_parallel(nthreads, calc_and_insert_new_results, submission_args)
    return
def main():
    #print load_genome_metadata(1)
    genome = pysam.FastaFile('hg19.genome.fa')
    #models = load_selex_models_from_db()
    models = load_binding_models_from_db()
    peaks = load_peaks(sys.argv[1])
    seqs_iter = ( genome.fetch(contig, start, stop+1)
                  for contig, start, stop in peaks )
    seqs = FixedLengthDNASequences(seqs_iter)
    with ThreadSafeFile("output.txt", "w") as ofp:
        all_args = [(ofp, model, seqs, peaks) for model in models]
        run_in_parallel(24, score_model_worker, all_args)
    return
Exemplo n.º 4
0
def main():
    #print load_genome_metadata(1)
    genome = pysam.FastaFile('hg19.genome.fa')
    #models = load_selex_models_from_db()
    models = load_binding_models_from_db()
    peaks = load_peaks(sys.argv[1])
    seqs_iter = (genome.fetch(contig, start, stop + 1)
                 for contig, start, stop in peaks)
    seqs = FixedLengthDNASequences(seqs_iter)
    with ThreadSafeFile("output.txt", "w") as ofp:
        all_args = [(ofp, model, seqs, peaks) for model in models]
        run_in_parallel(24, score_model_worker, all_args)
    return
def main():    
    sample_grpd_labels = defaultdict(list)
    for fname in os.listdir(labels_dir):
        if not fname.endswith(".npy"): continue
        sample, factor, split_name, _, _ = fname.split(".")
        if factor != 'FOXA2' and factor != 'ATF3': continue
        ofname = "%s.%s.labels.tsv" % (factor, split_name)
        sample_grpd_labels[os.path.join(labels_dir, "..", ofname)].append(
            (sample, os.path.join(labels_dir, fname)))

    args = [list(x) + [regions,] for x in sample_grpd_labels.iteritems()]
    #print args
    run_in_parallel(16, build_labels_tsv, args)
    return
Exemplo n.º 6
0
def build_labels_tsvs(regions, labels_dir, output_dir):
    sample_grpd_labels = defaultdict(list)
    for fname in os.listdir(labels_dir):
        if not fname.endswith(".npy"): continue
        sample, factor, split_name, _, _ = fname.split(".")
        ofname = "%s.%s.labels.tsv" % (factor, split_name)
        sample_grpd_labels[os.path.join(output_dir, ofname)].append(
            (sample, os.path.join(labels_dir, fname)))

    args = [list(x) + [
        regions,
    ] for x in sample_grpd_labels.iteritems()]
    run_in_parallel(16, build_labels_tsv, args)
    return
Exemplo n.º 7
0
def build_hidden_test_set_arrays():
    regions_fname = \
        "/mnt/data/TF_binding/DREAM_challenge/public_data/annotations/test_regions.blacklistfiltered.bed.gz"
    idr_peaks_dir = "/mnt/data/TF_binding/DREAM_challenge/hidden_test_set_chipseq_data/idr/"
    relaxed_peaks_dir = "/mnt/data/TF_binding/DREAM_challenge/hidden_test_set_chipseq_data/relaxed/"
    output_dir = "/mnt/data/TF_binding/DREAM_challenge/hidden_test_set_chipseq_data/arrays/"

    metadata = load_metadata()
    ladderboard_samples = set(
        (x.TF, x.CELL_TYPE) for x in metadata if x.HIDDEN_TEST_SET is True)
    args = []
    for tf_name, cell_type in ladderboard_samples:
        idr_fname = "ChIPseq.{sample_name}.{tf_name}.conservative.train.narrowPeak.gz".format(
            sample_name=cell_type, tf_name=tf_name)
        idr_fname = os.path.join(idr_peaks_dir, idr_fname)
        relaxed_peaks_fname = "ChIPseq.{sample_name}.{tf_name}.relaxed.narrowPeak.gz".format(
            sample_name=cell_type, tf_name=tf_name)
        relaxed_peaks_fname = os.path.join(relaxed_peaks_dir,
                                           relaxed_peaks_fname)
        args.append(
            [regions_fname, idr_fname, relaxed_peaks_fname, output_dir, True])
    run_in_parallel(16, build_labels_for_sample_and_factor, args)
Exemplo n.º 8
0
def copy_chipseq_data(sample_dirs, idr_peaks_output_dir,
                      relaxed_peaks_output_dir, fold_change_output_dir,
                      regions_bed_fname):
    ## first group the IDR optimal peaks, and when there are alternates choose
    ## the experiment that has the largest number of peaks
    optimal_peaks = defaultdict(list)
    for sample_name, pk_fname in find_idr_peaks(
            sample_dirs,
            "/mnt/data/TF_binding/DREAM_challenge/all_data/chipseq/output/DREAM_challenge/{}/out/peak/idr/optimal_set/",
            ".IDR0.05.filt.narrowPeak.gz"):
        optimal_peaks[sample_and_factor_from_samplename(sample_name)].append(
            (sample_name, pk_fname))
    # the new sample_dirs has a single entry for each TF,sample_type combo
    sample_dirs = []
    for (factor, sample), sample_and_fnames in optimal_peaks.iteritems():
        if len(sample_and_fnames) == 1:
            sample_dirs.append(sample_and_fnames[0][0])
        elif len(sample_and_fnames) > 1:
            assert False, "There shouldn't be any samples with more than 1 replicate"
            #print sample_and_fnames[0][0], len(sample_and_fnames)
            best_sample, most_num_lines = None, 0
            for sample_name, fname in sample_and_fnames:
                num_lines = find_num_peaks(fname)
                if num_lines > most_num_lines:
                    best_sample = sample_name
                    most_num_lines = num_lines
            print best_sample, most_num_lines
        else:
            assert False, "It shouldn't be possible to have zero fnames"
    ## now that we've refined the sample list, copy all of the peaks and wiggles
    ## into the correct directory
    # copy the IDR peaks
    cmds = []
    for sample_name, pk_fname in find_idr_peaks(
            sample_dirs,
            "/mnt/data/TF_binding/DREAM_challenge/all_data/chipseq/output/DREAM_challenge/{}/out/peak/idr/optimal_set/",
            ".IDR0.05.filt.narrowPeak.gz"):
        sample, factor = sample_and_factor_from_samplename(sample_name)
        ofname = "ChIPseq.{factor}.{sample}.conservative.train.narrowPeak.gz".format(
            factor=factor, sample=sample)
        cmd = "bedtools intersect -wa -u -a {pk_fname} -b {regions_fname} | pigz > {ofname} ".format(
            pk_fname=pk_fname,
            regions_fname=regions_bed_fname,
            ofname=os.path.join(idr_peaks_output_dir, ofname))
        cmds.append([
            cmd,
        ])
    # copy the relaxed peaks
    for sample_name, pk_fname in find_idr_peaks(
            sample_dirs,
            "/mnt/data/TF_binding/DREAM_challenge/all_data/chipseq/output/DREAM_challenge/{}/out/peak/idr/pooled_pseudo_reps/",
            "unthresholded-peaks.txt.gz"):
        sample, factor = sample_and_factor_from_samplename(sample_name)
        ofname = "ChIPseq.{factor}.{sample}.relaxed.narrowPeak.gz".format(
            factor=factor, sample=sample)
        cmd = "bedtools intersect -wa -u -a {pk_fname} -b {regions_fname} | pigz > {ofname} ".format(
            pk_fname=pk_fname,
            regions_fname=regions_bed_fname,
            ofname=os.path.join(relaxed_peaks_output_dir, ofname))
        cmds.append([
            cmd,
        ])

    # run all of the peaks intersection cmds
    run_in_parallel(16, os.system, cmds)

    # copy the fold change wiggles
    print sample_dirs
    fold_change_bigwigs = list(
        find_idr_peaks(
            sample_dirs,
            "/mnt/data/TF_binding/DREAM_challenge/all_data/chipseq/output/DREAM_challenge/{}/out/signal/macs2/pooled_rep/",
            ".fc.signal.bw"))
    cmds = []
    for i, (sample_name, pk_fname) in enumerate(fold_change_bigwigs):
        print i, len(fold_change_bigwigs)
        try:
            sample, factor = sample_and_factor_from_samplename(sample_name)
            ofname = "ChIPseq.{factor}.{sample}.fc.signal.train.bw".format(
                factor=factor, sample=sample)
            try:
                open(ofname)
            except IOError:
                cmd = "bwtool remove mask -inverse {} {} {}".format(
                    regions_bed_fname, pk_fname,
                    os.path.join(fold_change_output_dir, ofname))
                cmds.append([
                    cmd,
                ])
            else:
                pass
            ### the old copy command
            #os.system("cp -u {} {}".format(
            #    pk_fname, os.path.join(fold_change_output_dir, ofname)))
        except FileNotFoundError:
            print "Can't run:", cmd
    run_in_parallel(16, os.system, cmds)