def upload(self, uploader):
                # Upload file outputs from the local file system.

                self.overlapping_peaks 		= uploader.upload(common.compress(self.overlapping_peaks_fn))
                self.overlapping_peaks_bb 	= uploader.upload(self.overlapping_peaks_bb_fn)
                self.rejected_peaks 		= uploader.upload(common.compress(self.rejected_peaks_fn))
                self.rejected_peaks_bb 		= uploader.upload(self.rejected_peaks_bb_fn)
Exemplo n.º 2
0
    # Sort the termID-docID pairs by termID
    term_doc_list.sort(key=lambda tuple: tuple[0])
    combined_term_doc_list = {}
    for termID, docID in term_doc_list:
        combined_term_doc_list.setdefault(termID, []).append(docID)
    print >> sys.stderr, 'print posting list to disc for dir:' + dir
    terms = combined_term_doc_list.keys()
    # Since sorted() takes much longer  than .sort()
    terms.sort()
    for termID in terms:
        # write the posting lists to block_pl for this current block
        combined_term_doc_list[termID].sort()
        numPostings = len(combined_term_doc_list[termID])
        common.writePostingsList(
            block_dict, block_pl, termID,
            common.compress(combined_term_doc_list[termID]), numPostings)
    block_pl.close()
    block_dict.close()

print >> sys.stderr, '######\nposting list construction finished!\n##########'

print >> sys.stderr, '\nMerging postings...'
while True:
    if len(block_q) <= 1:
        break
    b1 = block_q.popleft()
    b2 = block_q.popleft()
    print >> sys.stderr, 'merging %s and %s' % (b1, b2)
    b1_f = open(out_dir + '/' + b1 + '.postings', 'rb')
    b2_f = open(out_dir + '/' + b2 + '.postings', 'rb')
    b1_dict_file = open(out_dir + '/' + b1 + '.dict', 'rb')
Exemplo n.º 3
0
def main(experiment,
         reps_peaks,
         r1pr_peaks,
         r2pr_peaks,
         pooledpr_peaks,
         chrom_sizes,
         as_file,
         blacklist=None):

    #TODO for now just taking the peak files.  This applet should actually call IDR instead of
    #putting that in the workflow populator script

    # Initialize the data object inputs on the platform into
    # dxpy.DXDataObject instances.

    reps_peaks_file = dxpy.DXFile(reps_peaks)
    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    r2pr_peaks_file = dxpy.DXFile(r2pr_peaks)
    pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Download the file inputs to the local file system.

    #Need to prepend something to ensure the local filenames will be unique
    reps_peaks_filename = 'true_%s' % (reps_peaks_file.name)
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name)
    pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name

    dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename)
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename)
    dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    print subprocess.check_output('ls -l', shell=True)

    reps_peaks_filename = common.uncompress(reps_peaks_filename)
    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename)
    pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename)

    Nt = common.count_lines(reps_peaks_filename)
    print "%d peaks from true replicates" % (Nt)
    N1 = common.count_lines(r1pr_peaks_filename)
    print "%d peaks from rep1 self-pseudoreplicates" % (N1)
    N2 = common.count_lines(r2pr_peaks_filename)
    print "%d peaks from rep2 self-pseudoreplicates" % (N2)
    Np = common.count_lines(pooledpr_peaks_filename)
    print "%d peaks from pooled pseudoreplicates" % (Np)

    conservative_set_filename = '%s_final_conservative.narrowPeak' % (
        experiment)
    if blacklist is not None:
        blacklist_filter(reps_peaks_filename, conservative_set_filename,
                         blacklist_filename)
    else:
        conservative_set_filename = reps_peaks_filename
    Ncb = common.count_lines(conservative_set_filename)
    print "%d peaks blacklisted from the conservative set" % (Nt - Ncb)

    if Nt >= Np:
        peaks_to_filter_filename = reps_peaks_filename
        No = Nt
    else:
        peaks_to_filter_filename = pooledpr_peaks_filename
        No = Np

    optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(peaks_to_filter_filename, optimal_set_filename,
                         blacklist_filename)
    else:
        optimal_set_filename = peaks_to_filter_filename
    Nob = common.count_lines(optimal_set_filename)
    print "%d peaks blacklisted from the optimal set" % (No - Nob)

    rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt))
    self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2))

    if rescue_ratio > 2 and self_consistency_ratio > 2:
        reproducibility = 'fail'
    elif rescue_ratio > 2 or self_consistency_ratio > 2:
        reproducibility = 'borderline'
    else:
        reproducibility = 'pass'

    output = {}

    #bedtobigbed often fails, so skip creating the bb if it does
    conservative_set_bb_filename = common.bed2bb(conservative_set_filename,
                                                 chrom_sizes_filename,
                                                 as_file_filename)
    optimal_set_bb_filename = common.bed2bb(optimal_set_filename,
                                            chrom_sizes_filename,
                                            as_file_filename)
    if conservative_set_bb_filename:
        conservative_set_bb_output = dxpy.upload_local_file(
            conservative_set_bb_filename)
        output.update(
            {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)})
    if optimal_set_bb_filename:
        optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename)
        output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)})

    output.update({
        "Nt":
        Nt,
        "N1":
        N1,
        "N2":
        N2,
        "Np":
        Np,
        "conservative_set":
        dxpy.dxlink(
            dxpy.upload_local_file(
                common.compress(conservative_set_filename))),
        "optimal_set":
        dxpy.dxlink(
            dxpy.upload_local_file(common.compress(optimal_set_filename))),
        "rescue_ratio":
        rescue_ratio,
        "self_consistency_ratio":
        self_consistency_ratio,
        "reproducibility_test":
        reproducibility
    })

    logging.info("Exiting with output: %s", output)
    return output
Exemplo n.º 4
0
def internal_pseudoreplicate_IDR(experiment, r1pr_peaks, rep1_ta, rep1_xcor,
                                 paired_end, chrom_sizes, as_file, blacklist,
                                 rep1_signal, fragment_length=None):

    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    rep1_ta = dxpy.DXFile(rep1_ta)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Need to prepend something to ensure the local filenames will be unique
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    rep1_ta_filename = 'r1ta_%s' % (rep1_ta.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(rep1_ta.get_id(), rep1_ta_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)
    # If fragment_length is given, override appropriate values.
    # Calculate, or set the actually used fragment length value.
    # Set the fragment_length_given_by_user flag appropriately.
    if fragment_length is not None:
        rep1_xcor_filename = None
        fragment_length_used_rep1 = fragment_length
        fragment_length_given_by_user = True
    else:
        rep1_xcor = dxpy.DXFile(rep1_xcor)
        rep1_xcor_filename = 'r1xc_%s' % (rep1_xcor.name)
        dxpy.download_dxfile(rep1_xcor.get_id(), rep1_xcor_filename)
        fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename)
        fragment_length_given_by_user = False

    subprocess.check_output('set -x; ls -l', shell=True)

    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    N1 = common.count_lines(r1pr_peaks_filename)
    logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1))
    stable_set_filename = "%s_stable.narrowPeak" % (experiment)
    if blacklist is not None:
        blacklist_filter(r1pr_peaks_filename, stable_set_filename,
                         blacklist_filename)
        Nsb = common.count_lines(stable_set_filename)
        logger.info(
            "%d peaks blacklisted from the stable set" % (N1-Nsb))
    else:
        subprocess.check_output(shlex.split(
            'cp %s %s' % (r1pr_peaks_filename, stable_set_filename)))
        Nsb = N1
        logger.info("No blacklist filter applied to the stable set")

    # calculate FRiP

    n_reads, n_reads_in_peaks, frip_score = common.frip(
        rep1_ta_filename, rep1_xcor_filename, stable_set_filename,
        chrom_sizes_filename, fragment_length_used_rep1)

    output = {
        "rep1_frip_nreads": n_reads,
        "rep1_frip_nreads_in_peaks": n_reads_in_peaks,
        "F1": frip_score,
        "fragment_length_used_rep1": fragment_length_used_rep1,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    # These are optional outputs to see what's being removed by the blacklist
    if blacklist:
        output.update({
            "pre_bl_stable_set":
                dxpy.dxlink(dxpy.upload_local_file(common.compress(
                    r1pr_peaks_filename)))}
        )

    # bedtobigbed often fails, so skip creating the bb if it does
    stable_set_bb_filename = \
        common.bed2bb(stable_set_filename, chrom_sizes_filename,
                      as_file_filename)
    if stable_set_bb_filename:
        stable_set_bb_output = \
            dxpy.upload_local_file(stable_set_bb_filename)
        output.update(
            {"stable_set_bb": dxpy.dxlink(stable_set_bb_output)})

    output.update({
        "N1": N1,
        "stable_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(stable_set_filename))),
        "Ns": Nsb
    })

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})

    return output
Exemplo n.º 5
0
def replicated_IDR(experiment,
                   reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks,
                   rep1_ta, rep1_xcor, rep2_ta, rep2_xcor,
                   paired_end, chrom_sizes, as_file, blacklist,
                   rep1_signal, rep2_signal, pooled_signal,
                   fragment_length=None):

    # TODO for now just taking the peak files.  This applet should actually
    # call IDR instead of putting that in the workflow populator script

    reps_peaks_file = dxpy.DXFile(reps_peaks)
    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    r2pr_peaks_file = dxpy.DXFile(r2pr_peaks)
    pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks)
    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep2_ta_file = dxpy.DXFile(rep2_ta)
    rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    rep2_xcor_file = dxpy.DXFile(rep2_xcor)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Need to prepend something to ensure the local filenames will be unique
    reps_peaks_filename = 'true_%s' % (reps_peaks_file.name)
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name)
    pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name)
    rep1_ta_filename = 'r1ta_%s' % (rep1_ta_file.name)
    rep2_ta_filename = 'r2ta_%s' % (rep2_ta_file.name)
    rep1_xcor_filename = 'r1cc_%s' % (rep1_xcor_file.name)
    rep2_xcor_filename = 'r2cc_%s' % (rep2_xcor_file.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name

    dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename)
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename)
    dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_filename)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_filename)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_filename)
    dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    reps_peaks_filename = common.uncompress(reps_peaks_filename)
    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename)
    pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename)

    pool_applet = dxpy.find_one_data_object(
            classname='applet',
            name='pool',
            project=dxpy.PROJECT_CONTEXT_ID,
            zero_ok=False,
            more_ok=False,
            return_handler=True)
    pool_replicates_subjob = \
        pool_applet.run(
            {"inputs": [rep1_ta, rep2_ta],
             "prefix": 'pooled_reps'},
            name='Pool replicates')
    # next call could be on 267 and save time?
    pool_replicates_subjob.wait_on_done()
    # If fragment_length is not given, calculate the fragment_length
    # using crosscorrelation. Else use the overridevalue. Set the
    # pool_xcor_filename to None to accommodate common.frip calls.
    # Calculate, or set, actually used fragment lengths for different
    # cases. Set the flag indicating whether the fragment length
    # was given by the user.
    if fragment_length is not None:
        pool_xcor_filename = None
        fragment_length_used_rep1 = fragment_length
        fragment_length_used_rep2 = fragment_length
        fragment_length_used_pool = fragment_length
        fragment_length_given_by_user = True
    else:
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pool_replicates_subjob.get_output_ref("pooled"),
                paired_end,
                spp_version=None,
                name='Pool cross-correlation')
        pooled_replicates_xcor_subjob.wait_on_done()
        pool_xcor_link = pooled_replicates_xcor_subjob.describe()['output'].get("CC_scores_file")
        pool_xcor_file = dxpy.get_handler(pool_xcor_link)
        pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name)
        dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename)
        fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename)
        fragment_length_used_rep2 = common.xcor_fraglen(rep2_xcor_filename)
        fragment_length_used_pool = common.xcor_fraglen(pool_xcor_filename)
        fragment_length_given_by_user = False

    pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled")
    pool_ta_file = dxpy.get_handler(pool_ta_link)
    pool_ta_filename = 'poolta_%s' % (pool_ta_file.name)
    dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    Nt = common.count_lines(reps_peaks_filename)
    logger.info("%d peaks from true replicates (Nt)" % (Nt))
    N1 = common.count_lines(r1pr_peaks_filename)
    logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1))
    N2 = common.count_lines(r2pr_peaks_filename)
    logger.info("%d peaks from rep2 self-pseudoreplicates (N2)" % (N2))
    Np = common.count_lines(pooledpr_peaks_filename)
    logger.info("%d peaks from pooled pseudoreplicates (Np)" % (Np))

    # generate the conservative set, which is always based on the IDR peaks
    # from true replicates
    conservative_set_filename = \
        '%s_final_conservative.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(reps_peaks_filename, conservative_set_filename,
                         blacklist_filename)
        Ncb = common.count_lines(conservative_set_filename)
        logger.info(
            "%d peaks blacklisted from the conservative set" % (Nt-Ncb))
    else:
        subprocess.check_output(shlex.split(
            'cp %s %s' % (reps_peaks_filename, conservative_set_filename)))
        Ncb = Nt
        logger.info("No blacklist filter applied to the conservative set")

    # generate the optimal set, which is based on the longest of IDR peaks
    # list from true reps or the IDR peaks from the pseudoreplicates of the
    # pool
    if Nt >= Np:
        peaks_to_filter_filename = reps_peaks_filename
        No = Nt
    else:
        peaks_to_filter_filename = pooledpr_peaks_filename
        No = Np
    optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(peaks_to_filter_filename, optimal_set_filename,
                         blacklist_filename)
        Nob = common.count_lines(optimal_set_filename)
        logger.info("%d peaks blacklisted from the optimal set" % (No-Nob))
    else:
        subprocess.check_output(shlex.split(
            'cp %s %s' % (peaks_to_filter_filename, optimal_set_filename)))
        Nob = No
        logger.info("No blacklist filter applied to the optimal set")

    rescue_ratio            = float(max(Np, Nt)) / float(min(Np, Nt))
    self_consistency_ratio  = float(max(N1, N2)) / float(min(N1, N2))

    if rescue_ratio > 2 and self_consistency_ratio > 2:
        reproducibility = 'fail'
    elif rescue_ratio > 2 or self_consistency_ratio > 2:
        reproducibility = 'borderline'
    else:
        reproducibility = 'pass'

    # FRiP (fraction reads in peaks)
    # rep1 stable peaks comparing internal pseudoreplicates
    rep1_n_reads, rep1_n_reads_in_peaks, rep1_frip_score = common.frip(
        rep1_ta_filename, rep1_xcor_filename, r1pr_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # rep2 stable peaks comparing internal pseudoreplicates
    rep2_n_reads, rep2_n_reads_in_peaks, rep2_frip_score = common.frip(
        rep2_ta_filename, rep2_xcor_filename, r2pr_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # comparing true reps
    true_n_reads, true_n_reads_in_peaks, true_frip_score = common.frip(
        pool_ta_filename, pool_xcor_filename, reps_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # comparing pooled pseudoreplicates
    pr_n_reads, pr_n_reads_in_peaks, pr_frip_score = common.frip(
        pool_ta_filename, pool_xcor_filename, pooledpr_peaks_filename,
        chrom_sizes_filename, fragment_length)

    output = {
        "rep1_frip_nreads"           : rep1_n_reads,
        "rep1_frip_nreads_in_peaks"  : rep1_n_reads_in_peaks,
        "F1"            : rep1_frip_score,
        "rep2_frip_nreads"           : rep2_n_reads,
        "rep2_frip_nreads_in_peaks"  : rep2_n_reads_in_peaks,
        "F2"            : rep2_frip_score,
        "true_frip_nreads"           : true_n_reads,
        "true_frip_nreads_in_peaks"  : true_n_reads_in_peaks,
        "Ft"            : true_frip_score,
        "pr_frip_nreads"             : pr_n_reads,
        "pr_frip_nreads_in_peaks"    : pr_n_reads_in_peaks,
        "Fp"              : pr_frip_score,
        "fragment_length_used_rep1": fragment_length_used_rep1,
        "fragment_length_used_rep2": fragment_length_used_rep2,
        "fragment_length_used_pool": fragment_length_used_pool,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    # These are optional outputs to see what's being removed by the blacklist
    if blacklist:
        output.update({
            "pre_bl_conservative_set":
                dxpy.dxlink(dxpy.upload_local_file(common.compress(
                    reps_peaks_filename))),
            "pre_bl_optimal_set":
                dxpy.dxlink(dxpy.upload_local_file(common.compress(
                    peaks_to_filter_filename)))}
        )

    # bedtobigbed often fails, so skip creating the bb if it does
    conservative_set_bb_filename = \
        common.bed2bb(conservative_set_filename, chrom_sizes_filename,
                      as_file_filename)
    optimal_set_bb_filename = \
        common.bed2bb(optimal_set_filename, chrom_sizes_filename,
                      as_file_filename)
    if conservative_set_bb_filename:
        conservative_set_bb_output = \
            dxpy.upload_local_file(conservative_set_bb_filename)
        output.update(
            {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)})
    if optimal_set_bb_filename:
        optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename)
        output.update(
            {"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)})

    output.update({
        "Nt": Nt,
        "N1": N1,
        "N2": N2,
        "Np": Np,
        "conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(conservative_set_filename))),
        "optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(optimal_set_filename))),
        "rescue_ratio": rescue_ratio,
        "self_consistency_ratio": self_consistency_ratio,
        "reproducibility_test": reproducibility,
        "No": Nob,
        "Nc": Ncb
    })

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})
    if rep2_signal:
        output.update({"rep2_signal": rep2_signal})
    if pooled_signal:
        output.update({"pooled_signal": pooled_signal})

    return output
Exemplo n.º 6
0
def main(rep1_peaks,
         rep2_peaks,
         pooled_peaks,
         pooledpr1_peaks,
         pooledpr2_peaks,
         chrom_sizes,
         as_file,
         peak_type,
         prefix=None,
         rep1_signal=None,
         rep2_signal=None,
         pooled_signal=None):

    # Initialize data object inputs on the platform
    # into dxpy.DXDataObject instances

    rep1_peaks = dxpy.DXFile(rep1_peaks)
    rep2_peaks = dxpy.DXFile(rep2_peaks)
    pooled_peaks = dxpy.DXFile(pooled_peaks)
    pooledpr1_peaks = dxpy.DXFile(pooledpr1_peaks)
    pooledpr2_peaks = dxpy.DXFile(pooledpr2_peaks)
    chrom_sizes = dxpy.DXFile(chrom_sizes)
    as_file = dxpy.DXFile(as_file)

    #Input filenames - necessary to define each explicitly because input files could have the same name, in which case subsequent
    #file would overwrite previous file
    rep1_peaks_fn = 'rep1-%s' % (rep1_peaks.name)
    rep2_peaks_fn = 'rep2-%s' % (rep2_peaks.name)
    pooled_peaks_fn = 'pooled-%s' % (pooled_peaks.name)
    pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks.name)
    pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks.name)
    chrom_sizes_fn = 'chrom.sizes'
    as_file_fn = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        m = re.match(
            '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
            pooled_peaks.name)  #strip off the peak and compression extensions
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(pooledpr1_peaks.get_id(), pooledpr1_peaks_fn)
    dxpy.download_dxfile(pooledpr2_peaks.get_id(), pooledpr2_peaks_fn)
    dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file.get_id(), as_file_fn)
    '''
    #find pooled peaks that are in (rep1 AND rep2)
    out, err = common.run_pipe([
        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn),
        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn)
        ], overlap_tr_fn)
    print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn))

    #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2)
    out, err = common.run_pipe([
        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn),
        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn)
        ], overlap_pr_fn)
    print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn))

    #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2)
    out, err = common.run_pipe([
        'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn),
        'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn)
        ], overlapping_peaks_fn)
    print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn))
    '''
    #the only difference between the peak_types is how the extra columns are handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in [
            'narrowPeak', 'gappedPeak', 'broadPeak'
        ], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (
            peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %
        (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (rep2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_tr_fn)
    print "%d peaks overlap with both true replicates" % (
        common.count_lines(overlap_tr_fn))

    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn),
        awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_pr_fn)
    print "%d peaks overlap with both pooled pseudoreplicates" % (
        common.count_lines(overlap_pr_fn))

    # Combine peak lists
    out, err = common.run_pipe(
        ['cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u'],
        overlapping_peaks_fn)
    print "%d peaks overlap with true replicates or with pooled pseudorepliates" % (
        common.count_lines(overlapping_peaks_fn))

    #rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' %
        (pooled_peaks_fn, overlapping_peaks_fn)
    ], rejected_peaks_fn)
    print "%d peaks were rejected" % (common.count_lines(rejected_peaks_fn))

    npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    #make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn,
                                            chrom_sizes_fn,
                                            as_file_fn,
                                            bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn,
                                         chrom_sizes_fn,
                                         as_file_fn,
                                         bed_type=bed_type)
    # overlapping_peaks_bb_fn = common.bed2bb(common.slop_clip(overlapping_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    # rejected_peaks_bb_fn    = common.bed2bb(common.slop_clip(rejected_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks = dxpy.upload_local_file(
        common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn)

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {
        "overlapping_peaks": dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks": dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in": npeaks_in,
        "npeaks_out": npeaks_out,
        'npeaks_rejected': npeaks_rejected
    }

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})
    if rep2_signal:
        output.update({"rep2_signal": rep2_signal})
    if pooled_signal:
        output.update({"pooled_signal": pooled_signal})

    return output
Exemplo n.º 7
0
  print >> sys.stderr, 'sorting term doc list for dir:' + dir
  term_doc_list = list(term_doc_list)
  # Sort the termID-docID pairs by termID
  term_doc_list.sort(key=lambda tuple:tuple[0])
  combined_term_doc_list = {}
  for termID,docID in term_doc_list:
      combined_term_doc_list.setdefault(termID,[]).append(docID)
  print >> sys.stderr, 'print posting list to disc for dir:' + dir
  terms = combined_term_doc_list.keys()
  # Since sorted() takes much longer  than .sort()
  terms.sort()
  for termID in terms:
    # write the posting lists to block_pl for this current block
    combined_term_doc_list[termID].sort()
    numPostings = len(combined_term_doc_list[termID])
    common.writePostingsList( block_dict, block_pl, termID, common.compress(combined_term_doc_list[termID]), numPostings)
  block_pl.close()
  block_dict.close()

 


print >> sys.stderr, '######\nposting list construction finished!\n##########'

print >> sys.stderr, '\nMerging postings...'
while True:
  if len(block_q) <= 1:
    break
  b1 = block_q.popleft()
  b2 = block_q.popleft()
  print >> sys.stderr, 'merging %s and %s' % (b1, b2)
Exemplo n.º 8
0
def internal_pseudoreplicate_overlap(rep1_peaks, rep2_peaks, pooled_peaks,
                                     rep1_ta, rep1_xcor,
                                     paired_end, chrom_sizes, as_file,
                                     peak_type, prefix, fragment_length=None):

    rep1_peaks_file      = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file      = dxpy.DXFile(rep2_peaks)
    pooled_peaks_file    = dxpy.DXFile(pooled_peaks)
    rep1_ta_file         = dxpy.DXFile(rep1_ta)
    rep1_xcor_file       = dxpy.DXFile(rep1_xcor)
    chrom_sizes_file     = dxpy.DXFile(chrom_sizes)
    as_file_file         = dxpy.DXFile(as_file)

    # Input filenames - necessary to define each explicitly because input files
    # could have the same name, in which case subsequent
    # file would overwrite previous file
    rep1_peaks_fn      = 'rep1-%s' % (rep1_peaks_file.name)
    rep2_peaks_fn      = 'rep2-%s' % (rep2_peaks_file.name)
    pooled_peaks_fn    = 'pooled-%s' % (pooled_peaks_file.name)
    rep1_ta_fn         = 'r1ta_%s' % (rep1_ta_file.name)
    rep1_xcor_fn       = 'r1xc_%s' % (rep1_xcor_file.name)
    chrom_sizes_fn     = 'chrom.sizes'
    as_file_fn         = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        # strip off the peak and compression extensions
        m = re.match(
            '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
            pooled_peaks.name)
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn    = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn       = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn    = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_fn)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    # the only difference between the peak_types is how the extra columns are
    # handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as
    # the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_tr_fn)
    print(
        "%d peaks overlap with both true replicates"
        % (common.count_lines(overlap_tr_fn)))

    # this is a simplicate analysis
    # overlapping peaks are just based on pseudoreps of the one pool
    out, err = common.run_pipe([
        'cat %s' % (overlap_tr_fn),
        'sort -u'
        ], overlapping_peaks_fn)
    print(
        "%d peaks overlap"
        % (common.count_lines(overlapping_peaks_fn)))

    # rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn)
        ], rejected_peaks_fn)
    print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)))

    # calculate FRiP (Fraction of Reads in Peaks)

    # Extract the fragment length estimate from column 3 of the
    # cross-correlation scores file or use the user-defined
    # fragment_length if given.
    if fragment_length is not None:
        fraglen = fragment_length
        fragment_length_given_by_user = True
    else:
        fraglen = common.xcor_fraglen(rep1_xcor_fn)
        fragment_length_given_by_user = False

    # FRiP
    reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type)
    n_reads, n_reads_in_peaks, frip_score = common.frip(
        rep1_ta_fn, rep1_xcor_fn, overlapping_peaks_fn,
        chrom_sizes_fn, fraglen,
        reads_in_peaks_fn=reads_in_peaks_fn)

    # count peaks
    npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    # make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(
        overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(
        rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks = dxpy.upload_local_file(common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn)

    output = {
        "overlapping_peaks"     : dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb"  : dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks"        : dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb"     : dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in"             : npeaks_in,
        "npeaks_out"            : npeaks_out,
        "npeaks_rejected"       : npeaks_rejected,
        "frip_nreads"           : n_reads,
        "frip_nreads_in_peaks"  : n_reads_in_peaks,
        "frip_score"            : frip_score,
        "fragment_length_used"  : fraglen,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    return output
Exemplo n.º 9
0
def replicated_overlap(rep1_peaks,
                       rep2_peaks,
                       pooled_peaks,
                       pooledpr1_peaks,
                       pooledpr2_peaks,
                       rep1_ta,
                       rep1_xcor,
                       rep2_ta,
                       rep2_xcor,
                       paired_end,
                       chrom_sizes,
                       as_file,
                       peak_type,
                       prefix,
                       fragment_length=None):

    rep1_peaks_file = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file = dxpy.DXFile(rep2_peaks)
    pooled_peaks_file = dxpy.DXFile(pooled_peaks)
    pooledpr1_peaks_file = dxpy.DXFile(pooledpr1_peaks)
    pooledpr2_peaks_file = dxpy.DXFile(pooledpr2_peaks)
    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep2_ta_file = dxpy.DXFile(rep2_ta)
    rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    rep2_xcor_file = dxpy.DXFile(rep2_xcor)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)

    # Input filenames - necessary to define each explicitly because input files
    # could have the same name, in which case subsequent
    # file would overwrite previous file
    rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name)
    rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name)
    pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name)
    pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks_file.name)
    pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks_file.name)
    rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name)
    rep2_ta_fn = 'r2ta_%s' % (rep2_ta_file.name)
    rep1_xcor_fn = 'r1cc_%s' % (rep1_xcor_file.name)
    rep2_xcor_fn = 'r2cc_%s' % (rep2_xcor_file.name)
    chrom_sizes_fn = 'chrom.sizes'
    as_file_fn = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        # strip off the peak and compression extensions
        m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
                     pooled_peaks.name)
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(pooledpr1_peaks_file.get_id(), pooledpr1_peaks_fn)
    dxpy.download_dxfile(pooledpr2_peaks_file.get_id(), pooledpr2_peaks_fn)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_fn)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn)
    dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_fn)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_fn)

    pool_applet = dxpy.find_one_data_object(classname='applet',
                                            name='pool',
                                            project=dxpy.PROJECT_CONTEXT_ID,
                                            zero_ok=False,
                                            more_ok=False,
                                            return_handler=True)
    pool_replicates_subjob = \
        pool_applet.run(
            {"inputs": [rep1_ta, rep2_ta],
             "prefix": 'pooled_reps'},
            name='Pool replicates')
    # If fragment length was given by user we skip pooled_replicates
    # _xcor_subjob, set the pool_xcor_filename to None, and update
    # the flag fragment_length_given_by_user. Otherwise, run the subjob
    # to be able to extract the fragment length fron cross-correlations.
    if fragment_length is not None:
        pool_xcor_filename = None
        fraglen = fragment_length
        fragment_length_given_by_user = True
    else:
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pool_replicates_subjob.get_output_ref("pooled"),
                paired_end,
                spp_version=None,
                name='Pool cross-correlation')
        pooled_replicates_xcor_subjob.wait_on_done()
        pool_xcor_link = pooled_replicates_xcor_subjob.describe(
        )['output'].get("CC_scores_file")
        pool_xcor_file = dxpy.get_handler(pool_xcor_link)
        pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name)
        dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename)
        fraglen = common.xcor_fraglen(pool_xcor_filename)
        fragment_length_given_by_user = False

    pool_replicates_subjob.wait_on_done()
    pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled")
    pool_ta_file = dxpy.get_handler(pool_ta_link)
    pool_ta_filename = 'poolta_%s' % (pool_ta_file.name)
    dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    # the only difference between the peak_types is how the extra columns are
    # handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in [
            'narrowPeak', 'gappedPeak', 'broadPeak'
        ], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (
            peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as
    # the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %
        (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (rep2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_tr_fn)
    print("%d peaks overlap with both true replicates" %
          (common.count_lines(overlap_tr_fn)))

    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where
    # overlap is defined as the fractional overlap wrt any one of the
    # overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn),
        awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_pr_fn)
    print("%d peaks overlap with both pooled pseudoreplicates" %
          (common.count_lines(overlap_pr_fn)))

    # Combine peak lists
    out, err = common.run_pipe(
        ['cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u'],
        overlapping_peaks_fn)
    print(
        "%d peaks overlap with true replicates or with pooled pseudoreplicates"
        % (common.count_lines(overlapping_peaks_fn)))

    # rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' %
        (pooled_peaks_fn, overlapping_peaks_fn)
    ], rejected_peaks_fn)
    print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)))

    # calculate FRiP (Fraction of Reads in Peaks)
    reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type)
    n_reads, n_reads_in_peaks, frip_score = common.frip(
        pool_ta_filename,
        pool_xcor_filename,
        overlapping_peaks_fn,
        chrom_sizes_fn,
        fraglen,
        reads_in_peaks_fn=reads_in_peaks_fn)

    # count peaks
    npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    # make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn,
                                            chrom_sizes_fn,
                                            as_file_fn,
                                            bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn,
                                         chrom_sizes_fn,
                                         as_file_fn,
                                         bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks = dxpy.upload_local_file(
        common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn)

    output = {
        "overlapping_peaks": dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks": dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in": npeaks_in,
        "npeaks_out": npeaks_out,
        "npeaks_rejected": npeaks_rejected,
        "frip_nreads": n_reads,
        "frip_nreads_in_peaks": n_reads_in_peaks,
        "frip_score": frip_score,
        "fragment_length_used": fraglen,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    return output
Exemplo n.º 10
0
def main(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks, chrom_sizes, as_file, blacklist=None):

	#TODO for now just taking the peak files.  This applet should actually call IDR instead of 
	#putting that in the workflow populator script

	# Initialize the data object inputs on the platform into
	# dxpy.DXDataObject instances.

	reps_peaks_file = dxpy.DXFile(reps_peaks)
	r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
	r2pr_peaks_file = dxpy.DXFile(r2pr_peaks)
	pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks)
	chrom_sizes_file = dxpy.DXFile(chrom_sizes)
	as_file_file = dxpy.DXFile(as_file)
	if blacklist is not None:
		blacklist_file = dxpy.DXFile(blacklist)
		blacklist_filename = 'blacklist_%s' %(blacklist_file.name)
		dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
		blacklist_filename = common.uncompress(blacklist_filename)

	# Download the file inputs to the local file system.

	#Need to prepend something to ensure the local filenames will be unique
	reps_peaks_filename = 'true_%s' %(reps_peaks_file.name)
	r1pr_peaks_filename = 'r1pr_%s' %(r1pr_peaks_file.name)
	r2pr_peaks_filename = 'r2pr_%s' %(r2pr_peaks_file.name)
	pooledpr_peaks_filename = 'pooledpr_%s' %(pooledpr_peaks_file.name)
	chrom_sizes_filename = chrom_sizes_file.name
	as_file_filename = as_file_file.name

	dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename)
	dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
	dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename)
	dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename)
	dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
	dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

	print subprocess.check_output('ls -l', shell=True)

	reps_peaks_filename = common.uncompress(reps_peaks_filename)
	r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
	r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename)
	pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename)

	Nt = common.count_lines(reps_peaks_filename)
	print "%d peaks from true replicates" %(Nt)
	N1 = common.count_lines(r1pr_peaks_filename)
	print "%d peaks from rep1 self-pseudoreplicates" %(N1)
	N2 = common.count_lines(r2pr_peaks_filename)
	print "%d peaks from rep2 self-pseudoreplicates" %(N2)
	Np = common.count_lines(pooledpr_peaks_filename)
	print "%d peaks from pooled pseudoreplicates" %(Np)

	conservative_set_filename = '%s_final_conservative.narrowPeak' %(experiment)
	if blacklist is not None:
		blacklist_filter(reps_peaks_filename, conservative_set_filename, blacklist_filename)
	else:
		conservative_set_filename = reps_peaks_filename
	Ncb = common.count_lines(conservative_set_filename)
	print "%d peaks blacklisted from the conservative set" %(Nt-Ncb)

	if Nt >= Np:
		peaks_to_filter_filename = reps_peaks_filename
		No = Nt
	else:
		peaks_to_filter_filename = pooledpr_peaks_filename
		No = Np

	optimal_set_filename = '%s_final_optimal.narrowPeak' %(experiment)
	if blacklist is not None:
		blacklist_filter(peaks_to_filter_filename, optimal_set_filename, blacklist_filename)
	else:
		optimal_set_filename = peaks_to_filter_filename
	Nob = common.count_lines(optimal_set_filename)
	print "%d peaks blacklisted from the optimal set" %(No-Nob)

	rescue_ratio            = float(max(Np,Nt)) / float(min(Np,Nt))
	self_consistency_ratio  = float(max(N1,N2)) / float(min(N1,N2))

	if rescue_ratio > 2 and self_consistency_ratio > 2:
		reproducibility = 'fail'
	elif rescue_ratio > 2 or self_consistency_ratio > 2:
		reproducibility = 'borderline'
	else:
		reproducibility = 'pass'

	output = {}

	#bedtobigbed often fails, so skip creating the bb if it does
	conservative_set_bb_filename = common.bed2bb(conservative_set_filename, chrom_sizes_filename, as_file_filename)
	optimal_set_bb_filename = common.bed2bb(optimal_set_filename, chrom_sizes_filename, as_file_filename)
	if conservative_set_bb_filename:
		conservative_set_bb_output = dxpy.upload_local_file(conservative_set_bb_filename)
		output.update({"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)})
	if optimal_set_bb_filename:
		optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename)
		output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)})

	output.update({
		"Nt": Nt,
		"N1": N1,
		"N2": N2,
		"Np": Np,
		"conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(conservative_set_filename))),
		"optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(optimal_set_filename))),
		"rescue_ratio": rescue_ratio,
		"self_consistency_ratio": self_consistency_ratio,
		"reproducibility_test": reproducibility,
		"No": Nob,
		"Nc": Ncb
	})

	logging.info("Exiting with output: %s", output)
	return output
Exemplo n.º 11
0
def main(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks,
         chrom_sizes, as_file, blacklist=None,
         rep1_signal=None, rep2_signal=None, pooled_signal=None):

    # TODO for now just taking the peak files.  This applet should actually
    # call IDR instead of putting that in the workflow populator script

    reps_peaks_file = dxpy.DXFile(reps_peaks)
    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    r2pr_peaks_file = dxpy.DXFile(r2pr_peaks)
    pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Need to prepend something to ensure the local filenames will be unique
    reps_peaks_filename = 'true_%s' % (reps_peaks_file.name)
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name)
    pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name

    dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename)
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename)
    dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    subprocess.check_output('set -x; ls -l', shell=True)

    reps_peaks_filename = common.uncompress(reps_peaks_filename)
    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename)
    pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename)

    Nt = common.count_lines(reps_peaks_filename)
    logger.info("%d peaks from true replicates (Nt)" % (Nt))
    N1 = common.count_lines(r1pr_peaks_filename)
    logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1))
    N2 = common.count_lines(r2pr_peaks_filename)
    logger.info("%d peaks from rep2 self-pseudoreplicates (N2)" % (N2))
    Np = common.count_lines(pooledpr_peaks_filename)
    logger.info("%d peaks from pooled pseudoreplicates (Np)" % (Np))

    # generate the conservative set, which is always based on the IDR peaks
    # from true replicates
    conservative_set_filename = \
        '%s_final_conservative.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(reps_peaks_filename, conservative_set_filename,
                         blacklist_filename)
        Ncb = common.count_lines(conservative_set_filename)
        logger.info(
            "%d peaks blacklisted from the conservative set" % (Nt-Ncb))
    else:
        subprocess.check_output(shlex.split(
            'cp %s %s' % (reps_peaks_filename, conservative_set_filename)))
        Ncb = Nt
        logger.info("No blacklist filter applied to the conservative set")

    # generate the optimal set, which is based on the longest of IDR peaks
    # list from true reps or the IDR peaks from the pseudoreplicates of the
    # pool
    if Nt >= Np:
        peaks_to_filter_filename = reps_peaks_filename
        No = Nt
    else:
        peaks_to_filter_filename = pooledpr_peaks_filename
        No = Np
    optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(peaks_to_filter_filename, optimal_set_filename,
                         blacklist_filename)
        Nob = common.count_lines(optimal_set_filename)
        logger.info("%d peaks blacklisted from the optimal set" % (No-Nob))
    else:
        subprocess.check_output(shlex.split(
            'cp %s %s' % (peaks_to_filter_filename, optimal_set_filename)))
        Nob = No
        logger.info("No blacklist filter applied to the optimal set")

    rescue_ratio            = float(max(Np, Nt)) / float(min(Np, Nt))
    self_consistency_ratio  = float(max(N1, N2)) / float(min(N1, N2))

    if rescue_ratio > 2 and self_consistency_ratio > 2:
        reproducibility = 'fail'
    elif rescue_ratio > 2 or self_consistency_ratio > 2:
        reproducibility = 'borderline'
    else:
        reproducibility = 'pass'

    output = {}

    # These are optional outputs to see what's being removed by the blacklist
    if blacklist:
        output.update({
            "pre_bl_conservative_set":
                dxpy.dxlink(dxpy.upload_local_file(common.compress(
                    reps_peaks_filename))),
            "pre_bl_optimal_set":
                dxpy.dxlink(dxpy.upload_local_file(common.compress(
                    peaks_to_filter_filename)))}
        )

    # bedtobigbed often fails, so skip creating the bb if it does
    conservative_set_bb_filename = \
        common.bed2bb(conservative_set_filename, chrom_sizes_filename,
                      as_file_filename)
    optimal_set_bb_filename = \
        common.bed2bb(optimal_set_filename, chrom_sizes_filename,
                      as_file_filename)
    if conservative_set_bb_filename:
        conservative_set_bb_output = \
            dxpy.upload_local_file(conservative_set_bb_filename)
        output.update(
            {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)})
    if optimal_set_bb_filename:
        optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename)
        output.update(
            {"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)})

    output.update({
        "Nt": Nt,
        "N1": N1,
        "N2": N2,
        "Np": Np,
        "conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(conservative_set_filename))),
        "optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(optimal_set_filename))),
        "rescue_ratio": rescue_ratio,
        "self_consistency_ratio": self_consistency_ratio,
        "reproducibility_test": reproducibility,
        "No": Nob,
        "Nc": Ncb
    })

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})
    if rep2_signal:
        output.update({"rep2_signal": rep2_signal})
    if pooled_signal:
        output.update({"pooled_signal": pooled_signal})

    logging.info("Exiting with output: %s", output)
    return output
Exemplo n.º 12
0
def main(rep1_peaks, rep2_peaks, pooled_peaks, pooledpr1_peaks, pooledpr2_peaks,
         chrom_sizes, as_file, peak_type, prefix=None,
         rep1_signal=None, rep2_signal=None, pooled_signal=None):

    # Initialize data object inputs on the platform
    # into dxpy.DXDataObject instances

    rep1_peaks      = dxpy.DXFile(rep1_peaks)
    rep2_peaks      = dxpy.DXFile(rep2_peaks)
    pooled_peaks    = dxpy.DXFile(pooled_peaks)
    pooledpr1_peaks = dxpy.DXFile(pooledpr1_peaks)
    pooledpr2_peaks = dxpy.DXFile(pooledpr2_peaks)
    chrom_sizes     = dxpy.DXFile(chrom_sizes)
    as_file         = dxpy.DXFile(as_file)

    #Input filenames - necessary to define each explicitly because input files could have the same name, in which case subsequent
    #file would overwrite previous file
    rep1_peaks_fn       = 'rep1-%s' %(rep1_peaks.name)
    rep2_peaks_fn       = 'rep2-%s' %(rep2_peaks.name)
    pooled_peaks_fn     = 'pooled-%s' %(pooled_peaks.name)
    pooledpr1_peaks_fn  = 'pooledpr1-%s' %(pooledpr1_peaks.name)
    pooledpr2_peaks_fn  = 'pooledpr2-%s' %(pooledpr2_peaks.name)
    chrom_sizes_fn      = 'chrom.sizes'
    as_file_fn          = '%s.as' %(peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' %(peak_type), pooled_peaks.name) #strip off the peak and compression extensions
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn    = '%s.replicated.%s' %(basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn       = '%s.rejected.%s' %(basename, peak_type)
    rejected_peaks_bb_fn    = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn   = 'replicated_tr.%s' %(peak_type)
    overlap_pr_fn   = 'replicated_pr.%s' %(peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(pooledpr1_peaks.get_id(), pooledpr1_peaks_fn)
    dxpy.download_dxfile(pooledpr2_peaks.get_id(), pooledpr2_peaks_fn)
    dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file.get_id(), as_file_fn)

    '''
    #find pooled peaks that are in (rep1 AND rep2)
    out, err = common.run_pipe([
        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn),
        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn)
        ], overlap_tr_fn)
    print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn))

    #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2)
    out, err = common.run_pipe([
        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn),
        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn)
        ], overlap_pr_fn)
    print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn))

    #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2)
    out, err = common.run_pipe([
        'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn),
        'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn)
        ], overlapping_peaks_fn)
    print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn))
    '''
    #the only difference between the peak_types is how the extra columns are handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' %(rep2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_tr_fn)
    print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn))

    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' %(pooledpr2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_pr_fn)
    print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn))

    # Combine peak lists
    out, err = common.run_pipe([
        'cat %s %s' %(overlap_tr_fn, overlap_pr_fn),
        'sort -u'
        ], overlapping_peaks_fn)
    print "%d peaks overlap with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn))

    #rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' %(pooled_peaks_fn, overlapping_peaks_fn)
        ], rejected_peaks_fn)
    print "%d peaks were rejected" %(common.count_lines(rejected_peaks_fn))

    npeaks_in       = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out      = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    #make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    rejected_peaks_bb_fn    = common.bed2bb(rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    # overlapping_peaks_bb_fn = common.bed2bb(common.slop_clip(overlapping_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    # rejected_peaks_bb_fn    = common.bed2bb(common.slop_clip(rejected_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks       = dxpy.upload_local_file(common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb    = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks          = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb       = dxpy.upload_local_file(rejected_peaks_bb_fn)

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {
        "overlapping_peaks"     : dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb"  : dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks"        : dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb"     : dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in"             : npeaks_in,
        "npeaks_out"            : npeaks_out,
        'npeaks_rejected'       : npeaks_rejected
    }

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})
    if rep2_signal:
        output.update({"rep2_signal": rep2_signal})
    if pooled_signal:
        output.update({"pooled_signal": pooled_signal})

    return output
Exemplo n.º 13
0
def internal_pseudoreplicate_IDR(experiment,
                                 r1pr_peaks,
                                 rep1_ta,
                                 rep1_xcor,
                                 paired_end,
                                 chrom_sizes,
                                 as_file,
                                 blacklist,
                                 rep1_signal,
                                 fragment_length=None):

    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    rep1_ta = dxpy.DXFile(rep1_ta)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Need to prepend something to ensure the local filenames will be unique
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    rep1_ta_filename = 'r1ta_%s' % (rep1_ta.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(rep1_ta.get_id(), rep1_ta_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)
    # If fragment_length is given, override appropriate values.
    # Calculate, or set the actually used fragment length value.
    # Set the fragment_length_given_by_user flag appropriately.
    if fragment_length is not None:
        rep1_xcor_filename = None
        fragment_length_used_rep1 = fragment_length
        fragment_length_given_by_user = True
    else:
        rep1_xcor = dxpy.DXFile(rep1_xcor)
        rep1_xcor_filename = 'r1xc_%s' % (rep1_xcor.name)
        dxpy.download_dxfile(rep1_xcor.get_id(), rep1_xcor_filename)
        fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename)
        fragment_length_given_by_user = False

    subprocess.check_output('set -x; ls -l', shell=True)

    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    N1 = common.count_lines(r1pr_peaks_filename)
    logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1))
    stable_set_filename = "%s_stable.narrowPeak" % (experiment)
    if blacklist is not None:
        blacklist_filter(r1pr_peaks_filename, stable_set_filename,
                         blacklist_filename)
        Nsb = common.count_lines(stable_set_filename)
        logger.info("%d peaks blacklisted from the stable set" % (N1 - Nsb))
    else:
        subprocess.check_output(
            shlex.split('cp %s %s' %
                        (r1pr_peaks_filename, stable_set_filename)))
        Nsb = N1
        logger.info("No blacklist filter applied to the stable set")

    # calculate FRiP

    n_reads, n_reads_in_peaks, frip_score = common.frip(
        rep1_ta_filename, rep1_xcor_filename, stable_set_filename,
        chrom_sizes_filename, fragment_length_used_rep1)

    output = {
        "rep1_frip_nreads": n_reads,
        "rep1_frip_nreads_in_peaks": n_reads_in_peaks,
        "F1": frip_score,
        "fragment_length_used_rep1": fragment_length_used_rep1,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    # These are optional outputs to see what's being removed by the blacklist
    if blacklist:
        output.update({
            "pre_bl_stable_set":
            dxpy.dxlink(
                dxpy.upload_local_file(common.compress(r1pr_peaks_filename)))
        })

    # bedtobigbed often fails, so skip creating the bb if it does
    stable_set_bb_filename = \
        common.bed2bb(stable_set_filename, chrom_sizes_filename,
                      as_file_filename)
    if stable_set_bb_filename:
        stable_set_bb_output = \
            dxpy.upload_local_file(stable_set_bb_filename)
        output.update({"stable_set_bb": dxpy.dxlink(stable_set_bb_output)})

    output.update({
        "N1":
        N1,
        "stable_set":
        dxpy.dxlink(
            dxpy.upload_local_file(common.compress(stable_set_filename))),
        "Ns":
        Nsb
    })

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})

    return output
Exemplo n.º 14
0
def replicated_IDR(experiment,
                   reps_peaks,
                   r1pr_peaks,
                   r2pr_peaks,
                   pooledpr_peaks,
                   rep1_ta,
                   rep1_xcor,
                   rep2_ta,
                   rep2_xcor,
                   paired_end,
                   chrom_sizes,
                   as_file,
                   blacklist,
                   rep1_signal,
                   rep2_signal,
                   pooled_signal,
                   fragment_length=None):

    # TODO for now just taking the peak files.  This applet should actually
    # call IDR instead of putting that in the workflow populator script

    reps_peaks_file = dxpy.DXFile(reps_peaks)
    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    r2pr_peaks_file = dxpy.DXFile(r2pr_peaks)
    pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks)
    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep2_ta_file = dxpy.DXFile(rep2_ta)
    rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    rep2_xcor_file = dxpy.DXFile(rep2_xcor)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Need to prepend something to ensure the local filenames will be unique
    reps_peaks_filename = 'true_%s' % (reps_peaks_file.name)
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name)
    pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name)
    rep1_ta_filename = 'r1ta_%s' % (rep1_ta_file.name)
    rep2_ta_filename = 'r2ta_%s' % (rep2_ta_file.name)
    rep1_xcor_filename = 'r1cc_%s' % (rep1_xcor_file.name)
    rep2_xcor_filename = 'r2cc_%s' % (rep2_xcor_file.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name

    dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename)
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename)
    dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_filename)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_filename)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_filename)
    dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    reps_peaks_filename = common.uncompress(reps_peaks_filename)
    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename)
    pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename)

    pool_applet = dxpy.find_one_data_object(classname='applet',
                                            name='pool',
                                            project=dxpy.PROJECT_CONTEXT_ID,
                                            zero_ok=False,
                                            more_ok=False,
                                            return_handler=True)
    pool_replicates_subjob = \
        pool_applet.run(
            {"inputs": [rep1_ta, rep2_ta],
             "prefix": 'pooled_reps'},
            name='Pool replicates')
    # next call could be on 267 and save time?
    pool_replicates_subjob.wait_on_done()
    # If fragment_length is not given, calculate the fragment_length
    # using crosscorrelation. Else use the overridevalue. Set the
    # pool_xcor_filename to None to accommodate common.frip calls.
    # Calculate, or set, actually used fragment lengths for different
    # cases. Set the flag indicating whether the fragment length
    # was given by the user.
    if fragment_length is not None:
        pool_xcor_filename = None
        fragment_length_used_rep1 = fragment_length
        fragment_length_used_rep2 = fragment_length
        fragment_length_used_pool = fragment_length
        fragment_length_given_by_user = True
    else:
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pool_replicates_subjob.get_output_ref("pooled"),
                paired_end,
                spp_version=None,
                name='Pool cross-correlation')
        pooled_replicates_xcor_subjob.wait_on_done()
        pool_xcor_link = pooled_replicates_xcor_subjob.describe(
        )['output'].get("CC_scores_file")
        pool_xcor_file = dxpy.get_handler(pool_xcor_link)
        pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name)
        dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename)
        fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename)
        fragment_length_used_rep2 = common.xcor_fraglen(rep2_xcor_filename)
        fragment_length_used_pool = common.xcor_fraglen(pool_xcor_filename)
        fragment_length_given_by_user = False

    pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled")
    pool_ta_file = dxpy.get_handler(pool_ta_link)
    pool_ta_filename = 'poolta_%s' % (pool_ta_file.name)
    dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    Nt = common.count_lines(reps_peaks_filename)
    logger.info("%d peaks from true replicates (Nt)" % (Nt))
    N1 = common.count_lines(r1pr_peaks_filename)
    logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1))
    N2 = common.count_lines(r2pr_peaks_filename)
    logger.info("%d peaks from rep2 self-pseudoreplicates (N2)" % (N2))
    Np = common.count_lines(pooledpr_peaks_filename)
    logger.info("%d peaks from pooled pseudoreplicates (Np)" % (Np))

    # generate the conservative set, which is always based on the IDR peaks
    # from true replicates
    conservative_set_filename = \
        '%s_final_conservative.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(reps_peaks_filename, conservative_set_filename,
                         blacklist_filename)
        Ncb = common.count_lines(conservative_set_filename)
        logger.info("%d peaks blacklisted from the conservative set" %
                    (Nt - Ncb))
    else:
        subprocess.check_output(
            shlex.split('cp %s %s' %
                        (reps_peaks_filename, conservative_set_filename)))
        Ncb = Nt
        logger.info("No blacklist filter applied to the conservative set")

    # generate the optimal set, which is based on the longest of IDR peaks
    # list from true reps or the IDR peaks from the pseudoreplicates of the
    # pool
    if Nt >= Np:
        peaks_to_filter_filename = reps_peaks_filename
        No = Nt
    else:
        peaks_to_filter_filename = pooledpr_peaks_filename
        No = Np
    optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(peaks_to_filter_filename, optimal_set_filename,
                         blacklist_filename)
        Nob = common.count_lines(optimal_set_filename)
        logger.info("%d peaks blacklisted from the optimal set" % (No - Nob))
    else:
        subprocess.check_output(
            shlex.split('cp %s %s' %
                        (peaks_to_filter_filename, optimal_set_filename)))
        Nob = No
        logger.info("No blacklist filter applied to the optimal set")

    rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt))
    self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2))

    if rescue_ratio > 2 and self_consistency_ratio > 2:
        reproducibility = 'fail'
    elif rescue_ratio > 2 or self_consistency_ratio > 2:
        reproducibility = 'borderline'
    else:
        reproducibility = 'pass'

    # FRiP (fraction reads in peaks)
    # rep1 stable peaks comparing internal pseudoreplicates
    rep1_n_reads, rep1_n_reads_in_peaks, rep1_frip_score = common.frip(
        rep1_ta_filename, rep1_xcor_filename, r1pr_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # rep2 stable peaks comparing internal pseudoreplicates
    rep2_n_reads, rep2_n_reads_in_peaks, rep2_frip_score = common.frip(
        rep2_ta_filename, rep2_xcor_filename, r2pr_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # comparing true reps
    true_n_reads, true_n_reads_in_peaks, true_frip_score = common.frip(
        pool_ta_filename, pool_xcor_filename, reps_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # comparing pooled pseudoreplicates
    pr_n_reads, pr_n_reads_in_peaks, pr_frip_score = common.frip(
        pool_ta_filename, pool_xcor_filename, pooledpr_peaks_filename,
        chrom_sizes_filename, fragment_length)

    output = {
        "rep1_frip_nreads": rep1_n_reads,
        "rep1_frip_nreads_in_peaks": rep1_n_reads_in_peaks,
        "F1": rep1_frip_score,
        "rep2_frip_nreads": rep2_n_reads,
        "rep2_frip_nreads_in_peaks": rep2_n_reads_in_peaks,
        "F2": rep2_frip_score,
        "true_frip_nreads": true_n_reads,
        "true_frip_nreads_in_peaks": true_n_reads_in_peaks,
        "Ft": true_frip_score,
        "pr_frip_nreads": pr_n_reads,
        "pr_frip_nreads_in_peaks": pr_n_reads_in_peaks,
        "Fp": pr_frip_score,
        "fragment_length_used_rep1": fragment_length_used_rep1,
        "fragment_length_used_rep2": fragment_length_used_rep2,
        "fragment_length_used_pool": fragment_length_used_pool,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    # These are optional outputs to see what's being removed by the blacklist
    if blacklist:
        output.update({
            "pre_bl_conservative_set":
            dxpy.dxlink(
                dxpy.upload_local_file(common.compress(reps_peaks_filename))),
            "pre_bl_optimal_set":
            dxpy.dxlink(
                dxpy.upload_local_file(
                    common.compress(peaks_to_filter_filename)))
        })

    # bedtobigbed often fails, so skip creating the bb if it does
    conservative_set_bb_filename = \
        common.bed2bb(conservative_set_filename, chrom_sizes_filename,
                      as_file_filename)
    optimal_set_bb_filename = \
        common.bed2bb(optimal_set_filename, chrom_sizes_filename,
                      as_file_filename)
    if conservative_set_bb_filename:
        conservative_set_bb_output = \
            dxpy.upload_local_file(conservative_set_bb_filename)
        output.update(
            {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)})
    if optimal_set_bb_filename:
        optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename)
        output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)})

    output.update({
        "Nt":
        Nt,
        "N1":
        N1,
        "N2":
        N2,
        "Np":
        Np,
        "conservative_set":
        dxpy.dxlink(
            dxpy.upload_local_file(
                common.compress(conservative_set_filename))),
        "optimal_set":
        dxpy.dxlink(
            dxpy.upload_local_file(common.compress(optimal_set_filename))),
        "rescue_ratio":
        rescue_ratio,
        "self_consistency_ratio":
        self_consistency_ratio,
        "reproducibility_test":
        reproducibility,
        "No":
        Nob,
        "Nc":
        Ncb
    })

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})
    if rep2_signal:
        output.update({"rep2_signal": rep2_signal})
    if pooled_signal:
        output.update({"pooled_signal": pooled_signal})

    return output
Exemplo n.º 15
0
  print >> sys.stderr, 'sorting term doc list for dir:' + dir
  term_doc_list = list(term_doc_list)
  # Sort the termID-docID pairs by termID
  term_doc_list.sort(key=lambda tuple:tuple[0])
  combined_term_doc_list = {}
  for termID,docID in term_doc_list:
      combined_term_doc_list.setdefault(termID,[]).append(docID)
  print >> sys.stderr, 'print posting list to disc for dir:' + dir
  terms = combined_term_doc_list.keys()
  # Since sorted() takes much longer  than .sort()
  terms.sort()
  for termID in terms:
    # write the posting lists to block_pl for this current block
    combined_term_doc_list[termID].sort()
    numPostings  = len(combined_term_doc_list[termID])
    common.writePostingsList( block_dict, block_pl, termID, common.compress(combined_term_doc_list[termID]), numPostings)
  block_pl.close()
  block_dict.close()

 


print >> sys.stderr, '######\nposting list construction finished!\n##########'

print >> sys.stderr, '\nMerging postings...'
while True:
  if len(block_q) <= 1:
    break
  b1 = block_q.popleft()
  b2 = block_q.popleft()
  print >> sys.stderr, 'merging %s and %s' % (b1, b2)
Exemplo n.º 16
0
def internal_pseudoreplicate_overlap(rep1_peaks,
                                     rep2_peaks,
                                     pooled_peaks,
                                     rep1_ta,
                                     rep1_xcor,
                                     paired_end,
                                     chrom_sizes,
                                     as_file,
                                     peak_type,
                                     prefix,
                                     fragment_length=None):

    rep1_peaks_file = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file = dxpy.DXFile(rep2_peaks)
    pooled_peaks_file = dxpy.DXFile(pooled_peaks)
    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)

    # Input filenames - necessary to define each explicitly because input files
    # could have the same name, in which case subsequent
    # file would overwrite previous file
    rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name)
    rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name)
    pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name)
    rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name)
    rep1_xcor_fn = 'r1xc_%s' % (rep1_xcor_file.name)
    chrom_sizes_fn = 'chrom.sizes'
    as_file_fn = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        # strip off the peak and compression extensions
        m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
                     pooled_peaks.name)
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_fn)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    # the only difference between the peak_types is how the extra columns are
    # handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in [
            'narrowPeak', 'gappedPeak', 'broadPeak'
        ], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (
            peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as
    # the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %
        (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (rep2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_tr_fn)
    print("%d peaks overlap with both true replicates" %
          (common.count_lines(overlap_tr_fn)))

    # this is a simplicate analysis
    # overlapping peaks are just based on pseudoreps of the one pool
    out, err = common.run_pipe(['cat %s' % (overlap_tr_fn), 'sort -u'],
                               overlapping_peaks_fn)
    print("%d peaks overlap" % (common.count_lines(overlapping_peaks_fn)))

    # rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' %
        (pooled_peaks_fn, overlapping_peaks_fn)
    ], rejected_peaks_fn)
    print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)))

    # calculate FRiP (Fraction of Reads in Peaks)

    # Extract the fragment length estimate from column 3 of the
    # cross-correlation scores file or use the user-defined
    # fragment_length if given.
    if fragment_length is not None:
        fraglen = fragment_length
        fragment_length_given_by_user = True
    else:
        fraglen = common.xcor_fraglen(rep1_xcor_fn)
        fragment_length_given_by_user = False

    # FRiP
    reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type)
    n_reads, n_reads_in_peaks, frip_score = common.frip(
        rep1_ta_fn,
        rep1_xcor_fn,
        overlapping_peaks_fn,
        chrom_sizes_fn,
        fraglen,
        reads_in_peaks_fn=reads_in_peaks_fn)

    # count peaks
    npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    # make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn,
                                            chrom_sizes_fn,
                                            as_file_fn,
                                            bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn,
                                         chrom_sizes_fn,
                                         as_file_fn,
                                         bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks = dxpy.upload_local_file(
        common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn)

    output = {
        "overlapping_peaks": dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks": dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in": npeaks_in,
        "npeaks_out": npeaks_out,
        "npeaks_rejected": npeaks_rejected,
        "frip_nreads": n_reads,
        "frip_nreads_in_peaks": n_reads_in_peaks,
        "frip_score": frip_score,
        "fragment_length_used": fraglen,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    return output
Exemplo n.º 17
0
def replicated_overlap(rep1_peaks, rep2_peaks, pooled_peaks,
                       pooledpr1_peaks, pooledpr2_peaks,
                       rep1_ta, rep1_xcor, rep2_ta, rep2_xcor,
                       paired_end, chrom_sizes, as_file, peak_type, prefix,
                       fragment_length=None):

    rep1_peaks_file      = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file      = dxpy.DXFile(rep2_peaks)
    pooled_peaks_file    = dxpy.DXFile(pooled_peaks)
    pooledpr1_peaks_file = dxpy.DXFile(pooledpr1_peaks)
    pooledpr2_peaks_file = dxpy.DXFile(pooledpr2_peaks)
    rep1_ta_file         = dxpy.DXFile(rep1_ta)
    rep2_ta_file         = dxpy.DXFile(rep2_ta)
    rep1_xcor_file       = dxpy.DXFile(rep1_xcor)
    rep2_xcor_file       = dxpy.DXFile(rep2_xcor)
    chrom_sizes_file     = dxpy.DXFile(chrom_sizes)
    as_file_file         = dxpy.DXFile(as_file)

    # Input filenames - necessary to define each explicitly because input files
    # could have the same name, in which case subsequent
    # file would overwrite previous file
    rep1_peaks_fn      = 'rep1-%s' % (rep1_peaks_file.name)
    rep2_peaks_fn      = 'rep2-%s' % (rep2_peaks_file.name)
    pooled_peaks_fn    = 'pooled-%s' % (pooled_peaks_file.name)
    pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks_file.name)
    pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks_file.name)
    rep1_ta_fn         = 'r1ta_%s' % (rep1_ta_file.name)
    rep2_ta_fn         = 'r2ta_%s' % (rep2_ta_file.name)
    rep1_xcor_fn       = 'r1cc_%s' % (rep1_xcor_file.name)
    rep2_xcor_fn       = 'r2cc_%s' % (rep2_xcor_file.name)
    chrom_sizes_fn     = 'chrom.sizes'
    as_file_fn         = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        # strip off the peak and compression extensions
        m = re.match(
            '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
            pooled_peaks.name)
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn    = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn       = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn    = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(pooledpr1_peaks_file.get_id(), pooledpr1_peaks_fn)
    dxpy.download_dxfile(pooledpr2_peaks_file.get_id(), pooledpr2_peaks_fn)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_fn)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn)
    dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_fn)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_fn)

    pool_applet = dxpy.find_one_data_object(
            classname='applet',
            name='pool',
            project=dxpy.PROJECT_CONTEXT_ID,
            zero_ok=False,
            more_ok=False,
            return_handler=True)
    pool_replicates_subjob = \
        pool_applet.run(
            {"inputs": [rep1_ta, rep2_ta],
             "prefix": 'pooled_reps'},
            name='Pool replicates')
    # If fragment length was given by user we skip pooled_replicates
    # _xcor_subjob, set the pool_xcor_filename to None, and update
    # the flag fragment_length_given_by_user. Otherwise, run the subjob
    # to be able to extract the fragment length fron cross-correlations.
    if fragment_length is not None:
        pool_xcor_filename = None
        fraglen = fragment_length
        fragment_length_given_by_user = True
    else:
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pool_replicates_subjob.get_output_ref("pooled"),
                paired_end,
                spp_version=None,
                name='Pool cross-correlation')
        pooled_replicates_xcor_subjob.wait_on_done()
        pool_xcor_link = pooled_replicates_xcor_subjob.describe()['output'].get("CC_scores_file")
        pool_xcor_file = dxpy.get_handler(pool_xcor_link)
        pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name)
        dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename)
        fraglen = common.xcor_fraglen(pool_xcor_filename)
        fragment_length_given_by_user = False

    pool_replicates_subjob.wait_on_done()
    pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled")
    pool_ta_file = dxpy.get_handler(pool_ta_link)
    pool_ta_filename = 'poolta_%s' % (pool_ta_file.name)
    dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    # the only difference between the peak_types is how the extra columns are
    # handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as
    # the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_tr_fn)
    print(
        "%d peaks overlap with both true replicates"
        % (common.count_lines(overlap_tr_fn)))

    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where
    # overlap is defined as the fractional overlap wrt any one of the
    # overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' % (pooledpr2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_pr_fn)
    print(
        "%d peaks overlap with both pooled pseudoreplicates"
        % (common.count_lines(overlap_pr_fn)))

    # Combine peak lists
    out, err = common.run_pipe([
        'cat %s %s' % (overlap_tr_fn, overlap_pr_fn),
        'sort -u'
        ], overlapping_peaks_fn)
    print(
        "%d peaks overlap with true replicates or with pooled pseudoreplicates"
        % (common.count_lines(overlapping_peaks_fn)))

    # rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn)
        ], rejected_peaks_fn)
    print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)))

    # calculate FRiP (Fraction of Reads in Peaks)
    reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type)
    n_reads, n_reads_in_peaks, frip_score = common.frip(
        pool_ta_filename, pool_xcor_filename, overlapping_peaks_fn,
        chrom_sizes_fn, fraglen, reads_in_peaks_fn=reads_in_peaks_fn)

    # count peaks
    npeaks_in        = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out       = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected  = common.count_lines(rejected_peaks_fn)

    # make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(
        overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(
        rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks       = dxpy.upload_local_file(common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb    = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks          = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb       = dxpy.upload_local_file(rejected_peaks_bb_fn)

    output = {
        "overlapping_peaks"     : dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb"  : dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks"        : dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb"     : dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in"             : npeaks_in,
        "npeaks_out"            : npeaks_out,
        "npeaks_rejected"       : npeaks_rejected,
        "frip_nreads"           : n_reads,
        "frip_nreads_in_peaks"  : n_reads_in_peaks,
        "frip_score"            : frip_score,
        "fragment_length_used"  : fraglen,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    return output