예제 #1
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    stage_dict = {"download_encode": _download_encode,
                  "fastqc": _run_fastqc}

    curr_files = config["encode_file"]

    for stage in config["run"]:
        if stage == "download_encode":
            curr_files = _download_encode(config["encode_file"], config)
        elif stage == "fastqc":
            _run_fastqc(curr_files, config)
        elif stage == "trim":
            _run_trim(curr_files, config)
        elif stage == "align":
            _run_tophat(curr_files, config)


    cell_types = _get_cell_types(config["encode_file"])
    logger.info("files: %s" % (curr_files))
    logger.info("types: %s" % (cell_types))

    # end gracefully
    stop_cluster()
예제 #2
0
파일: slim.py 프로젝트: Xiuying/projects
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    from rkinf.log import logger
    start_cluster(config)

    from rkinf.cluster import view
    input_files = [os.path.join(config["dir"]["data"], x) for x in
                   config["input"]]
    results_dir = config["dir"]["results"]

    map(safe_makedir, config["dir"].values())

    curr_files = input_files

    for stage in config["run"]:
        if stage == "fastqc":
            nfiles = len(curr_files)
            logger.info("Running %s on %s" % (stage, str(curr_files)))
            fastqc_config = _get_stage_config(config, stage)
            fastqc_outputs = view.map(fastqc.run, curr_files,
                                      [fastqc_config] * nfiles,
                                      [config] * nfiles)

        if stage == "cutadapt":
            nfiles = len(curr_files)
            cutadapt_config = _get_stage_config(config, stage)
            cutadapt_outputs = view.map(cutadapt_tool.run,
                                        curr_files,
                                        [cutadapt_config] * nfiles,
                                        [config] * nfiles)
            curr_files = cutadapt_outputs

        if stage == "novoalign":
            nfiles = len(curr_files)
            novoalign_config = _get_stage_config(config, stage)
            #db = novoindex.run(config["ref"],
            #                   _get_stage_config(config, "novoindex"),
            #                   config)
            db = config["genome"]["file"]
            novoalign_outputs = view.map(novoalign.run, curr_files,
                                         [db] * nfiles,
                                         [novoalign_config] * nfiles,
                                         [config] * nfiles)
            curr_files = novoalign_outputs

        if stage == "htseq-count":
            nfiles = len(curr_files)
            htseq_config = _get_stage_config(config, stage)
            htseq_outputs = view.map(htseq_count.run_with_config,
                                     curr_files,
                                     [config] * nfiles,
                                     [stage] * nfiles)
            combined_out = htseq_count.combine_counts(htseq_outputs,
                                                      "combined.counts")

    stop_cluster()
예제 #3
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    from rkinf.log import logger
    start_cluster(config)

    from rkinf.cluster import view
    input_files = [
        os.path.join(config["dir"]["data"], x) for x in config["input"]
    ]
    results_dir = config["dir"]["results"]

    map(safe_makedir, config["dir"].values())

    curr_files = input_files

    for stage in config["run"]:
        if stage == "fastqc":
            nfiles = len(curr_files)
            logger.info("Running %s on %s" % (stage, str(curr_files)))
            fastqc_config = _get_stage_config(config, stage)
            fastqc_outputs = view.map(fastqc.run, curr_files,
                                      [fastqc_config] * nfiles,
                                      [config] * nfiles)

        if stage == "cutadapt":
            nfiles = len(curr_files)
            cutadapt_config = _get_stage_config(config, stage)
            cutadapt_outputs = view.map(cutadapt_tool.run, curr_files,
                                        [cutadapt_config] * nfiles,
                                        [config] * nfiles)
            curr_files = cutadapt_outputs

        if stage == "novoalign":
            nfiles = len(curr_files)
            novoalign_config = _get_stage_config(config, stage)
            #db = novoindex.run(config["ref"],
            #                   _get_stage_config(config, "novoindex"),
            #                   config)
            db = config["genome"]["file"]
            novoalign_outputs = view.map(novoalign.run, curr_files,
                                         [db] * nfiles,
                                         [novoalign_config] * nfiles,
                                         [config] * nfiles)
            curr_files = novoalign_outputs

        if stage == "htseq-count":
            nfiles = len(curr_files)
            htseq_config = _get_stage_config(config, stage)
            htseq_outputs = view.map(htseq_count.run_with_config, curr_files,
                                     [config] * nfiles, [stage] * nfiles)
            combined_out = htseq_count.combine_counts(htseq_outputs,
                                                      "combined.counts")

    stop_cluster()
예제 #4
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    start_cluster(config)

    # after the cluster is up, import a view to it
    from rkinf.cluster import view
    in_file = config.get("query")

    # de-parallelize for now
    blast_results = []
    for stage in config["run"]:
        if config["stage"][stage]["program"] == "blastn":
            blastn_config = config["stage"][stage]
            blast_results = [blastn.run(in_file, ref, blastn_config, config) for
                             ref in config["refs"]]

    for identity in config["min_identity"]:
        filtered_results = []
        for blast_result in blast_results:
            filtered_results.append(blastn.filter_results_by_length(
                blast_result, identity))

        fasta_hits = set()
        for filtered_result in filtered_results:
            fasta_hits.update(blastn.get_id_of_hits(filtered_result))

        def in_set_predicate(x):
            return x.id in fasta_hits

        outfile = os.path.join(build_results_dir(blastn_config, config),
                               append_stem(os.path.basename(in_file),
                                           str(identity) + "_filt"))

        fasta_filtered = fasta.filter_fasta(in_file,
                                            in_set_predicate,
                                            outfile)

        trimmed = _trim(fasta_filtered, filtered_results)
        org_names = [x["name"] for x in config["refs"]]
        logger.info(trimmed)
        logger.info(filtered_results)
        logger.info(org_names)
        combined = _make_combined_csv(trimmed, filtered_results, org_names)

    stop_cluster()
예제 #5
0
def main(config_file):

    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    from rkinf.log import logger
    start_cluster(config)
    from rkinf.cluster import view

    input_dir = config["dir"]["input_dir"]
    results_dir = config["dir"]["results"]
    input_files = glob.glob(os.path.join(input_dir, "*.bam"))

    """ example running with macs
    macs.run_with_config(input_file, config, control_file=None, stage=None)
    """

    curr_files = input_files
    for stage in config["run"]:
        # for now just run macs on all of these files without the control
        # file
        if stage == "macs":
            nfiles = len(curr_files)
            out_files = view.map(macs.run_with_config, curr_files,
                                 [config] * nfiles,
                                 [None] * nfiles,
                                 [stage] * nfiles)
            # just use the peak files going forward
            peak_files = [x[0] for x in out_files]
            curr_files = peak_files

        if stage == "intersect":
            """ 1) loop over the ids in the negative group
            """ for each one pick out the files that match it
            """ combine them into one file
            """ output it as the union
            """ 2) loop over the ids in the positive and test group
            """ find intersections of the ones that match the same id:
            """ intersectBed -wao -bed -f fraction -r -a bed1 -b -bed2
            """ might have to try a range of f

    stop_cluster()
예제 #6
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    start_cluster(config)

    # after the cluster is up, import a view to it
    from rkinf.cluster import view
    in_file = config.get("query")
    org_names = [x["name"] for x in config["refs"]]

    curr_files = in_file

    for stage in config["run"]:
        if stage == "blastn":
            logger.info("Running %s on %s." % (stage, curr_files))
            blastn_config = config["stage"][stage]
            refs = config["refs"]
            args = zip(*itertools.product([curr_files], refs,
                                          [blastn_config], [config]))
            blastn_results = view.map(blastn.run, *args)
            curr_files = blastn_results

        if stage == "annotate":
            logger.info("Running %s on %s." % (stage, curr_files))
            # annotate the data frames
            args = zip(*itertools.product(curr_files, ["sseqid"],
                                          org_names))
            annotated = view.map(_annotate_df, *args)
            curr_files = annotated

        if stage == "combine":
            out_fname = os.path.join(os.path.dirname(curr_files[0]),
                                                     append_stem(in_file,
                                                                 "combined"))
            logger.info("Combining %s into %s." % (curr_files, out_fname))
            org_names = [x["name"] for x in config["refs"]]
            #       combined = _make_combined_csv(curr_files, org_names, out_fname)

    stop_cluster()
예제 #7
0
def main(config_file):

    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    start_cluster(config)

    # after the cluster is up, import the view to i
    from rkinf.cluster import view
    input_files = config["input"]
    results_dir = config["dir"]["results"]

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    curr_files = input_files

    ## qc steps
    for stage in config["run"]:
        if stage == "fastqc":
            # run the basic fastqc
            logger.info("Running %s on %s" % (stage, str(curr_files)))
            fastqc_config = config["stage"][stage]
            fastqc_outputs = view.map(fastqc.run, curr_files,
                                      [fastqc_config] * len(curr_files),
                                      [config] * len(curr_files))
            # this does nothing for now, not implemented yet
            summary_file = _combine_fastqc(fastqc_outputs)

        if stage == "trim":
            logger.info("Trimming poor quality ends "
                        " from %s" % (str(curr_files)))
            nlen = len(curr_files)
            min_length = str(config["stage"][stage].get("min_length", 20))

            # trim low quality ends of reads
            # do this dirty for now
            out_dir = os.path.join(results_dir, "trimmed")
            safe_makedir(out_dir)
            out_files = [append_stem(os.path.basename(x), "trim") for
                         x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            # XXX remove the magic number of 10 the length of the
            # minimum read to keep
            out_files = view.map(sickle.run, curr_files,
                                 ["se"] * nlen,
                                 ["sanger"] * nlen,
                                 [min_length] * nlen,
                                 out_files)
            curr_files = out_files

        if stage == "tagdust":
            input_files = curr_files
            # remove tags matching the other miRNA tested
            logger.info("Running %s on %s." % (stage, input_files))
            tagdust_config = config["stage"][stage]
            tagdust_outputs = view.map(tagdust.run, input_files,
                                       [tagdust_config] * len(input_files),
                                       [config] * len(input_files))
            curr_files = [x[0] for x in tagdust_outputs]

        if stage == "filter_length":
            # filter out reads below or above a certain length
            filter_config = config["stage"][stage]
            min_length = filter_config.get("min_length", 0)
            max_length = filter_config.get("max_length", MAX_READ_LENGTH)

            # length predicate
            def length_filter(x):
                return min_length < len(x.seq) < max_length

            # filter the input reads based on length
            # parallelizing this doesn't seem to work
            # ipython can't accept closures as an argument to view.map()
            """
            filtered_fastq = view.map(filter_seqio, tagdust_outputs,
                                      [lf] * len(tagdust_outputs),
                                      ["filt"] * len(tagdust_outputs),
                                      ["fastq"] * len(tagdust_outputs))"""
            out_files = [append_stem(os.path.basename(input_file[0]),
                         "filt") for input_file in tagdust_outputs]
            out_dir = os.path.join(config["dir"]["results"],
                                   "length_filtered")
            safe_makedir(out_dir)
            out_files = [os.path.join(out_dir, x) for x in out_files]

            filtered_fastq = [filter_seqio(x[0], length_filter, y, "fastq")
                              for x, y in zip(tagdust_outputs, out_files)]

            curr_files = filtered_fastq

        if stage == "count_ends":
            logger.info("Compiling nucleotide counts at 3' and 5' ends.")
            # count the nucleotide at the end of each read
            def count_ends(x, y):
                """ keeps a running count of an arbitrary set of keys
                during the reduce step """
                x[y] = x.get(y, 0) + 1
                return x

            def get_3prime_end(x):
                return str(x.seq[-1])

            def get_5prime_end(x):
                return str(x.seq[0])

            def output_counts(end_function, count_file):
                # if the count_file already exists, skip
                outdir = os.path.join(config["dir"]["results"], stage)
                safe_makedir(outdir)
                count_file = os.path.join(outdir, count_file)
                if os.path.exists(count_file):
                    return count_file
                # outputs a tab file of the counts at the end
                # of the fastq files kj
                counts = [reduce(count_ends,
                                 apply_seqio(x, end_function, kind="fastq"),
                                 {}) for x in curr_files]
                df = pd.DataFrame(counts,
                                  index=map(_short_name, curr_files))
                df = df.astype(float)
                total = df.sum(axis=1)
                df = df.div(total, axis=0)
                df["total"] = total
                df.to_csv(count_file, sep="\t")

            output_counts(get_3prime_end, "3prime_counts.tsv")
            output_counts(get_5prime_end, "5prime_counts.tsv")

        if stage == "tophat":
            tophat_config = config["stage"][stage]
            logger.info("Running tophat on %s" % (str(curr_files)))
            nlen = len(curr_files)
            pair_file = None
            ref_file = tophat_config["annotation"]
            out_base = os.path.join(results_dir, "mirna")
            align_dir = os.path.join(results_dir, "tophat")
            config = config
            tophat_files = view.map(tophat.align,
                                    curr_files,
                                    [pair_file] * nlen,
                                    [ref_file] * nlen,
                                    [out_base] * nlen,
                                    [align_dir] * nlen,
                                    [config] * nlen)
            curr_files = tophat_files

        if stage == "novoalign":
            logger.info("Running novoalign on %s" % (str(curr_files)))
            # align
            ref = config["genome"]["file"]
            novoalign_config = config["stage"][stage]
            aligned_outputs = view.map(novoalign.run, curr_files,
                                       [ref] * len(curr_files),
                                       [novoalign_config] * len(curr_files),
                                       [config] * len(curr_files))
            # convert sam to bam, sort and index
            picard = BroadRunner(config["program"]["picard"])
            bamfiles = view.map(picardrun.picard_formatconverter,
                                [picard] * len(aligned_outputs),
                                aligned_outputs)
            sorted_bf = view.map(picardrun.picard_sort,
                                 [picard] * len(bamfiles),
                                 bamfiles)
            # these files are the new starting point for the downstream
            # analyses, so copy them over into the data dir and setting
            # them to read only
            data_dir = os.path.join(config["dir"]["data"], stage)
            safe_makedir(data_dir)
            view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf))
            new_files = [os.path.join(data_dir, x) for x in
                         map(os.path.basename, sorted_bf)]
            [os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files]
            # index the bam files for later use
            view.map(picardrun.picard_index, [picard] * len(new_files),
                     new_files)

            curr_files = new_files

        if stage == "new_coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = blastn.prepare_ref_file(config["stage"][stage]["ref"],
                                          config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"])
            out_dir = os.path.join(results_dir, "new_coverage")
            safe_makedir(out_dir)
            out_files = [replace_suffix(os.path.basename(x),
                                        "metrics") for x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun,
                                 curr_files,
                                 [ref] * nrun,
                                 [ribo] * nrun,
                                 out_files)
            curr_files = out_files

        if stage == "coverage":
            gtf = blastn.prepare_ref_file(config["annotation"], config)
            logger.info("Calculating coverage of features in %s for %s"
                        % (gtf, str(sorted_bf)))
            out_files = [replace_suffix(x, "counts.bed") for
                         x in sorted_bf]
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            logger.info(out_files)
            out_files = [os.path.join(out_dir,
                                      os.path.basename(x)) for x in out_files]
            logger.info(out_files)
            view.map(bedtools.count_overlaps, sorted_bf,
                     [gtf] * len(sorted_bf),
                     out_files)

    stop_cluster()