Пример #1
0
def main(fwd_reads, rev_reads, ref_genome):
    fwd_bam = dxpy.new_dxjob(
        {
            'reads': fwd_reads,
            'ref_genome': ref_genome
        },
        'run_bwa',
        name='Map forward reads').get_output_ref('output_bam')
    rev_bam = dxpy.new_dxjob(
        {
            'reads': rev_reads,
            'ref_genome': ref_genome
        },
        'run_bwa',
        name='Map reverse reads').get_output_ref('output_bam')

    merge_job = dxpy.new_dxjob({
        'fwd_bam': fwd_bam,
        'rev_bam': rev_bam
    },
                               'combine_bams',
                               name='Combine bams')

    output = {
        'output_bam': merge_job.get_output_ref('output_bam'),
        'output_bai': merge_job.get_output_ref('output_bai')
    }

    return output
Пример #2
0
def main(DX_APP_WIZARD_INPUT_SIGNATURE):
DX_APP_WIZARD_INITIALIZE_INPUTDX_APP_WIZARD_DOWNLOAD_ANY_FILES
    # Split your work into parallel tasks.  As an example, the
    # following generates 10 subjobs running with the same dummy
    # input.

    subjobs = []
    for i in range(10):
        subjob_input = { "input1": True }
        subjobs.append(dxpy.new_dxjob(subjob_input, "process"))

    # The following line creates the job that will perform the
    # "postprocess" step of your app.  We've given it an input field
    # that is a list of job-based object references created from the
    # "process" jobs we just created.  Assuming those jobs have an
    # output field called "output", these values will be passed to the
    # "postprocess" job.  Because these values are not ready until the
    # "process" jobs finish, the "postprocess" job WILL NOT RUN until
    # all job-based object references have been resolved (i.e. the
    # jobs they reference have finished running).
    #
    # If you do not plan to have the "process" jobs create output that
    # the "postprocess" job will require, then you can explicitly list
    # the dependencies to wait for those jobs to finish by setting the
    # "depends_on" field to the list of subjobs to wait for (it
    # accepts either dxpy handlers or string IDs in the list).  We've
    # included this parameter in the line below as well for
    # completeness, though it is unnecessary if you are providing
    # job-based object references in the input that refer to the same
    # set of jobs.

    postprocess_job = dxpy.new_dxjob(fn_input={ "process_outputs": [subjob.get_output_ref("output") for subjob in subjobs] },
                                     fn_name="postprocess",
                                     depends_on=subjobs)
Пример #3
0
def geneBody_coverage(BAM_file, BED_file):
    dxpy.download_dxfile(BED_file, "genes.bed")
    dxpy.download_dxfile(BAM_file, "mappings.bam")

    # split mappings into chunks that can be done on a single worker
    # all mappings are loaded into RAM so can only do 5 million at a time
    run_shell(" ".join(["samtools", "view", "mappings.bam", "|", "split", "-l 10000000", "-", "split_map"]))
    run_shell(" ".join(["samtools", "view", "-H", "mappings.bam", ">", "header_only.sam"]))
    files = os.listdir(".")
    jobs = []
    for f in files:
        if f.startswith("split_map"):
            # add header 
            run_shell(" ".join(["cat", "header_only.sam", f, ">", "temp.sam"]))
            # convert to BAM
            run_shell(" ".join(["samtools", "view", "-S", "-b", "temp.sam", ">", "temp.bam"]))
            # upload file
            split_bam = dxpy.upload_local_file("temp.bam")
            # run analysis
            jobs.append(dxpy.new_dxjob({"BAM_file":dxpy.dxlink(split_bam.get_id()), "BED_file":BED_file}, "run_gbc"))
            
    run_shell( "ls -l" )

    gbc_agg_input = {"sub_reports":[]}
    for j in jobs:
        gbc_agg_input["sub_reports"].append({"job":j.get_id(), "field":"file"})

    agg_job = dxpy.new_dxjob(gbc_agg_input, "gbc_agg").get_id()
    
    return {"results":{"job":agg_job, "field":"cover"}}
Пример #4
0
def main(DX_APP_WIZARD_INPUT_SIGNATURE):
DX_APP_WIZARD_INITIALIZE_INPUTDX_APP_WIZARD_DOWNLOAD_ANY_FILES
    # We first create the "scatter" job which will scatter some input
    # (replace with your own input as necessary).
    input_to_scatter = "placeholder value"
    scatter_job = dxpy.new_dxjob(fn_input={ "input_to_scatter": input_to_scatter },
                                 fn_name="scatter")

    # We will want to call "process" on each output of "scatter", so
    # we call the "map" entry point to do so.  We can also provide
    # here additional input that we want each "process" entry point to
    # receive, e.g. a file ID to which the "process" function should
    # add rows of data.
    map_input = {
        "array_of_scattered_input": scatter_job.get_output_ref("array_of_scattered_input"),
        "process_input": { "additional_input": "file ID, for example" }
        }
    map_job = dxpy.new_dxjob(fn_input=map_input, fn_name="map")

    # Finally, we want the "postprocess" job to run after "map" is
    # done calling "process" on each of its inputs.  Note that a job
    # is marked as "done" only after all of its child jobs are also
    # marked "done".
    postprocess_input = {
        "process_outputs": map_job.get_output_ref("process_outputs"),
        "additional_input": "file ID, for example"
        }
    postprocess_job = dxpy.new_dxjob(fn_input=postprocess_input,
                                     fn_name="postprocess",
                                     depends_on=[map_job])
Пример #5
0
def main(DX_APP_WIZARD_INPUT_SIGNATURE):
DX_APP_WIZARD_INITIALIZE_INPUTDX_APP_WIZARD_DOWNLOAD_ANY_FILES
    # Split your work into parallel tasks.  As an example, the
    # following generates 10 subjobs running with the same dummy
    # input.

    subjobs = []
    for i in range(10):
        subjob_input = { "input1": True }
        subjobs.append(dxpy.new_dxjob(subjob_input, "process"))

    # The following line creates the job that will perform the
    # "postprocess" step of your app.  We've given it an input field
    # that is a list of job-based object references created from the
    # "process" jobs we just created.  Assuming those jobs have an
    # output field called "output", these values will be passed to the
    # "postprocess" job.  Because these values are not ready until the
    # "process" jobs finish, the "postprocess" job WILL NOT RUN until
    # all job-based object references have been resolved (i.e. the
    # jobs they reference have finished running).
    #
    # If you do not plan to have the "process" jobs create output that
    # the "postprocess" job will require, then you can explicitly list
    # the dependencies to wait for those jobs to finish by setting the
    # "depends_on" field to the list of subjobs to wait for (it
    # accepts either dxpy handlers or string IDs in the list).  We've
    # included this parameter in the line below as well for
    # completeness, though it is unnecessary if you are providing
    # job-based object references in the input that refer to the same
    # set of jobs.

    postprocess_job = dxpy.new_dxjob(fn_input={ "process_outputs": [subjob.get_output_ref("output") for subjob in subjobs] },
                                     fn_name="postprocess",
                                     depends_on=subjobs)
def main(workers, max_files_per_worker=None, threads_per_worker=8, worker_launch_delay_seconds=0, smallest=False):
    mkdirs()

    worker_instance_type = "mem2_hdd2_x4"
    if smallest:
        # debugging - run on default instances
        worker_instance_type = None

    # launch workers, each to process a subset of the files
    subjobs = []
    for i in range(workers):
        subjob_input = { "workers": workers, "max_files_per_worker": max_files_per_worker, "whoami": i,
                         "threads_per_worker": threads_per_worker, "smallest": smallest }
        subjobs.append(dxpy.new_dxjob(subjob_input, "process", instance_type=worker_instance_type))
        if worker_launch_delay_seconds > 0 and i < (workers-1):
            # delay launching each worker to smooth out the load on the remote
            # server
            time.sleep(worker_launch_delay_seconds)

    # schedule postprocessing to reduce statistics
    output_fields = ["files_skipped", "files_transferred", "bytes_transferred"]
    postprocess_job = dxpy.new_dxjob(fn_input={k:[subjob.get_output_ref(k) for subjob in subjobs] for k in output_fields},
                                     fn_name="postprocess")

    return {k:postprocess_job.get_output_ref(k) for k in output_fields}
Пример #7
0
def main(DX_APP_WIZARD_INPUT_SIGNATURE):
DX_APP_WIZARD_INITIALIZE_INPUTDX_APP_WIZARD_DOWNLOAD_ANY_FILES
    # Split your input to be solved by the next stage of your app.
    # The following assumes you are splitting the input by giving,
    # 100000 rows of a GenomicTable per subjob running the "process"
    # entry point.

    num_rows = DX_APP_WIZARD_||_INPUT.describe()["length"]

    subjobs = []

    for i in range(num_rows / row_chunk_size + (0 if num_rows % row_chunk_size == 0 else 1)):
        subjob_input = { "gtable_id": DX_APP_WIZARD_||_INPUT.get_id(),
                         "start_row": row_chunk_size * i,
                         "end_row": min(row_chunk_size * (i + 1), num_rows)}
        subjobs.append(dxpy.new_dxjob(subjob_input, "process"))

    # The following line creates the job that will perform the
    # "postprocess" step of your app.  We've given it an input field
    # that is a list of job-based object references created from the
    # "process" jobs we just created.  Assuming those jobs have an
    # output field called "output", these values will be passed to the
    # "postprocess" job.  Because these values are not ready until the
    # "process" jobs finish, the "postprocess" job WILL NOT RUN until
    # all job-based object references have been resolved (i.e. the
    # jobs they reference have finished running).
    #
    # If you do not plan to have the "process" jobs create output that
    # the "postprocess" job will require, then you can explicitly list
    # the dependencies to wait for those jobs to finish by setting the
    # "depends_on" field to the list of subjobs to wait for (it
    # accepts either dxpy handlers or string IDs in the list).  We've
    # included this parameter in the line below as well for
    # completeness, though it is unnecessary if you are providing
    # job-based object references in the input that refer to the same
    # set of jobs.

    postprocess_job = dxpy.new_dxjob(fn_input={"process_outputs": [subjob.get_output_ref("output") for subjob in subjobs]},
                                     fn_name="postprocess",
                                     depends_on=subjobs)

    # If you would like to include any of the output fields from the
    # postprocess_job as the output of your app, you should return it
    # here using a job-based object reference.  If the output field is
    # called "answer", you can pass that on here as follows:
    #
    # return {"app_output_field": postprocess_job.get_output_ref("answer"), ...}
    #
    # Tip: you can include in your output at this point any open
    # objects (such as GTables) which are closed by a job that
    # finishes later.  The system will check to make sure that the
    # output object is closed and will attempt to clone it out as
    # output into the parent container only after all subjobs have
    # finished.

    output = {}
Пример #8
0
def main(worker_max, f_ids, bandwidth, species_name=None):
    """
    Input variables removed:
    """
    _run_cmd('aws --version', True)
    print('file ids: ' + str(f_ids))

    # Remove any files which are already symlinks
    f_ids = filter(lambda x: not _is_symlink(x), f_ids)

    if species_name is None:
        species_name = _get_species_name()

    # Set upload root to user specified directory or project
    projdx = dxpy.DXProject(os.environ['DX_PROJECT_CONTEXT_ID'])
    dir_file = os.path.join(S3_ROOT_FOLDER, species_name, projdx.name)

    # Trim trailing / in upload dir
    dir_file = dir_file.strip('/')
    print('Upload directory: ' + dir_file)

    # Programatically split files into equal list based on size and max workers
    split_list_dxlinks = _split_partition(f_ids, worker_max)

    # Select instance type based on user input
    trans_worker_inst = instance_from_bandwidth(bandwidth)

    # Run subjobs on list
    uploadjobs = [dxpy.new_dxjob(
                  fn_input={'target_s3': TARGET_S3,
                            'assigned_files': f_group,
                            'up_dir': dir_file},
                  fn_name='s3_upload',
                  instance_type=trans_worker_inst)
                  for f_group in split_list_dxlinks]

    # Merge S3 status upload reports from subjobs
    report_fileDXLinks = [subjob.get_output_ref('report_file_link')
                          for subjob in uploadjobs]

    print('Creating S3 upload report')
    report_job = dxpy.new_dxjob(
        fn_input={'filelinks': report_fileDXLinks}, fn_name='create_upload_report')

    # Output merged report
    print('Output final report')
    finalreportDXLink = report_job.get_output_ref('reportDXLink')
    output = {}
    output['upload_report'] = finalreportDXLink

    return output
Пример #9
0
def concat_pdfs_link(pdf_refs, name):
    job = dxpy.new_dxjob(fn_name='concat_pdfs',
                         fn_input={
                             'pdfs': pdf_refs,
                             'name': name
                         })
    return job.get_output_ref('pdf')
Пример #10
0
def main(bam_set, map_report_set, dme_ix, uncompress_bam=True):

    # tool_versions.py --applet $script_name --appver $script_ver
    props = {}
    if os.path.isfile('/usr/bin/tool_versions.py'): 
        sw_versions = subprocess.check_output(['tool_versions.py', '--dxjson', 'dnanexus-executable.json'])
        props["SW"] = sw_versions
    
    print "* Value of bam_set:        '" + str(bam_set) + "'"
    print "* Value of map_report_set: '" + str(map_report_set) + "'"
    print "* Value of dme_ix:         '" + str(dme_ix) + "'"
    print "* Value of uncompress_bam: '" + str(uncompress_bam) + "'"

    print "* Calling merge_extract()..."
    inp = {
        'bam_set':        bam_set,
        'map_report_set': map_report_set, 
        'dme_ix_dxlink':  dme_ix,
        'uncompress_bam': uncompress_bam,
        'props':          props
    }
    extract_job = dxpy.new_dxjob(inp, "merge_extract")
    print "* Kicked off extract() and waiting..."
    extract_job.wait_on_done() # Wait because we want the qc_metrics to pass to other jobs.
    extract_out = extract_job.describe()['output']
    target_root = extract_out['target_root']
    qc_metrics = extract_out['qc_metrics']

    print "* Calling post_extraction()..."
    post_extraction_out = post_extraction(extract_out["CpG_context_dxlink"], \
                                          extract_out["CHG_context_dxlink"], \
                                          extract_out["CHH_context_dxlink"], \
                                          dme_ix, target_root, qc_metrics, props)

    print "* Check storage..."
    run_cmd('ls -l')
    run_cmd('df -k .')

    print "* Finished."

    return {
        # from extract() 
        #"bam_biorep":    extract_out['biorep_bam_dxlink'], 
        "bam_biorep_qc": extract_out['biorep_bam_qc_dxlink'], 
        "map_biorep":    extract_out['biorep_map_dxlink'],
        "mbias_report":  extract_out["mbias_report_dxlink"],
        
        # from post_extraction() 
        "signal": post_extraction_out["bigWig_dxlink"],
        
        "CpG_bed": post_extraction_out["CpG_bed_dxlink"],
        "CHG_bed": post_extraction_out["CHG_bed_dxlink"],
        "CHH_bed": post_extraction_out["CHH_bed_dxlink"],
        
        "CpG_bb": post_extraction_out["CpG_bb_dxlink"],
        "CHG_bb": post_extraction_out["CHG_bb_dxlink"],
        "CHH_bb": post_extraction_out["CHH_bb_dxlink"],

        "metadata": json.dumps(qc_metrics) 
        }
def main(bam_set, map_report_set, dme_ix, uncompress_bam=True):

    # tool_versions.py --applet $script_name --appver $script_ver
    props = {}
    if os.path.isfile('/usr/bin/tool_versions.py'): 
        sw_versions = subprocess.check_output(['tool_versions.py', '--dxjson', 'dnanexus-executable.json'])
        props["SW"] = sw_versions
    
    print "* Value of bam_set:        '" + str(bam_set) + "'"
    print "* Value of map_report_set: '" + str(map_report_set) + "'"
    print "* Value of dme_ix:         '" + str(dme_ix) + "'"
    print "* Value of uncompress_bam: '" + str(uncompress_bam) + "'"

    print "* Calling merge_extract()..."
    inp = {
        'bam_set':        bam_set,
        'map_report_set': map_report_set, 
        'dme_ix_dxlink':  dme_ix,
        'uncompress_bam': uncompress_bam,
        'props':          props
    }
    extract_job = dxpy.new_dxjob(inp, "merge_extract")
    print "* Kicked off extract() and waiting..."
    extract_job.wait_on_done() # Wait because we want the qc_metrics to pass to other jobs.
    extract_out = extract_job.describe()['output']
    target_root = extract_out['target_root']
    qc_metrics = extract_out['qc_metrics']

    print "* Calling post_extraction()..."
    post_extraction_out = post_extraction(extract_out["CpG_context_dxlink"], \
                                          extract_out["CHG_context_dxlink"], \
                                          extract_out["CHH_context_dxlink"], \
                                          dme_ix, target_root, qc_metrics, props)

    print "* Check storage..."
    run_cmd('ls -l')
    run_cmd('df -k .')

    print "* Finished."

    return {
        # from extract() 
        #"bam_biorep":    extract_out['biorep_bam_dxlink'], 
        "bam_biorep_qc": extract_out['biorep_bam_qc_dxlink'], 
        "map_biorep":    extract_out['biorep_map_dxlink'],
        "mbias_report":  extract_out["mbias_report_dxlink"],
        
        # from post_extraction() 
        "signal": post_extraction_out["bigWig_dxlink"],
        
        "CpG_bed": post_extraction_out["CpG_bed_dxlink"],
        "CHG_bed": post_extraction_out["CHG_bed_dxlink"],
        "CHH_bed": post_extraction_out["CHH_bed_dxlink"],
        
        "CpG_bb": post_extraction_out["CpG_bb_dxlink"],
        "CHG_bb": post_extraction_out["CHG_bb_dxlink"],
        "CHH_bb": post_extraction_out["CHH_bb_dxlink"],

        "metadata": json.dumps(qc_metrics) 
        }
Пример #12
0
def main(**job_inputs):
    # If we weren't provided a mmi index for the reference, generate it.
    if 'genome_mmi' not in job_inputs:
        mmi_input = {'genome_fastagz': job_inputs['genome_fastagz']}
        minimap_index_job = dxpy.new_dxjob(mmi_input, 'run_minimap_index')
        job_inputs['genome_mmi'] = minimap_index_job.get_output_ref(
            'genome_mmi')
    output = {'genome_mmi': job_inputs['genome_mmi']}

    # check if we're dealing with pacbio or ONT reads and what the filetype is
    datatype = job_inputs['datatype']
    one_reads_file = dxpy.DXFile(job_inputs['reads'][0]).describe()['name']
    try:
        file_ext = re.search("(fastq|fasta|fa|fq){1}(.gz)?$",
                             one_reads_file,
                             flags=re.I).group(1).lower()
    except AttributeError:
        raise dxpy.AppError("Invalid filetype extension supplied.")

    # for fasta and fastq inputs, run jobs using native minimap2
    jobs = run_minimap2_subjobs(job_inputs)

    output['bam_files'] = [j.get_output_ref('mapped_reads') for j in jobs]
    output['bai_files'] = [
        j.get_output_ref('mapped_reads_index') for j in jobs
    ]

    return output
def map_entry_point(array_of_scattered_input, process_input):
    # The following calls "process" for each of the items in
    # *array_of_scattered_input*, using as input the item in the
    # array, as well as the rest of the fields in *process_input*.
    if DEBUG:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    logger.debug("** in map entry point with %s *" % process_input)
    process_jobs = []
    for item in array_of_scattered_input:
        logger.debug("** scattering: %s *" % item)
        process_input["scattered_input"] = item
        process_jobs.append(
            dxpy.new_dxjob(fn_input=process_input, fn_name="process"))

    logger.info("* %s scatter jobs started *" % len(array_of_scattered_input))
    bams = []
    reports = []
    for subjob in process_jobs:
        bams.append(subjob.get_output_ref('bam_file'))
        reports.append(subjob.get_output_ref('report_file'))

    return {
        "bam_files": bams,
        "report_files": reports,
    }
def main(fastq_gz_left_reads, fastq_gz_right_reads, indexed_reference, reads_per_chunk=25000000, aln_params="", sampe_params="-r '@RG\tID:1\tPL:ILLUMINA\tPU:None\tLB:1\tSM:1'"):

    picard_merge = applet("picard_merge_sam_files")
    if picard_merge == None:
        raise dxpy.AppError("unable to find applet called 'picard_merge_sam_files'.  Please copy into your project from the collection of developer applets")

    splitter = applet("fastq_splitter")
    if splitter == None:
        raise dxpy.AppError("unable to find applet called 'fastq_splitter'.  Please copy into your project from the collection of developer applets")

    bwa_aligner = applet("bwa_aligner")
    if bwa_aligner == None:
        raise dxpy.AppError("unable to find applet called 'bwa_aligner'.  Please copy into your project from the collection of developer applets")


    bwa_controller_input = {"left_reads": [], "right_reads": [], "indexed_reference": indexed_reference, "aln_params":aln_params, "sampe_params":sampe_params, "bwa_aligner": bwa_aligner.get_id()}
    bwa_subjobs = []
    for x, y in zip(fastq_gz_left_reads, fastq_gz_right_reads):
        left_job = splitter.run({"fastqgz": x, "reads_per_chunk": reads_per_chunk})
        right_job = splitter.run({"fastqgz": y, "reads_per_chunk": reads_per_chunk})
        bwa_controller_input["left_reads"].append(left_job.get_id())
        bwa_controller_input["right_reads"].append(right_job.get_id())
        bwa_subjobs.extend([left_job, right_job])

    bwa_controller_job = dxpy.new_dxjob(fn_input=bwa_controller_input, fn_name='bwa_controller', depends_on=bwa_subjobs)

    picard_merge_job = picard_merge.run({"BAMs": {"job": bwa_controller_job.get_id(), "field": "BAMs"}})

    print picard_merge_job.get_id()

    output = {"BAM": {"job": picard_merge_job.get_id(), "field": "BAM"}}

    return output
def map_entry_point(array_of_scattered_input, process_input):
    # The following calls "process" for each of the items in
    # *array_of_scattered_input*, using as input the item in the
    # array, as well as the rest of the fields in *process_input*.
    if DEBUG:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    logger.debug("** in map entry point with %s *" % process_input)
    process_jobs = []
    for item in array_of_scattered_input:
        logger.debug("** scattering: %s *" % item)
        process_input["scattered_input"] = item
        process_jobs.append(dxpy.new_dxjob(fn_input=process_input, fn_name="process"))

    logger.info("* %s scatter jobs started *" % len(array_of_scattered_input))
    bams = []
    reports = []
    for subjob in process_jobs:
        bams.append(subjob.get_output_ref('bam_file'))
        reports.append(subjob.get_output_ref('report_file'))

    return {
        "bam_files": bams,
        "report_files": reports,
    }
Пример #16
0
def run_pbmm2_subjobs(job_inputs):
    pbi_filenames = {}
    if job_inputs.get('reads_indices'):
        filenames = _get_filenames(job_inputs['reads_indices'])

        for indx, name in enumerate(filenames):
            pbi_filenames[name] = job_inputs['reads_indices'][indx]
    else:
        pbi_filenames = {}

    # now set up and run pbmm2 subjobs for mapping reads
    # group inputs into filesizes
    jobs = []
    # set default target size to 5GB
    for group in _group_movies(job_inputs['reads'], job_inputs['chunk_size']):
        group_fns = _get_filenames(group)
        group_pbis = [pbi_filenames.get(f + '.pbi') for f in group_fns]
        map_reads_input = {
            'bam_files': group,
            'pbi_files': group_pbis,
            'genome_fastagz': job_inputs['genome_fastagz'],
            'genome_mmi': job_inputs['genome_mmi']
        }
        job = dxpy.new_dxjob(map_reads_input, 'map_reads_pbmm2')
        jobs.append(job)
    return jobs
Пример #17
0
def map_entry_point(array_of_scattered_input, process_input):
    # The following calls "process" for each of the items in
    # *array_of_scattered_input*, using as input the item in the
    # array, as well as the rest of the fields in *process_input*.
    process_jobs = []
    for item in array_of_scattered_input:
        process_input["scattered_input"] = item
        process_jobs.append(dxpy.new_dxjob(fn_input=process_input, fn_name="process"))
    return { "process_outputs": [subjob.get_output_ref("process_output") for subjob in process_jobs] }
Пример #18
0
def main(reads1, reference_tar, bwa_aln_params, bwa_version, samtools_version, reads2=None):

    # Main entry-point.  Parameter defaults assumed to come from dxapp.json.
    # reads1, reference_tar, reads2 are links to DNAnexus files or None

    # This spawns only one or two subjobs for single- or paired-end,
    # respectively.  It could also download the files, chunk the reads,
    # and spawn multiple subjobs.

    # Files are downloaded later by subjobs into their own filesystems
    # and uploaded to the project.

    # Initialize file handlers for input files.

    paired_end = reads2 is not None
    unmapped_reads = [r for r in [reads1, reads2] if r]
    
    subjobs = []
    for reads in unmapped_reads:
        subjob_input = {"reads_file": reads,
                        "reference_tar": reference_tar,
                        "bwa_aln_params": bwa_aln_params,
                        "bwa_version": bwa_version}
        print "Submitting:"
        print subjob_input
        subjobs.append(dxpy.new_dxjob(subjob_input, "process"))

    # Create the job that will perform the "postprocess" step.  depends_on=subjobs, so blocks on all subjobs

    postprocess_job = dxpy.new_dxjob(fn_input={ "indexed_reads": [subjob.get_output_ref("output") for subjob in subjobs],
                                                "unmapped_reads": unmapped_reads,
                                                "reference_tar": reference_tar,
                                                "bwa_version": bwa_version,
                                                "samtools_version": samtools_version },
                                     fn_name="postprocess",
                                     depends_on=subjobs)

    mapped_reads = postprocess_job.get_output_ref("mapped_reads")
    mapping_statistics = postprocess_job.get_output_ref("mapping_statistics")

    output = { "mapped_reads": mapped_reads, "mapping_statistics": mapping_statistics, "paired_end": paired_end }
    print "Exiting with output: %s" %(output)
    return output
Пример #19
0
def geneBody_coverage(BAM_file, BED_file):
    dxpy.download_dxfile(BED_file, "genes.bed")
    dxpy.download_dxfile(BAM_file, "mappings.bam")

    # split mappings into chunks that can be done on a single worker
    # all mappings are loaded into RAM so can only do 5 million at a time
    run_shell(" ".join([
        "samtools", "view", "mappings.bam", "|", "split", "-l 10000000", "-",
        "split_map"
    ]))
    run_shell(" ".join(
        ["samtools", "view", "-H", "mappings.bam", ">", "header_only.sam"]))
    files = os.listdir(".")
    jobs = []
    for f in files:
        if f.startswith("split_map"):
            # add header
            run_shell(" ".join(["cat", "header_only.sam", f, ">", "temp.sam"]))
            # convert to BAM
            run_shell(" ".join(
                ["samtools", "view", "-S", "-b", "temp.sam", ">", "temp.bam"]))
            # upload file
            split_bam = dxpy.upload_local_file("temp.bam")
            # run analysis
            jobs.append(
                dxpy.new_dxjob(
                    {
                        "BAM_file": dxpy.dxlink(split_bam.get_id()),
                        "BED_file": BED_file
                    }, "run_gbc"))

    run_shell("ls -l")

    gbc_agg_input = {"sub_reports": []}
    for j in jobs:
        gbc_agg_input["sub_reports"].append({
            "job": j.get_id(),
            "field": "file"
        })

    agg_job = dxpy.new_dxjob(gbc_agg_input, "gbc_agg").get_id()

    return {"results": {"job": agg_job, "field": "cover"}}
Пример #20
0
def main(nSimulations, nWorkers):

    # To achieve a target of nSimulations total simulations,
    # an approximately equal share of simulations is delegated to each
    # worker initiated

    subjobs = []
    workerLoads = splitIntoGroups(nSimulations, nWorkers)

    
    for load in workerLoads:
        subjob_input = { "workerLoad": load}
        subjobs.append(dxpy.new_dxjob(subjob_input, "process"))

    # The postprocess job depends on all subjobs being "done"

    postprocess_job = dxpy.new_dxjob(fn_input={ "process_outputs": [subjob.get_output_ref("output") for subjob in subjobs],
                                                "total_count": nSimulations },
                                     fn_name="postprocess",
                                     depends_on=subjobs)

    output = {"estimatedPi": postprocess_job.get_output_ref("answer")}
    return output
Пример #21
0
def run_minimap2_subjobs(job_inputs):
    # group subjobs by filesize chunks
    files_and_filesizes = zip(job_inputs['reads'],
                              _get_filesizes(job_inputs['reads']))
    jobs = []
    for group in dx_utils.schedule_lpt(files_and_filesizes,
                                       job_inputs['chunk_size']):
        map_reads_input = {
            'reads': group,
            'genome_fastagz': job_inputs['genome_fastagz'],
            'genome_mmi': job_inputs['genome_mmi'],
            'datatype': job_inputs['datatype']
        }
        job = dxpy.new_dxjob(map_reads_input, 'map_reads_minimap2')
        jobs.append(job)

    return jobs
Пример #22
0
def main(**job_inputs):
    # If we weren't provided a mmi index for the reference, generate it.
    if 'genome_mmi' not in job_inputs:
        mmi_input = {'genome_fastagz': job_inputs['genome_fastagz']}
        minimap_index_job = dxpy.new_dxjob(mmi_input, 'run_minimap_index')
        job_inputs['genome_mmi'] = minimap_index_job.get_output_ref(
            'genome_mmi')
    output = {'genome_mmi': job_inputs['genome_mmi']}

    # check if we're dealing with pacbio or ONT reads and what the filetype is
    datatype = job_inputs['datatype']
    one_reads_file = dxpy.DXFile(job_inputs['reads'][0]).describe()['name']
    try:
        file_ext = re.search("(bam|fastq|fasta|fa|fq){1}(.gz)?$",
                             one_reads_file,
                             flags=re.I).group(1).lower()
    except AttributeError:
        raise dxpy.AppError("Unknown filetype extension supplied.")

    if file_ext == 'bam':
        # input bam files must be pacbio raw reads
        if datatype == 'ONT':
            raise dxpy.AppError("Invalid file input for provided datatype.")

        # for bam input, run jobs using pbmm2
        jobs = run_pbmm2_subjobs(job_inputs)

    else:
        # for fasta and fastq inputs, run jobs using native minimap2
        if job_inputs['pbbamify']:
            print(
                'WARNING: The "Run pbbamify" option is only valid for BAM input'
            )
        jobs = run_minimap2_subjobs(job_inputs)

    output['bam_files'] = [j.get_output_ref('mapped_reads') for j in jobs]
    output['bai_files'] = [
        j.get_output_ref('mapped_reads_index') for j in jobs
    ]

    return output
Пример #23
0
def map_contaminant(Contig, Reads):
    # get ID of our mapper
    try:
        bwa = dxpy.DXApp(dxpy.find_apps(name="bwa").next()['id'])
    except StopIteration:
        raise dxpy.AppError("Unable to find app 'bwa'.  Please install it to enable contaminant mapping")

    # TODO: find optimal chunk size so we don't launch too many bwa jobs
    map_job = bwa.run({"reads":Reads, "reference": Contig, "discard_unmapped_rows":True, "chunk_size":10000000})

    total_reads = 0
    for r in Reads:
        desc = dxpy.DXGTable(r).describe()
        current_reads = desc['length']
        if 'sequence2' in desc['columns']:
            current_reads *= 2
        total_reads += current_reads

    # launch a job to wait for the mapping and will calculate what % has mapped
    calc_job = dxpy.new_dxjob({"num_reads":total_reads, "mappings":{"job":map_job.get_id(), "field":"mappings"}}, "calc_contam")

    return calc_job.get_id()
Пример #24
0
def map_contaminant(Contig, Reads):
    # get ID of our mapper
    try:
        bwa = dxpy.DXApp(
            dxpy.find_apps(name="bwa_mem_fastq_read_mapper").next()['id'])
    except StopIteration:
        raise dxpy.AppError(
            "Unable to find app 'bwa_mem_fastq_read_mapper'.  Please install it to enable contaminant mapping"
        )

    # TODO: find optimal chunk size so we don't launch too many bwa jobs
    map_job = bwa.run({
        "reads": Reads,
        "reference": Contig,
        "discard_unmapped_rows": True,
        "chunk_size": 10000000
    })

    total_reads = 0
    for r in Reads:
        desc = dxpy.DXGTable(r).describe()
        current_reads = desc['length']
        if 'sequence2' in desc['columns']:
            current_reads *= 2
        total_reads += current_reads

    # launch a job to wait for the mapping and will calculate what % has mapped
    calc_job = dxpy.new_dxjob(
        {
            "num_reads": total_reads,
            "mappings": {
                "job": map_job.get_id(),
                "field": "mappings"
            }
        }, "calc_contam")

    return calc_job.get_id()
def main(exp_acc, files_to_fetch=None, skipvalidate=True, key='www', debug=False):

    # Splits the work into parallel tasks: one for each file to fetch.

    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    proj_id = os.environ['DX_PROJECT_CONTEXT_ID']
    project = dxpy.DXProject(proj_id)  ## should be default

    logger.debug("* Project: " + proj_id)

    if files_to_fetch != None:
        logger.debug("* f2f_json: " + files_to_fetch)
        file_objs = json.loads(files_to_fetch.encode('ascii')) # Expect [ {},{},{},... ]
        logger.debug(file_objs)

    # f_obj = { "accession": ,"dx_folder": ,"dx_file_name": ,"enc_file_name": ,"bucket_url": }

    subjobs = []
    if file_objs:
        for f_obj in file_objs:
            skipvalidate_this = skipvalidate
            dx_file_name = f_obj["dx_file_name"]
            if dx_file_name.endswith(".fastq.gz") or dx_file_name.endswith(".fq.gz"):
                skipvalidate_this = True

            logger.debug(f_obj["bucket_url"] + " " + f_obj["enc_file_name"])

            #process(f_obj["enc_file_name"], f_obj["bucket_url"], project.get_id(), f_obj["dx_folder"], f_obj["accession"], \
            #                                                                    f_obj["dx_file_name"], skipvalidate_this)
            subjob_input = {
                "enc_file_name": f_obj["enc_file_name"],
                "bucket_url": f_obj["bucket_url"],
                "proj_id": project.get_id(),
                "dx_folder": f_obj["dx_folder"],
                "file_acc": f_obj["accession"],
                "dx_file_name": f_obj["dx_file_name"],
                "skipvalidate": skipvalidate_this
            }
            subjobs.append(dxpy.new_dxjob(subjob_input, "process"))
            #subjobs.append(dxpy.new_dxjob(subjob_input, "noop"))

    # This does not wait for subjob completion as I thought.
    files_fetched = [subjob.get_output_ref("file") for subjob in subjobs]
    logger.debug("Attempting to fetch %d file(s)" % (len(files_fetched)))

    if skipvalidate:
        output = {
                    "fetched_count": len(files_fetched),
                    "files": files_fetched
        }
    else:
        output = {
                    "fetched_count": len(files_fetched),
                    "files": files_fetched,
                    "reports": [subjob.get_output_ref("report") for subjob in subjobs],
                    "summaries": [subjob.get_output_ref("summary") for subjob in subjobs],
                    "zips": [subjob.get_output_ref("zip") for subjob in subjobs],
        }


    return output
Пример #26
0
def RunWithBamInput(kwargs):  
    mappings_ids = kwargs["mappings_files"]
    mappings_names = sorted([dxpy.describe(id)["name"] for id in mappings_ids])
    num_threads = kwargs["num_threads_per_instance"]
    bam_config_fn = "bam_config.txt"
    
    if "bam_config_file" in kwargs:
        print "\nInput has a BAM config file. Need to download and validate bam config file"
        dxpy.download_dxfile(kwargs["bam_config_file"], bam_config_fn)
        ValidateBamConfig(bam_config_fn=bam_config_fn, bam_name_array=mappings_names)
    else:
        if "insert_size" not in kwargs:
            raise dxpy.AppError("Input files are bam files but neither a bam configuration file, nor an insert size was given as an app input.")
        if kwargs["bam_not_produced_by_bwa"]:
            return RunWithPindelInput(kwargs, sam2pindel=True)
        else:
            bam_config_fn = WriteConfigFile(mappings_names=mappings_names, fn=bam_config_fn,  insert_size=kwargs["insert_size"])   
        
    need_to_index=True
    if "bam_index_files" in kwargs:
        bam_idx_ids = kwargs["bam_index_files"]
        idx_names = sorted([dxpy.describe(id)["name"] for id in bam_idx_ids])
        if CheckBamIdxMatch(bam_names=mappings_names, idx_names=idx_names):
            need_to_index = False
            mappings_names = DownloadFilesFromArray(mappings_ids)
            bam_idx_names = DownloadFilesFromArray(bam_idx_ids)
            
    if need_to_index:
        mappings_names = DownloadFilesFromArray(mappings_ids)
        if not kwargs["assume_sorted"]:
            mappings_names = SortBams(bam_names=mappings_names, num_threads=num_threads)
        mappings_names, bam_idx_names = IndexBams(mappings_names) 
    
    chrom = kwargs["chromosome"] if "chromosome" in kwargs else "ALL"
    
    if "chromosome" in kwargs or kwargs["num_instances"] == 1:        
        command, output_path = BuildPindelCommand(kwargs=kwargs, chrom=chrom, input_fn=bam_config_fn, is_pindel_input_type=False)
        output_path = RunPindel(kwargs=kwargs, pindel_command=command, output_path=output_path)    
        app_outputs = UploadPindelOutputs(kwargs=kwargs, output_path=output_path)
        if kwargs["export_vcf"]:
            app_outputs["vcf"] = ExportVCF(kwargs=kwargs, output_path=output_path, ref_fn="reference_fasta")   

    else: 
        subjob_ids = SplitBamForSubjobs(kwargs, mappings_names, bam_config_fn)
        postprocess_inputs = {"subjob_outputs": [job.get_output_ref("subjob_output") for job in subjob_ids], "kwargs": kwargs}
        postprocess_job = dxpy.new_dxjob(fn_input = postprocess_inputs, fn_name = "postprocess")
        
        app_outputs = {"deletions" : {"job": postprocess_job.get_id(), "field": "deletions"},
                       "short_inserts" : {"job": postprocess_job.get_id(), "field": "short_inserts"}, 
                       "tandem_duplications" : {"job": postprocess_job.get_id(), "field": "tandem_duplications"},
                       "large_inserts" : {"job": postprocess_job.get_id(), "field": "large_inserts"},
                       "inversions" : {"job": postprocess_job.get_id(), "field": "inversions"},
                       "breakpoints" : {"job": postprocess_job.get_id(), "field": "breakpoints"}
                       }
        if kwargs["report_close_mapped_reads"] or kwargs["report_only_close_mapped_reads"]:
            app_outputs["close_mapped_reads"] = {"job": postprocess_job.get_id(), "field": "close_mapped_reads"}
        if kwargs["export_vcf"]:
            app_outputs["vcf"] = {"job": postprocess_job.get_id(), "field": "vcf"}   
        #if "breakdancer_calls_file" in kwargs:
        #    app_outputs["breakdancer_outputs"] = {"job": postprocess_job.get_id(), "field": "breakdancer_outputs"}
    
    dxlinks = []
    if need_to_index:
        if not kwargs["assume_sorted"]:
            for bam in mappings_names:
                uploaded_bam = dxpy.upload_local_file(bam, name=bam.rstrip('.bam')+"_sorted.bam")
                dxlinks.append(dxpy.dxlink(uploaded_bam))
        for idx in bam_idx_names:
            uploaded_idx = dxpy.upload_local_file(idx, name=idx.rstrip('.bam.bai')+"_sorted.bam.bai")
            dxlinks.append(dxpy.dxlink(uploaded_idx))
        app_outputs["sortedbam_and_index_files"] = dxlinks
        
    return app_outputs
Пример #27
0
def main(reads1,
         crop_length,
         reference_tar,
         bwa_version,
         bwa_aln_params,
         samtools_version,
         debug,
         reads2=None):

    # Main entry-point.  Parameter defaults assumed to come from dxapp.json.
    # reads1, reference_tar, reads2 are links to DNAnexus files or None

    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    # This spawns only one or two subjobs for single- or paired-end,
    # respectively.  It could also download the files, chunk the reads,
    # and spawn multiple subjobs.

    # Files are downloaded later by subjobs into their own filesystems
    # and uploaded to the project.

    # Initialize file handlers for input files.

    paired_end = reads2 is not None

    if crop_length == 'native':
        crop_subjob = None
        unmapped_reads = [reads1, reads2]
    else:
        crop_subjob_input = {
            "reads1_file": reads1,
            "reads2_file": reads2,
            "crop_length": crop_length,
            "debug": debug
        }
        logger.info("Crop job input: %s" % (crop_subjob_input))
        crop_subjob = dxpy.new_dxjob(crop_subjob_input, "crop")
        unmapped_reads = [crop_subjob.get_output_ref("cropped_reads1")]
        if paired_end:
            unmapped_reads.append(crop_subjob.get_output_ref("cropped_reads2"))
        else:
            unmapped_reads.append(None)

    unmapped_reads = [r for r in unmapped_reads if r]

    mapping_subjobs = []
    for reads in unmapped_reads:
        mapping_subjob_input = {
            "reads_file": reads,
            "reference_tar": reference_tar,
            "bwa_aln_params": bwa_aln_params,
            "bwa_version": bwa_version,
            "debug": debug
        }
        logger.info("Mapping job input: %s" % (mapping_subjob_input))
        if crop_subjob:
            mapping_subjobs.append(
                dxpy.new_dxjob(fn_input=mapping_subjob_input,
                               fn_name="process",
                               depends_on=[crop_subjob]))
        else:
            mapping_subjobs.append(
                dxpy.new_dxjob(fn_input=mapping_subjob_input,
                               fn_name="process"))

    # Create the job that will perform the "postprocess" step.
    # depends_on=mapping_subjobs, so blocks on all mapping subjobs

    postprocess_job = dxpy.new_dxjob(fn_input={
        "indexed_reads": [
            subjob.get_output_ref("suffix_array_index")
            for subjob in mapping_subjobs
        ],
        "unmapped_reads":
        unmapped_reads,
        "reference_tar":
        reference_tar,
        "bwa_version":
        bwa_version,
        "samtools_version":
        samtools_version,
        "debug":
        debug
    },
                                     fn_name="postprocess",
                                     depends_on=mapping_subjobs)

    mapped_reads = postprocess_job.get_output_ref("mapped_reads")
    mapping_statistics = postprocess_job.get_output_ref("mapping_statistics")
    n_mapped_reads = postprocess_job.get_output_ref("n_mapped_reads")

    output = {
        "mapped_reads": mapped_reads,
        "crop_length": crop_length,
        "mapping_statistics": mapping_statistics,
        "paired_end": paired_end,
        "n_mapped_reads": n_mapped_reads
    }
    logger.info("Exiting with output: %s" % (output))
    return output
Пример #28
0
def main(bam_set, map_report_set, dme_ix, uncompress_bam=True):

    # tool_versions.py --applet $script_name --appver $script_ver
    props = {}
    if os.path.isfile('/usr/bin/tool_versions.py'):
        sw_versions = subprocess.check_output(
            ['tool_versions.py', '--dxjson', 'dnanexus-executable.json'])
        props["SW"] = sw_versions

    print "* Value of bam_set:        '" + str(bam_set) + "'"
    print "* Value of map_report_set: '" + str(map_report_set) + "'"
    print "* Value of dme_ix:         '" + str(dme_ix) + "'"
    print "* Value of uncompress_bam: '" + str(uncompress_bam) + "'"

    print "* Calling merge_extract_full()..."
    inp = {
        'bam_set': bam_set,
        'map_report_set': map_report_set,
        'dme_ix_dxlink': dme_ix,
        'uncompress_bam': uncompress_bam,
        'props': props
    }
    extract_job = dxpy.new_dxjob(inp, "merge_extract_full")
    print "* Kicked off extract() and waiting..."
    extract_job.wait_on_done(
    )  # Wait because we want the qc_metrics to pass to other jobs.
    extract_out = extract_job.describe()['output']
    target_root = extract_out['target_root']
    qc_metrics = extract_out['qc_metrics']

    print "* Calling bedmethyl()..."
    # What is cheaper?  bedmethyl and signal in main or farm one out to a separate process?
    bedmethyl_out = bedmethyl_io(extract_out["cx_report_dxlink"],
                                 extract_out["chrom_sizes_dxlink"],
                                 target_root, qc_metrics, props)
    #inp = {
    #    'cx_report_dxlink':   extract_out["cx_report_dxlink"],
    #    'chrom_sizes_dxlink': extract_out["chrom_sizes_dxlink"],
    #    'target_root':        target_root,
    #    'qc_metrics':         qc_metrics,
    #    'props':              props
    #}
    #bedmethyl_job = dxpy.new_dxjob(inp, "bedmethyl_io")
    #print "* Kicked off bedmethyl() but not waiting waiting..."

    print "* Calling signal()..."
    signal_out = signal_io(extract_out["bedgraph_gz_dxlink"],
                           extract_out["chrom_sizes_dxlink"], target_root,
                           qc_metrics, props)

    print "* Check storage..."
    run_cmd('ls -l')
    run_cmd('df -k .')

    #bedmethyl_job.wait_on_done() # Wait because we want the qc_metrics to pass to other jobs.
    #bedmethyl_out = bedmethyl_job.describe()['output']
    print "* Finished."

    return {
        # from extract()
        #"bam_biorep":    extract_out['biorep_bam_dxlink'],
        "bam_biorep_qc": extract_out['biorep_bam_qc_dxlink'],
        "map_biorep": extract_out['biorep_map_dxlink'],
        "mbias_report": extract_out["mbias_report_dxlink"],

        # from signal()
        "signal": signal_out["bigWig_dxlink"],

        # from bedmethyl()
        "CpG_bed": bedmethyl_out["CpG_bed_dxlink"],
        "CHG_bed": bedmethyl_out["CHG_bed_dxlink"],
        "CHH_bed": bedmethyl_out["CHH_bed_dxlink"],
        "CpG_bb": bedmethyl_out["CpG_bb_dxlink"],
        "CHG_bb": bedmethyl_out["CHG_bb_dxlink"],
        "CHH_bb": bedmethyl_out["CHH_bb_dxlink"],
        "metadata": json.dumps(qc_metrics)
    }
def main(accession, key=None, debug=False, skipvalidate=False):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    #files = [dxpy.DXFile(item) for item in files]

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    #for i, f in enumerate(files):
    #    dxpy.download_dxfile(f.get_id(), "files-" + str(i))

    # Split your work into parallel tasks.  As an example, the
    # following generates 10 subjobs running with the same dummy
    # input.

    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    (AUTHID, AUTHPW, SERVER) = processkey(key)

    url = SERVER + 'experiments/%s/?format=json&frame=embedded' % (accession)
    #get the experiment object
    logger.debug("%s - %s" % (url, AUTHID))
    response = encoded_get(url, AUTHID, AUTHPW)
    logger.debug(response)

    exp = response.json()
    reps = exp.get('replicates')
    # for some reason cannot write exp json to STDERR/logger
    logger.debug(reps or "No replicates")
    '''
    Derive replicate structure and make directories
    '''
    project = dxpy.DXProject(
        os.environ['DX_PROJECT_CONTEXT_ID'])  ## should be default
    exp_folder = "%s/%s" % (ROOT_FOLDER, accession)
    #rf = find_or_create_folder(project, ROOT_FOLDER)
    #project.new_fo
    #f = find_or_create_folder(project, exp_folder, root_folder='/'+ROOT_FOLDER)
    for rep in exp['replicates']:
        rep_folder = "%s/rep%s_%s" % (exp_folder,
                                      rep['biological_replicate_number'],
                                      rep['technical_replicate_number'])
        project.new_folder(rep_folder, parents=True)

    subjobs = []
    files = exp.get('files')
    if reps and files:
        for ff in files:
            if ff['file_format'] == 'fastq':
                folder = "%s/rep%s_%s" % (
                    exp_folder, ff['replicate']['biological_replicate_number'],
                    ff['replicate']['technical_replicate_number'])
                file_name, bucket_url = get_bucket(SERVER, AUTHID, AUTHPW, ff)
                subjob_input = {
                    "filename": file_name,
                    "bucket_url": bucket_url,
                    "project": project.get_id(),
                    "folder": folder,
                    "skipvalidate": skipvalidate
                }
                subjobs.append(dxpy.new_dxjob(subjob_input, "process"))

    # The following line creates the job that will perform the
    # "postprocess" step of your app.  We've given it an input field
    # that is a list of job-based object references created from the
    # "process" jobs we just created.  Assuming those jobs have an
    # output field called "output", these values will be passed to the
    # "postprocess" job.  Because these values are not ready until the
    # "process" jobs finish, the "postprocess" job WILL NOT RUN until
    # all job-based object references have been resolved (i.e. the
    # jobs they reference have finished running).
    #
    # If you do not plan to have the "process" jobs create output that
    # the "postprocess" job will require, then you can explicitly list
    # the dependencies to wait for those jobs to finish by setting the
    # "depends_on" field to the list of subjobs to wait for (it
    # accepts either dxpy handlers or string IDs in the list).  We've
    # included this parameter in the line below as well for
    # completeness, though it is unnecessary if you are providing
    # job-based object references in the input that refer to the same
    # set of jobs.
    # If you would like to include any of the output fields from the
    # postprocess_job as the output of your app, you should return it
    # here using a job-based object reference.  If the output field in
    # the postprocess function is called "answer", you can pass that
    # on here as follows:
    #
    #return { "FastQC_reports": [ dxpy.dxlink(item) for item in postprocess_job.get_output_ref("report") ]}
    #
    # Tip: you can include in your output at this point any open
    # objects (such as gtables) which will be closed by a job that
    # finishes later.  The system will check to make sure that the
    # output object is closed and will attempt to clone it out as
    # output into the parent container only after all subjobs have
    # finished.

    if skipvalidate:
        output = {
            "files": [subjob.get_output_ref("file") for subjob in subjobs]
        }
    else:
        output = {
            "files": [subjob.get_output_ref("file") for subjob in subjobs],
            "reports": [subjob.get_output_ref("report") for subjob in subjobs],
            "summaries":
            [subjob.get_output_ref("summary") for subjob in subjobs],
            "zips": [subjob.get_output_ref("zip") for subjob in subjobs],
        }

    return output
Пример #30
0
def main(**job_inputs):
    output = {}
    reportInput = {}
    
    run_shell("dx-spans-to-bed --output genes.bed " + job_inputs["gene_model"]["$dnanexus_link"])
    bed_id = dxpy.upload_local_file("genes.bed").get_id()
    mappings_id = job_inputs["mappings"]["$dnanexus_link"]

    # get contaminant mapping started if we're doing it:
    if "contaminants" in job_inputs:
        if not "original_reads" in job_inputs:
            raise dxpy.AppError("Original Reads must be input to calculate contamination levels. Please also supply the reads object that corresponds to these RNA-Seq mappings")

        name_input = []
        contam_input = []

        #spawn mappings job for each ContigSet
        for contaminant in job_inputs['contaminants']:
            calc_job = map_contaminant(Reads=job_inputs['original_reads'], Contig=contaminant)

            name_input.append(dxpy.DXRecord(contaminant).describe()['name'])
            contam_input.append({"job":calc_job, "field":"percent_mapped"})
    
        reportInput['contam'] = contam_input
        reportInput['names'] = name_input
    else:
        reportInput['contam'] = None
        reportInput['names'] = None

    # output mappings as SAM for analysis modules
    run_shell(" ".join(["dx-mappings-to-sam", "--discard_unmapped", "--output mappings.sam", mappings_id]))
    run_shell(" ".join(["samtools", "view", "-S", "-b", "mappings.sam", ">", "mappings.bam"]))
    bam_id = dxpy.upload_local_file("mappings.bam", wait_on_close=True).get_id()

    job1 = dxpy.new_dxjob( {'BED_file':bed_id, "BAM_file":dxpy.dxlink(bam_id)}, "geneBody_coverage" )

    # if paired then do inner distance calculation
    if "chr2" in dxpy.DXGTable(mappings_id).get_col_names():
        job2 = dxpy.new_dxjob( {'BED_file':bed_id, "BAM_file":dxpy.dxlink(bam_id)}, "inner_distance" )
    else:
        job2 = None

    job3 = dxpy.new_dxjob( {'BED_file':bed_id, "BAM_file":dxpy.dxlink(bam_id)}, "junction_annotation" )

    job4 = dxpy.new_dxjob( {"BAM_file":dxpy.dxlink(bam_id)}, "read_duplication" )

    # implement this one when we can request a large RAM instance - requires 19GB for human genome
    job5 = dxpy.new_dxjob( {'BED_file':bed_id, "BAM_file":dxpy.dxlink(bam_id)}, "read_distribution")
    #                       {"systemRequirements": {"instanceType":"dx_m2.2xlarge"}} )

    reportInput['geneBody'] = {"job":job1.get_id(), "field":"results"}
    if job2 != None:
        reportInput['inner_dist'] = {"job":job2.get_id(), "field":"results"}
    else:
        reportInput['inner_dist'] = None

    reportInput['junc_ann'] = {"job":job3.get_id(), "field":"results"}
    reportInput['read_dup'] = {"job":job4.get_id(), "field":"results"}
    reportInput['read_dist'] = {"job":job5.get_id(), "field":"results"}
    reportInput['mappings'] = job_inputs["mappings"]

    reportJob = dxpy.new_dxjob( reportInput, "generate_report" )

    output['report'] = {"job":reportJob.get_id(), "field": "Report"}
    
    return output
def main(reads, dme_ix, ncpus, splitsize):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    #dx_reads = [dxpy.DXFile(item) for item in reads]

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    # We first create the "scatter" job which will scatter some input
    # (replace with your own input as necessary).
    logger.info("* Start Scatter with %d files %sM read splits *" %
                (len(reads), splitsize))

    scatter_job = dxpy.new_dxjob(fn_input={
        'orig_reads': reads,
        'split_size': splitsize,
    },
                                 fn_name="scatter")

    # We will want to call "process" on each output of "scatter", so
    # we call the "map" entry point to do so.  We can also provide
    # here additional input that we want each "process" entry point to
    # receive, e.g. a GTable ID to which the "process" function should
    # add rows of data.

    reads_root = simplify_name() or strip_extensions(
        dxpy.describe(reads[0])['name'], STRIP_EXTENSIONS)

    map_input = {
        "array_of_scattered_input":
        scatter_job.get_output_ref("array_of_scattered_input"),
        "process_input": {
            "reads_root": reads_root,
            "ncpus": ncpus,
            "dme_ix": dme_ix
        }
    }
    logger.info("* Start Map with: %s *" % map_input)
    map_job = dxpy.new_dxjob(fn_input=map_input, fn_name="map")

    # Finally, we want the "postprocess" job to run after "map" is
    # done calling "process" on each of its inputs.  Note that a job
    # is marked as "done" only after all of its child jobs are also
    # marked "done".
    logger.info("* Waiting for map job to finish...")
    postprocess_input = {
        "bam_files": map_job.get_output_ref("bam_files"),
        "report_files": map_job.get_output_ref("report_files"),
        "bam_root": reads_root + '_techrep'
    }
    logger.info("* Start Post process with: %s *" % postprocess_input)
    postprocess_job = dxpy.new_dxjob(fn_input=postprocess_input,
                                     fn_name="postprocess",
                                     depends_on=[map_job])

    # If you would like to include any of the output fields from the
    # postprocess_job as the output of your app, you should return it
    # here using a job-based object reference.
    #
    # return { "app_output_field": postprocess_job.get_output_ref("final_output"), ...}
    #
    # Tip: you can include in your output at this point any open
    # objects (such as gtables) which will be closed by a job that
    # finishes later.  The system will check to make sure that the
    # output object is closed and will attempt to clone it out as
    # output into the parent container only after all subjobs have
    # finished.

    output = {}
    output["bam_techrep"] = dxpy.dxlink(
        postprocess_job.get_output_ref("bam_techrep"))
    output["bam_techrep_qc"] = dxpy.dxlink(
        postprocess_job.get_output_ref("bam_techrep_qc"))
    output["map_techrep"] = dxpy.dxlink(
        postprocess_job.get_output_ref("map_techrep"))
    output["reads"] = postprocess_job.get_output_ref("reads")
    output["metadata"] = postprocess_job.get_output_ref("metadata")

    return output
Пример #32
0
def concat_pdfs_link(pdf_refs, name):
    job = dxpy.new_dxjob(fn_name='concat_pdfs',
            fn_input={'pdfs': pdf_refs, 'name': name})
    return job.get_output_ref('pdf')
Пример #33
0
def SplitBamForSubjobs(kwargs, bam_names, bam_config_fn=None):
    num_threads = kwargs["num_threads_per_instance"]
    print "\nSplitting bam for subjobs"

    # Assuming that all bam files have the same chromosomes (is this safe?)
    subprocess.check_output("samtools view -H {input_bam} > header.txt".format(
        input_bam=bam_names[0]),
                            shell=True)
    with open('header.txt') as fh:
        header = [line.rstrip('\n') for line in fh]
    print "Input header: "
    for line in header:
        print line

    print "Save unmapped reads as bam files to merge into subjob files"
    unmapped = {}
    for bam in bam_names:
        fn = bam.rstrip('.bam') + '_unmapped'
        command = "samtools view -@ {n} -u -b -f 4 {bam} > {unmapped}".format(
            n=num_threads, bam=bam, unmapped=fn)
        print command
        subprocess.check_call(command, shell=True)
        unmapped[bam] = fn

    groups = SplitGenomeFromSam(header, kwargs["num_instances"])
    subjobs = []
    subjob_no = 0
    for group in groups:
        group = " ".join(group)
        subjob_bam_fn = []

        for bam in bam_names:
            start_time = time.time()
            print "\nMerging {bam} with unmapped reads for pindel subjobs".format(
                bam=bam)
            out_fn = bam.rstrip('.bam') + '_' + str(subjob_no) + '.bam'

            command = "samtools view -@ {n} -bh {bam} {group} > tmp.bam".format(
                n=num_threads, bam=bam, group=group)
            subprocess.check_call(command, shell=True)

            split_command = "samtools merge -@ {n} {out} {unmapped} tmp.bam ".format(
                n=num_threads, out=out_fn, unmapped=unmapped[bam])
            print split_command
            subprocess.check_call(split_command, shell=True)

            print "Samtools view and merge ran in: {min} minutes".format(
                min=float((time.time() - start_time) / 60))
            subjob_bam_fn.append(out_fn)

        subjob_kwargs = kwargs.copy()
        subjob_bam_fn, subjob_bam_idx_fn = IndexBams(bam_names=subjob_bam_fn)

        print "Uploading split bam files: " + str(subjob_bam_fn)
        subjob_bam_ids = [
            dxpy.dxlink(dxpy.upload_local_file(bam)) for bam in subjob_bam_fn
        ]
        print "Uploading split bam index files: " + str(subjob_bam_idx_fn)
        subjob_bam_idx_ids = [
            dxpy.dxlink(dxpy.upload_local_file(idx))
            for idx in subjob_bam_idx_fn
        ]

        subjob_kwargs["mappings_files"] = subjob_bam_ids
        subjob_kwargs["bam_index_files"] = subjob_bam_idx_ids

        print "Updating bam config file for subjob"
        if bam_config_fn:
            new_config_fn = "subjob_config_" + str(subjob_no) + '.txt'
            with open(bam_config_fn,
                      'r') as config_fh, open(new_config_fn, 'w') as write_fh:
                for line in config_fh:
                    line = line.split('\t')
                    bam_name = line[0]
                    out_fn = bam_name.rstrip('.bam') + '_' + str(
                        subjob_no) + '.bam'
                    write_fh.write(out_fn + '\t' + "\t".join(line[1:]) + '\n')

            print "Uploading new config file: " + str(new_config_fn)
            subjob_kwargs["bam_config_file"] = dxpy.dxlink(
                dxpy.upload_local_file(new_config_fn))

        job = dxpy.new_dxjob(subjob_kwargs, "process")
        print "Started subjob #{n}: {job_id}".format(n=subjob_no,
                                                     job_id=job.get_id())
        subjobs.append(job)
        subjob_no += 1

    return subjobs
def main(bam_set, map_report_set, dme_ix, uncompress_bam=True):

    # tool_versions.py --applet $script_name --appver $script_ver
    props = {}
    if os.path.isfile('/usr/bin/tool_versions.py'): 
        sw_versions = subprocess.check_output(['tool_versions.py', '--dxjson', 'dnanexus-executable.json'])
        props["SW"] = sw_versions
    
    print "* Value of bam_set:        '" + str(bam_set) + "'"
    print "* Value of map_report_set: '" + str(map_report_set) + "'"
    print "* Value of dme_ix:         '" + str(dme_ix) + "'"
    print "* Value of uncompress_bam: '" + str(uncompress_bam) + "'"

    print "* Calling merge_extract_full()..."
    inp = {
        'bam_set':        bam_set,
        'map_report_set': map_report_set, 
        'dme_ix_dxlink':  dme_ix,
        'uncompress_bam': uncompress_bam,
        'props':          props
    }
    extract_job = dxpy.new_dxjob(inp, "merge_extract_full")
    print "* Kicked off extract() and waiting..."
    extract_job.wait_on_done() # Wait because we want the qc_metrics to pass to other jobs.
    extract_out = extract_job.describe()['output']
    target_root = extract_out['target_root']
    qc_metrics = extract_out['qc_metrics']


    print "* Calling bedmethyl()..."
    # What is cheaper?  bedmethyl and signal in main or farm one out to a separate process?
    bedmethyl_out = bedmethyl_io(extract_out["cx_report_dxlink"], extract_out["chrom_sizes_dxlink"], target_root, qc_metrics, props)
    #inp = {
    #    'cx_report_dxlink':   extract_out["cx_report_dxlink"],
    #    'chrom_sizes_dxlink': extract_out["chrom_sizes_dxlink"],
    #    'target_root':        target_root,
    #    'qc_metrics':         qc_metrics,
    #    'props':              props
    #}
    #bedmethyl_job = dxpy.new_dxjob(inp, "bedmethyl_io")
    #print "* Kicked off bedmethyl() but not waiting waiting..."

    print "* Calling signal()..."
    signal_out = signal_io(extract_out["bedgraph_gz_dxlink"],extract_out["chrom_sizes_dxlink"],target_root,qc_metrics,props)

    print "* Check storage..."
    run_cmd('ls -l')
    run_cmd('df -k .')

    #bedmethyl_job.wait_on_done() # Wait because we want the qc_metrics to pass to other jobs.
    #bedmethyl_out = bedmethyl_job.describe()['output']
    print "* Finished."

    return {
        # from extract() 
        #"bam_biorep":    extract_out['biorep_bam_dxlink'], 
        "bam_biorep_qc": extract_out['biorep_bam_qc_dxlink'], 
        "map_biorep":    extract_out['biorep_map_dxlink'],
        "mbias_report":  extract_out["mbias_report_dxlink"],
                
        # from signal() 
        "signal": signal_out["bigWig_dxlink"],
        
        # from bedmethyl() 
        "CpG_bed": bedmethyl_out["CpG_bed_dxlink"],
        "CHG_bed": bedmethyl_out["CHG_bed_dxlink"],
        "CHH_bed": bedmethyl_out["CHH_bed_dxlink"],
        "CpG_bb":  bedmethyl_out["CpG_bb_dxlink"],
        "CHG_bb":  bedmethyl_out["CHG_bb_dxlink"],
        "CHH_bb":  bedmethyl_out["CHH_bb_dxlink"],

        "metadata": json.dumps(qc_metrics) 
        }
Пример #35
0
def RunWithBamInput(kwargs):
    mappings_ids = kwargs["mappings_files"]
    mappings_names = sorted([dxpy.describe(id)["name"] for id in mappings_ids])
    num_threads = kwargs["num_threads_per_instance"]
    bam_config_fn = "bam_config.txt"

    if "bam_config_file" in kwargs:
        print "\nInput has a BAM config file. Need to download and validate bam config file"
        dxpy.download_dxfile(kwargs["bam_config_file"], bam_config_fn)
        ValidateBamConfig(bam_config_fn=bam_config_fn,
                          bam_name_array=mappings_names)
    else:
        if "insert_size" not in kwargs:
            raise dxpy.AppError(
                "Input files are bam files but neither a bam configuration file, nor an insert size was given as an app input."
            )
        if kwargs["bam_not_produced_by_bwa"]:
            return RunWithPindelInput(kwargs, sam2pindel=True)
        else:
            bam_config_fn = WriteConfigFile(mappings_names=mappings_names,
                                            fn=bam_config_fn,
                                            insert_size=kwargs["insert_size"])

    need_to_index = True
    if "bam_index_files" in kwargs:
        bam_idx_ids = kwargs["bam_index_files"]
        idx_names = sorted([dxpy.describe(id)["name"] for id in bam_idx_ids])
        if CheckBamIdxMatch(bam_names=mappings_names, idx_names=idx_names):
            need_to_index = False
            mappings_names = DownloadFilesFromArray(mappings_ids)
            bam_idx_names = DownloadFilesFromArray(bam_idx_ids)

    if need_to_index:
        mappings_names = DownloadFilesFromArray(mappings_ids)
        if not kwargs["assume_sorted"]:
            mappings_names = SortBams(bam_names=mappings_names,
                                      num_threads=num_threads)
        mappings_names, bam_idx_names = IndexBams(mappings_names)

    chrom = kwargs["chromosome"] if "chromosome" in kwargs else "ALL"

    if "chromosome" in kwargs or kwargs["num_instances"] == 1:
        command, output_path = BuildPindelCommand(kwargs=kwargs,
                                                  chrom=chrom,
                                                  input_fn=bam_config_fn,
                                                  is_pindel_input_type=False)
        output_path = RunPindel(kwargs=kwargs,
                                pindel_command=command,
                                output_path=output_path)
        app_outputs = UploadPindelOutputs(kwargs=kwargs,
                                          output_path=output_path)
        if kwargs["export_vcf"]:
            app_outputs["vcf"] = ExportVCF(kwargs=kwargs,
                                           output_path=output_path,
                                           ref_fn="reference_fasta")

    else:
        subjob_ids = SplitBamForSubjobs(kwargs, mappings_names, bam_config_fn)
        postprocess_inputs = {
            "subjob_outputs":
            [job.get_output_ref("subjob_output") for job in subjob_ids],
            "kwargs":
            kwargs
        }
        postprocess_job = dxpy.new_dxjob(fn_input=postprocess_inputs,
                                         fn_name="postprocess")

        app_outputs = {
            "deletions": {
                "job": postprocess_job.get_id(),
                "field": "deletions"
            },
            "short_inserts": {
                "job": postprocess_job.get_id(),
                "field": "short_inserts"
            },
            "tandem_duplications": {
                "job": postprocess_job.get_id(),
                "field": "tandem_duplications"
            },
            "large_inserts": {
                "job": postprocess_job.get_id(),
                "field": "large_inserts"
            },
            "inversions": {
                "job": postprocess_job.get_id(),
                "field": "inversions"
            },
            "breakpoints": {
                "job": postprocess_job.get_id(),
                "field": "breakpoints"
            }
        }
        if kwargs["report_close_mapped_reads"] or kwargs[
                "report_only_close_mapped_reads"]:
            app_outputs["close_mapped_reads"] = {
                "job": postprocess_job.get_id(),
                "field": "close_mapped_reads"
            }
        if kwargs["export_vcf"]:
            app_outputs["vcf"] = {
                "job": postprocess_job.get_id(),
                "field": "vcf"
            }
        #if "breakdancer_calls_file" in kwargs:
        #    app_outputs["breakdancer_outputs"] = {"job": postprocess_job.get_id(), "field": "breakdancer_outputs"}

    dxlinks = []
    if need_to_index:
        if not kwargs["assume_sorted"]:
            for bam in mappings_names:
                uploaded_bam = dxpy.upload_local_file(bam,
                                                      name=bam.rstrip('.bam') +
                                                      "_sorted.bam")
                dxlinks.append(dxpy.dxlink(uploaded_bam))
        for idx in bam_idx_names:
            uploaded_idx = dxpy.upload_local_file(idx,
                                                  name=idx.rstrip('.bam.bai') +
                                                  "_sorted.bam.bai")
            dxlinks.append(dxpy.dxlink(uploaded_idx))
        app_outputs["sortedbam_and_index_files"] = dxlinks

    return app_outputs
Пример #36
0
def main(files):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    #files = [dxpy.DXFile(item) for item in files]

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    #for i, f in enumerate(files):
    #    dxpy.download_dxfile(f.get_id(), "files-" + str(i))

    # Split your work into parallel tasks.  As an example, the
    # following generates 10 subjobs running with the same dummy
    # input.

    subjobs = []
    for fastq in files:
        subjob_input = {"fastq": fastq}
        subjobs.append(dxpy.new_dxjob(subjob_input, "process"))

    # The following line creates the job that will perform the
    # "postprocess" step of your app.  We've given it an input field
    # that is a list of job-based object references created from the
    # "process" jobs we just created.  Assuming those jobs have an
    # output field called "output", these values will be passed to the
    # "postprocess" job.  Because these values are not ready until the
    # "process" jobs finish, the "postprocess" job WILL NOT RUN until
    # all job-based object references have been resolved (i.e. the
    # jobs they reference have finished running).
    #
    # If you do not plan to have the "process" jobs create output that
    # the "postprocess" job will require, then you can explicitly list
    # the dependencies to wait for those jobs to finish by setting the
    # "depends_on" field to the list of subjobs to wait for (it
    # accepts either dxpy handlers or string IDs in the list).  We've
    # included this parameter in the line below as well for
    # completeness, though it is unnecessary if you are providing
    # job-based object references in the input that refer to the same
    # set of jobs.
    '''
    postprocess_job = dxpy.new_dxjob(fn_input={
                "report": [subjob.get_output_ref("report") for subjob in subjobs],
                "summary": [subjob.get_output_ref("summary") for subjob in subjobs],
                "zips": [subjob.get_output_ref("zips") for subjob in subjobs],
                },
                fn_name="postprocess",
                depends_on=subjobs)
    '''
    # If you would like to include any of the output fields from the
    # postprocess_job as the output of your app, you should return it
    # here using a job-based object reference.  If the output field in
    # the postprocess function is called "answer", you can pass that
    # on here as follows:
    #
    #return { "FastQC_reports": [ dxpy.dxlink(item) for item in postprocess_job.get_output_ref("report") ]}
    #
    # Tip: you can include in your output at this point any open
    # objects (such as gtables) which will be closed by a job that
    # finishes later.  The system will check to make sure that the
    # output object is closed and will attempt to clone it out as
    # output into the parent container only after all subjobs have
    # finished.

    output = {
        "reports": [subjob.get_output_ref("report") for subjob in subjobs],
        "summaries": [subjob.get_output_ref("summary") for subjob in subjobs],
        "zips": [subjob.get_output_ref("zip") for subjob in subjobs],
    }
    '''
    for job in postprocess_job.get_output_ref("reports"):
        item = dxpy.dxlink(job)
        output['FastQC_reports'].append(item['report'])
        output['FastQC_zip'].append(item['zip'])
        output['FastQC_summary'].append(item['summary'])
    '''

    #    output["FastQC_reports"] = [ dxpy.dxlink(item)  for item in FastQC_reports]
    #    output["FastQC_reports"] = FastQC_reports
    #    output["FastQC_zip"] = FastQC_zip
    #    output["FastQC_summary"] = FastQC_summary

    return output
Пример #37
0
def main(fastq_gz_left_reads,
         fastq_gz_right_reads,
         indexed_reference,
         reads_per_chunk=25000000,
         aln_params="",
         sampe_params="-r '@RG\tID:1\tPL:ILLUMINA\tPU:None\tLB:1\tSM:1'"):

    picard_merge = applet("picard_merge_sam_files")
    if picard_merge == None:
        raise dxpy.AppError(
            "unable to find applet called 'picard_merge_sam_files'.  Please copy into your project from the collection of developer applets"
        )

    splitter = applet("fastq_splitter")
    if splitter == None:
        raise dxpy.AppError(
            "unable to find applet called 'fastq_splitter'.  Please copy into your project from the collection of developer applets"
        )

    bwa_aligner = applet("bwa_aligner")
    if bwa_aligner == None:
        raise dxpy.AppError(
            "unable to find applet called 'bwa_aligner'.  Please copy into your project from the collection of developer applets"
        )

    bwa_controller_input = {
        "left_reads": [],
        "right_reads": [],
        "indexed_reference": indexed_reference,
        "aln_params": aln_params,
        "sampe_params": sampe_params,
        "bwa_aligner": bwa_aligner.get_id()
    }
    bwa_subjobs = []
    for x, y in zip(fastq_gz_left_reads, fastq_gz_right_reads):
        left_job = splitter.run({
            "fastqgz": x,
            "reads_per_chunk": reads_per_chunk
        })
        right_job = splitter.run({
            "fastqgz": y,
            "reads_per_chunk": reads_per_chunk
        })
        bwa_controller_input["left_reads"].append(left_job.get_id())
        bwa_controller_input["right_reads"].append(right_job.get_id())
        bwa_subjobs.extend([left_job, right_job])

    bwa_controller_job = dxpy.new_dxjob(fn_input=bwa_controller_input,
                                        fn_name='bwa_controller',
                                        depends_on=bwa_subjobs)

    picard_merge_job = picard_merge.run(
        {"BAMs": {
            "job": bwa_controller_job.get_id(),
            "field": "BAMs"
        }})

    print picard_merge_job.get_id()

    output = {"BAM": {"job": picard_merge_job.get_id(), "field": "BAM"}}

    return output
Пример #38
0
def main(bam_set, map_report_set, dme_ix, uncompress_bam=True):

    # tool_versions.py --applet $script_name --appver $script_ver
    props = {}
    if os.path.isfile('/usr/bin/tool_versions.py'): 
        sw_versions = subprocess.check_output(['tool_versions.py', '--dxjson', 'dnanexus-executable.json'])
        props["SW"] = sw_versions
    
    print "* Value of bam_set:        '" + str(bam_set) + "'"
    print "* Value of map_report_set: '" + str(map_report_set) + "'"
    print "* Value of dme_ix:         '" + str(dme_ix) + "'"
    print "* Value of uncompress_bam: '" + str(uncompress_bam) + "'"

    print "* Calling merge_extract()..."
    inp = {
        'bam_set':        bam_set,
        'map_report_set': map_report_set, 
        'dme_ix_dxlink':  dme_ix,
        'uncompress_bam': uncompress_bam,
        'props':          props
    }
    extract_job = dxpy.new_dxjob(inp, "merge_extract")
    print "* Kicked off extract() and waiting..."
    extract_job.wait_on_done() # Wait because we want the qc_metrics to pass to other jobs.
    extract_out = extract_job.describe()['output']
    target_root = extract_out['target_root']
    qc_metrics = extract_out['qc_metrics']

    print "* Calling coverage()..."
    inp = {
        'CpG_context_dxlink': extract_out["CpG_context_dxlink"],
        'CHG_context_dxlink': extract_out["CHG_context_dxlink"],
        'CHH_context_dxlink': extract_out["CHH_context_dxlink"],
        'dme_ix_dxlink':      dme_ix,
        'target_root':        target_root,
        #'qc_metrics':        extract_job.get_output_ref("qc_metrics"),
        #'props':             props
    }
    coverage_job = dxpy.new_dxjob(inp, "coverage")
    print "* Kicked off coverage() and waiting..."

    print "* Calling bedmethyl()..."
    inp = {
        'cx_report_dxlink': coverage_job.get_output_ref("cx_report_dxlink"),
        #'cx_report_dxlink': extract_job.get_output_ref("cx_report_dxlink"),
        'chrom_sizes_dxlink': extract_out["chrom_sizes_dxlink"],
        'target_root': target_root,
        'qc_metrics': extract_out["qc_metrics"],
        'props': props,
    }
    bedmethyl_job = dxpy.new_dxjob(inp, "bedmethyl_io")
    print "* Kicked off bedmethyl() but not waiting waiting..."

    coverage_job.wait_on_done() # Already finished by this point
    coverage_out = coverage_job.describe()['output']

    print "* Calling signal()..."
    # No need for a separate instance unless storage is limited or can use a an instance cheaper than mem3_hdd2_x8!
    signal_out = signal_io(coverage_out["bedgraph_gz_dxlink"],extract_out["chrom_sizes_dxlink"],target_root,qc_metrics,props)

    print "* Check storage..."
    run_cmd('ls -l')
    run_cmd('df -k .')

    print "* Finished."

    return {
        # from extract() 
        #"bam_biorep":    extract_out['biorep_bam_dxlink'], 
        "bam_biorep_qc": extract_out['biorep_bam_qc_dxlink'], 
        "map_biorep":    extract_out['biorep_map_dxlink'],
        "mbias_report":  extract_out["mbias_report_dxlink"],
                
        # from signal() 
        "signal": signal_out["bigWig_dxlink"],
        
        # from bedmethyl() 
        "CpG_bed": bedmethyl_job.get_output_ref("CpG_bed_dxlink"),
        "CHG_bed": bedmethyl_job.get_output_ref("CHG_bed_dxlink"),
        "CHH_bed": bedmethyl_job.get_output_ref("CHH_bed_dxlink"),
        "CpG_bb":  bedmethyl_job.get_output_ref("CpG_bb_dxlink"),
        "CHG_bb":  bedmethyl_job.get_output_ref("CHG_bb_dxlink"),
        "CHH_bb":  bedmethyl_job.get_output_ref("CHH_bb_dxlink"),

        "metadata": json.dumps(qc_metrics) 
        }
def main(reads, dme_ix, ncpus, splitsize):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    #dx_reads = [dxpy.DXFile(item) for item in reads]

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.



    # We first create the "scatter" job which will scatter some input
    # (replace with your own input as necessary).
    logger.info("* Start Scatter with %d files %sM read splits *" % (len(reads), splitsize))

    scatter_job = dxpy.new_dxjob(fn_input={
                                 'orig_reads': reads,
                                 'split_size': splitsize,
                                 },
                                 fn_name="scatter")

    # We will want to call "process" on each output of "scatter", so
    # we call the "map" entry point to do so.  We can also provide
    # here additional input that we want each "process" entry point to
    # receive, e.g. a GTable ID to which the "process" function should
    # add rows of data.

    reads_root = simplify_name() or strip_extensions(dxpy.describe(reads[0])['name'], STRIP_EXTENSIONS)

    map_input = {
        "array_of_scattered_input": scatter_job.get_output_ref("array_of_scattered_input"),
        "process_input": {
            "reads_root": reads_root,
            "ncpus": ncpus,
            "dme_ix": dme_ix
            }
        }
    logger.info("* Start Map with: %s *" % map_input)
    map_job = dxpy.new_dxjob(fn_input=map_input, fn_name="map")

    # Finally, we want the "postprocess" job to run after "map" is
    # done calling "process" on each of its inputs.  Note that a job
    # is marked as "done" only after all of its child jobs are also
    # marked "done".
    logger.info("* Waiting for map job to finish...")
    postprocess_input = {
        "bam_files": map_job.get_output_ref("bam_files"),
        "report_files": map_job.get_output_ref("report_files"),
        "bam_root": reads_root + '_techrep'
        }
    logger.info("* Start Post process with: %s *" % postprocess_input)
    postprocess_job = dxpy.new_dxjob(fn_input=postprocess_input,
                                     fn_name="postprocess",
                                     depends_on=[map_job])

    # If you would like to include any of the output fields from the
    # postprocess_job as the output of your app, you should return it
    # here using a job-based object reference.
    #
    # return { "app_output_field": postprocess_job.get_output_ref("final_output"), ...}
    #
    # Tip: you can include in your output at this point any open
    # objects (such as gtables) which will be closed by a job that
    # finishes later.  The system will check to make sure that the
    # output object is closed and will attempt to clone it out as
    # output into the parent container only after all subjobs have
    # finished.

    output = {}
    output["bam_techrep"] = dxpy.dxlink(postprocess_job.get_output_ref("bam_techrep"))
    output["bam_techrep_qc"] = dxpy.dxlink(postprocess_job.get_output_ref("bam_techrep_qc"))
    output["map_techrep"] = dxpy.dxlink(postprocess_job.get_output_ref("map_techrep"))
    output["reads"] = postprocess_job.get_output_ref("reads")
    output["metadata"] = postprocess_job.get_output_ref("metadata")

    return output
Пример #40
0
def main(case_bams=None, normal_bams=None, snv_vcfs=None, cn_reference=None,
         baits=None, fasta=None, annotation=None,
         seq_method='hybrid', segment_method='cbs',
         haploid_x_reference=False, drop_low_coverage=False,
         exclude_access=None, antitarget_avg_size=None, target_avg_size=None,
         purity=None, ploidy=None, do_parallel=True):
    cnvkit("version")

    # Validate inputs
    # (from cnvlib.commands._cmd_batch)
    if cn_reference:
        bad_flags = [flag
                     for is_used, flag in (
                         (normal_bams is not None,  'normal_bams'),
                         (fasta,                    'fasta'),
                         (baits,                    'baits'),
                         (annotation,               'annotation'),
                         (exclude_access,           'exclude_access'),
                         (target_avg_size,          'target_avg_size'),
                         (antitarget_avg_size,      'antitarget_avg_size'),
                         )
                     if is_used]
        if bad_flags:
            raise dxpy.AppError(
                    "If 'cn_reference' is given, options to construct a new "
                    "reference (%s) should not be used:" % ", ".join(bad_flags))
    else:
        if not fasta:
            raise dxpy.AppError(
                    "Input 'fasta' must be given with the reference genome "
                    "sequence if an existing copy number reference profile "
                    "('cn_reference') is not given.")

        if seq_method in ('hybrid', 'amplicon') and not baits:
            raise dxpy.AppError(
                    "For the '%r' sequencing method, input 'baits' (at least) "
                    "must be given with the captured genomic regions if an "
                    "existing copy number reference profile ('cn_reference') "
                    "is not given." % baits)
    if case_bams:
        purities = validate_per_tumor(purity, len(case_bams), "purity values",
                                      lambda p: 0 < p <= 1)
        ploidies = validate_per_tumor(ploidy, len(case_bams), "ploidy values",
                                      lambda p: p > 0)
        snv_vcfs = validate_per_tumor(snv_vcfs, len(case_bams), "VCF files")
    else:
        purities = ploidies = None

    # If reference is not given, create one
    if not cn_reference:
        print("** About to call 'make_region_beds'")  # DBG
        targets, antitargets = make_region_beds(
                normal_bams, seq_method, fasta, baits, annotation,
                exclude_access, antitarget_avg_size, target_avg_size)
        print("** Finished calling 'make_region_beds'")  # DBG
        normal_cvgs = []
        if normal_bams:
            # 'coverage' of each normal bam in a subjob
            for nbam in normal_bams:
                print("** About to launch 'run_coverage'")  # DBG
                job_cvg = dxpy.new_dxjob(fn_name='run_coverage',
                        fn_input={
                            'bam': nbam,
                            'targets': targets,
                            'antitargets': antitargets,
                            'do_parallel': do_parallel,
                            })
                normal_cvgs.append(job_cvg.get_output_ref('coverages'))
                print("** Got output ref from 'run_coverage'")  # DBG
        print("** About to launch 'run_reference'")  # DBG
        job_ref = dxpy.new_dxjob(fn_name='run_reference',
                fn_input={'coverages': normal_cvgs,
                          'fasta': fasta,
                          'targets': targets,
                          'antitargets': (antitargets
                              if seq_method == 'hybrid' else None),
                          'haploid_x_reference': haploid_x_reference,
                          })
        cn_reference = job_ref.get_output_ref('cn_reference')
        print("** Got output ref from 'run_reference'")  # DBG
    output = {'cn_reference': cn_reference, 'copy_ratios': [],
              'copy_segments': [], 'call_segments': [], 'genemetrics': [],
              'cnv_beds': [], 'cnv_vcfs': [], 'scatters_png': [],
    }

    # Process each test/case/tumor individually using the given/built reference
    if case_bams:
        print("** About to process", len(case_bams), "'case_bams'")  # DBG
        for sample_bam, vcf, purity, ploidy in \
                zip(case_bams, snv_vcfs, purities, ploidies):
            print("** About to launch 'run_sample'")  # DBG
            job_sample = dxpy.new_dxjob(fn_name='run_sample',
                    fn_input={
                        'sample_bam': sample_bam,
                        'vcf': vcf,
                        'purity': purity,
                        'ploidy': ploidy,
                        'cn_reference': cn_reference,
                        'seq_method': seq_method,
                        'segment_method': segment_method,
                        'drop_low_coverage': drop_low_coverage,
                        'haploid_x_reference': haploid_x_reference,
                        'do_parallel': do_parallel,
                        })
            for field in ('copy_ratios', 'copy_segments', 'call_segments',
                          'genemetrics'):
                output[field].append(job_sample.get_output_ref(field))
            output['scatters_png'].append(job_sample.get_output_ref('scatter'))
            output['cnv_beds'].append(job_sample.get_output_ref('bed'))
            output['cnv_vcfs'].append(job_sample.get_output_ref('vcf'))
            print("** Got outputs from 'run_sample'")  # DBG

        # Consolidate multi-sample outputs
        print("** About to launch 'aggregate_outputs'")  # DBG
        job_agg = dxpy.new_dxjob(fn_name='aggregate_outputs',
                fn_input={'copy_ratios': output['copy_ratios'],
                          'copy_segments': output['copy_segments'],
                          'haploid_x_reference': haploid_x_reference})
        for field in ('seg', 'heatmap_pdf', 'metrics', 'sexes'):
            output[field] = job_agg.get_output_ref(field)
        print("** Got outputs from 'aggregate_outputs'")  # DBG

    print("** All done! Returning output:")
    from pprint import pprint
    pprint(output)
    return output
Пример #41
0
def main(fastq_files,
         sample_name,
         output_project,
         output_folder,
         properties={},
         aligner=None,
         genome_fasta_file=None,
         fastq_files2=None,
         bam_file=None):
    """Run the various QC programs and output the report files that they
    produce."""

    output = {}
    json_outputs = []
    tools_used = []

    # Run fastqc
    fastqc_jobs = []
    fastqc_input = {
        "fastq_files": fastq_files,
        "properties": properties,
        "output_project": output_project,
        "output_folder": output_folder
    }
    if not fastq_files2:
        fastqc_input["output_name"] = sample_name + "_fastqc.zip"
    else:
        fastqc_input["output_name"] = sample_name + "_fastqc_left.zip"
    fastqc_jobs.append(dxpy.new_dxjob(fastqc_input, "run_fastqc"))

    if fastq_files2:
        fastqc_input2 = {
            "fastq_files": fastq_files2,
            "output_name": sample_name + "_fastqc_right.zip",
            "properties": properties,
            "output_project": output_project,
            "output_folder": output_folder
        }
        fastqc_jobs.append(dxpy.new_dxjob(fastqc_input2, "run_fastqc"))

    output["fastqc_reports"] = [
        job.get_output_ref("fastqc_report") for job in fastqc_jobs
    ]
    tools_used += [job.get_output_ref("tools_used") for job in fastqc_jobs]

    # These tools require a bam file.
    if (bam_file is not None) and (genome_fasta_file is not None):
        # Run CollectAlignmentSummaryMetrics
        casm_input = {
            "bam_file": bam_file,
            "genome_fasta_file": genome_fasta_file,
            "sample_name": sample_name,
            "properties": properties,
            "output_project": output_project,
            "output_folder": output_folder
        }
        casm_job = dxpy.new_dxjob(casm_input,
                                  "collect_alignment_summary_metrics")
        output["alignment_summary_metrics"] = casm_job.get_output_ref(
            "alignment_summary_metrics")
        json_outputs += [
            casm_job.get_output_ref("json_alignment_summary_metrics")
        ]
        tools_used += [casm_job.get_output_ref("tools_used")]

    if (bam_file is not None) and (aligner is not None):
        # Run Collect Uniqueness Metrics
        uniqueness_input = {
            "bam_file": bam_file,
            "aligner": aligner
            #"output_project": output_project,
            #"output_folder": output_folder
        }
        uniqueness_job = dxpy.new_dxjob(uniqueness_input,
                                        "collect_uniqueness_metrics")
        json_outputs += [
            uniqueness_job.get_output_ref("json_uniqueness_metrics")
        ]
        tools_used += [uniqueness_job.get_output_ref("tools_used")]

        # Run Calc Mismatch Per Cycle Stats
        mismatch_per_cycle_input = {
            "bam_file": bam_file,
            "aligner": aligner,
            "output_project": output_project,
            "output_folder": output_folder
        }
        mismatch_per_cycle_job = dxpy.new_dxjob(
            mismatch_per_cycle_input, 'calc_mismatch_per_cycle_stats')
        output['mismatch_metrics'] = mismatch_per_cycle_job.get_output_ref(
            'mismatch_per_cycle_stats')
        tools_used += [mismatch_per_cycle_job.get_output_ref('tools_used')]

    # If paired-end reads, run CollectInsertSizeMetrics
    if (bam_file is not None) and (fastq_files2
                                   is not None) and (genome_fasta_file
                                                     is not None):
        cism_input = {
            "bam_file": bam_file,
            "genome_fasta_file": genome_fasta_file,
            "sample_name": sample_name,
            "properties": properties,
            "output_project": output_project,
            "output_folder": output_folder
        }
        cism_job = dxpy.new_dxjob(cism_input, "collect_insert_size_metrics")

        output["insert_size_metrics"] = cism_job.get_output_ref(
            "insert_size_metrics")
        json_outputs += [cism_job.get_output_ref("json_insert_size_metrics")]
        tools_used += [cism_job.get_output_ref("tools_used")]

    produce_qc_report_input = {
        "individual_json_outputs": json_outputs,
        "sample_name": sample_name,
        "output_project": output_project,
        "output_folder": output_folder
    }
    produce_qc_report_job = dxpy.new_dxjob(produce_qc_report_input,
                                           "produce_qc_report")
    output['json_output_file'] = produce_qc_report_job.get_output_ref(
        "combined_json_file")

    tools_used_input = {
        "tools_used": tools_used,
        "output_project": output_project,
        "output_folder": output_folder
    }
    tools_used_job = dxpy.new_dxjob(tools_used_input,
                                    "create_tools_used_json_file")
    output['tools_used'] = tools_used_job.get_output_ref(
        'tools_used_json_file')

    print 'QC sample output: %s' % output
    return output
def main(DX_APP_WIZARD_INPUT_SIGNATURE):
DX_APP_WIZARD_INITIALIZE_INPUTDX_APP_WIZARD_DOWNLOAD_ANY_FILES
    # First, create the output GTable that will contain your results.
    # NOTE: You must specify the columns and indices for a GTable when
    # you create it, and they are immutable thereafter.
    #
    # Note: If you are filtering a GTable or are otherwise happy with
    # using the same exact columns and indices as your input GTable,
    # you can easily initialize your new GTable as follows:
    #
    # DX_APP_WIZARD_||_OUTPUT = dxpy.new_dxgtable(init_from=DX_APP_WIZARD_||_INPUT)
    #
    # In the more general case, you may want to specify different
    # columns.  The following lines assume you would like to create a
    # GTable with a genomic range index, i.e. there is a string column
    # for chromosome names and two integer columns for low and high
    # coordinates.

    columns = [dxpy.DXGTable.make_column_desc("chr", "string"),
               dxpy.DXGTable.make_column_desc("lo", "int"),
               dxpy.DXGTable.make_column_desc("hi", "int"),
               dxpy.DXGTable.make_column_desc("somedata", "string")]
    DX_APP_WIZARD_||_OUTPUT = dxpy.new_dxgtable(columns=columns,
                                                          indices=[dxpy.DXGTable.genomic_range_index("chr", "lo", "hi")])

    # Split your input to be solved by the next stage of your app.
    # The following assumes you are splitting the input by giving
    # 100000 rows of a GenomicTable per subjob running the
    # "process" entry point.

    num_rows = DX_APP_WIZARD_||_INPUT.describe()["length"]

    subjobs = []
    for i in range(num_rows / row_chunk_size + (0 if num_rows % row_chunk_size == 0 else 1)):
        subjob_input = { "input_gtable_id": DX_APP_WIZARD_||_INPUT.get_id(),
                         "start_row": row_chunk_size * i,
                         "end_row": min(row_chunk_size * (i + 1), num_rows),
                         "output_gtable_id": DX_APP_WIZARD_||_OUTPUT.get_id()}
        subjobs.append(dxpy.new_dxjob(subjob_input, "process"))

    # The next line creates the job that will perform the
    # "postprocess" step of your app.  It assumes that you do not need
    # to aggregate any output from your "process" stages (other than
    # closing the output GTable), but you can add the output of those
    # stages to the input of your "postprocess" stage easily by adding
    # the following value as a field in the "fn_input" dict and adding
    # the parameter to your "postprocess" entry point.
    #
    #   fn_input={"process_outputs": [subjob.get_output_ref("output") for subjob in subjobs], ...}
    #
    # With no other input other than the output GTable ID for the
    # "postprocess" stage, we will force it to run only after all the
    # "process" stages have finished running by providing the list of
    # their DXJob handlers to the "depends_on" field (it accepts
    # either dxpy handlers or string IDs in the list).

    postprocess_job = dxpy.new_dxjob(fn_input={ "output_gtable_id": DX_APP_WIZARD_||_OUTPUT.get_id() },
                                     fn_name="postprocess",
                                     depends_on=subjobs)

    # If you would like to include any of the output fields from the
    # postprocess_job as the output of your app, you should return it
    # here using a job-based object reference.  If the output field is
    # called "answer", you can pass that on here as follows:
    #
    # return {"app_output_field": postprocess_job.get_output_ref("answer"), ...}
    #
    # Tip: you can include in your output at this point any open
    # objects (such as GTables) which are closed by a job that
    # finishes later.  The system will check to make sure that the
    # output object is closed and will attempt to clone it out as
    # output into the parent container only after all subjobs have
    # finished.

    output = {}
Пример #43
0
def main(mappings_bam, region_size, index_file=None):
    """The 'scatter' subjob of the applet

    The main function will perform logic to distribute our job
    across multiple workers (instances)

    Returns:
        output (dict): Contains key "count_file" with value DXLink to job output file.
    """
    print('Creating workspace directory to store downloaded files')
    os.mkdir(u'workspace')
    os.chdir(u'workspace')

    mappings_bam_h = dxpy.DXFile(mappings_bam)
    filename = mappings_bam_h.name
    dxpy.download_dxfile(mappings_bam_h.get_id(), filename)

    #
    # SECTION: Scatter
    # ------------------------------------------------------
    # Split regions into list of <region size> list
    #
    # Create index file if not provided by user.
    # In order to index bam file needs to be sorted already.
    #   Sort BAM if necessary.
    #   Upload dx file to pass to distributed jobs
    #
    regions = parseSAM_header_for_region(filename)
    split_regions = [regions[i:i + region_size]
                     for i in range(0, len(regions), region_size)]

    if not index_file:
        mappings_bam, index_file = create_index_file(filename, mappings_bam)

    #
    # SECTION: Processing
    # -----------------------------------------------------------------------
    # Run subjob for each distributed region.
    #
    # Note: inputs for subjobs are sent as a dictionary with key value pairs:
    #    key: "region_list"   value: [ [], [], ... ](region sections)
    #    key: "mappings_bam"   value: sorted bam
    #    key: "index_file"    value: bam bai index file
    # The dictionary keys must match the input of the subjob
    #
    # Collect outputs for downstream gather job using dxjob.get_output_ref()
    #
    # Note: Programmatically it's possible to intelligently split workload and
    #    create optimized instance types.  dxpy.new_dxjob takes the optional
    #    parameter: instance_type
    #
    print('creating subjobs')
    subjobs = [dxpy.new_dxjob(
               fn_input={"region_list": split,
                         "mappings_bam": mappings_bam,
                         "index_file": index_file},
               fn_name="samtoolscount_bam")
               for split in split_regions]

    fileDXLinks = [subjob.get_output_ref("readcount_fileDX")
                   for subjob in subjobs]

    #
    # SECTION: Gather (Post-processing)
    # -------------------------------------------------------------------------
    # Pass DNAnexus object references to post processing job to combine outputs
    #
    # Create dictionary to be returned as output for the job
    # Dictionary must contain keys matching outputs set in dxapp.json
    #
    print('combining outputs')
    postprocess_job = dxpy.new_dxjob(
        fn_input={"countDXlinks": fileDXLinks, "resultfn": filename},
        fn_name="combine_files")

    countDXLink = postprocess_job.get_output_ref("countDXLink")

    output = {}
    output["count_file"] = countDXLink

    return output
Пример #44
0
def main(reads1, crop_length, reference_tar,
         bwa_version, bwa_aln_params, samtools_version, debug, reads2=None):

    # Main entry-point.  Parameter defaults assumed to come from dxapp.json.
    # reads1, reference_tar, reads2 are links to DNAnexus files or None

    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    # This spawns only one or two subjobs for single- or paired-end,
    # respectively.  It could also download the files, chunk the reads,
    # and spawn multiple subjobs.

    # Files are downloaded later by subjobs into their own filesystems
    # and uploaded to the project.

    # Initialize file handlers for input files.

    paired_end = reads2 is not None

    if crop_length == 'native':
        crop_subjob = None
        unmapped_reads = [reads1, reads2]
    else:
        crop_subjob_input = {
            "reads1_file": reads1,
            "reads2_file": reads2,
            "crop_length": crop_length,
            "debug": debug
        }
        logger.info("Crop job input: %s" % (crop_subjob_input))
        crop_subjob = dxpy.new_dxjob(crop_subjob_input, "crop")
        unmapped_reads = [crop_subjob.get_output_ref("cropped_reads1")]
        if paired_end:
            unmapped_reads.append(crop_subjob.get_output_ref("cropped_reads2"))
        else:
            unmapped_reads.append(None)

    unmapped_reads = [r for r in unmapped_reads if r]

    mapping_subjobs = []
    for reads in unmapped_reads:
        mapping_subjob_input = {
            "reads_file": reads,
            "reference_tar": reference_tar,
            "bwa_aln_params": bwa_aln_params,
            "bwa_version": bwa_version,
            "debug": debug
        }
        logger.info("Mapping job input: %s" % (mapping_subjob_input))
        if crop_subjob:
            mapping_subjobs.append(dxpy.new_dxjob(
                fn_input=mapping_subjob_input,
                fn_name="process",
                depends_on=[crop_subjob]))
        else:
            mapping_subjobs.append(dxpy.new_dxjob(
                fn_input=mapping_subjob_input,
                fn_name="process"))

    # Create the job that will perform the "postprocess" step.
    # depends_on=mapping_subjobs, so blocks on all mapping subjobs

    postprocess_job = dxpy.new_dxjob(
        fn_input={
            "indexed_reads": [
                subjob.get_output_ref("suffix_array_index")
                for subjob in mapping_subjobs],
            "unmapped_reads": unmapped_reads,
            "reference_tar": reference_tar,
            "bwa_version": bwa_version,
            "samtools_version": samtools_version,
            "debug": debug},
        fn_name="postprocess",
        depends_on=mapping_subjobs)

    mapped_reads = postprocess_job.get_output_ref("mapped_reads")
    mapping_statistics = postprocess_job.get_output_ref("mapping_statistics")
    n_mapped_reads = postprocess_job.get_output_ref("n_mapped_reads")

    output = {
        "mapped_reads": mapped_reads,
        "crop_length": crop_length,
        "mapping_statistics": mapping_statistics,
        "paired_end": paired_end,
        "n_mapped_reads": n_mapped_reads
    }
    logger.info("Exiting with output: %s" % (output))
    return output
Пример #45
0
def main(record_link,
         worker_id,
         worker_project,
         fastqs,
         output_folder,
         mark_duplicates=False):

    output = {"bams": [], "bais": [], "tools_used": []}

    lane = FlowcellLane(record_link=record_link, fastqs=fastqs)

    fastq_files = [dxpy.DXFile(item) for item in fastqs]
    sample_dict = group_files_by_barcode(fastq_files)

    for barcode in sample_dict:
        print 'Processing sample: %s' % barcode
        read_dict = group_files_by_read(sample_dict[barcode])

        fastq_files2 = None

        if "1" in read_dict and "2" in read_dict:
            # Sample is paired; there should be no files without a 'read'
            # property of "1" or "2"
            fastq_files = [dxpy.dxlink(item) for item in read_dict["1"]]
            fastq_files2 = [dxpy.dxlink(item) for item in read_dict["2"]]
        else:
            fastq_files = [dxpy.dxlink(item) for item in read_dict["1"]]

        print("fastq_files: {}".format(fastq_files))
        print("fastq_files2: {}".format(fastq_files2))

        mapped_files_properties = {
            'barcode': barcode,
            'run_date': lane.run_date,
            'library_id': lane.library_id,
            'lane_id': lane.lane_id,
            'mapper': lane.mapper,
            'mapping_reference': lane.mapping_reference,
            'library_name': lane.library_name
        }
        print 'Initiating map sample job'
        sample_name = 'SCGPM_%s_%s_L%d_%s' % (
            lane.library_name, lane.flowcell_id, lane.lane_index, barcode)
        map_sample_job = dxpy.new_dxjob(fn_input={
            "project_id": lane.project_id,
            "output_folder": output_folder,
            "fastq_files": fastq_files,
            "fastq_files2": fastq_files2,
            "genome_fasta_file": lane.reference_genome_dxid,
            "genome_index_file": lane.reference_index_dxid,
            "mapper": lane.mapper,
            "sample_name": sample_name,
            "mark_duplicates": mark_duplicates,
            "applet_id": worker_id,
            "applet_project": worker_project,
            "properties": mapped_files_properties
        },
                                        fn_name="run_map_sample")
        output["bams"].append({"job": map_sample_job.get_id(), "field": "bam"})
        output["bais"].append({"job": map_sample_job.get_id(), "field": "bai"})
        output["tools_used"].append({
            "job": map_sample_job.get_id(),
            "field": "tools_used"
        })
    return output
Пример #46
0
def main(rep1_peaks, rep2_peaks, pooled_peaks):

    # Initialize the data object inputs on the platform into
    # dxpy.DXDataObject instances.

    rep1_peaks_file = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file = dxpy.DXFile(rep2_peaks)

    rep1_peaks_filename = rep1_peaks_file.name
    rep2_peaks_filename = rep2_peaks_file.name

    # Download the file inputs to the local file system.

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_filename)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_filename)

    # Find the pooler and pseudoreplicator applets
    # (assumed to be in the same project as this applet)
    pool_applet = dxpy.find_one_data_object(
        classname='applet', name='pool', zero_ok=False, more_ok=False, return_handler=True)

    pseudoreplicator_applet = dxpy.find_one_data_object(
        classname='applet', name='pseudoreplicator', zero_ok=False, more_ok=False, return_handler=True)

    # Dispatch parallel tasks.

    subjobs = []

    # True replicates

    

    # Pooled replciates

    pool_replicates_subjob = pool_applet.run({ "input1": rep1_peaks, "input2": rep2_peaks })
    subjobs.append(pool_replicates_subjob)

    # The following line creates the job that will perform the
    # "postprocess" step of your app.  We've given it an input field
    # that is a list of job-based object references created from the
    # "process" jobs we just created.  Assuming those jobs have an
    # output field called "output", these values will be passed to the
    # "postprocess" job.  Because these values are not ready until the
    # "process" jobs finish, the "postprocess" job WILL NOT RUN until
    # all job-based object references have been resolved (i.e. the
    # jobs they reference have finished running).
    #
    # If you do not plan to have the "process" jobs create output that
    # the "postprocess" job will require, then you can explicitly list
    # the dependencies to wait for those jobs to finish by setting the
    # "depends_on" field to the list of subjobs to wait for (it
    # accepts either dxpy handlers or string IDs in the list).  We've
    # included this parameter in the line below as well for
    # completeness, though it is unnecessary if you are providing
    # job-based object references in the input that refer to the same
    # set of jobs.

    postprocess_job = dxpy.new_dxjob(fn_input={ "process_outputs": [subjob.get_output_ref("pooled") for subjob in subjobs] },
                                     fn_name="postprocess",
                                     depends_on=subjobs)

    pooled_replicates = postprocess_job.get_output_ref("pooled")

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    subprocess.check_call('touch EM_fit_output',shell=True)
    subprocess.check_call('touch empirical_curves_output',shell=True)
    subprocess.check_call('touch EM_parameters_log',shell=True)
    subprocess.check_call('touch npeaks_pass',shell=True)
    subprocess.check_call('touch overlapped_peaks',shell=True)
    subprocess.check_call('touch IDR_output',shell=True)
    #subprocess.check_call('touch IDR_peaks',shell=True)

    EM_fit_output = dxpy.upload_local_file("EM_fit_output")
    empirical_curves_output = dxpy.upload_local_file("empirical_curves_output")
    EM_parameters_log = dxpy.upload_local_file("EM_parameters_log")
    npeaks_pass = dxpy.upload_local_file("npeaks_pass")
    overlapped_peaks = dxpy.upload_local_file("overlapped_peaks")
    IDR_output = dxpy.upload_local_file("IDR_output")
    #IDR_peaks = dxpy.upload_local_file("IDR_peaks")

    # If you would like to include any of the output fields from the
    # postprocess_job as the output of your app, you should return it
    # here using a job-based object reference.  If the output field in
    # the postprocess function is called "answer", you can pass that
    # on here as follows:
    #
    # return { "app_output_field": postprocess_job.get_output_ref("answer"), ...}
    #
    # Tip: you can include in your output at this point any open
    # objects (such as gtables) which will be closed by a job that
    # finishes later.  The system will check to make sure that the
    # output object is closed and will attempt to clone it out as
    # output into the parent container only after all subjobs have
    # finished.

    output = {}
    output["EM_fit_output"] = dxpy.dxlink(EM_fit_output)
    output["empirical_curves_output"] = dxpy.dxlink(empirical_curves_output)
    output["EM_parameters_log"] = dxpy.dxlink(EM_parameters_log)
    output["npeaks_pass"] = dxpy.dxlink(npeaks_pass)
    output["overlapped_peaks"] = dxpy.dxlink(overlapped_peaks)
    output["IDR_output"] = dxpy.dxlink(IDR_output)
    output["IDR_peaks"] = pooled_replicates

    logging.info("Exiting with output: %s", output)
    return output
Пример #47
0
def main(mappings_bam, region_size, index_file=None):
    """The 'scatter' subjob of the applet

    The main function will perform logic to distribute our job
    across multiple workers (instances)

    Returns:
        output (dict): Contains key "count_file" with value DXLink to job output file.
    """
    print 'Creating workspace directory to store downloaded files'
    os.mkdir(u'workspace')
    os.chdir(u'workspace')

    mappings_bam_h = dxpy.DXFile(mappings_bam)
    filename = mappings_bam_h.name
    dxpy.download_dxfile(mappings_bam_h.get_id(), filename)

    #
    # SECTION: Scatter
    # ------------------------------------------------------
    # Split regions into list of <region size> list
    #
    # Create index file if not provided by user.
    # In order to index bam file needs to be sorted already.
    #   Sort BAM if necessary.
    #   Upload dx file to pass to distributed jobs
    #
    regions = parseSAM_header_for_region(filename)
    split_regions = [regions[i:i + region_size]
                     for i in xrange(0, len(regions), region_size)]

    if not index_file:
        mappings_bam, index_file = create_index_file(filename, mappings_bam)

    #
    # SECTION: Processing
    # -----------------------------------------------------------------------
    # Run subjob for each distributed region.
    #
    # Note: inputs for subjobs are sent as a dictionary with key value pairs:
    #    key: "region_list"   value: [ [], [], ... ](region sections)
    #    key: "mappings_bam"   value: sorted bam
    #    key: "index_file"    value: bam bai index file
    # The dictionary keys must match the input of the subjob
    #
    # Collect outputs for downstream gather job using dxjob.get_output_ref()
    #
    # Note: Programmatically it's possible to intelligently split workload and
    #    create optimized instance types.  dxpy.new_dxjob takes the optional
    #    parameter: instance_type
    #
    print 'creating subjobs'
    subjobs = [dxpy.new_dxjob(
               fn_input={"region_list": split,
                         "mappings_bam": mappings_bam,
                         "index_file": index_file},
               fn_name="samtoolscount_bam")
               for split in split_regions]

    fileDXLinks = [subjob.get_output_ref("readcount_fileDX")
                   for subjob in subjobs]

    #
    # SECTION: Gather (Post-processing)
    # -------------------------------------------------------------------------
    # Pass DNAnexus object references to post processing job to combine outputs
    #
    # Create dictionary to be returned as output for the job
    # Dictionary must contain keys matching outputs set in dxapp.json
    #
    print 'combining outputs'
    postprocess_job = dxpy.new_dxjob(
        fn_input={"countDXlinks": fileDXLinks, "resultfn": filename},
        fn_name="combine_files")

    countDXLink = postprocess_job.get_output_ref("countDXLink")

    output = {}
    output["count_file"] = countDXLink

    return output
def main(bam_files, sampleId, padding, reference, loglevel, number_of_nodes,
    downsample, downsample_fraction, regions_file=None, indel_vcf=None,
    dbsnp=None, advanced_rtc_options=None, advanced_ir_options=None,
    advanced_br_options=None, advanced_pr_options=None):

    """This is a dx applet that runs on the DNAnexus platform. This will run
    GATK3 best practices pipeline using scatter gather. This is very useful for
    processing WGS datasets. This function is the controller of the pipeline,
    which will scatter data, process it and then gather it for final processing.

    :param: `bam_files`:
    :param: `sampleId`:
    :param: `padding`:
    :param: `reference`:
    :param: `loglevel`:
    :param: `number_of_nodes`
    :param: `downsample`:
    :param: `downsample_fraction`:
    :param: `regions_file`:
    :param: `indel_vcf`:
    :param: `dbsnp`:
    :param: `advanced_rtc_options`:
    :param: `advanced_ir_options`:
    :param: `advanced_br_options`:
    :param: `advanced_pr_options`:
    """

    logger.setLevel(loglevel)
    logger.info("GATK3 scatter gather controller. Number of nodes for scatter jobs: {0}".format(number_of_nodes))

    # Balance jobs based on the file sizes of file from input

    file_sizes = {}
    file_objects = {}
    for bam_file in bam_files:
        file_size = int(dxpy.DXFile(bam_file).describe()["size"])
        file_name = dxpy.DXFile(bam_file).describe()["name"]
        file_sizes[file_name] = file_size
        file_objects[file_name] = bam_file

    balanced_jobs_object = dx_scatter.distribute_files_by_size(
        file_sizes=file_sizes,
        dx_file_objects=file_objects,
        number_of_nodes=number_of_nodes)

    # GATK in/del realignment phase

    gatk_rtc_ir_jobs = []
    for job_name, file_objects in balanced_jobs_object.items():

        logger.info("Create GATK3 Realignment Node")
        gatk_rtc_ir_jobs.append(
            dxpy.new_dxjob(
                fn_input={
                    "bam_files": file_objects,
                    "reference": reference,
                    "regions_file": regions_file,
                    "padding": padding,
                    "indel_vcf": indel_vcf,
                    "sampleId": sampleId,
                    "advanced_rtc_options": advanced_rtc_options,
                    "advanced_ir_options": advanced_ir_options,
                    "downsample": downsample,
                    "downsample_fraction": downsample_fraction,
                    "loglevel": loglevel
                },
                fn_name="gatk_realignment"
            )
        )

    # GATK3 BaseRecalibrator phase

    # This will gather the input from all the GATK3 Realignment nodes

    logger.info("Gather all GATK3 Realignment Output")

    kwargs = {
        "output_downsample_bams": [job.get_output_ref("output_downsample_bams")
            for job in gatk_rtc_ir_jobs],
        "output_realigned_bams": [job.get_output_ref("output_realigned_bams")
            for job in gatk_rtc_ir_jobs]
    }

    gather_gatk_rtc_ir_jobs = dxpy.new_dxjob(
        fn_input=kwargs,
        fn_name="gather",
    	depends_on=gatk_rtc_ir_jobs
    )

    # This will send all the realigned BAM files to the BaseRecalibrator node

    logger.info("Create GATK3 BaseRecalibrator Node")

    gatk_br_job = dxpy.new_dxjob(
        fn_input={
            "bam_files": gather_gatk_rtc_ir_jobs.get_output_ref("output_downsample_bams") if downsample else gather_gatk_rtc_ir_jobs.get_output_ref("output_realigned_bams"),
            "reference": reference,
            "regions_file": regions_file,
            "padding": padding,
            "indel_vcf": indel_vcf,
            "dbsnp": dbsnp,
            "advanced_br_options": advanced_br_options,
            "loglevel": loglevel
        },
        fn_name="gatk_base_recalibrator",
        depends_on=[gather_gatk_rtc_ir_jobs]
    )

    # GATK Apply BQSR

    gatk_apply_bqsr_jobs = []
    for gatk_rtc_ir_job in gatk_rtc_ir_jobs:

        logger.info("Create GATK3 Apply BQSR Node")
        gatk_apply_bqsr_jobs.append(
            dxpy.new_dxjob(
                fn_input={
                    "bam_files": gatk_rtc_ir_job.get_output_ref("output_realigned_bams"),
                    "BR_output": gatk_br_job.get_output_ref("output_bqsr"),
                    "reference": reference,
                    "regions_file": regions_file,
                    "padding": padding,
                    "dbsnp": dbsnp,
                    "sampleId": sampleId,
                    "advanced_pr_options": advanced_pr_options,
                    "loglevel": loglevel
                },
                fn_name="gatk_apply_bqsr",
                depends_on = gatk_rtc_ir_jobs + [gatk_br_job]
            )
        )

    # Gather all Apply BQSR output and finish the pipeline

    logger.info("Gather all GATK Apply BQSR calling job outputs")

    kwargs = {
        "output_recalibrated_bam": [job.get_output_ref("output_recalibrated_bam")
            for job in gatk_apply_bqsr_jobs],
        "output_recalibrated_cram": [job.get_output_ref("output_recalibrated_cram")
            for job in gatk_apply_bqsr_jobs]
    }

    gather_gatk_apply_bqsr_jobs = dxpy.new_dxjob(
        fn_input=kwargs,
        fn_name="gather",
    	depends_on=gatk_apply_bqsr_jobs
    )

    output = {}
    output["output_recalibrated_bam"] = gather_gatk_apply_bqsr_jobs.get_output_ref("output_recalibrated_bam")
    output["output_recalibrated_cram"] = gather_gatk_apply_bqsr_jobs.get_output_ref("output_recalibrated_cram")
    return output
Пример #49
0
def main(files):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    #files = [dxpy.DXFile(item) for item in files]

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    #for i, f in enumerate(files):
    #    dxpy.download_dxfile(f.get_id(), "files-" + str(i))

    # Split your work into parallel tasks.  As an example, the
    # following generates 10 subjobs running with the same dummy
    # input.

    subjobs = []
    for file_obj in files:

        filename = dxpy.describe(file_obj)['name']
        encff = re.compile('ENCFF[0-9]{3}[A-Z]{3}')
        try:
            file_acc = encff.match(filename).group()
        except:
            print "Filename %s is not an ENCODE file" % filename
            exit(0)

        file_meta = requests.get(SERVER+'/'+file_acc+'/?frame=embedded', \
            auth=(auth['AUTHID'],auth['AUTHPW']), headers=HEADERS).json()

        subjob_input = {
            "file_obj": file_obj,
            "file_meta": file_meta
        }
        subjobs.append(dxpy.new_dxjob(subjob_input, "process"))

    # The following line creates the job that will perform the
    # "postprocess" step of your app.  We've given it an input field
    # that is a list of job-based object references created from the
    # "process" jobs we just created.  Assuming those jobs have an
    # output field called "output", these values will be passed to the
    # "postprocess" job.  Because these values are not ready until the
    # "process" jobs finish, the "postprocess" job WILL NOT RUN until
    # all job-based object references have been resolved (i.e. the
    # jobs they reference have finished running).
    #
    # If you do not plan to have the "process" jobs create output that
    # the "postprocess" job will require, then you can explicitly list
    # the dependencies to wait for those jobs to finish by setting the
    # "depends_on" field to the list of subjobs to wait for (it
    # accepts either dxpy handlers or string IDs in the list).  We've
    # included this parameter in the line below as well for
    # completeness, though it is unnecessary if you are providing
    # job-based object references in the input that refer to the same
    # set of jobs.

    postprocess_job = dxpy.new_dxjob(fn_input={
                                    "report": [subjob.get_output_ref("report") for subjob in subjobs],
                                    "valid": [subjob.get_output_ref("validation") for subjob in subjobs]
                                    },
                                     fn_name="postprocess",
                                     depends_on=subjobs)

    # If you would like to include any of the output fields from the
    # postprocess_job as the output of your app, you should return it
    # here using a job-based object reference.  If the output field in
    # the postprocess function is called "answer", you can pass that
    # on here as follows:
    #
    #return { "FastQC_reports": [ dxpy.dxlink(item) for item in postprocess_job.get_output_ref("report") ]}
    #
    # Tip: you can include in your output at this point any open
    # objects (such as gtables) which will be closed by a job that
    # finishes later.  The system will check to make sure that the
    # output object is closed and will attempt to clone it out as
    # output into the parent container only after all subjobs have
    # finished.
    validate_reports = []
    validations = []
    validate_reports.append(postprocess_job.get_output_ref("report"))
    validations.append(postprocess_job.get_output_ref("validation"))
    output = {}
    print validate_reports
    print validations
#    output["FastQC_reports"] = [ dxpy.dxlink(item)  for item in FastQC_reports]
    output["validate_reports"] = validate_reports
    output["validate_errors"] = validations

    return output
Пример #50
0
def main(files):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    # files = [dxpy.DXFile(item) for item in files]

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    # for i, f in enumerate(files):
    #    dxpy.download_dxfile(f.get_id(), "files-" + str(i))

    # Split your work into parallel tasks.  As an example, the
    # following generates 10 subjobs running with the same dummy
    # input.

    subjobs = []
    for fastq in files:
        subjob_input = {"fastq": fastq}
        subjobs.append(dxpy.new_dxjob(subjob_input, "process"))

    # The following line creates the job that will perform the
    # "postprocess" step of your app.  We've given it an input field
    # that is a list of job-based object references created from the
    # "process" jobs we just created.  Assuming those jobs have an
    # output field called "output", these values will be passed to the
    # "postprocess" job.  Because these values are not ready until the
    # "process" jobs finish, the "postprocess" job WILL NOT RUN until
    # all job-based object references have been resolved (i.e. the
    # jobs they reference have finished running).
    #
    # If you do not plan to have the "process" jobs create output that
    # the "postprocess" job will require, then you can explicitly list
    # the dependencies to wait for those jobs to finish by setting the
    # "depends_on" field to the list of subjobs to wait for (it
    # accepts either dxpy handlers or string IDs in the list).  We've
    # included this parameter in the line below as well for
    # completeness, though it is unnecessary if you are providing
    # job-based object references in the input that refer to the same
    # set of jobs.
    """
    postprocess_job = dxpy.new_dxjob(fn_input={
                "report": [subjob.get_output_ref("report") for subjob in subjobs],
                "summary": [subjob.get_output_ref("summary") for subjob in subjobs],
                "zips": [subjob.get_output_ref("zips") for subjob in subjobs],
                },
                fn_name="postprocess",
                depends_on=subjobs)
    """
    # If you would like to include any of the output fields from the
    # postprocess_job as the output of your app, you should return it
    # here using a job-based object reference.  If the output field in
    # the postprocess function is called "answer", you can pass that
    # on here as follows:
    #
    # return { "FastQC_reports": [ dxpy.dxlink(item) for item in postprocess_job.get_output_ref("report") ]}
    #
    # Tip: you can include in your output at this point any open
    # objects (such as gtables) which will be closed by a job that
    # finishes later.  The system will check to make sure that the
    # output object is closed and will attempt to clone it out as
    # output into the parent container only after all subjobs have
    # finished.

    output = {
        "reports": [subjob.get_output_ref("report") for subjob in subjobs],
        "summaries": [subjob.get_output_ref("summary") for subjob in subjobs],
        "zips": [subjob.get_output_ref("zip") for subjob in subjobs],
    }

    """
    for job in postprocess_job.get_output_ref("reports"):
        item = dxpy.dxlink(job)
        output['FastQC_reports'].append(item['report'])
        output['FastQC_zip'].append(item['zip'])
        output['FastQC_summary'].append(item['summary'])
    """

    #    output["FastQC_reports"] = [ dxpy.dxlink(item)  for item in FastQC_reports]
    #    output["FastQC_reports"] = FastQC_reports
    #    output["FastQC_zip"] = FastQC_zip
    #    output["FastQC_summary"] = FastQC_summary

    return output
Пример #51
0
def main(**job_inputs):
    job_outputs = {}
    mappingsTable = dxpy.open_dxgtable(job_inputs["mappings"]["$dnanexus_link"])
    mappingsTableId = mappingsTable.get_id()

    # This controls the degree of parallelism
    chunks = int(mappingsTable.describe()["length"] / job_inputs["reads_per_job"]) + 1

    try:
        contigSetId = mappingsTable.get_details()["original_contigset"]["$dnanexus_link"]
        originalContigSet = mappingsTable.get_details()["original_contigset"]
    except:
        raise Exception("The original reference genome must be attached as a detail")

    # In the next major section of code, we construct a variants table. As regions of the genome are passed to each worker
    # and variants are called on them, the workers will add rows to this table concurrently.

    variants_schema = [
        {"name": "chr", "type": "string"},
        {"name": "lo", "type": "int32"},
        {"name": "hi", "type": "int32"},
        {"name": "ref", "type": "string"},
        {"name": "alt", "type": "string"},
        {"name": "qual", "type": "double"},
        {"name": "ids", "type": "string"},
    ]

    # The information in these tags is elevated into specific columns, so additional columns for these tags will not be created
    elevatedTags = ["format_GT", "format_DP", "format_AD"]

    # The info and format tags are extracted from the header printed by samtools
    # If additional code will add a tag to the output of the program, modify this header to include the tag.
    # TODO: Allow the table to be created by the first job that finishes to avoid this step.
    headerInfo = extractHeader("/tmp/header.txt", elevatedTags)
    description = {}
    samples = []

    indices = [dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", "gri")]

    ##The following section creates the sample-specific table columns
    for k, v in headerInfo["tags"]["info"].iteritems():
        variants_schema.append({"name": "info_" + k, "type": translateTagTypeToColumnType(v)})
        description[k] = {"name": k, "description": v["description"], "type": v["type"], "number": v["number"]}

    # For each sample, add the sample-specific columns to the schema, at present only one sample is supported
    numSamples = 1
    for i in range(numSamples):
        variants_schema.extend(
            [
                {"name": "genotype_" + str(i), "type": "string"},
                {"name": "phasing_" + str(i), "type": "string"},
                {"name": "type_" + str(i), "type": "string"},
                {"name": "variation_qual_" + str(i), "type": "double"},
                {"name": "genotype_qual_" + str(i), "type": "double"},
                {"name": "coverage_" + str(i), "type": "string"},
                {"name": "total_coverage_" + str(i), "type": "int32"},
            ]
        )
        indices.append(dxpy.DXGTable.lexicographic_index([["type_" + str(i), "ASC"]], "type_" + str(i)))
        samples.append("Sample_0")
        for k, v in headerInfo["tags"]["format"].iteritems():
            if "format_" + k not in elevatedTags:
                variants_schema.append({"name": "format_" + k + "_" + str(i), "type": translateTagTypeToColumnType(v)})

    # TODO: Add lexicographic indices when secondary indices are supported

    variants = dxpy.new_dxgtable(variants_schema, indices=[dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", "gri")])
    tableId = variants.get_id()
    variants = dxpy.open_dxgtable(tableId)
    variants.add_types(["Variants", "gri"])

    details = {
        "samples": samples,
        "original_contigset": job_inputs["reference"],
        "original_mappings": job_inputs["mappings"],
        "formats": headerInfo["tags"]["format"],
        "infos": headerInfo["tags"]["info"],
    }
    # if headerInfo.get('filters') != {}:
    #  details['filters'] = headerInfo['filters']
    variants.set_details(details)

    if "output_name" in job_inputs:
        variants.rename(job_inputs["output_name"])
    else:
        variants.rename(mappingsTable.describe()["name"] + " variant calls by Samtools mpileup")

    # Split the genome into evenly sized regions
    genomeRegions = splitGenomeLengthLargePieces(originalContigSet, chunks)

    # Generate the command line arguments needed to run samtools and bcftools
    samOptions = makeSamtoolsParameters(**job_inputs)
    bcfOptions = makeBcftoolsParameters(**job_inputs)

    # The rest of the main function contains the map-reduce functionality. For each genome chunk, an input spec is created for a new child job.
    # Which specifies
    reduce_job_inputs = {}
    for i in range(len(genomeRegions)):
        if len(genomeRegions[i]) > 0:
            map_job_inputs = {
                "mappings_table_id": mappingsTableId,
                "original_contig_set": contigSetId,
                "interval": genomeRegions[i],
                "tableId": tableId,
                "compress_reference": job_inputs["compress_reference"],
                "compress_no_call": job_inputs["compress_no_call"],
                "infer_no_call": job_inputs["infer_no_call"],
                "sam_options": samOptions,
                "bcf_options": bcfOptions,
                "part_number": i,
            }
            # Run a "map" job for each chunk, passing in the inputspec from above and looking for a function entry point given as "map" (@dxpy.entry_point('map'))
            map_job = dxpy.new_dxjob(map_job_inputs, "map")
            reduce_job_inputs["mapJob" + str(i) + "TableId"] = {"job": map_job.get_id(), "field": "ok"}

    reduce_job_inputs["tableId"] = tableId

    # Run a "reduce" job, which only begins once all of the map jobs singal they have completed by sending 'ok':True
    # The reduce job closes the table. This step is explicitly needed because table closing must wait till the completion of the map jobs
    # By giving the reduce job the map jobs as input, the reduce job will wait to start.
    reduce_job = dxpy.new_dxjob(reduce_job_inputs, "reduce")
    job_outputs = {"variants": {"job": reduce_job.get_id(), "field": "variants"}}

    return job_outputs
Пример #52
0
def main(reads1=None, reference_tar=None, bwa_aln_params=None, bwa_version=None, samtools_version=None, reads2=None, input_JSON=None, debug=False):

	# Main entry-point.  Parameter defaults assumed to come from dxapp.json.
	# reads1, reference_tar, reads2 are links to DNAnexus files or None

	if debug:
		logger.setLevel(logging.DEBUG)
	else:
		logger.setLevel(logging.INFO)

	# if there is input_JSON, it over-rides any explicit parameters

	if input_JSON:
		if 'reads1' in input_JSON:
			reads1 = input_JSON['reads1']
		if 'reads2' in input_JSON:
			reads2 = input_JSON['reads2']
		if 'reference_tar' in input_JSON:
			reference_tar = input_JSON['reference_tar']
		if 'bwa_aln_params' in input_JSON:
			bwa_aln_params = input_JSON['bwa_aln_params']
		if 'bwa_version' in input_JSON:
			bwa_version = input_JSON['bwa_version']
		if 'samtools_version' in input_JSON:
			samtools_version = input_JSON['samtools_version']

	if not reads1:
		logger.error('reads1 is required, explicitly or in input_JSON')
		raise Exception

	# This spawns only one or two subjobs for single- or paired-end,
	# respectively.  It could also download the files, chunk the reads,
	# and spawn multiple subjobs.

	# Files are downloaded later by subjobs into their own filesystems
	# and uploaded to the project.

	# Initialize file handlers for input files.

	paired_end = reads2 is not None
	unmapped_reads = [r for r in [reads1, reads2] if r]
	
	subjobs = []
	for reads in unmapped_reads:
		subjob_input = {"reads_file": reads,
						"reference_tar": reference_tar,
						"bwa_aln_params": bwa_aln_params,
						"bwa_version": bwa_version}
		print "Submitting:"
		print subjob_input
		subjobs.append(dxpy.new_dxjob(subjob_input, "process"))

	# Create the job that will perform the "postprocess" step.  depends_on=subjobs, so blocks on all subjobs

	postprocess_job = dxpy.new_dxjob(fn_input={ "indexed_reads": [subjob.get_output_ref("output") for subjob in subjobs],
												"unmapped_reads": unmapped_reads,
												"reference_tar": reference_tar,
												"bwa_version": bwa_version,
												"samtools_version": samtools_version },
									 fn_name="postprocess",
									 depends_on=subjobs)

	mapped_reads = postprocess_job.get_output_ref("mapped_reads")
	mapping_statistics = postprocess_job.get_output_ref("mapping_statistics")

	output = {
		"mapped_reads": mapped_reads,
		"mapping_statistics": mapping_statistics,
		"paired_end": paired_end
	}
	output.update({'output_JSON': output.copy()})

	print "Exiting with output: %s" %(output)
	return output
Пример #53
0
def main(fastq_files,
         genome_fasta_file,
         genome_index_file,
         mapper,
         project_id,
         output_folder,
         mark_duplicates=False,
         fastq_files2=None,
         sample_name=None,
         properties=None):
    """Spawn subjobs to map each of the FASTQ files (and their pairs,
    if provided) and merge the BAM files into a single BAM file, which
    is output."""

    if fastq_files2 != None:
        assert len(fastq_files2) == len(fastq_files), \
            "fastq_files2 contains %s elements; expected %s" % (len(fastq_files2), len(fastq_files))

    subjobs = []
    for i in xrange(len(fastq_files)):
        subjob_input = {
            "project_id": project_id,
            "output_folder": output_folder,
            "fastq_file": fastq_files[i],
            "genome_fasta_file": genome_fasta_file,
            "genome_index_file": genome_index_file,
            "mapper": mapper,
            "sample_name": sample_name,
            "mark_duplicates": mark_duplicates,
            "properties": properties
        }
        if fastq_files2 != None:
            subjob_input["fastq_file2"] = fastq_files2[i]
        subjobs.append(dxpy.new_dxjob(subjob_input, "process"))

    if len(fastq_files) > 1:
        postprocess_input = {
            "project_id": project_id,
            "output_folder": output_folder,
            "bam_files": [subjob.get_output_ref("bam") for subjob in subjobs],
            "sample_name": sample_name,
            "properties": properties
        }
        postprocess_job = dxpy.new_dxjob(fn_input=postprocess_input,
                                         fn_name="postprocess",
                                         depends_on=subjobs)
        tools_used_input = {
            "project_id":
            project_id,
            "output_folder":
            output_folder,
            "tools_used": [
                job.get_output_ref("tools_used")
                for job in (subjobs + [postprocess_job])
            ]
        }
        tools_used_job = dxpy.new_dxjob(tools_used_input,
                                        "create_tools_used_json_file")
        return {
            "bam": postprocess_job.get_output_ref("bam"),
            "bai": postprocess_job.get_output_ref("bai"),
            "tools_used": tools_used_job.get_output_ref("tools_used_json_file")
        }
    else:
        tools_used_input = {
            "project_id": project_id,
            "output_folder": output_folder,
            "tools_used":
            [job.get_output_ref('tools_used') for job in subjobs]
        }
        tools_used_job = dxpy.new_dxjob(tools_used_input,
                                        "create_tools_used_json_file")
        return {
            "bam": subjobs[0].get_output_ref("bam"),
            "bai": subjobs[0].get_output_ref("bai"),
            "tools_used": tools_used_job.get_output_ref("tools_used_json_file")
        }
Пример #54
0
def SplitBamForSubjobs(kwargs, bam_names, bam_config_fn=None):
    num_threads = kwargs["num_threads_per_instance"]
    print "\nSplitting bam for subjobs"
    
    # Assuming that all bam files have the same chromosomes (is this safe?)
    subprocess.check_output("samtools view -H {input_bam} > header.txt".format(input_bam=bam_names[0]),
                                    shell=True)
    with open('header.txt') as fh: 
        header = [line.rstrip('\n') for line in fh]
    print "Input header: "
    for line in header: 
        print line
    
    print "Save unmapped reads as bam files to merge into subjob files"
    unmapped = {}
    for bam in bam_names:
        fn = bam.rstrip('.bam')+'_unmapped'
        command = "samtools view -@ {n} -u -b -f 4 {bam} > {unmapped}".format(n=num_threads, bam=bam, unmapped=fn)
        print command
        subprocess.check_call(command, shell=True)
        unmapped[bam] = fn
    
    groups = SplitGenomeFromSam(header, kwargs["num_instances"])
    subjobs = []
    subjob_no = 0
    for group in groups:
        group = " ".join(group)
        subjob_bam_fn = [] 

        for bam in bam_names:
            start_time = time.time()
            print "\nMerging {bam} with unmapped reads for pindel subjobs".format(bam=bam)
            out_fn = bam.rstrip('.bam') + '_' + str(subjob_no) + '.bam'
            
            command = "samtools view -@ {n} -bh {bam} {group} > tmp.bam".format(n=num_threads, bam=bam, group=group)
            subprocess.check_call(command, shell=True)
            
            split_command = "samtools merge -@ {n} {out} {unmapped} tmp.bam ".format(n=num_threads, out=out_fn, unmapped=unmapped[bam])
            print split_command
            subprocess.check_call(split_command, shell=True)
            
            print "Samtools view and merge ran in: {min} minutes".format(min=float((time.time()-start_time)/60))
            subjob_bam_fn.append(out_fn)

        subjob_kwargs = kwargs.copy()
        subjob_bam_fn, subjob_bam_idx_fn = IndexBams(bam_names=subjob_bam_fn)
      
        print "Uploading split bam files: " + str(subjob_bam_fn)
        subjob_bam_ids = [dxpy.dxlink(dxpy.upload_local_file(bam)) for bam in subjob_bam_fn]
        print "Uploading split bam index files: " + str(subjob_bam_idx_fn)
        subjob_bam_idx_ids = [dxpy.dxlink(dxpy.upload_local_file(idx)) for idx in subjob_bam_idx_fn]
        
        subjob_kwargs["mappings_files"] = subjob_bam_ids
        subjob_kwargs["bam_index_files"] = subjob_bam_idx_ids
        
        print "Updating bam config file for subjob"
        if bam_config_fn:
            new_config_fn = "subjob_config_" + str(subjob_no) + '.txt'
            with open(bam_config_fn, 'r') as config_fh, open(new_config_fn, 'w') as write_fh:
                for line in config_fh: 
                    line = line.split('\t')
                    bam_name = line[0]
                    out_fn = bam_name.rstrip('.bam') + '_' + str(subjob_no) + '.bam'
                    write_fh.write(out_fn + '\t' + "\t".join(line[1:]) + '\n')      
            
            print "Uploading new config file: " + str(new_config_fn)
            subjob_kwargs["bam_config_file"] = dxpy.dxlink(dxpy.upload_local_file(new_config_fn))
    
        job = dxpy.new_dxjob(subjob_kwargs, "process")
        print "Started subjob #{n}: {job_id}".format(n=subjob_no, job_id=job.get_id())
        subjobs.append(job)
        subjob_no += 1
    
    return subjobs
Пример #55
0
def main(bam_set, map_report_set, dme_ix, uncompress_bam=True):

    # tool_versions.py --applet $script_name --appver $script_ver
    props = {}
    if os.path.isfile("/usr/bin/tool_versions.py"):
        sw_versions = subprocess.check_output(["tool_versions.py", "--dxjson", "dnanexus-executable.json"])
        props["SW"] = sw_versions

    print "* Value of bam_set:        '" + str(bam_set) + "'"
    print "* Value of map_report_set: '" + str(map_report_set) + "'"
    print "* Value of dme_ix:         '" + str(dme_ix) + "'"
    print "* Value of uncompress_bam: '" + str(uncompress_bam) + "'"

    print "* Calling merge_extract_full()..."
    inp = {
        "bam_set": bam_set,
        "map_report_set": map_report_set,
        "dme_ix_dxlink": dme_ix,
        "uncompress_bam": uncompress_bam,
        "props": props,
    }
    extract_job = dxpy.new_dxjob(inp, "merge_extract_full")
    print "* Kicked off extract() and waiting..."
    extract_job.wait_on_done()  # Wait because we want the qc_metrics to pass to other jobs.
    extract_out = extract_job.describe()["output"]
    target_root = extract_out["target_root"]
    qc_metrics = extract_out["qc_metrics"]

    print "* Calling bedmethyl()..."
    inp = {
        "cx_report_dxlink": extract_out["cx_report_dxlink"],
        "chrom_sizes_dxlink": extract_out["chrom_sizes_dxlink"],
        "target_root": target_root,
        "qc_metrics": qc_metrics,
        "props": props,
    }
    bedmethyl_job = dxpy.new_dxjob(inp, "bedmethyl_io")
    print "* Kicked off bedmethyl() but not waiting waiting..."

    print "* Calling signal()..."
    signal_out = signal_io(
        extract_out["bedgraph_gz_dxlink"], extract_out["chrom_sizes_dxlink"], target_root, qc_metrics, props
    )

    print "* Check storage..."
    run_cmd("ls -l")
    run_cmd("df -k .")

    print "* Finished."

    return {
        # from extract()
        # "bam_biorep":    extract_out['biorep_bam_dxlink'],
        "bam_biorep_qc": extract_out["biorep_bam_qc_dxlink"],
        "map_biorep": extract_out["biorep_map_dxlink"],
        "mbias_report": extract_out["mbias_report_dxlink"],
        # from signal()
        "signal": signal_out["bigWig_dxlink"],
        # from bedmethyl()
        "CpG_bed": bedmethyl_job.get_output_ref("CpG_bed_dxlink"),
        "CHG_bed": bedmethyl_job.get_output_ref("CHG_bed_dxlink"),
        "CHH_bed": bedmethyl_job.get_output_ref("CHH_bed_dxlink"),
        "CpG_bb": bedmethyl_job.get_output_ref("CpG_bb_dxlink"),
        "CHG_bb": bedmethyl_job.get_output_ref("CHG_bb_dxlink"),
        "CHH_bb": bedmethyl_job.get_output_ref("CHH_bb_dxlink"),
        "metadata": json.dumps(qc_metrics),
    }