def main():
    ################################################################################
    # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on
    #          applying the capturing group named "group_by" in group_by_regex.
    #          (and terminate if this is task 0)
    ################################################################################
    ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(reference_coll=arvados.current_job()['script_parameters']['reference_collection'])
    job_input_pdh = arvados.current_job()['script_parameters']['inputs_collection']
    interval_lists_pdh = arvados.current_job()['script_parameters']['interval_lists_collection']
    interval_count = 1
    if "interval_count" in arvados.current_job()['script_parameters']:
        interval_count = arvados.current_job()['script_parameters']['interval_count']

    if arvados.current_task()['sequence'] == 0:
        # get candidates for task reuse
        task_key_params=['inputs', 'ref', 'name'] # N.B. inputs collection includes input vcfs and corresponding interval_list
        script="gatk-genotypegvcfs.py"
        oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2'
        job_filters = [
            ['script', '=', script],
            ['repository', '=', arvados.current_job()['repository']],
            ['script_version', 'in git', oldest_git_commit_to_reuse],
            ['docker_image_locator', 'in docker', arvados.current_job()['docker_image_locator']],
        ]

        # retrieve a full set of all possible reusable tasks at sequence 1
        print "Retrieving all potentially reusable tasks"
        reusable_tasks = hgi_arvados.get_reusable_tasks(1, task_key_params, job_filters)
        print "Have %s tasks for potential reuse" % (len(reusable_tasks))

        def create_task_with_validated_reuse(sequence, params):
            return hgi_arvados.create_or_reuse_task(sequence, params, reusable_tasks, task_key_params, validate_task_output)

        # Setup sub tasks (and terminate if this is task 0)
        hgi_arvados.one_task_per_group_combined_inputs(ref_input_pdh, job_input_pdh, interval_lists_pdh,
                                                       group_by_regex,
                                                       if_sequence=0, and_end_task=True,
                                                       create_task_func=create_task_with_validated_reuse)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task sequence
    assert(this_task['sequence'] > 0)

    ################################################################################
    # Phase IIa: If we are a "reuse" task, just set our output and be done with it
    ################################################################################
    if 'reuse_job_task' in this_task['parameters']:
        print "This task's work was already done by JobTask %s" % this_task['parameters']['reuse_job_task']
        exit(0)

    ################################################################################
    # Phase IIb: Genotype gVCFs!
    ################################################################################
    ref_file = gatk_helper.mount_gatk_reference(ref_param="ref")
    gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs")
    out_dir = hgi_arvados.prepare_out_dir()
    interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(interval_list_param="inputs")
    name = this_task['parameters'].get('name')
    if not name:
        name = "unknown"
    out_file = name + ".vcf.gz"

    # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output
    out_file = out_file.replace(".bcf", "._cf")

    # GenotypeGVCFs!
    gatk_exit = gatk.genotype_gvcfs(ref_file, interval_list_file, gvcf_files, os.path.join(out_dir, out_file), cores="32", java_mem="200g")

    if gatk_exit != 0:
        print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit
        arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                         body={'success':False}
                                         ).execute()
    else:
        print "GATK exited successfully, writing output to keep"

        # Write a new collection as output
        out = arvados.CollectionWriter()

        # Write out_dir to keep
        out.write_directory_tree(out_dir)

        # Commit the output to Keep.
        output_locator = out.finish()

        if validate_task_output(output_locator):
            print "Task output validated, setting output to %s" % (output_locator)

            # Use the resulting locator as the output for this task.
            this_task.set_output(output_locator)
        else:
            print "ERROR: Failed to validate task output (%s)" % (output_locator)
            arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                             body={'success':False}
                                             ).execute()
def main():
    ################################################################################
    # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on
    #          applying the capturing group named "group_by" in group_by_regex.
    #          (and terminate if this is task 0)
    ################################################################################
    ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(
        reference_coll=arvados.current_job()["script_parameters"]["reference_collection"]
    )
    job_input_pdh = arvados.current_job()["script_parameters"]["inputs_collection"]
    interval_lists_pdh = arvados.current_job()["script_parameters"]["interval_lists_collection"]
    interval_count = 1
    if "interval_count" in arvados.current_job()["script_parameters"]:
        interval_count = arvados.current_job()["script_parameters"]["interval_count"]

    # Setup sub tasks 1-N (and terminate if this is task 0)
    hgi_arvados.chunked_tasks_per_cram_file(
        ref_input_pdh,
        job_input_pdh,
        interval_lists_pdh,
        validate_task_output,
        if_sequence=0,
        and_end_task=True,
        reuse_tasks=False,
        oldest_git_commit_to_reuse="6ca726fc265f9e55765bf1fdf71b86285b8a0ff2",
        script="gatk-haplotypecaller-cram.py",
    )

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task
    assert this_task["sequence"] != 0

    ################################################################################
    # Phase IIa: If we are a "reuse" task, just set our output and be done with it
    ################################################################################
    if "reuse_job_task" in this_task["parameters"]:
        print "This task's work was already done by JobTask %s" % this_task["parameters"]["reuse_job_task"]
        exit(0)

    ################################################################################
    # Phase IIb: Call Haplotypes!
    ################################################################################
    ref_file = gatk_helper.mount_gatk_reference(ref_param="ref")
    interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(interval_list_param="chunk")
    cram_file = gatk_helper.mount_gatk_cram_input(input_param="input")
    cram_file_base, cram_file_ext = os.path.splitext(cram_file)
    out_dir = hgi_arvados.prepare_out_dir()
    out_filename = os.path.basename(cram_file_base) + "." + os.path.basename(interval_list_file) + ".vcf.gz"

    # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output
    out_filename = out_filename.replace(".bcf", "._cf")

    # HaplotypeCaller!
    gatk_exit = gatk.haplotype_caller(ref_file, cram_file, interval_list_file, os.path.join(out_dir, out_filename))

    if gatk_exit != 0:
        print "ERROR: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit
        arvados.api().job_tasks().update(uuid=arvados.current_task()["uuid"], body={"success": False}).execute()
    else:
        print "GATK exited successfully, writing output to keep"

        # Write a new collection as output
        out = arvados.CollectionWriter()

        # Write out_dir to keep
        out.write_directory_tree(out_dir)

        # Commit the output to Keep.
        output_locator = out.finish()

        print "Task output written to keep, validating it"
        if validate_task_output(output_locator):
            print "Task output validated, setting output to %s" % (output_locator)

            # Use the resulting locator as the output for this task.
            this_task.set_output(output_locator)
        else:
            print "ERROR: Failed to validate task output (%s)" % (output_locator)
            arvados.api().job_tasks().update(uuid=arvados.current_task()["uuid"], body={"success": False}).execute()
def main():
    ################################################################################
    # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on
    #          applying the capturing group named "group_by" in group_by_regex.
    #          (and terminate if this is task 0)
    ################################################################################
    ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(reference_coll=arvados.current_job()['script_parameters']['reference_collection'])
    job_input_pdh = arvados.current_job()['script_parameters']['inputs_collection']
    interval_lists_pdh = arvados.current_job()['script_parameters']['interval_lists_collection']
    interval_count = 1
    if "interval_count" in arvados.current_job()['script_parameters']:
        interval_count = arvados.current_job()['script_parameters']['interval_count']

    # Setup sub tasks 1-N (and terminate if this is task 0)
    hgi_arvados.one_task_per_group_and_per_n_gvcfs(ref_input_pdh, job_input_pdh, interval_lists_pdh,
                                                   group_by_regex, max_gvcfs_to_combine,
                                                   if_sequence=0, and_end_task=True)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task sequence
    assert(this_task['sequence'] > 0)

    ################################################################################
    # Phase II: Read interval_list and split into additional intervals
    ################################################################################
    hgi_arvados.one_task_per_interval(interval_count, validate_task_output,
                                      reuse_tasks=True,
                                      oldest_git_commit_to_reuse="1f6e1e0b8bb12c573dd253d7900ef55305d55aa1",
                                      if_sequence=1, and_end_task=True)

    # We will never reach this point if we are in the 1st task sequence
    assert(this_task['sequence'] > 1)

    ################################################################################
    # Phase IIIa: If we are a "reuse" task, just set our output and be done with it
    ################################################################################
    if 'reuse_job_task' in this_task['parameters']:
        print "This task's work was already done by JobTask %s" % this_task['parameters']['reuse_job_task']
        exit(0)

    ################################################################################
    # Phase IIIb: Combine gVCFs!
    ################################################################################
    ref_file = gatk_helper.mount_gatk_reference(ref_param="ref")
    gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs")
    out_dir = hgi_arvados.prepare_out_dir()
    name = this_task['parameters'].get('name')
    if not name:
        name = "unknown"
    interval_str = this_task['parameters'].get('interval')
    if not interval_str:
        interval_str = ""
    interval_strs = interval_str.split()
    intervals = []
    for interval in interval_strs:
        intervals.extend(["--intervals", interval])
    out_file = name + ".vcf.gz"
    if interval_count > 1:
        out_file = name + "." + '_'.join(interval_strs) + ".vcf.gz"
        if len(out_file) > 255:
            out_file = name + "." + '_'.join([interval_strs[0], interval_strs[-1]]) + ".vcf.gz"
            print "Output file name was too long with full interval list, shortened it to: %s" % out_file
        if len(out_file) > 255:
            raise errors.InvalidArgumentError("Output file name is too long, cannot continue: %s" % out_file)

    # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output
    out_file = out_file.replace(".bcf", "._cf")

    # CombineGVCFs!
    extra_args = intervals
    extra_args.extend(["--breakBandsAtMultiplesOf", "1000000"])
    gatk_exit = gatk.combine_gvcfs(ref_file, gvcf_files, os.path.join(out_dir, out_file), extra_gatk_args=extra_args)

    if gatk_exit != 0:
        print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit
        arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                         body={'success':False}
                                         ).execute()
    else:
        print "GATK exited successfully, writing output to keep"

        # Write a new collection as output
        out = arvados.CollectionWriter()

        # Write out_dir to keep
        out.write_directory_tree(out_dir)

        # Commit the output to Keep.
        output_locator = out.finish()

        if validate_task_output(output_locator):
            print "Task output validated, setting output to %s" % (output_locator)

            # Use the resulting locator as the output for this task.
            this_task.set_output(output_locator)
        else:
            print "ERROR: Failed to validate task output (%s)" % (output_locator)
            arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                             body={'success':False}
                                             ).execute()
Пример #4
0
def main():
    ################################################################################
    # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on
    #          applying the capturing group named "group_by" in group_by_regex.
    #          (and terminate if this is task 0)
    ################################################################################
    ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(
        reference_coll=arvados.current_job()['script_parameters']
        ['reference_collection'])
    job_input_pdh = arvados.current_job(
    )['script_parameters']['inputs_collection']
    interval_lists_pdh = arvados.current_job(
    )['script_parameters']['interval_lists_collection']
    interval_count = 1
    if "interval_count" in arvados.current_job()['script_parameters']:
        interval_count = arvados.current_job(
        )['script_parameters']['interval_count']

    # Setup sub tasks 1-N (and terminate if this is task 0)
    hgi_arvados.one_task_per_group_and_per_n_gvcfs(ref_input_pdh,
                                                   job_input_pdh,
                                                   interval_lists_pdh,
                                                   group_by_regex,
                                                   max_gvcfs_to_combine,
                                                   if_sequence=0,
                                                   and_end_task=True)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task sequence
    assert (this_task['sequence'] > 0)

    ################################################################################
    # Phase II: Read interval_list and split into additional intervals
    ################################################################################
    hgi_arvados.one_task_per_interval(
        interval_count,
        validate_task_output,
        reuse_tasks=True,
        oldest_git_commit_to_reuse="1f6e1e0b8bb12c573dd253d7900ef55305d55aa1",
        if_sequence=1,
        and_end_task=True)

    # We will never reach this point if we are in the 1st task sequence
    assert (this_task['sequence'] > 1)

    ################################################################################
    # Phase IIIa: If we are a "reuse" task, just set our output and be done with it
    ################################################################################
    if 'reuse_job_task' in this_task['parameters']:
        print "This task's work was already done by JobTask %s" % this_task[
            'parameters']['reuse_job_task']
        exit(0)

    ################################################################################
    # Phase IIIb: Combine gVCFs!
    ################################################################################
    ref_file = gatk_helper.mount_gatk_reference(ref_param="ref")
    gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs")
    out_dir = hgi_arvados.prepare_out_dir()
    name = this_task['parameters'].get('name')
    if not name:
        name = "unknown"
    interval_str = this_task['parameters'].get('interval')
    if not interval_str:
        interval_str = ""
    interval_strs = interval_str.split()
    intervals = []
    for interval in interval_strs:
        intervals.extend(["--intervals", interval])
    out_file = name + ".vcf.gz"
    if interval_count > 1:
        out_file = name + "." + '_'.join(interval_strs) + ".vcf.gz"
        if len(out_file) > 255:
            out_file = name + "." + '_'.join(
                [interval_strs[0], interval_strs[-1]]) + ".vcf.gz"
            print "Output file name was too long with full interval list, shortened it to: %s" % out_file
        if len(out_file) > 255:
            raise errors.InvalidArgumentError(
                "Output file name is too long, cannot continue: %s" % out_file)

    # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output
    out_file = out_file.replace(".bcf", "._cf")

    # CombineGVCFs!
    extra_args = intervals
    extra_args.extend(["--breakBandsAtMultiplesOf", "1000000"])
    gatk_exit = gatk.combine_gvcfs(ref_file,
                                   gvcf_files,
                                   os.path.join(out_dir, out_file),
                                   extra_gatk_args=extra_args)

    if gatk_exit != 0:
        print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit
        arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                         body={
                                             'success': False
                                         }).execute()
    else:
        print "GATK exited successfully, writing output to keep"

        # Write a new collection as output
        out = arvados.CollectionWriter()

        # Write out_dir to keep
        out.write_directory_tree(out_dir)

        # Commit the output to Keep.
        output_locator = out.finish()

        if validate_task_output(output_locator):
            print "Task output validated, setting output to %s" % (
                output_locator)

            # Use the resulting locator as the output for this task.
            this_task.set_output(output_locator)
        else:
            print "ERROR: Failed to validate task output (%s)" % (
                output_locator)
            arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                             body={
                                                 'success': False
                                             }).execute()
Пример #5
0
def main():
    ################################################################################
    # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on
    #          applying the capturing group named "group_by" in group_by_regex.
    #          (and terminate if this is task 0)
    ################################################################################
    ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(
        reference_coll=arvados.current_job()['script_parameters']
        ['reference_collection'])
    job_input_pdh = arvados.current_job(
    )['script_parameters']['inputs_collection']
    interval_lists_pdh = arvados.current_job(
    )['script_parameters']['interval_lists_collection']
    interval_count = 1
    if "interval_count" in arvados.current_job()['script_parameters']:
        interval_count = arvados.current_job(
        )['script_parameters']['interval_count']

    if arvados.current_task()['sequence'] == 0:
        # get candidates for task reuse
        task_key_params = [
            'inputs', 'ref', 'name'
        ]  # N.B. inputs collection includes input vcfs and corresponding interval_list
        script = "gatk-genotypegvcfs.py"
        oldest_git_commit_to_reuse = '6ca726fc265f9e55765bf1fdf71b86285b8a0ff2'
        job_filters = [
            ['script', '=', script],
            ['repository', '=',
             arvados.current_job()['repository']],
            ['script_version', 'in git', oldest_git_commit_to_reuse],
            [
                'docker_image_locator', 'in docker',
                arvados.current_job()['docker_image_locator']
            ],
        ]

        # retrieve a full set of all possible reusable tasks at sequence 1
        print "Retrieving all potentially reusable tasks"
        reusable_tasks = hgi_arvados.get_reusable_tasks(
            1, task_key_params, job_filters)
        print "Have %s tasks for potential reuse" % (len(reusable_tasks))

        def create_task_with_validated_reuse(sequence, params):
            return hgi_arvados.create_or_reuse_task(sequence, params,
                                                    reusable_tasks,
                                                    task_key_params,
                                                    validate_task_output)

        # Setup sub tasks (and terminate if this is task 0)
        hgi_arvados.one_task_per_group_combined_inputs(
            ref_input_pdh,
            job_input_pdh,
            interval_lists_pdh,
            group_by_regex,
            if_sequence=0,
            and_end_task=True,
            create_task_func=create_task_with_validated_reuse)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task sequence
    assert (this_task['sequence'] > 0)

    ################################################################################
    # Phase IIa: If we are a "reuse" task, just set our output and be done with it
    ################################################################################
    if 'reuse_job_task' in this_task['parameters']:
        print "This task's work was already done by JobTask %s" % this_task[
            'parameters']['reuse_job_task']
        exit(0)

    ################################################################################
    # Phase IIb: Genotype gVCFs!
    ################################################################################
    ref_file = gatk_helper.mount_gatk_reference(ref_param="ref")
    gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs")
    out_dir = hgi_arvados.prepare_out_dir()
    interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(
        interval_list_param="inputs")
    name = this_task['parameters'].get('name')
    if not name:
        name = "unknown"
    out_file = name + ".vcf.gz"

    # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output
    out_file = out_file.replace(".bcf", "._cf")

    # GenotypeGVCFs!
    gatk_exit = gatk.genotype_gvcfs(ref_file,
                                    interval_list_file,
                                    gvcf_files,
                                    os.path.join(out_dir, out_file),
                                    cores="4",
                                    java_mem="19g")

    if gatk_exit != 0:
        print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit
        arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                         body={
                                             'success': False
                                         }).execute()
    else:
        print "GATK exited successfully, writing output to keep"

        # Write a new collection as output
        out = arvados.CollectionWriter()

        # Write out_dir to keep
        out.write_directory_tree(out_dir)

        # Commit the output to Keep.
        output_locator = out.finish()

        if validate_task_output(output_locator):
            print "Task output validated, setting output to %s" % (
                output_locator)

            # Use the resulting locator as the output for this task.
            this_task.set_output(output_locator)
        else:
            print "ERROR: Failed to validate task output (%s)" % (
                output_locator)
            arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                             body={
                                                 'success': False
                                             }).execute()
def main():
    ################################################################################
    # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on
    #          applying the capturing group named "group_by" in group_by_regex.
    #          (and terminate if this is task 0)
    ################################################################################
    ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(
        reference_coll=arvados.current_job()['script_parameters']
        ['reference_collection'])
    job_input_pdh = arvados.current_job(
    )['script_parameters']['inputs_collection']
    interval_lists_pdh = arvados.current_job(
    )['script_parameters']['interval_lists_collection']
    interval_count = 1
    if "interval_count" in arvados.current_job()['script_parameters']:
        interval_count = arvados.current_job(
        )['script_parameters']['interval_count']

    # Setup sub tasks 1-N (and terminate if this is task 0)
    hgi_arvados.chunked_tasks_per_cram_file(
        ref_input_pdh,
        job_input_pdh,
        interval_lists_pdh,
        validate_task_output,
        if_sequence=0,
        and_end_task=True,
        reuse_tasks=False,
        oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2',
        script="gatk-haplotypecaller-cram.py")

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task
    assert (this_task['sequence'] != 0)

    ################################################################################
    # Phase IIa: If we are a "reuse" task, just set our output and be done with it
    ################################################################################
    if 'reuse_job_task' in this_task['parameters']:
        print "This task's work was already done by JobTask %s" % this_task[
            'parameters']['reuse_job_task']
        exit(0)

    ################################################################################
    # Phase IIb: Call Haplotypes!
    ################################################################################
    ref_file = gatk_helper.mount_gatk_reference(ref_param="ref")
    interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(
        interval_list_param="chunk")
    cram_file = gatk_helper.mount_gatk_cram_input(input_param="input")
    cram_file_base, cram_file_ext = os.path.splitext(cram_file)
    out_dir = hgi_arvados.prepare_out_dir()
    out_filename = os.path.basename(cram_file_base) + "." + os.path.basename(
        interval_list_file) + ".vcf.gz"

    # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output
    out_filename = out_filename.replace(".bcf", "._cf")

    # HaplotypeCaller!
    gatk_exit = gatk.haplotype_caller(ref_file, cram_file, interval_list_file,
                                      os.path.join(out_dir, out_filename))

    if gatk_exit != 0:
        print "ERROR: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={
                                             'success': False
                                         }).execute()
    else:
        print "GATK exited successfully, writing output to keep"

        # Write a new collection as output
        out = arvados.CollectionWriter()

        # Write out_dir to keep
        out.write_directory_tree(out_dir)

        # Commit the output to Keep.
        output_locator = out.finish()

        print "Task output written to keep, validating it"
        if validate_task_output(output_locator):
            print "Task output validated, setting output to %s" % (
                output_locator)

            # Use the resulting locator as the output for this task.
            this_task.set_output(output_locator)
        else:
            print "ERROR: Failed to validate task output (%s)" % (
                output_locator)
            arvados.api().job_tasks().update(
                uuid=arvados.current_task()['uuid'], body={
                    'success': False
                }).execute()