def setUp(self): super(FuseMagicTest, self).setUp() cw = arvados.CollectionWriter() cw.start_new_file('thing1.txt') cw.write("data 1") self.testcollection = cw.finish() self.api.collections().create(body={ "manifest_text": cw.manifest_text() }).execute()
def write_foo_bar_baz(self): cw = arvados.CollectionWriter(self.api_client) self.assertEqual(cw.current_stream_name(), '.', 'current_stream_name() should be "." now') cw.set_current_file_name('foo.txt') cw.write('foo') self.assertEqual(cw.current_file_name(), 'foo.txt', 'current_file_name() should be foo.txt now') cw.start_new_file('bar.txt') cw.write('bar') cw.start_new_stream('baz') cw.write('baz') cw.set_current_file_name('baz.txt') return cw.finish()
def setUp(self, api=None): super(FuseMagicTest, self).setUp(api=api) cw = arvados.CollectionWriter() cw.start_new_file('thing1.txt') cw.write("data 1") self.testcollection = cw.finish() self.test_manifest = cw.manifest_text() coll = self.api.collections().create( body={ "manifest_text": self.test_manifest }).execute() self.test_manifest_pdh = coll['portable_data_hash']
def setUp(self, api=None): super(FuseMagicTestPDHOnly, self).setUp(api=api) cw = arvados.CollectionWriter() cw.start_new_file('thing1.txt') cw.write("data 1") self.testcollection = cw.finish() self.test_manifest = cw.manifest_text() created = self.api.collections().create( body={ "manifest_text": self.test_manifest }).execute() self.testcollectionuuid = str(created['uuid'])
def make_manifest(self, bytes_per_block=1, blocks_per_file=1, files_per_stream=1, streams=1): datablip = 'x' * bytes_per_block data_loc = tutil.str_keep_locator(datablip) with tutil.mock_keep_responses(data_loc, 200): coll = arvados.CollectionWriter() for si in range(0, streams): for fi in range(0, files_per_stream): with coll.open("stream{}/file{}.txt".format(si, fi)) as f: for bi in range(0, blocks_per_file): f.write(datablip) return coll.manifest_text()
def runTest(self): cw = arvados.CollectionWriter() self.assertEqual(cw.current_stream_name(), '.', 'current_stream_name() should be "." now') cw.set_current_file_name('foo.txt') cw.write('foo') self.assertEqual(cw.current_file_name(), 'foo.txt', 'current_file_name() should be foo.txt now') cw.start_new_file('bar.txt') cw.write('bar') cw.start_new_stream('baz') cw.write('baz') cw.set_current_file_name('baz.txt') hash = cw.finish() self.assertEqual(hash, '23ca013983d6239e98931cc779e68426+114', 'resulting manifest hash is not what I expected')
def setUp(self, api=None): super(FuseMagicTest, self).setUp(api=api) self.test_project = run_test_server.fixture('groups')['aproject']['uuid'] self.non_project_group = run_test_server.fixture('groups')['public']['uuid'] self.collection_in_test_project = run_test_server.fixture('collections')['foo_collection_in_aproject']['name'] cw = arvados.CollectionWriter() cw.start_new_file('thing1.txt') cw.write("data 1") self.testcollection = cw.finish() self.test_manifest = cw.manifest_text() coll = self.api.collections().create(body={"manifest_text":self.test_manifest}).execute() self.test_manifest_pdh = coll['portable_data_hash']
def runTest(self): cw = arvados.CollectionWriter() self.assertEqual(cw.current_stream_name(), '.', 'current_stream_name() should be "." now') cw.set_current_file_name('foo.txt') cw.write('foo') self.assertEqual(cw.current_file_name(), 'foo.txt', 'current_file_name() should be foo.txt now') cw.start_new_file('bar.txt') cw.write('bar') cw.start_new_stream('baz') cw.write('baz') cw.set_current_file_name('baz.txt') hash = cw.finish() self.assertEqual( hash, 'd6c3b8e571f1b81ebb150a45ed06c884+114', "resulting manifest hash was {0}, expecting d6c3b8e571f1b81ebb150a45ed06c884+114" .format(hash))
def setUp(self): super(FuseMountTest, self).setUp() cw = arvados.CollectionWriter() cw.start_new_file('thing1.txt') cw.write("data 1") cw.start_new_file('thing2.txt') cw.write("data 2") cw.start_new_stream('dir1') cw.start_new_file('thing3.txt') cw.write("data 3") cw.start_new_file('thing4.txt') cw.write("data 4") cw.start_new_stream('dir2') cw.start_new_file('thing5.txt') cw.write("data 5") cw.start_new_file('thing6.txt') cw.write("data 6") cw.start_new_stream('dir2/dir3') cw.start_new_file('thing7.txt') cw.write("data 7") cw.start_new_file('thing8.txt') cw.write("data 8") cw.start_new_stream('edgecases') for f in ":/.../-/*/\x01\\/ ".split("/"): cw.start_new_file(f) cw.write('x') for f in ":/.../-/*/\x01\\/ ".split("/"): cw.start_new_stream('edgecases/dirs/' + f) cw.start_new_file('x/x') cw.write('x') self.testcollection = cw.finish() self.api.collections().create(body={ "manifest_text": cw.manifest_text() }).execute()
def runTest(self): n_lines_in = 2**18 data_in = "abc\n" for x in xrange(0, 18): data_in += data_in compressed_data_in = bz2.compress(data_in) cw = arvados.CollectionWriter() cw.start_new_file('test.bz2') cw.write(compressed_data_in) bz2_manifest = cw.manifest_text() cr = arvados.CollectionReader(bz2_manifest) got = 0 for x in list(cr.all_files())[0].readlines(): self.assertEqual(x, "abc\n", "decompression returned wrong data: %s" % x) got += 1 self.assertEqual( got, n_lines_in, "decompression returned %d lines instead of %d" % (got, n_lines_in))
def main(): this_job = arvados.current_job() # Setup sub tasks 1-N (and terminate if this is task 0) one_task_per_cram_file(if_sequence=0, and_end_task=True) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task assert (this_task['sequence'] != 0) # Get reference FASTA ref_file = None print "Mounting reference FASTA collection" ref_dir = arvados.get_task_param_mount('ref') for f in arvados.util.listdir_recursive(ref_dir): if re.search(r'\.fa$', f): ref_file = os.path.join(ref_dir, f) if ref_file is None: raise InvalidArgumentError( "No reference fasta found in reference collection.") # Ensure we can read the reference file if not os.access(ref_file, os.R_OK): raise FileAccessError("reference FASTA file not readable: %s" % ref_file) # TODO: could check readability of .fai and .dict as well? # Get genome chunk intervals file chunk_file = None print "Mounting chunk collection" chunk_dir = arvados.get_task_param_mount('chunk') for f in arvados.util.listdir_recursive(chunk_dir): if re.search(r'\.region_list.txt$', f): chunk_file = os.path.join(chunk_dir, f) if chunk_file is None: raise InvalidArgumentError( "No chunk intervals file found in chunk collection.") # Ensure we can read the chunk file if not os.access(chunk_file, os.R_OK): raise FileAccessError("Chunk intervals file not readable: %s" % chunk_file) # Get single CRAM file for this task input_dir = None print "Mounting task input collection" input_dir = arvados.get_task_param_mount('input') input_cram_files = [] for f in arvados.util.listdir_recursive(input_dir): if re.search(r'\.cram$', f): stream_name, input_file_name = os.path.split(f) input_cram_files += [os.path.join(input_dir, f)] if len(input_cram_files) != 1: raise InvalidArgumentError("Expected exactly one cram file per task.") # There is only one CRAM file cram_file = input_cram_files[0] # Ensure we can read the CRAM file if not os.access(cram_file, os.R_OK): raise FileAccessError("CRAM file not readable: %s" % cram_file) # Ensure we have corresponding CRAI index and can read it as well cram_file_base, cram_file_ext = os.path.splitext(cram_file) assert (cram_file_ext == ".cram") crai_file = cram_file_base + ".crai" if not os.access(crai_file, os.R_OK): crai_file = cram_file_base + ".cram.crai" if not os.access(crai_file, os.R_OK): raise FileAccessError( "No readable CRAM index file for CRAM file: %s" % cram_file) # Will write to out_dir, make sure it is empty out_dir = os.path.join(arvados.current_task().tmpdir, 'out') if os.path.exists(out_dir): old_out_dir = out_dir + ".old" print "Moving out_dir %s out of the way (to %s)" % (out_dir, old_out_dir) try: os.rename(out_dir, old_out_dir) except: raise try: os.mkdir(out_dir) os.chdir(out_dir) except: raise # out_file = os.path.join(out_dir, os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file) + ".g.vcf.gz") config_file = os.path.join(arvados.current_task().tmpdir, "mpileup.conf") lock_file = os.path.join(arvados.current_task().tmpdir, "run-bt-mpileup.lock") if not os.path.exists(RUNNER_CONFIG_TEMPLATE): raise FileAccessError("No runner configuration template at %s" % RUNNER_CONFIG_TEMPLATE) # generate config runner_config_text = jinja2.Environment(loader=jinja2.FileSystemLoader( "/")).get_template(RUNNER_CONFIG_TEMPLATE).render( fasta_reference=ref_file, input_cram=cram_file, regions=chunk_file) with open(config_file, "wb") as fh: fh.write(runner_config_text) # report configuration print "Generated runner config to %s:\n%s" % (config_file, runner_config_text) # Call run-bt-mpileup runner_p = subprocess.Popen([ "run-bt-mpileup", "+config", config_file, "+js", "mpm", "+loop", "5", "+lock", lock_file, "-o", out_dir ], stdin=None, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True, shell=False) while runner_p.poll() is None: line = runner_p.stdout.readline() # only print '#### unfinished' lines or things that are errors or warnings if re.search(r'\d+\s+unfinished', line) or re.search( r'(FATAL|ERROR|WARNING)', line, flags=re.IGNORECASE): print "RUNNER: %s" % line.rstrip() runner_exit = runner_p.wait() if runner_exit != 0: print "WARNING: runner exited with exit code %s" % runner_exit # clean up out_dir try: os.remove(os.path.join(out_dir, "run-bt-mpileup.lock")) os.remove(os.path.join(out_dir, "mpileup.conf")) os.remove(os.path.join(out_dir, "cleaned-job-outputs.tgz")) except: print "WARNING: could not remove some output files!" pass out_bcf = os.path.join( out_dir, os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file) + ".bcf") try: os.rename(os.path.join(out_dir, "all.bcf"), out_bcf) os.rename(os.path.join(out_dir, "all.bcf.csi"), out_bcf + ".csi") os.rename(os.path.join(out_dir, "all.bcf.filt.vchk"), out_bcf + ".filt.vchk") os.rename(os.path.join(out_dir, "all.bcf.vchk"), out_bcf + ".vchk") except: print "WARNING: could not rename some output files!" pass # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir, stream_name) # Commit the output to Keep. output_locator = out.finish() # Use the resulting locator as the output for this task. this_task.set_output(output_locator)
def main(arguments=None): args = arvrun_parser.parse_args(arguments) if len(args.args) == 0: arvrun_parser.print_help() return starting_args = args.args reading_into = 2 # Parse the command arguments into 'slots'. # All words following '>' are output arguments and are collected into slots[0]. # All words following '<' are input arguments and are collected into slots[1]. # slots[2..] store the parameters of each command in the pipeline. # # e.g. arv-run foo arg1 arg2 '|' bar arg3 arg4 '<' input1 input2 input3 '>' output.txt # will be parsed into: # [['output.txt'], # ['input1', 'input2', 'input3'], # ['foo', 'arg1', 'arg2'], # ['bar', 'arg3', 'arg4']] slots = [[], [], []] for c in args.args: if c.startswith('>'): reading_into = 0 if len(c) > 1: slots[reading_into].append(c[1:]) elif c.startswith('<'): reading_into = 1 if len(c) > 1: slots[reading_into].append(c[1:]) elif c == '|': reading_into = len(slots) slots.append([]) else: slots[reading_into].append(c) if slots[0] and len(slots[0]) > 1: logger.error("Can only specify a single stdout file (run-command substitutions are permitted)") return if not args.dry_run: api = arvados.api('v1') if args.project_uuid: project = args.project_uuid else: project = determine_project(os.getcwd(), api.users().current().execute()["uuid"]) # Identify input files. Look at each parameter and test to see if there is # a file by that name. This uses 'patterns' to look for within # command line arguments, such as --foo=file.txt or -lfile.txt patterns = [re.compile("([^=]+=)(.*)"), re.compile("(-[A-Za-z])(.+)")] for j, command in enumerate(slots[1:]): for i, a in enumerate(command): if j > 0 and i == 0: # j == 0 is stdin, j > 0 is commands # always skip program executable (i == 0) in commands pass elif a.startswith('\\'): # if it starts with a \ then don't do any interpretation command[i] = a[1:] else: # See if it looks like a file command[i] = statfile('', a) # If a file named command[i] was found, it would now be an # ArvFile or UploadFile. If command[i] is a basestring, that # means it doesn't correspond exactly to a file, so do some # pattern matching. if isinstance(command[i], basestring): for p in patterns: m = p.match(a) if m: command[i] = statfile(m.group(1), m.group(2)) break n = True pathprefix = "/" files = [c for command in slots[1:] for c in command if isinstance(c, UploadFile)] if len(files) > 0: # Find the smallest path prefix that includes all the files that need to be uploaded. # This starts at the root and iteratively removes common parent directory prefixes # until all file pathes no longer have a common parent. while n: pathstep = None for c in files: if pathstep is None: sp = c.fn.split('/') if len(sp) < 2: # no parent directories left n = False break # path step takes next directory pathstep = sp[0] + "/" else: # check if pathstep is common prefix for all files if not c.fn.startswith(pathstep): n = False break if n: # pathstep is common parent directory for all files, so remove the prefix # from each path pathprefix += pathstep for c in files: c.fn = c.fn[len(pathstep):] orgdir = os.getcwd() os.chdir(pathprefix) print("Upload local files: \"%s\"" % '" "'.join([c.fn for c in files])) if args.dry_run: print("$(input) is %s" % pathprefix.rstrip('/')) pdh = "$(input)" else: files = sorted(files, key=lambda x: x.fn) collection = arvados.CollectionWriter(api, num_retries=args.retries) stream = None for f in files: sp = os.path.split(f.fn) if sp[0] != stream: stream = sp[0] collection.start_new_stream(stream) collection.write_file(f.fn, sp[1]) item = api.collections().create(body={"owner_uuid": project, "manifest_text": collection.manifest_text()}).execute() pdh = item["portable_data_hash"] print "Uploaded to %s" % item["uuid"] for c in files: c.fn = "$(file %s/%s)" % (pdh, c.fn) os.chdir(orgdir) for i in xrange(1, len(slots)): slots[i] = [("%s%s" % (c.prefix, c.fn)) if isinstance(c, ArvFile) else c for c in slots[i]] component = { "script": "run-command", "script_version": args.script_version, "repository": args.repository, "script_parameters": { }, "runtime_constraints": { "docker_image": args.docker_image } } task_foreach = [] group_parser = argparse.ArgumentParser() group_parser.add_argument('-b', '--batch-size', type=int) group_parser.add_argument('args', nargs=argparse.REMAINDER) for s in xrange(2, len(slots)): for i in xrange(0, len(slots[s])): if slots[s][i] == '--': inp = "input%i" % (s-2) groupargs = group_parser.parse_args(slots[2][i+1:]) if groupargs.batch_size: component["script_parameters"][inp] = {"value": {"batch":groupargs.args, "size":groupargs.batch_size}} slots[s] = slots[s][0:i] + [{"foreach": inp, "command": "$(%s)" % inp}] else: component["script_parameters"][inp] = groupargs.args slots[s] = slots[s][0:i] + ["$(%s)" % inp] task_foreach.append(inp) break if slots[s][i] == '\--': slots[s][i] = '--' if slots[0]: component["script_parameters"]["task.stdout"] = slots[0][0] if slots[1]: task_foreach.append("stdin") component["script_parameters"]["stdin"] = slots[1] component["script_parameters"]["task.stdin"] = "$(stdin)" if task_foreach: component["script_parameters"]["task.foreach"] = task_foreach component["script_parameters"]["command"] = slots[2:] if args.ignore_rcode: component["script_parameters"]["task.ignore_rcode"] = args.ignore_rcode pipeline = { "name": "arv-run " + " | ".join([s[0] for s in slots[2:]]), "description": "@" + " ".join(starting_args) + "@", "components": { "command": component }, "state": "RunningOnClient" if args.local else "RunningOnServer" } if args.dry_run: print(json.dumps(pipeline, indent=4)) else: pipeline["owner_uuid"] = project pi = api.pipeline_instances().create(body=pipeline, ensure_unique_name=True).execute() print "Running pipeline %s" % pi["uuid"] if args.local: subprocess.call(["arv-run-pipeline-instance", "--instance", pi["uuid"], "--run-jobs-here"] + (["--no-reuse"] if args.no_reuse else [])) elif not args.no_wait: ws.main(["--pipeline", pi["uuid"]]) pi = api.pipeline_instances().get(uuid=pi["uuid"]).execute() print "Pipeline is %s" % pi["state"] if "output_uuid" in pi["components"]["command"]: print "Output is %s" % pi["components"]["command"]["output_uuid"] else: print "No output"
def main(): signal(SIGINT, sigint_handler) signal(SIGTERM, sigterm_handler) this_job = arvados.current_job() skip_sq_sn_regex = this_job['script_parameters']['skip_sq_sn_regex'] genome_chunks = int(this_job['script_parameters']['genome_chunks']) if genome_chunks < 1: raise InvalidArgumentError("genome_chunks must be a positive integer") # Setup sub tasks 1-N (and terminate if this is task 0) one_task_per_cram_file(if_sequence=0, and_end_task=True, skip_sq_sn_regex=skip_sq_sn_regex, genome_chunks=genome_chunks) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task assert (this_task['sequence'] != 0) # Get reference FASTA ref_file = None print "Mounting reference FASTA collection" ref_dir = arvados.get_task_param_mount('ref') for f in arvados.util.listdir_recursive(ref_dir): if re.search(r'\.fa$', f): ref_file = os.path.join(ref_dir, f) if ref_file is None: raise InvalidArgumentError( "No reference fasta found in reference collection.") # Ensure we can read the reference fasta test_and_prime_input_file( ref_file, error_exception=FileAccessError("reference fasta not readable: %s" % ref_file)) # Ensure we have corresponding .fai, and that it is also readable ref_fai_file = ref_file + ".fai" test_and_prime_input_file( ref_fai_file, error_exception=FileAccessError( "reference fai index not readable: %s" % ref_fai_file)) # Get genome chunk intervals file chunk_file = None print "Mounting chunk collection" chunk_dir = arvados.get_task_param_mount('chunk') for f in arvados.util.listdir_recursive(chunk_dir): if re.search(r'\.region_list$', f): chunk_file = os.path.join(chunk_dir, f) if chunk_file is None: raise InvalidArgumentError( "No chunk intervals file found in chunk collection.") # Ensure we can read the chunk file test_and_prime_input_file( chunk_file, error_exception=FileAccessError( "Chunk intervals file not readable: %s" % chunk_file)) # Get single CRAM file for this task input_dir = None print "Mounting task input collection" input_dir = arvados.get_task_param_mount('input') input_cram_files = [] stream_name = "" for f in arvados.util.listdir_recursive(input_dir): if re.search(r'\.cram$', f): stream_name, input_file_name = os.path.split(f) input_cram_files += [os.path.join(input_dir, f)] if len(input_cram_files) != 1: raise InvalidArgumentError("Expected exactly one cram file per task.") # There is only one CRAM file cram_file = input_cram_files[0] # Ensure we can read the CRAM file test_and_prime_input_file(cram_file, error_exception=FileAccessError( "CRAM file not readable: %s" % cram_file)) # Ensure we have corresponding CRAI index and can read it as well cram_file_base, cram_file_ext = os.path.splitext(cram_file) assert (cram_file_ext == ".cram") crai_file = cram_file_base + ".crai" if not test_and_prime_input_file(crai_file, error_exception=None): crai_file = cram_file_base + ".cram.crai" if not test_and_prime_input_file(crai_file, error_exception=None): raise FileAccessError( "No readable CRAM index file for CRAM file: %s" % cram_file) # Will write to out_dir, make sure it is empty tmp_dir = arvados.current_task().tmpdir out_dir = os.path.join(tmp_dir, 'out') if os.path.exists(out_dir): old_out_dir = out_dir + ".old" print "Moving out_dir %s out of the way (to %s)" % (out_dir, old_out_dir) try: os.rename(out_dir, old_out_dir) except: raise try: os.mkdir(out_dir) os.chdir(out_dir) except: raise output_basename = os.path.basename( cram_file_base) + "." + os.path.basename(chunk_file) out_file_tmp = os.path.join(tmp_dir, output_basename + ".g.vcf.tmp") penultimate_out_file = os.path.join( tmp_dir, output_basename + ".provheader.g.vcf.gz") final_out_file = os.path.join(out_dir, output_basename + ".g.vcf.gz") # bash_cmd_pipe = "samtools view -h -u -@ 1 -T %s %s | bcftools mpileup -t AD,INFO/AD -C50 -pm2 -F0.1 -d10000 --gvcf 1,2,3,4,5,10,15 -f %s -Ou - | bcftools view -Ou | bcftools norm -f %s -Ob -o %s" % (ref_file, cram_file, ref_file, ref_file, out_file) regions = [] print "Preparing region list from chunk file [%s]" % chunk_file with open(chunk_file, 'r') as f: for line in f.readlines(): (chr, start, end) = line.rstrip().split() region = "%s:%s-%s" % (chr, start, end) regions.append(region) total_region_count = len(regions) print "Preparing fifos for output from %s bcftools mpileup commands (one for each region) to bcftools concat" % total_region_count concat_noheader_fifos = dict() concat_headeronly_tmps = dict() current_region_num = 0 for region in regions: current_region_num += 1 concat_noheader_fifo = os.path.join( tmp_dir, output_basename + (".part_%s_of_%s.g.vcf" % (current_region_num, total_region_count))) try: os.mkfifo(concat_noheader_fifo, 0600) except: print "ERROR: could not mkfifo %s" % concat_noheader_fifo raise fifos_to_delete.append(concat_noheader_fifo) concat_noheader_fifos[region] = concat_noheader_fifo concat_headeronly_tmp = os.path.join( tmp_dir, output_basename + (".part_%s_of_%s.headeronly.g.vcf.gz" % (current_region_num, total_region_count))) concat_headeronly_tmps[region] = concat_headeronly_tmp region_concat_cmd = ["cat"] region_concat_cmd.extend( [concat_noheader_fifos[region] for region in regions]) # open file for output file out_file_tmp_f = open(out_file_tmp, 'wb') region_concat_p = run_child_cmd(region_concat_cmd, stdout=out_file_tmp_f, tag="bcftools concat (stderr)") current_region_num = 0 current_concat_noheader_fifo_f = None regions_to_process = list(regions) bcftools_mpileup_p = None bcftools_norm_p = None part_tee_p = None bcftools_view_headeronly_p = None bcftools_view_noheader_p = None while True: # at least one of the regional aggregation processes is still running watch_fds_and_print_output() if ((bcftools_mpileup_p is None) and (bcftools_norm_p is None) and (part_tee_p is None) and (bcftools_view_headeronly_p is None) and (bcftools_view_noheader_p is None)): # no per-region processes are running (they have finished or # have not yet started) if len(regions_to_process) > 0: # have more regions to run region = regions_to_process.pop(0) current_region_num += 1 region_label = "%s/%s [%s]" % (current_region_num, total_region_count, region) concat_noheader_fifo = concat_noheader_fifos[region] bcftools_view_noheader_input_fifo = os.path.join( tmp_dir, output_basename + (".part_%s_of_%s.noheader.g.bcf" % (current_region_num, total_region_count))) part_tee_cmd = [ "teepot", bcftools_view_noheader_input_fifo, "-" ] bcftools_view_noheader_cmd = [ "bcftools", "view", "-H", "-Ov", bcftools_view_noheader_input_fifo ] concat_headeronly_tmp = concat_headeronly_tmps[region] bcftools_view_headeronly_cmd = [ "bcftools", "view", "-h", "-Oz", "-o", concat_headeronly_tmp ] bcftools_norm_cmd = ["bcftools", "norm", "-f", ref_file, "-Ou"] bcftools_mpileup_cmd = [ "bcftools-gvcf", "mpileup", "-t", "AD,INFO/AD", "-C50", "-pm2", "-F0.1", "-d10000", "--gvcf", "1,2,3,4,5,10,15", "-f", ref_file, "-Ou", "-r", region, cram_file ] print "Creating 'bcftools mpileup | bcftools norm' pipe for region %s" % ( region_label) bcftools_norm_stdin_pipe_read, bcftools_norm_stdin_pipe_write = os.pipe( ) print "Creating 'bcftools norm | tee' pipe for region %s" % ( region_label) part_tee_stdin_pipe_read, part_tee_stdin_pipe_write = os.pipe() print "Creating 'tee | bcftools view -h' pipe for region %s" % ( region_label) bcftools_view_headeronly_stdin_pipe_read, bcftools_view_headeronly_stdin_pipe_write = os.pipe( ) print "Creating 'tee | bcftools view' named pipe [%s] for region %s" % ( bcftools_view_noheader_input_fifo, region_label) try: os.mkfifo(bcftools_view_noheader_input_fifo, 0600) except: print "ERROR: could not mkfifo %s" % bcftools_view_noheader_input_fifo raise fifos_to_delete.append(bcftools_view_noheader_input_fifo) print "Opening concat fifo %s for writing" % concat_noheader_fifo if current_concat_noheader_fifo_f is not None: #print "ERROR: current_concat_noheader_fifo_f was not closed properly" #raise Exception("current_concat_noheader_fifo_f was not closed properly") current_concat_noheader_fifo_f.close() current_concat_noheader_fifo_f = open(concat_noheader_fifo, 'wb') bcftools_mpileup_p = run_child_cmd( bcftools_mpileup_cmd, stdout=bcftools_norm_stdin_pipe_write, tag="bcftools mpileup %s" % (region_label)) bcftools_norm_p = run_child_cmd( bcftools_norm_cmd, stdin=bcftools_norm_stdin_pipe_read, stdout=part_tee_stdin_pipe_write, tag="bcftools norm %s" % (region_label)) part_tee_p = run_child_cmd( part_tee_cmd, stdin=part_tee_stdin_pipe_read, stdout=bcftools_view_headeronly_stdin_pipe_write, tag="tee %s" % (region_label)) bcftools_view_headeronly_p = run_child_cmd( bcftools_view_headeronly_cmd, stdin=bcftools_view_headeronly_stdin_pipe_read, tag="bcftools view -h %s" % (region_label)) bcftools_view_noheader_p = run_child_cmd( bcftools_view_noheader_cmd, stdout=current_concat_noheader_fifo_f, tag="bcftools view %s" % (region_label)) bcftools_mpileup_p = close_process_if_finished( bcftools_mpileup_p, "bcftools mpileup %s" % (region_label), close_fds=[bcftools_norm_stdin_pipe_write]) bcftools_norm_p = close_process_if_finished( bcftools_norm_p, "bcftools norm %s" % (region_label), close_fds=[ bcftools_norm_stdin_pipe_read, part_tee_stdin_pipe_write ]) part_tee_p = close_process_if_finished( part_tee_p, "tee %s" % (region_label), close_fds=[ part_tee_stdin_pipe_read, bcftools_view_headeronly_stdin_pipe_write ], ignore_error=True) bcftools_view_headeronly_p = close_process_if_finished( bcftools_view_headeronly_p, "bcftools view -h %s" % (region_label), close_fds=[bcftools_view_headeronly_stdin_pipe_read]) bcftools_view_noheader_p = close_process_if_finished( bcftools_view_noheader_p, "bcftools view %s" % (region_label), close_files=[current_concat_noheader_fifo_f]) region_concat_p = close_process_if_finished( region_concat_p, "bcftools concat", close_files=[out_file_tmp_f]) # end loop once all processes have finished if ((region_concat_p is None) and (bcftools_view_noheader_p is None) and (bcftools_view_headeronly_p is None) and (part_tee_p is None) and (bcftools_norm_p is None) and (bcftools_mpileup_p is None)): print "All region work has completed" break else: sleep(0.01) # continue to next loop iteration if len(child_pids) > 0: print "WARNING: some children are still alive: [%s]" % (child_pids) for pid in child_pids: print "Attempting to terminate %s forcefully" % (pid) try: os.kill(pid, SIGTERM) except Exception as e: print "Could not kill pid %s: %s" % (pid, e) for fifo in fifos_to_delete: try: os.remove(fifo) except: raise concat_headeronly_tmp_fofn = os.path.join(tmp_dir, output_basename + ".fifos_fofn") tmp_files_to_delete = [] print "Preparing fofn for bcftools concat (headeronly): %s" % ( concat_headeronly_tmp_fofn) with open(concat_headeronly_tmp_fofn, 'w') as f: print "Checking files for regions: %s" % regions for concat_headeronly_tmp in [ concat_headeronly_tmps[region] for region in regions ]: if os.path.exists(concat_headeronly_tmp): print "Adding %s to fofn" % concat_headeronly_tmp f.write("%s\n" % concat_headeronly_tmp) tmp_files_to_delete.append(concat_headeronly_tmp) else: print "WARNING: no output file for %s (there was probably no data in the region)" % concat_headeronly_tmp final_headeronly_tmp = os.path.join(tmp_dir, output_basename + ".headeronly.g.vcf") final_headeronly_tmp_f = open(final_headeronly_tmp, 'wb') print "Creating 'bcftools concat | grep' pipe" grep_headeronly_stdin_pipe_read, grep_headeronly_stdin_pipe_write = os.pipe( ) grep_headeronly_cmd = [ "egrep", "-v", "^[#][#](bcftools|mpileup|reference)" ] grep_headeronly_p = run_child_cmd(grep_headeronly_cmd, stdin=grep_headeronly_stdin_pipe_read, stdout=final_headeronly_tmp_f, tag="grep (headeronly)") bcftools_concat_headeronly_cmd = [ "bcftools", "concat", "-Ov", "-f", concat_headeronly_tmp_fofn ] bcftools_concat_headeronly_p = run_child_cmd( bcftools_concat_headeronly_cmd, stdout=grep_headeronly_stdin_pipe_write, tag="bcftools concat (headeronly)") while True: watch_fds_and_print_output() bcftools_concat_headeronly_p = close_process_if_finished( bcftools_concat_headeronly_p, "bcftools concat (headeronly)", close_fds=[grep_headeronly_stdin_pipe_write]) grep_headeronly_p = close_process_if_finished( grep_headeronly_p, "grep (headeronly)", close_fds=[grep_headeronly_stdin_pipe_read], close_files=[final_headeronly_tmp_f]) if ((bcftools_concat_headeronly_p is None) and (grep_headeronly_p is None)): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if bcftools_concat_headeronly_p is not None: print "ERROR: failed to cleanly terminate bcftools concat (headeronly)" if grep_headeronly_p is not None: print "ERROR: failed to cleanly terminate grep (headeronly)" # check if there was any data output if os.stat(out_file_tmp)[6] == 0: # 0-byte data file, there is no point in concatenating and # reheader will reject the file, so we need to bgzip it ourselves print "Handling 0-byte data file - compressing headeronly vcf with bgzip to create [%s]" % ( final_out_file) final_out_file_f = open(final_out_file, 'wb') final_bgzip_cmd = ["bgzip", "-c", final_headeronly_tmp] final_bgzip_p = run_child_cmd(final_bgzip_cmd, tag="final bgzip", stdout=final_out_file_f) while True: watch_fds_and_print_output() final_bgzip_p = close_process_if_finished( final_bgzip_p, "final bgzip", close_files=[final_out_file_f]) if (final_bgzip_p is None): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if final_bgzip_p is not None: print "ERROR: failed to cleanly terminate final bgzip (header with no data)" else: # there is some data in the data file print "Creating final 'cat | bcftools view -Oz' pipe" final_bcftools_view_stdin_pipe_read, final_bcftools_view_stdin_pipe_write = os.pipe( ) print "Preparing penultimate output file [%s]" % (penultimate_out_file) final_bcftools_view_cmd = [ "bcftools", "view", "-Oz", "-o", penultimate_out_file ] final_concat_cmd = ["cat", final_headeronly_tmp, out_file_tmp] final_bcftools_view_p = run_child_cmd( final_bcftools_view_cmd, tag="final bcftools view -Oz", stdin=final_bcftools_view_stdin_pipe_read) final_concat_p = run_child_cmd( final_concat_cmd, tag="final cat (header+data)", stdout=final_bcftools_view_stdin_pipe_write) while True: watch_fds_and_print_output() final_bcftools_view_p = close_process_if_finished( final_bcftools_view_p, "final bcftools view -Oz", close_fds=[final_bcftools_view_stdin_pipe_read]) final_concat_p = close_process_if_finished( final_concat_p, "final cat (header+data)", close_fds=[final_bcftools_view_stdin_pipe_write]) if ((final_concat_p is None) and (final_bcftools_view_p is None)): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if final_bcftools_view_p is not None: print "ERROR: failed to cleanly terminate final bcftools view -Oz" if final_concat_p is not None: print "ERROR: failed to cleanly terminate final cat (header+data)" print "Reheadering penultimate output file into final out file [%s]" % ( final_out_file) final_bcftools_reheader_cmd = [ "bcftools", "reheader", "-h", final_headeronly_tmp, "-o", final_out_file, penultimate_out_file ] final_bcftools_reheader_p = run_child_cmd( final_bcftools_reheader_cmd, tag="final bcftools reheader") while True: watch_fds_and_print_output() final_bcftools_reheader_p = close_process_if_finished( final_bcftools_reheader_p, "final bcftools reheader") if (final_bcftools_reheader_p is None): # none of the processes are still running, we're done! break else: sleep(0.01) # continue to next loop iteration if final_bcftools_reheader_p is not None: print "ERROR: failed to cleanly terminate final bcftools reheader" os.remove(penultimate_out_file) print "Indexing final output file [%s]" % (final_out_file) bcftools_index_cmd = ["bcftools", "index", final_out_file] bcftools_index_p = run_child_cmd(bcftools_index_cmd, tag="bcftools index") while True: watch_fds_and_print_output() bcftools_index_p = close_process_if_finished(bcftools_index_p, "bcftools index") if (bcftools_index_p is None): break else: sleep(0.01) # continue to next loop iteration if bcftools_index_p is not None: print "ERROR: failed to cleanly terminate bcftools index" print "Complete, removing temporary files" os.remove(concat_headeronly_tmp_fofn) os.remove(out_file_tmp) os.remove(final_headeronly_tmp) for tmp_file in tmp_files_to_delete: os.remove(tmp_file) # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep print "Writing Keep Collection from [%s] to [%s]" % (out_dir, stream_name) out.write_directory_tree(out_dir, stream_name) # Commit the output to Keep. output_locator = out.finish() print "Task output locator [%s]" % output_locator # Use the resulting locator as the output for this task. this_task.set_output(output_locator) # Done! print "Task complete!"
of.write("directory structure:\n" + lsinfo) dfinfo = sp.check_output(["df", "-h"]) of.write("df:\n" + dfinfo) meminfo = sp.check_output(["free", "-hm"]) of.write("mem:\n" + meminfo) hostinfo = sp.check_output(["hostname"]) of.write("host: " + hostinfo) job = arv.current_job() task = arv.current_task() of = arv.CollectionWriter() of.set_current_file_name("info.log") whoinfo = sp.check_output(["whoami"]) of.write("user: "******"\n") pwdinfo = sp.check_output(["pwd"]) of.write("pwd: " + pwdinfo + "\n") lsinfo = sp.check_output(["ls", "-lahR"]) of.write("directory structure:\n" + lsinfo) dfinfo = sp.check_output(["df", "-h"]) of.write("df:\n" + dfinfo + "\n") meminfo = sp.check_output(["free", "-hm"])
help="List of keys to use for each TSV file", action="append") args = parser.parse_args() ds, ds_path = find_subjects_neuro_data_lite.install_dataset( args.dataset_name, args.output_dir) _ign, complete_subjects, _ign, _ign = find_subjects_behavior_data.get_data( args.behavior_files, args.behavior_keys) complete_subjects, datatypes = find_subjects_neuro_data_lite.get_type_neuro_data( args.output_dir, subjects=complete_subjects, add_unknown_subjects=False) if args.get_data: subject_pdh_uuid = [] subjects = sorted([s for s in complete_subjects]) for subject in subjects: out = arvados.CollectionWriter() subject_get_data_path = os.path.join(ds_path, subject) find_subjects_neuro_data_lite.get_dataset_data( ds, subject_get_data_path, parallelized=args.ncores) print("Writing %s to new collection" % (subject_get_data_path)) out.write_directory_tree(subject_get_data_path) out.finish() print("Written Collection: \n\tPDH: %s\n\n" % (out.portable_data_hash())) collection_body = { "collection": { "name": "%s MPI-Leipzig data" % (subject), "owner_uuid": args.project_uuid, "portable_data_hash": out.portable_data_hash(), "manifest_text": out.manifest_text() }
def uploadfiles(files, api, dry_run=False, num_retries=0, project=None, fnPattern="$(file %s/%s)", name=None): # Find the smallest path prefix that includes all the files that need to be uploaded. # This starts at the root and iteratively removes common parent directory prefixes # until all file paths no longer have a common parent. n = True pathprefix = "/" while n: pathstep = None for c in files: if pathstep is None: sp = c.fn.split('/') if len(sp) < 2: # no parent directories left n = False break # path step takes next directory pathstep = sp[0] + "/" else: # check if pathstep is common prefix for all files if not c.fn.startswith(pathstep): n = False break if n: # pathstep is common parent directory for all files, so remove the prefix # from each path pathprefix += pathstep for c in files: c.fn = c.fn[len(pathstep):] orgdir = os.getcwd() os.chdir(pathprefix) logger.info("Upload local files: \"%s\"", '" "'.join([c.fn for c in files])) if dry_run: logger.info("$(input) is %s", pathprefix.rstrip('/')) pdh = "$(input)" else: files = sorted(files, key=lambda x: x.fn) collection = arvados.CollectionWriter(api, num_retries=num_retries) stream = None for f in files: sp = os.path.split(f.fn) if sp[0] != stream: stream = sp[0] collection.start_new_stream(stream) collection.write_file(f.fn, sp[1]) exists = api.collections().list(filters=[[ "owner_uuid", "=", project ], ["portable_data_hash", "=", collection.portable_data_hash()], ["name", "=", name]]).execute( num_retries=num_retries) if exists["items"]: item = exists["items"][0] logger.info("Using collection %s", item["uuid"]) else: body = { "owner_uuid": project, "manifest_text": collection.manifest_text() } if name is not None: body["name"] = name item = api.collections().create(body=body, ensure_unique_name=True).execute() logger.info("Uploaded to %s", item["uuid"]) pdh = item["portable_data_hash"] for c in files: c.keepref = "%s/%s" % (pdh, c.fn) c.fn = fnPattern % (pdh, c.fn) os.chdir(orgdir)
def main(): ################################################################################ # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on # applying the capturing group named "group_by" in group_by_regex. # (and terminate if this is task 0) ################################################################################ ref_input_pdh = gatk_helper.prepare_gatk_reference_collection( reference_coll=arvados.current_job()['script_parameters'] ['reference_collection']) job_input_pdh = arvados.current_job( )['script_parameters']['inputs_collection'] interval_lists_pdh = arvados.current_job( )['script_parameters']['interval_lists_collection'] interval_count = 1 if "interval_count" in arvados.current_job()['script_parameters']: interval_count = arvados.current_job( )['script_parameters']['interval_count'] # Setup sub tasks 1-N (and terminate if this is task 0) hgi_arvados.one_task_per_group_and_per_n_gvcfs(ref_input_pdh, job_input_pdh, interval_lists_pdh, group_by_regex, max_gvcfs_to_combine, if_sequence=0, and_end_task=True) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task sequence assert (this_task['sequence'] > 0) ################################################################################ # Phase II: Read interval_list and split into additional intervals ################################################################################ hgi_arvados.one_task_per_interval( interval_count, validate_task_output, reuse_tasks=True, oldest_git_commit_to_reuse="1f6e1e0b8bb12c573dd253d7900ef55305d55aa1", if_sequence=1, and_end_task=True) # We will never reach this point if we are in the 1st task sequence assert (this_task['sequence'] > 1) ################################################################################ # Phase IIIa: If we are a "reuse" task, just set our output and be done with it ################################################################################ if 'reuse_job_task' in this_task['parameters']: print "This task's work was already done by JobTask %s" % this_task[ 'parameters']['reuse_job_task'] exit(0) ################################################################################ # Phase IIIb: Combine gVCFs! ################################################################################ ref_file = gatk_helper.mount_gatk_reference(ref_param="ref") gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs") out_dir = hgi_arvados.prepare_out_dir() name = this_task['parameters'].get('name') if not name: name = "unknown" interval_str = this_task['parameters'].get('interval') if not interval_str: interval_str = "" interval_strs = interval_str.split() intervals = [] for interval in interval_strs: intervals.extend(["--intervals", interval]) out_file = name + ".vcf.gz" if interval_count > 1: out_file = name + "." + '_'.join(interval_strs) + ".vcf.gz" if len(out_file) > 255: out_file = name + "." + '_'.join( [interval_strs[0], interval_strs[-1]]) + ".vcf.gz" print "Output file name was too long with full interval list, shortened it to: %s" % out_file if len(out_file) > 255: raise errors.InvalidArgumentError( "Output file name is too long, cannot continue: %s" % out_file) # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output out_file = out_file.replace(".bcf", "._cf") # CombineGVCFs! extra_args = intervals extra_args.extend(["--breakBandsAtMultiplesOf", "1000000"]) gatk_exit = gatk.combine_gvcfs(ref_file, gvcf_files, os.path.join(out_dir, out_file), extra_gatk_args=extra_args) if gatk_exit != 0: print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit arvados.api().job_tasks().update(uuid=this_task['uuid'], body={ 'success': False }).execute() else: print "GATK exited successfully, writing output to keep" # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir) # Commit the output to Keep. output_locator = out.finish() if validate_task_output(output_locator): print "Task output validated, setting output to %s" % ( output_locator) # Use the resulting locator as the output for this task. this_task.set_output(output_locator) else: print "ERROR: Failed to validate task output (%s)" % ( output_locator) arvados.api().job_tasks().update(uuid=this_task['uuid'], body={ 'success': False }).execute()
def main(): ################################################################################ # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on # applying the capturing group named "group_by" in group_by_regex. # (and terminate if this is task 0) ################################################################################ ref_input_pdh = gatk_helper.prepare_gatk_reference_collection( reference_coll=arvados.current_job()['script_parameters'] ['reference_collection']) job_input_pdh = arvados.current_job( )['script_parameters']['inputs_collection'] interval_lists_pdh = arvados.current_job( )['script_parameters']['interval_lists_collection'] interval_count = 1 if "interval_count" in arvados.current_job()['script_parameters']: interval_count = arvados.current_job( )['script_parameters']['interval_count'] if arvados.current_task()['sequence'] == 0: # get candidates for task reuse task_key_params = [ 'inputs', 'ref', 'name' ] # N.B. inputs collection includes input vcfs and corresponding interval_list script = "gatk-genotypegvcfs.py" oldest_git_commit_to_reuse = '6ca726fc265f9e55765bf1fdf71b86285b8a0ff2' job_filters = [ ['script', '=', script], ['repository', '=', arvados.current_job()['repository']], ['script_version', 'in git', oldest_git_commit_to_reuse], [ 'docker_image_locator', 'in docker', arvados.current_job()['docker_image_locator'] ], ] # retrieve a full set of all possible reusable tasks at sequence 1 print "Retrieving all potentially reusable tasks" reusable_tasks = hgi_arvados.get_reusable_tasks( 1, task_key_params, job_filters) print "Have %s tasks for potential reuse" % (len(reusable_tasks)) def create_task_with_validated_reuse(sequence, params): return hgi_arvados.create_or_reuse_task(sequence, params, reusable_tasks, task_key_params, validate_task_output) # Setup sub tasks (and terminate if this is task 0) hgi_arvados.one_task_per_group_combined_inputs( ref_input_pdh, job_input_pdh, interval_lists_pdh, group_by_regex, if_sequence=0, and_end_task=True, create_task_func=create_task_with_validated_reuse) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task sequence assert (this_task['sequence'] > 0) ################################################################################ # Phase IIa: If we are a "reuse" task, just set our output and be done with it ################################################################################ if 'reuse_job_task' in this_task['parameters']: print "This task's work was already done by JobTask %s" % this_task[ 'parameters']['reuse_job_task'] exit(0) ################################################################################ # Phase IIb: Genotype gVCFs! ################################################################################ ref_file = gatk_helper.mount_gatk_reference(ref_param="ref") gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs") out_dir = hgi_arvados.prepare_out_dir() interval_list_file = gatk_helper.mount_single_gatk_interval_list_input( interval_list_param="inputs") name = this_task['parameters'].get('name') if not name: name = "unknown" out_file = name + ".vcf.gz" # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output out_file = out_file.replace(".bcf", "._cf") # GenotypeGVCFs! gatk_exit = gatk.genotype_gvcfs(ref_file, interval_list_file, gvcf_files, os.path.join(out_dir, out_file), cores="4", java_mem="19g") if gatk_exit != 0: print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit arvados.api().job_tasks().update(uuid=this_task['uuid'], body={ 'success': False }).execute() else: print "GATK exited successfully, writing output to keep" # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir) # Commit the output to Keep. output_locator = out.finish() if validate_task_output(output_locator): print "Task output validated, setting output to %s" % ( output_locator) # Use the resulting locator as the output for this task. this_task.set_output(output_locator) else: print "ERROR: Failed to validate task output (%s)" % ( output_locator) arvados.api().job_tasks().update(uuid=this_task['uuid'], body={ 'success': False }).execute()
CURR_TILEVARS[tile_int].append(len(TILEVARS_TO_WRITE)) ignore, phaseA, phaseB, HG19_GENOME_VARIANTS, GENOME_VARIANT_ID, var_to_append = retval HUMAN_SEQ_PHASEA.extend(phaseA) HUMAN_SEQ_PHASEB.extend(phaseB) TILEVARS_TO_WRITE.append(var_to_append) return True ######################################################################################################################## #Set-up files to write out to GENOME_VARIANT_FILE = 'genomevariant.csv' GENOME_TRANSLATION_FILE = 'genomevarianttranslation.csv' NEW_TILEVARIANT_FILE = 'tilevariant.csv' # Write a new collection as output out = arvados.CollectionWriter(num_retries=NUM_RETRIES) #Parallelize the job according to paths and paths only => use library collection as the main input! arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True, input_as_path=True) # Get object representing the current task this_task = arvados.current_task() # Get the input file for the task input_id, library_input_path = this_task['parameters']['input'].split('/', 1) #open the input collection (containing library files, tilelocusannotation.csv, tilevariant.csv, and possibly genomevariant.csv) library_input_collection = arvados.CollectionReader(input_id) #only do work if we are given a library file as input! if library_input_path.endswith('_library.csv'):
def main(): ################################################################################ # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on # applying the capturing group named "group_by" in group_by_regex. # (and terminate if this is task 0) ################################################################################ ref_input_pdh = gatk_helper.prepare_gatk_reference_collection( reference_coll=arvados.current_job()['script_parameters'] ['reference_collection']) job_input_pdh = arvados.current_job( )['script_parameters']['inputs_collection'] interval_lists_pdh = arvados.current_job( )['script_parameters']['interval_lists_collection'] interval_count = 1 if "interval_count" in arvados.current_job()['script_parameters']: interval_count = arvados.current_job( )['script_parameters']['interval_count'] # Setup sub tasks 1-N (and terminate if this is task 0) hgi_arvados.chunked_tasks_per_cram_file( ref_input_pdh, job_input_pdh, interval_lists_pdh, validate_task_output, if_sequence=0, and_end_task=True, reuse_tasks=False, oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2', script="gatk-haplotypecaller-cram.py") # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task assert (this_task['sequence'] != 0) ################################################################################ # Phase IIa: If we are a "reuse" task, just set our output and be done with it ################################################################################ if 'reuse_job_task' in this_task['parameters']: print "This task's work was already done by JobTask %s" % this_task[ 'parameters']['reuse_job_task'] exit(0) ################################################################################ # Phase IIb: Call Haplotypes! ################################################################################ ref_file = gatk_helper.mount_gatk_reference(ref_param="ref") interval_list_file = gatk_helper.mount_single_gatk_interval_list_input( interval_list_param="chunk") cram_file = gatk_helper.mount_gatk_cram_input(input_param="input") cram_file_base, cram_file_ext = os.path.splitext(cram_file) out_dir = hgi_arvados.prepare_out_dir() out_filename = os.path.basename(cram_file_base) + "." + os.path.basename( interval_list_file) + ".vcf.gz" # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output out_filename = out_filename.replace(".bcf", "._cf") # HaplotypeCaller! gatk_exit = gatk.haplotype_caller(ref_file, cram_file, interval_list_file, os.path.join(out_dir, out_filename)) if gatk_exit != 0: print "ERROR: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={ 'success': False }).execute() else: print "GATK exited successfully, writing output to keep" # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir) # Commit the output to Keep. output_locator = out.finish() print "Task output written to keep, validating it" if validate_task_output(output_locator): print "Task output validated, setting output to %s" % ( output_locator) # Use the resulting locator as the output for this task. this_task.set_output(output_locator) else: print "ERROR: Failed to validate task output (%s)" % ( output_locator) arvados.api().job_tasks().update( uuid=arvados.current_task()['uuid'], body={ 'success': False }).execute()
def test_write_named_file(self): cwriter = arvados.CollectionWriter(self.api_client) with self.make_test_file() as testfile: cwriter.write_file(testfile.name, 'foo') self.assertEqual(cwriter.manifest_text(), ". 098f6bcd4621d373cade4e832627b4f6+4 0:4:foo\n")
def main(): this_job = arvados.current_job() # Setup sub tasks 1-N (and terminate if this is task 0) one_task_per_cram_file(if_sequence=0, and_end_task=True) # Get object representing the current task this_task = arvados.current_task() # We will never reach this point if we are in the 0th task assert(this_task['sequence'] != 0) # Get reference FASTA ref_file = None print "Mounting reference FASTA collection" ref_dir = arvados.get_task_param_mount('ref') for f in arvados.util.listdir_recursive(ref_dir): if re.search(r'\.fa$', f): ref_file = os.path.join(ref_dir, f) if ref_file is None: raise InvalidArgumentError("No reference fasta found in reference collection.") # Ensure we can read the reference file if not os.access(ref_file, os.R_OK): raise FileAccessError("reference FASTA file not readable: %s" % ref_file) # TODO: could check readability of .fai and .dict as well? # Get genome chunk intervals file # chunk_file = None # print "Mounting chunk collection" # chunk_dir = arvados.get_task_param_mount('chunk') # for f in arvados.util.listdir_recursive(chunk_dir): # if re.search(r'\.region_list.txt$', f): # chunk_file = os.path.join(chunk_dir, f) # if chunk_file is None: # raise InvalidArgumentError("No chunk intervals file found in chunk collection.") # # Ensure we can read the chunk file # if not os.access(chunk_file, os.R_OK): # raise FileAccessError("Chunk intervals file not readable: %s" % chunk_file) # Get single CRAM file for this task input_dir = None print "Mounting task input collection" input_dir = arvados.get_task_param_mount('input') input_cram_files = [] for f in arvados.util.listdir_recursive(input_dir): if re.search(r'\.cram$', f): stream_name, input_file_name = os.path.split(f) input_cram_files += [os.path.join(input_dir, f)] if len(input_cram_files) != 1: raise InvalidArgumentError("Expected exactly one cram file per task.") # There is only one CRAM file cram_file = input_cram_files[0] # Ensure we can read the CRAM file if not os.access(cram_file, os.R_OK): raise FileAccessError("CRAM file not readable: %s" % cram_file) # Ensure we have corresponding CRAI index and can read it as well cram_file_base, cram_file_ext = os.path.splitext(cram_file) assert(cram_file_ext == ".cram") crai_file = cram_file_base + ".crai" if not os.access(crai_file, os.R_OK): crai_file = cram_file_base + ".cram.crai" if not os.access(crai_file, os.R_OK): raise FileAccessError("No readable CRAM index file for CRAM file: %s" % cram_file) # Will write to out_dir, make sure it is empty out_dir = os.path.join(arvados.current_task().tmpdir, 'out') if os.path.exists(out_dir): old_out_dir = out_dir + ".old" print "Moving out_dir %s out of the way (to %s)" % (out_dir, old_out_dir) try: os.rename(out_dir, old_out_dir) except: raise try: os.mkdir(out_dir) os.chdir(out_dir) except: raise # out_file = os.path.join(out_dir, os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file) + ".g.bcf") out_file = os.path.join(out_dir, os.path.basename(cram_file_base) + ".g.bcf") bash_cmd_pipe = "samtools view -h -u -@ 1 -T %s %s | bcftools mpileup -t AD,INFO/AD -C50 -pm2 -F0.1 -d10000 --gvcf 1,2,3,4,5,10,15 -f %s -Ou - | bcftools view -Ou | bcftools norm -f %s -Ob -o %s" % (ref_file, cram_file, ref_file, ref_file, out_file) # Call bcftools runner_p = subprocess.Popen(bash_cmd_pipe, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True, shell=True) while runner_p.poll() is None: line = runner_p.stdout.readline() print "BCFTOOLS: %s" % line.rstrip() runner_exit = runner_p.wait() if runner_exit != 0: print "WARNING: runner exited with exit code %s" % runner_exit # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir, stream_name) # Commit the output to Keep. output_locator = out.finish() # Use the resulting locator as the output for this task. this_task.set_output(output_locator)
def foo_writer(self, **kwargs): api_client = self.api_client_mock() writer = arvados.CollectionWriter(api_client, **kwargs) writer.start_new_file('foo') writer.write('foo') return writer
def main(): # Get object representing the current task this_task = arvados.current_task() sort_by_r = re.compile(sort_by_regex) ################################################################################ # Concatentate VCFs in numerically sorted order of sort_by_regex ################################################################################ vcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs") out_dir = hgi_arvados.prepare_out_dir() output_prefix = arvados.current_job()['script_parameters']['output_prefix'] out_file = output_prefix + ".vcf.gz" # Concatenate VCFs bcftools_concat_exit = bcftools.concat( sorted(vcf_files, key=lambda fn: int(re.search(sort_by_r, fn).group('sort_by'))), os.path.join(out_dir, out_file)) if bcftools_concat_exit != 0: print "WARNING: bcftools concat exited with exit code %s (NOT WRITING OUTPUT)" % bcftools_concat_exit arvados.api().job_tasks().update(uuid=this_task['uuid'], body={ 'success': False }).execute() else: print "bcftools concat exited successfully, indexing" bcftools_index_exit = bcftools.index(os.path.join(out_dir, out_file)) if bcftools_index_exit != 0: print "WARNING: bcftools index exited with exit code %s (NOT WRITING OUTPUT)" % bcftools_index_exit arvados.api().job_tasks().update(uuid=this_task['uuid'], body={ 'success': False }).execute() else: print "bcftools index exited successfully, writing output to keep" # Write a new collection as output out = arvados.CollectionWriter() # Write out_dir to keep out.write_directory_tree(out_dir) # Commit the output to Keep. output_locator = out.finish() if validate_task_output(output_locator): print "Task output validated, setting output to %s" % ( output_locator) # Use the resulting locator as the output for this task. this_task.set_output(output_locator) else: print "ERROR: Failed to validate task output (%s)" % ( output_locator) arvados.api().job_tasks().update(uuid=this_task['uuid'], body={ 'success': False }).execute()