def mount_gatk_gvcf_inputs(inputs_param="inputs"): # Get input gVCFs for this task print "Mounting task input collection" inputs_dir = "" if inputs_param in arvados.current_task()['parameters']: inputs_dir = arvados.get_task_param_mount(inputs_param) else: inputs_dir = arvados.get_job_param_mount(inputs_param) # Sanity check input gVCFs input_gvcf_files = [] for f in arvados.util.listdir_recursive(inputs_dir): if re.search(r'\.vcf\.gz$', f): input_gvcf_files.append(os.path.join(inputs_dir, f)) elif re.search(r'\.tbi$', f): pass elif re.search(r'\.interval_list$', f): pass else: print "WARNING: collection contains unexpected file %s" % f if len(input_gvcf_files) == 0: raise errors.InvalidArgumentError( "Expected one or more .vcf.gz files in collection (found 0 while recursively searching %s)" % inputs_dir) # Ensure we can read the gVCF files and that they each have an index for gvcf_file in input_gvcf_files: if not os.access(gvcf_file, os.R_OK): raise errors.FileAccessError("gVCF file not readable: %s" % gvcf_file) # Ensure we have corresponding .tbi index and can read it as well (gvcf_file_base, gvcf_file_ext) = os.path.splitext(gvcf_file) assert (gvcf_file_ext == ".gz") tbi_file = gvcf_file_base + ".gz.tbi" if not os.access(tbi_file, os.R_OK): tbi_file = gvcf_file_base + ".tbi" if not os.access(tbi_file, os.R_OK): raise errors.FileAccessError( "No readable gVCF index file for gVCF file: %s" % gvcf_file) return input_gvcf_files
def get_file_path(parameter,regex): """ Return the path to a file with (name) set in script parameters (parameter), using regex (regex): Basically to avoid: ref_collection_id = this_job['script_parameters']['reference_index'] ref_collection = coll(ref_collection_id) for file in ref_collection: if not re.search('.*f(ast)?a(.gz)?$',file): continue ref_file = file ref_path = os.path.join(arvados.get_job_param_mount("reference_index"),ref_file) """ collection_id = arvados.current_job()['script_parameters'][parameter] collection_handle = Collection(collection_id) for file in collection_handle: if not re.search(regex,file): continue out_file = file out_path = os.path.join(arvados.get_job_param_mount(parameter),out_file) return out_path
def spawn_new_task_per_bed_line(script_parameter, regex, if_sequence=0, and_end_task=True): """ Generalized form of one_task_per_pair_input_file from https://github.com/curoverse/arvados/blob/master/crunch_scripts/arvados_bwa.py Creates a new task if the file in the collection matches the regex """ if if_sequence != arvados.current_task()['sequence']: return job_input = arvados.current_job()['script_parameters'][script_parameter] input_collection = Collection(job_input) for name in input_collection: if not re.search(regex,name): continue name_path = os.path.join(arvados.get_job_param_mount(script_parameter),name) bed_lines = (line.split() for line in open(name_path, 'r')) # Start the biggest regions first def cmp_desc_region_size(a, b): return ((int(b[2]) - int(b[1])) - (int(a[2]) - int(a[1]))) for bed_line in sorted(bed_lines, cmp=cmp_desc_region_size): print bed_line new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': if_sequence + 1, 'parameters': { 'chrom': bed_line[0], 'start': bed_line[1], 'end': bed_line[2] } } arvados.api().job_tasks().create(body=new_task_attrs).execute() if and_end_task: arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={'success':True} ).execute() exit()
import numpy as np ######################################################################################################################## # Read constants NUM_RETRIES = int(arvados.getjobparam('num-retries')) assert NUM_RETRIES > 0, "'num-retries' must be strictly positive" antigen_type = str(arvados.getjobparam('antigen-type')) ######################################################################################################################## #Set-up collection and logging file to write out to out = arvados.collection.Collection(num_retries=NUM_RETRIES) time_logging_fh = out.open('time_log.txt', 'w') ######################################################################################################################## # Load settings t0 = time.time() settings = imp.load_source('settings', arvados.get_job_param_mount('settings')) t1 = time.time() time_logging_fh.write('Loading settings %fs\n' % (t1 - t0)) ######################################################################################################################## #Get path lengths and path integers cr = arvados.CollectionReader(arvados.getjobparam('path-lengths'), num_retries=NUM_RETRIES) t0 = time.time() with cr.open("path_integers.npy", 'r') as f: path_integers = np.load(f) t1 = time.time() with cr.open("path_lengths.npy", 'r') as f: path_lengths = np.load(f) t2 = time.time() time_logging_fh.write('Loading path integers took %fs\n' % (t1 - t0)) time_logging_fh.write('Loading path lengths took %fs\n' % (t2 - t1))
# metadata: # batch: your-arbitrary-batch-name algorithm: aligner: bwa mark_duplicates: true recalibrate: false realign: false variantcaller: freebayes platform: illumina quality_format: Standard # for targetted projects, set the region # variant_regions: /path/to/your.bed ''') os.unlink("/usr/local/share/bcbio-nextgen/gemini_data") os.symlink(arvados.get_job_param_mount("gemini_data"), "/usr/local/share/bcbio-nextgen/gemini_data") os.chdir(arvados.current_task().tmpdir) rcode = subprocess.call([ "bcbio_nextgen.py", "--workflow", "template", "/tmp/crunch-job/freebayes-variant.yaml", "project1", subst.do_substitution(p, "$(file $(R1))"), subst.do_substitution(p, "$(file $(R2))") ]) os.chdir("project1/work") os.symlink("/usr/local/share/bcbio-nextgen/galaxy/tool-data", "tool-data")
NUM_PHASES_TMP += 1 elif not QUALITY: NUM_PHASES_TMP += 1 NUM_CALLSETS += 1 t1 = time.time() NUM_PHASES = NUM_PHASES_TMP/NUM_CALLSETS assert float(NUM_PHASES) == NUM_PHASES_TMP/float(NUM_CALLSETS), "Unequal number of phases per callset" time_logging_fh.write("Cursory reading of 'pythonic-tiling-callset-files' took %fs\n" % (t1-t0)) #Get callset phenotype files #Unable to use collections due to csv/json read functions t0 = time.time() if arvados.getjobparam('callset-phenotypes') == None: phenotype_file_paths = [] else: phenotype_path = arvados.get_job_param_mount('callset-phenotypes') for root, dirs, files in os.walk(phenotype_path): assert len(dirs) == 0, "Expects 'callset-phenotypes' to be a flat directory" phenotype_file_paths = [os.path.join(root, f) for f in files] t1 = time.time() time_logging_fh.write("Getting job param mount (and file paths) of 'callset-phenotypes' took %fs\n" % (t1-t0)) ######################################################################################################################## population, subjects, callset_names, size = fns.get_population( ACCEPTED_PATHS, path_integers, path_lengths, NUM_CALLSETS, NUM_PHASES, phenotype_file_paths, callset_collection_reader, CALLSET_NAME_REGEX,
# batch: your-arbitrary-batch-name algorithm: aligner: bwa mark_duplicates: true recalibrate: false realign: false variantcaller: freebayes platform: illumina quality_format: Standard # for targetted projects, set the region # variant_regions: /path/to/your.bed """ ) os.unlink("/usr/local/share/bcbio-nextgen/gemini_data") os.symlink(arvados.get_job_param_mount("gemini_data"), "/usr/local/share/bcbio-nextgen/gemini_data") os.chdir(arvados.current_task().tmpdir) rcode = subprocess.call( [ "bcbio_nextgen.py", "--workflow", "template", "/tmp/crunch-job/freebayes-variant.yaml", "project1", subst.do_substitution(p, "$(file $(R1))"), subst.do_substitution(p, "$(file $(R2))"), ] )
######################################################################################################################## # Read constants NUM_RETRIES = int(arvados.getjobparam('num-retries')) assert NUM_RETRIES > 0, "'num-retries' must be strictly positive" antigen_type = str(arvados.getjobparam('antigen-type')) ######################################################################################################################## #Set-up collection and logging file to write out to out = arvados.collection.Collection(num_retries=NUM_RETRIES) time_logging_fh = out.open('time_log.txt', 'w') info_fh = out.open('log.txt', 'w') ######################################################################################################################## # Load settings t0 = time.time() settings = imp.load_source('settings', arvados.get_job_param_mount('settings')) t1 = time.time() time_logging_fh.write('Loading settings %fs\n' %(t1-t0)) ######################################################################################################################## #Parallelize based on settings def one_task_per_classifier(num_classifiers_to_parameterize, if_sequence=0, and_end_task=True): if if_sequence != arvados.current_task()['sequence']: return api_client = arvados.api('v1') for i in range(num_classifiers_to_parameterize): new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': if_sequence + 1, 'parameters': { 'classifier_index':i,