def testTaskGenerate(self): targets = [] for a in glob(get_abspath("../examples/simple_galaxy/*.fasta")): targets.append(TargetFile(a)) tasks = TaskGroup() for i, t in enumerate(targets): workflow = GalaxyWorkflow(ga_file=get_abspath( "../examples/simple_galaxy/SimpleWorkflow.ga")) task = GalaxyWorkflowTask("workflow_%s" % (i), workflow, inputs={'input_file': t}) tasks.append(task) #check if elements can be serialized for a in tasks.to_dict(): task_json = json.dumps(a) with open(get_abspath("../test_tmp/nebula_tasks"), "w") as handle: tasks.store(handle) new_tasks = TaskGroup() with open(get_abspath("../test_tmp/nebula_tasks")) as handle: new_tasks.load(handle) self.assertEqual(len(tasks), len(new_tasks)) for task in new_tasks: print task
def testNebulaLaunch(self): input = { "input_file_1": Target("c39ded10-6073-11e4-9803-0800200c9a66"), "input_file_2": Target("26fd12a2-9096-4af2-a989-9e2f1cb692fe") } parameters = {"tail_select": {"lineNum": 3}} doc = FileDocStore(file_path=get_abspath("../test_tmp/docstore")) logging.info("Adding files to object store") sync_doc_dir("examples/simple_galaxy/", doc, uuid_set=[ "c39ded10-6073-11e4-9803-0800200c9a66", "26fd12a2-9096-4af2-a989-9e2f1cb692fe" ]) logging.info("Creating Task") workflow = GalaxyWorkflow( ga_file=get_abspath("../examples/simple_galaxy/SimpleWorkflow.ga")) task = nebula.tasks.GalaxyWorkflowTask("test_workflow", workflow, inputs=input, parameters=parameters) service = GalaxyService(docstore=doc, name="nosetest_galaxy", galaxy="bgruening/galaxy-stable:dev", port=20022) task_path = get_abspath("../test_tmp/test.tasks") service_path = get_abspath("../test_tmp/test.service") taskset = TaskGroup() taskset.append(task) with open(task_path, "w") as handle: taskset.store(handle) with open(service_path, "w") as handle: service.get_config().set_docstore_config( cache_path=get_abspath("../test_tmp/cache")).store(handle) env = dict(os.environ) if 'PYTHONPATH' in env: env['PYTHONPATH'] += ":" + get_abspath("../") else: env['PYTHONPATH'] = get_abspath("../") subprocess.check_call( [get_abspath("../bin/nebula"), "run", service_path, task_path], env=env) for i in doc.filter(): print json.dumps(i, indent=4)
def run_gen(args): syn = synapseclient.Synapse() syn.login() docstore = from_url(args.out_base) data_mapping = { "db_snp": "dbsnp_132_b37.leftAligned.vcf", "centromere": "centromere_hg19.bed", "reference_genome": "Homo_sapiens_assembly19.fasta", "cosmic": "b37_cosmic_v54_120711.vcf" } if args.ref_download: syn_sync(syn, REFDATA_PROJECT, docstore, data_mapping.values()) dm = {} for k, v in data_mapping.items(): hit = None for a in docstore.filter(name=v): hit = a[0] if hit is None: raise Exception("%s not found" % (v)) dm[k] = {"uuid": hit} if args.sample is not None: sync_doc_dir(os.path.join(os.path.dirname(__file__), "..", "testexomes"), docstore, filter=lambda x: x['donorId'] in args.sample) else: sync_doc_dir( os.path.join(os.path.dirname(__file__), "..", "testexomes"), docstore) tumor_uuids = {} normal_uuids = {} for id, ent in docstore.filter(sampleType="tumour"): tumor_uuids[ent['participant_id']] = id for id, ent in docstore.filter(sampleType="normal"): normal_uuids[ent['participant_id']] = id mc3_workflow = GalaxyWorkflow( ga_file="workflows/Galaxy-Workflow-MC3_Pipeline_Test.ga") reference_id = None for a in docstore.filter(name="Homo_sapiens_assembly19.fasta"): reference_id = a[0] tasks = TaskGroup() for donor in tumor_uuids: if donor in normal_uuids: print "participant", donor donor_name = None for k, v in fake_metadata.items(): if v['participant_id'] == donor: donor_name = k workflow_dm = dict(dm) workflow_dm['tumor_bam'] = {"uuid": tumor_uuids[donor]} workflow_dm['normal_bam'] = {"uuid": normal_uuids[donor]} task = GalaxyWorkflowTask( "workflow_%s" % (donor), mc3_workflow, inputs=workflow_dm, parameters={ "reheader_config": { "platform": "Illumina", "center": "OHSU", "reference_genome": "Homo_sapiens_assembly19.fasta", "participant_uuid": fake_metadata[donor_name]['participant_id'], "disease_code": fake_metadata[donor_name]['disease'], "filedate": datetime.datetime.now().strftime("%Y%m%d"), "normal_analysis_uuid": fake_metadata[donor_name]['normal']['uuid'], "normal_bam_name": fake_metadata[donor_name]['normal']['file_name'], "normal_aliquot_uuid": fake_metadata[donor_name]['normal']['aliquot_id'], "normal_aliquot_barcode": fake_metadata[donor_name]['normal']['barcode'], "tumor_analysis_uuid": fake_metadata[donor_name]['tumour']['uuid'], "tumor_bam_name": fake_metadata[donor_name]['tumour']['file_name'], "tumor_aliquot_uuid": fake_metadata[donor_name]['tumour']['aliquot_id'], "tumor_aliquot_barcode": fake_metadata[donor_name]['tumour']['barcode'], } }, tags=["donor:%s" % (donor)], ) tasks.append(task) if not os.path.exists("%s.tasks" % (args.out_base)): os.mkdir("%s.tasks" % (args.out_base)) for data in tasks: with open("%s.tasks/%s" % (args.out_base, data.task_id), "w") as handle: handle.write(json.dumps(data.to_dict())) if args.create_service: service = GalaxyService(docstore=docstore, galaxy=args.galaxy, sudo=args.sudo, tool_data=args.tool_data, tool_dir=args.tool_dir, work_dir=args.work_dir, smp=[["gatk_bqsr", 12], ["gatk_indel", 24], ["MuSE", 8], ["pindel", 8], ["mutect", 8], ["delly", 4], ["gatk_bqsr", 12], ["gatk_indel", 12], ["bwa_mem", 12], ["radia", 8], ['radia_filter', 8]]) with open("%s.service" % (args.out_base), "w") as handle: s = service.get_config() if args.scratch: print "Using scratch", args.scratch s.set_docstore_config(cache_path=args.scratch, open_perms=True) s.store(handle)
def run_gen(args): syn = synapseclient.Synapse() syn.login() docstore = from_url(args.out_base) data_mapping = { "db_snp" : "dbsnp_132_b37.leftAligned.vcf", "centromere" : "centromere_hg19.bed", "cosmic" : "b37_cosmic_v54_120711.vcf" } ref_genomes = [ "Homo_sapiens_assembly19.fasta", "GRCh37-lite.fa", "GRCh37-lite-+-HPV_Redux-build.fa", "GRCh37-lite_WUGSC_variant_1.fa.gz", "GRCh37-lite_WUGSC_variant_2.fa.gz", "hg19_M_rCRS.fa.gz" ] if args.ref_download: syn_sync(syn, REFDATA_PROJECT, docstore, data_mapping.values() + ref_genomes) dm = {} for k,v in data_mapping.items(): hit = None for a in docstore.filter(name=v): hit = a[0] if hit is None: raise Exception("%s not found" % (v)) dm[k] = { "uuid" : hit } mc3_dna_workflow = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-MC3_Pipeline_CGHub_DNA.ga") mc3_dnarna_workflow = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-MC3_Pipeline_CGHub_DNA_RNA.ga") rna_hit = None for a in docstore.filter(name="hg19_M_rCRS.fa"): rna_hit = a[0] tasks = TaskGroup() assembly_hits = {} with open(args.joblist) as handle: reader = csv.DictReader(handle, delimiter="\t") for row in reader: if row['normal_assembly'] != row['tumor_assembly']: print "Row Mispatch", row['normal_assembly'], row['tumor_assembly'] #raise Exception("Mismatch reference") ref_name = row['normal_assembly'] if ref_name in ref_rename: ref_name = ref_rename[ref_name] if ref_name in assembly_hits: hit = assembly_hits[ref_name] else: hit = None for a in docstore.filter(name=ref_name + ".fasta"): hit = a[0] for a in docstore.filter(name=ref_name + ".fa"): hit = a[0] if hit is None: raise Exception("%s not found" % (ref_name)) assembly_hits[ref_name] = hit workflow_dm = dict(dm) workflow_dm['reference_genome'] = { "uuid" : hit } params = { 'tumor_bam' : { "uuid" : row['tumor_analysis_id'], "gnos_endpoint" : "cghub.ucsc.edu", "cred_file" : "/tool_data/files/cghub.key" }, 'normal_bam' : { "uuid" : row['normal_analysis_id'], "gnos_endpoint" : "cghub.ucsc.edu", "cred_file" : "/tool_data/files/cghub.key" }, "reheader_config" : { "platform" : "Illumina", "center" : "OHSU", "reference_genome" : ref_name, "participant_uuid" : row['participant_id'], "disease_code" : row['disease'], "filedate" : datetime.datetime.now().strftime("%Y%m%d"), "normal_analysis_uuid" : row['normal_analysis_id'], "normal_bam_name" : row['normal_filename'], "normal_aliquot_uuid" : row['normal_aliquot_id'], "normal_aliquot_barcode": row['normal_barcode'], "tumor_analysis_uuid" : row['tumor_analysis_id'], "tumor_bam_name" : row['tumor_filename'], "tumor_aliquot_uuid" : row['tumor_aliquot_id'], "tumor_aliquot_barcode" : row['tumor_barcode'], } } if row['rna_analysis_id'] != "NA": params['rna_tumor_bam'] = { "uuid" : row['rna_analysis_id'], "gnos_endpoint" : "cghub.ucsc.edu", "cred_file" : "/tool_data/files/cghub.key" } workflow_dm['rna_reference_genome'] = { "uuid" : rna_hit } task = GalaxyWorkflowTask("workflow_%s" % (row['job_id']), mc3_dnarna_workflow, inputs=workflow_dm, parameters=params, tags=[ "donor:%s" % (row['participant_id']) ], ) else: task = GalaxyWorkflowTask("workflow_%s" % (row['job_id']), mc3_dna_workflow, inputs=workflow_dm, parameters=params, tags=[ "donor:%s" % (row['participant_id']) ], ) tasks.append(task) if not os.path.exists("%s.tasks" % (args.out_base)): os.mkdir("%s.tasks" % (args.out_base)) for data in tasks: with open("%s.tasks/%s" % (args.out_base, data.task_id), "w") as handle: handle.write(json.dumps(data.to_dict())) service = GalaxyService( docstore=docstore, galaxy=args.galaxy, sudo=args.sudo, tool_data=args.tool_data, tool_dir=args.tool_dir, work_dir=args.work_dir, smp=[ ["gatk_bqsr", 12], ["gatk_indel", 24], ["MuSE", 8], ["pindel", 8], ["mutect", 8], ["delly", 4], ["gatk_bqsr", 12], ["gatk_indel", 12], ["bwa_mem", 12], ["radia", 8], ['radia_filter', 8] ] ) with open("%s.service" % (args.out_base), "w") as handle: s = service.get_config() if args.scratch: print "Using scratch", args.scratch s.set_docstore_config(cache_path=args.scratch, open_perms=True) s.store(handle)
def run_gen(args): args = parser.parse_args() syn = synapseclient.Synapse() syn.login() docstore = from_url(args.out_base) data_mapping = { "reference_genome": "genome.fa", "dbsnp": "dbsnp_132_b37.leftAligned.vcf", "cosmic": "b37_cosmic_v54_120711.vcf", "gold_indels": "Mills_and_1000G_gold_standard.indels.hg19.sites.fixed.vcf", "phase_one_indels": "1000G_phase1.indels.hg19.sites.fixed.vcf", "centromere": "centromere_hg19.bed" } if args.ref_download: #download reference files from Synapse and populate the document store for a in syn.chunkedQuery('select * from entity where parentId=="%s"' % (REFDATA_PROJECT)): print "found", a['entity.name'] if a['entity.name'] in data_mapping.values( ) or a['entity.name'].replace(".gz", "") in data_mapping.values(): print "loading" ent = syn.get(a['entity.id']) id = ent.annotations['uuid'][0] t = Target(uuid=id) docstore.create(t) path = docstore.get_filename(t) name = ent.name if 'dataPrep' in ent.annotations: if ent.annotations['dataPrep'][0] == 'gunzip': subprocess.check_call("gunzip -c %s > %s" % (ent.path, path), shell=True) name = name.replace(".gz", "") else: print "Unknown DataPrep" else: shutil.copy(ent.path, path) docstore.update_from_file(t) meta = {} meta['name'] = name meta['uuid'] = id if 'dataPrep' in meta: del meta['dataPrep'] docstore.put(id, meta) dm = {} for k, v in data_mapping.items(): hit = None for a in docstore.filter(name=v): hit = a[0] if hit is None: raise Exception("%s not found" % (v)) dm[k] = {"uuid": hit} workflow = GalaxyWorkflow( ga_file="workflows/Galaxy-Workflow-PCAWG_CGHUB.ga") tasks = TaskGroup() for ent in synqueue.listAssignments(syn, **config): #print "'%s'" % (ent['state']), ent['state'] == 'nan', type(ent['state']), type('nan') if not isinstance(ent['state'], basestring) and isnan(ent['state']): gnos_endpoint = urlparse( ent['meta']['Normal_WGS_alignment_GNOS_repos']).netloc task = GalaxyWorkflowTask( "workflow_%s" % (ent['id']), workflow, inputs=dm, parameters={ 'normal_bam_download': { "uuid": ent['meta']['Normal_WGS_alignment_GNOS_analysis_ID'], "gnos_endpoint": gnos_endpoint, "cred_file": key_map[gnos_endpoint] }, 'tumor_bam_download': { "uuid": ent['meta']['Tumour_WGS_alignment_GNOS_analysis_IDs'], "gnos_endpoint": gnos_endpoint, "cred_file": key_map[gnos_endpoint] }, 'broad_variant_pipeline': { "broad_ref_dir": "/tool_data/files/refdata", "sample_id": ent['meta']['Submitter_donor_ID'] } }, tags=["donor:%s" % (ent['meta']['Submitter_donor_ID'])]) tasks.append(task) if not os.path.exists("%s.tasks" % (args.out_base)): os.mkdir("%s.tasks" % (args.out_base)) for data in tasks: with open("%s.tasks/%s" % (args.out_base, data.task_id), "w") as handle: handle.write(json.dumps(data.to_dict())) state_file = "%s.tasks/%s.state" % (args.out_base, data.task_id) if os.path.exists(state_file): os.unlink(state_file) print "Tasks Created: %s" % (len(tasks)) if args.create_service: service = GalaxyService(docstore=docstore, galaxy="bgruening/galaxy-stable", sudo=args.sudo, tool_data=os.path.abspath("tool_data"), tool_dir=os.path.abspath("tools"), work_dir=args.work_dir, smp=[["MuSE", 8], ["pindel", 8], ["muTect", 8], ["delly", 4], ["gatk_bqsr", 12], ["gatk_indel", 24], ["bwa_mem", 12], ["broad_variant_pipline", 24]]) with open("%s.service" % (args.out_base), "w") as handle: s = service.get_config() if args.scratch: print "Using scratch", args.scratch s.set_docstore_config(cache_path=args.scratch, open_perms=True) s.store(handle)
def run_gen(args): syn = synapseclient.Synapse() syn.login() if args.alt_table is not None: config['table_id'] = args.alt_table docstore = from_url(args.out_base) if args.ref_download: #download reference files from Synapse and populate the document store for a in syn.chunkedQuery('select * from entity where parentId=="%s"' % (REFDATA_PROJECT)): ent = syn.get(a['entity.id']) id = ent.annotations['uuid'][0] t = Target(uuid=id) docstore.create(t) path = docstore.get_filename(t) name = ent.name if 'dataPrep' in ent.annotations: if ent.annotations['dataPrep'][0] == 'gunzip': subprocess.check_call("gunzip -c %s > %s" % (ent.path, path), shell=True) name = name.replace(".gz", "") else: print "Unknown DataPrep" else: shutil.copy(ent.path, path) docstore.update_from_file(t) meta = {} meta['name'] = name meta['uuid'] = id if 'dataPrep' in meta: del meta['dataPrep'] docstore.put(id, meta) data_mapping = { "dbsnp": "dbsnp_132_b37.leftAligned.vcf", "cosmic": "b37_cosmic_v54_120711.vcf", "gold_indels": "Mills_and_1000G_gold_standard.indels.hg19.sites.fixed.vcf", "phase_one_indels": "1000G_phase1.indels.hg19.sites.fixed.vcf" } dm = {} for k, v in data_mapping.items(): hit = None for a in docstore.filter(name=v): hit = a[0] if hit is None: raise Exception("%s not found" % (v)) dm[k] = {"uuid": hit} workflow_2 = GalaxyWorkflow( ga_file="workflows/Galaxy-Workflow-GATK_CGHub_2.ga") workflow_3 = GalaxyWorkflow( ga_file="workflows/Galaxy-Workflow-GATK_CGHub_3.ga") ref_rename = {"HG19_Broad_variant": "Homo_sapiens_assembly19"} tasks = TaskGroup() for ent in synqueue.listAssignments(syn, **config): bam_set = list( a[1] for a in ent['meta'].items() if a[0].startswith("id_") and isinstance(a[1], basestring)) ref_set = set(a[1] for a in ent['meta'].items() if a[0].startswith("ref_assembly_") and isinstance(a[1], basestring)) assert (len(ref_set) == 1) ref_name = ref_set.pop() if ref_name in ref_rename: ref_name = ref_rename[ref_name] hit = None for a in docstore.filter(name=ref_name + ".fasta"): hit = a[0] for a in docstore.filter(name=ref_name + ".fa"): hit = a[0] if hit is None: raise Exception("%s not found" % (ref_name)) workflow_dm = dict(dm) workflow_dm['reference_genome'] = {"uuid": hit} if len(bam_set) == 2: task = GalaxyWorkflowTask( "workflow_%s" % (ent['id']), workflow_2, inputs=workflow_dm, parameters={ 'INPUT_BAM_1': { "uuid": bam_set[0], "gnos_endpoint": "cghub.ucsc.edu", "cred_file": "/tool_data/files/cghub.key" }, 'INPUT_BAM_2': { "uuid": bam_set[1], "gnos_endpoint": "cghub.ucsc.edu", "cred_file": "/tool_data/files/cghub.key" } }, tags=["donor:%s" % (ent['meta']['participant_id'])], tool_tags={ "BQSR_1": { "output_bam": ["original_bam:%s" % (bam_set[0])] }, "BQSR_2": { "output_bam": ["original_bam:%s" % (bam_set[1])] } }) tasks.append(task) elif len(bam_set) == 3: task = GalaxyWorkflowTask( "workflow_%s" % (ent['id']), workflow_3, inputs=workflow_dm, parameters={ 'INPUT_BAM_1': { "uuid": bam_set[0], "gnos_endpoint": "cghub.ucsc.edu", "cred_file": "/tool_data/files/cghub.key" }, 'INPUT_BAM_2': { "uuid": bam_set[1], "gnos_endpoint": "cghub.ucsc.edu", "cred_file": "/tool_data/files/cghub.key" }, 'INPUT_BAM_3': { "uuid": bam_set[2], "gnos_endpoint": "cghub.ucsc.edu", "cred_file": "/tool_data/files/cghub.key" } }, tags=["donor:%s" % (ent['meta']['participant_id'])], tool_tags={ "BQSR_1": { "output_bam": ["original_bam:%s" % (bam_set[0])] }, "BQSR_2": { "output_bam": ["original_bam:%s" % (bam_set[1])] }, "BQSR_3": { "output_bam": ["original_bam:%s" % (bam_set[2])] } }) tasks.append(task) if not os.path.exists("%s.tasks" % (args.out_base)): os.mkdir("%s.tasks" % (args.out_base)) for data in tasks: with open("%s.tasks/%s" % (args.out_base, data.task_id), "w") as handle: handle.write(json.dumps(data.to_dict())) if args.create_service: service = GalaxyService(docstore=docstore, galaxy="bgruening/galaxy-stable", sudo=True, tool_data=args.tool_data, tool_dir=args.tool_dir, work_dir=args.work_dir, smp=[["gatk_bqsr", 12], ["gatk_indel", 24]]) with open("%s.service" % (args.out_base), "w") as handle: s = service.get_config() if args.scratch: print "Using scratch", args.scratch s.set_docstore_config(cache_path=args.scratch, open_perms=True) s.store(handle)