def run_audit(docstore, sample_list): doc = FileDocStore(file_path=docstore) master_list = [] with open(sample_list) as handle: for line in handle: master_list.append(line.rstrip()) results = {} pending = {} for id, entry in doc.filter(visible=True, data_type='galaxy.datatypes.tabular.Vcf'): if 'tags' in entry: sample = None for s in entry['tags']: tmp = s.split(":") if tmp[0] == 'sample': sample = tmp[1] if doc.size(entry) > 0: results[sample] = results.get(sample, []) + [entry['name']] for sample, files in results.items(): print "%s (%s) %s" % (sample, len(files), "\t".join(files)) for sample in master_list: if sample not in results or len(results[sample]) < 3: print "missing (%s)" % (len(results.get(sample, []))), sample
def run_errors(docstore): doc = FileDocStore(file_path=docstore) results = {} pending = {} for id, entry in doc.filter(visible=True): if entry.get('state', 'ok') in ['error']: print entry
def run_ls(docstore, size=False): doc = FileDocStore(file_path=docstore) for id, entry in doc.filter(): #if doc.size(entry) > 0: if size: print id, entry.get('name', id), doc.size(entry) else: print id, entry.get('name', id)
def run_timing(docstore): doc = FileDocStore(file_path=docstore) for id, entry in doc.filter(): if 'job' in entry and 'job_metrics' in entry['job']: timing = None for met in entry['job']['job_metrics']: if met['name'] == 'runtime_seconds': timing = met['raw_value'] if timing is not None: print id, entry["name"], timing
def testNebulaLaunch(self): input = { "input_file_1" : Target("c39ded10-6073-11e4-9803-0800200c9a66"), "input_file_2" : Target("26fd12a2-9096-4af2-a989-9e2f1cb692fe") } parameters = { "tail_select" : { "lineNum" : 3 } } doc = FileDocStore( file_path=get_abspath("../test_tmp/docstore") ) logging.info("Adding files to object store") sync_doc_dir("examples/simple_galaxy/", doc, uuid_set=["c39ded10-6073-11e4-9803-0800200c9a66", "26fd12a2-9096-4af2-a989-9e2f1cb692fe"] ) logging.info("Creating Task") workflow = GalaxyWorkflow(ga_file=get_abspath("../examples/simple_galaxy/SimpleWorkflow.ga")) task = nebula.tasks.GalaxyWorkflowTask( "test_workflow", workflow, inputs=input, parameters=parameters ) service = GalaxyService( docstore=doc, name="nosetest_galaxy", galaxy="bgruening/galaxy-stable:dev", port=20022 ) task_path = get_abspath("../test_tmp/test.tasks") service_path = get_abspath("../test_tmp/test.service") taskset = TaskGroup() taskset.append(task) with open(task_path, "w") as handle: taskset.store(handle) with open(service_path, "w") as handle: service.get_config().set_docstore_config(cache_path=get_abspath("../test_tmp/cache")).store(handle) env = dict(os.environ) if 'PYTHONPATH' in env: env['PYTHONPATH'] += ":" + get_abspath("../") else: env['PYTHONPATH'] = get_abspath("../") subprocess.check_call([get_abspath("../bin/nebula"), "run", service_path, task_path], env=env) for i in doc.filter(): print json.dumps(i, indent=4)
def run_ls(docstore, size=False, extra=[]): doc = FileDocStore(file_path=docstore) for id, entry in doc.filter(): #if doc.size(entry) > 0: extra = [] for e in args.extra: extra.append( str(entry.get(e,"")) ) if size: print id, entry.get('name', id), doc.size(entry), " ".join(extra) else: print id, entry.get('name', id), " ".join(extra)
def run_errors(docstore): doc = FileDocStore(file_path=docstore) for id, entry in doc.filter(): if entry.get('state', '') == 'error': print "Dataset", id, entry.get("tags", "") if 'provenance' in entry: print "tool:", entry['provenance']['tool_id'] print "-=-=-=-=-=-=-" print entry['job']['stdout'] print "-------------" print entry['job']['stderr'] print "-=-=-=-=-=-=-"
def testNebulaLaunch(self): input = { "input_file_1": Target("c39ded10-6073-11e4-9803-0800200c9a66"), "input_file_2": Target("26fd12a2-9096-4af2-a989-9e2f1cb692fe") } parameters = {"tail_select": {"lineNum": 3}} doc = FileDocStore(file_path=get_abspath("../test_tmp/docstore")) logging.info("Adding files to object store") sync_doc_dir("examples/simple_galaxy/", doc, uuid_set=[ "c39ded10-6073-11e4-9803-0800200c9a66", "26fd12a2-9096-4af2-a989-9e2f1cb692fe" ]) logging.info("Creating Task") workflow = GalaxyWorkflow( ga_file=get_abspath("../examples/simple_galaxy/SimpleWorkflow.ga")) task = nebula.tasks.GalaxyWorkflowTask("test_workflow", workflow, inputs=input, parameters=parameters) service = GalaxyService(docstore=doc, name="nosetest_galaxy", galaxy="bgruening/galaxy-stable:dev", port=20022) task_path = get_abspath("../test_tmp/test.tasks") service_path = get_abspath("../test_tmp/test.service") taskset = TaskGroup() taskset.append(task) with open(task_path, "w") as handle: taskset.store(handle) with open(service_path, "w") as handle: service.get_config().set_docstore_config( cache_path=get_abspath("../test_tmp/cache")).store(handle) env = dict(os.environ) if 'PYTHONPATH' in env: env['PYTHONPATH'] += ":" + get_abspath("../") else: env['PYTHONPATH'] = get_abspath("../") subprocess.check_call( [get_abspath("../bin/nebula"), "run", service_path, task_path], env=env) for i in doc.filter(): print json.dumps(i, indent=4)
def testServiceDescription(self): store = FileDocStore("./test_tmp/docstore") service = nebula.service.GalaxyService(store) service_dict = service.to_dict() self.assertIn('service_type', service_dict) self.assertEqual('Galaxy', service_dict['service_type']) print service_dict
def testMesosLaunch(self): input_file_1 = Target("c39ded10-6073-11e4-9803-0800200c9a66"), input_file_2 = Target("26fd12a2-9096-4af2-a989-9e2f1cb692fe") doc = FileDocStore(file_path="./test_tmp/docstore") logging.info("Adding files to object store") sync_doc_dir("examples/simple_galaxy/", doc, uuid_set=[ "c39ded10-6073-11e4-9803-0800200c9a66", "26fd12a2-9096-4af2-a989-9e2f1cb692fe" ]) task_1 = MD5Task(input_file_1) md5_service = nebula.service.md5_service.MD5Service(doc) sched = nebula.scheduler.Scheduler({}) mesos = nebula.drms.mesos_runner.MesosDRMS( sched, {"mesos": "%s:%s" % (self.host_ip, CONFIG_PARENT_PORT)}) mesos.start() mesos_md5_service = mesos.deploy_service(md5_service) job_1 = mesos_md5_service.submit(task_1) mesos_md5_service.wait([job_1]) print job_1 logging.info("Sleeping for 15") time.sleep(15) mesos.stop()
def testServiceGenerate(self): doc = FileDocStore(file_path=get_abspath("../test_tmp/docstore")) service = GalaxyService(docstore=doc, name="nosetest_galaxy", galaxy="bgruening/galaxy-stable", port=20022) json.dumps(service.to_dict())
def testServiceStart(self): store = FileDocStore("./test_tmp/docstore") self.service = nebula.service.GalaxyService(store, name="nosetest_galaxy", force=True, port=20022) self.service.start() time.sleep(10) self.assertFalse(self.service.in_error())
def run_query(docstore, fields, size, filters): doc = FileDocStore(file_path=docstore) filter = {} for k in filters: tmp=k.split("=") filter[tmp[0]] = tmp[1] for id, entry in doc.filter(**filter): if fields is None or len(fields) == 0: line = entry else: line = dict( (i, entry.get(i, "")) for i in fields ) if size: size_value = doc.size(Target(uuid=entry['uuid'])) else: size_value = "" print size_value, json.dumps(line)
def run_synapse(docstore, parent, workdir): doc = FileDocStore(file_path=docstore) syn = synapseclient.Synapse() syn.login() for id, entry in doc.filter(visible=True, data_type='galaxy.datatypes.tabular.Vcf'): if 'tags' in entry: sample = None for s in entry['tags']: tmp = s.split(":") if tmp[0] == 'sample': sample = tmp[1] name = entry['name'] name = re.sub(r'.vcf$', '', name) file_name = sample + "." + name + ".snv_mnv.vcf" target = Target(uuid=entry['uuid']) if doc.size(target) > 0: src_file = doc.get_filename(target) dst_file = os.path.join(workdir, file_name) query = "select * from entity where parentId=='%s' and name=='%s'" % (parent, file_name + ".gz") r = syn.query(query)['results'] if len(r) == 0: #print r print dst_file shutil.copy(src_file, dst_file) subprocess.check_call("bgzip %s" % (dst_file), shell=True) f = synapseclient.File(dst_file + ".gz", parentId = parent, name=file_name + ".gz" ) f.fileType = 'vcf' f.pipeline = 'UCSC' f.variant_type = "snv" f = syn.store(f, executed="https://github.com/ucsccancer/pcawg_tools" ) else: print "Skipping", file_name
def testToolTagging(self): doc = FileDocStore(file_path=get_abspath("../test_tmp/docstore")) sync_doc_dir(get_abspath("../examples/simple_galaxy/"), doc, uuid_set=[ "c39ded10-6073-11e4-9803-0800200c9a66", "26fd12a2-9096-4af2-a989-9e2f1cb692fe" ]) input_file_1 = Target(uuid="c39ded10-6073-11e4-9803-0800200c9a66") input_file_2 = Target(uuid="26fd12a2-9096-4af2-a989-9e2f1cb692fe") workflow = GalaxyWorkflow( ga_file=get_abspath("../examples/simple_galaxy/SimpleWorkflow.ga")) task_tag = nebula.tasks.GalaxyWorkflowTask( "workflow_ok", workflow, inputs={ 'input_file_1': input_file_1, 'input_file_2': input_file_2 }, parameters={"tail_select": { "lineNum": 3 }}, tags=["run:testing"], tool_tags={ "tail_select": { "out_file1": ["file:tail"] }, "concat_out": { "out_file1": ["file:output"] } }) print "Starting Service" service = GalaxyService(docstore=doc, name="nosetest_galaxy", galaxy="bgruening/galaxy-stable:dev", force=True, port=20022) service.start() self.service = service job = service.submit(task_tag) print "JOB", job.get_status() service.wait([job]) self.assertIn(job.get_status(), ['ok']) self.assertFalse(service.in_error()) print service.in_error()
def run_workflow(args): data_map = {} for meta_path in glob(os.path.join(args['lib_data'], "*.json")): data_path = re.sub(r'.json$', "", meta_path) if os.path.exists(data_path): try: with open(meta_path) as handle: meta = json.loads(handle.read()) if 'uuid' in meta: data_map[meta['uuid']] = data_path except: pass d_url = urlparse(args['doc_store']) if d_url.scheme == '': doc = FileDocStore(file_path=d_url.path) else: raise Exception("Object Store type not supported: %s" % (o_url.scheme)) #this side happens on the master node tasks = {} task_request = {} input_uuids = {} for i, input_file in enumerate(args['inputs']): with open(input_file) as handle: meta = json.loads(handle.read()) inputs = {} for k, v in meta.get('ds_map').items(): input_uuids[v['uuid']] = True t = Target(v['uuid']) if not doc.exists(t): if t.uuid not in data_map: raise Exception("Can't find input data: %s" % (t.uuid)) doc.update_from_file(t, data_map[t.uuid], create=True) doc.put(t.uuid, t.to_dict()) inputs[k] = t params = meta.get("parameters", {}) task_name = 'task_%s' % (i) if args['workflow'] is not None: task = GalaxyWorkflow(task_name, args['workflow'], inputs=inputs, parameters=params, tags=meta.get("tags", None), galaxy=args['galaxy'], tool_dir=args['tool_dir'], tool_data=args['tool_data']) else: with open(args['yaml_workflow']) as handle: yaml_text = handle.read() task = GalaxyWorkflow(task_name, yaml=yaml_text, inputs=inputs, parameters=params, tags=meta.get("tags", None), docker=args['galaxy'], tool_dir=args['tools'], tool_data=args['tool_data']) task_request[task_name] = meta task_data = task.get_task_data() tasks[task_name] = task_data #this side happens on the worker node service = ServiceFactory('galaxy', objectstore=doc, lib_data=[doc.file_path], tool_dir=args['tool_dir'], tool_data=args['tool_data'], galaxy=args['galaxy'], config_dir=args['config_dir'], sudo=args['sudo'], force=True, tool_docker=True, smp=args['smp'], cpus=args['cpus'], work_dir=args['work_dir']) service.start() task_job_ids = {} for task_name, task_data in tasks.items(): task = TaskJob(task_data) i = service.submit(task) task_job_ids[task_name] = i sleep_time = 1 while True: waiting = False for i in task_job_ids.values(): status = service.status(i) logging.info("Status check %s %s" % (status, i)) if status not in ['ok', 'error']: waiting = True if not waiting: break time.sleep(sleep_time) if sleep_time < 60: sleep_time += 1 #move the output data into the datastore for task_name, i in task_job_ids.items(): job = service.get_job(i) if job.error is None: for a in job.get_outputs(): meta = service.get_meta(a) #if 'tags' in task_request[task_name]: # meta["tags"] = task_request[task_name]["tags"] #print "meta!!!", json.dumps(meta, indent=4) doc.put(meta['uuid'], meta) if meta.get('visible', True): if meta['state'] == "ok": if meta['uuid'] not in input_uuids: logging.info("Downloading: %s" % (meta['uuid'])) service.store_data(a, doc) else: logging.info("Skipping input file %s" % (a)) else: logging.info("Skipping non-ok file: %s" % (meta['state'])) else: logging.info("Skipping Download %s (not visible)" % (a)) logging.info("Done") if not args['hold']: service.stop()
def run_get(docstore, uuid, outpath): doc = FileDocStore(file_path=docstore) print doc.get_filename(Target(uuid=uuid))
def testRunSimple(self): input = { "input_file_1" : Target("c39ded10-6073-11e4-9803-0800200c9a66"), "input_file_2" : Target("26fd12a2-9096-4af2-a989-9e2f1cb692fe") } parameters = { "tail_select" : { "lineNum" : 3 } } bad_parameters = dict(parameters) del bad_parameters['tail_select'] doc = FileDocStore(file_path="./test_tmp/docstore") logging.info("Adding files to object store") sync_doc_dir("examples/simple_galaxy/", doc, uuid_set=["c39ded10-6073-11e4-9803-0800200c9a66", "26fd12a2-9096-4af2-a989-9e2f1cb692fe"] ) logging.info("Creating Task") workflow = GalaxyWorkflow(ga_file="examples/simple_galaxy/SimpleWorkflow.ga") task = nebula.tasks.GalaxyWorkflowTask( "test_workflow", workflow, inputs=input, parameters=parameters ) task_data = task.to_dict() #make sure the task data can be serialized task_data_str = json.dumps(task_data) service = GalaxyService( docstore=doc, name="nosetest_galaxy", galaxy="bgruening/galaxy-stable", force=True, port=20022 ) self.service = service #make sure the generated task is serializable new_task_data = json.loads(task_data_str) new_task = nebula.tasks.from_dict(new_task_data) logging.info("Starting Service") print "Starting service" service.start() self.assertFalse( service.in_error() ) logging.info("Starting Tasks") job = service.submit(new_task) self.assertTrue( isinstance(job, TaskJob) ) self.assertFalse( service.in_error() ) #logging.info("Waiting") service.wait([job]) self.assertIn(job.get_status(), ['ok']) bad_task = nebula.tasks.GalaxyWorkflowTask( "test_workflow_bad", workflow, inputs=input, parameters=bad_parameters ) job = service.submit(bad_task) service.wait([job]) self.assertIn(job.get_status(), ['error']) self.assertFalse( service.in_error() )
def run_copy(docstore, out_docstore): doc = FileDocStore(file_path=docstore) out_doc = FileDocStore(file_path=out_docstore) for id, entry in doc.filter(): if out_doc.get(id) is None: print "copy", id out_doc.put(id, entry) if doc.exists(entry): src_path = doc.get_filename(entry) out_doc.create(entry) dst_path = out_doc.get_filename(entry) shutil.copy(src_path, dst_path) out_doc.update_from_file(entry) else: #print "skip", id, doc.size(entry), out_doc.size(entry) if doc.size(entry) != out_doc.size(entry): print "mismatch", id
def run_scan(docstore, workdir, keyfile, upload_url, manifest): doc = FileDocStore(file_path=docstore) file_map = { 'broad' : {}, 'muse' : {} } wl_map = {} with open(manifest) as handle: reader = csv.DictReader(handle, delimiter="\t") for row in reader: wl_map[row['Donor_ID']] = row for id, entry in doc.filter(visible=True): if entry.get('extension', None) in ["vcf", "vcf_bgzip"]: if 'tags' in entry: sample = None for s in entry['tags']: tmp = s.split(":") if tmp[0] == 'sample': sample = tmp[1] pipeline = None method = None call_type = None variant_type = None if entry['name'] in ['MUSE_1.0rc', 'MUSE_0.9.9.5']: pipeline = "muse" method = entry['name'].replace(".", "-") variant_type = 'somatic' call_type = 'snv_mnv' elif entry['name'].split(".")[0] in ['broad-dRanger', 'broad-dRanger_snowman', 'broad-snowman', 'broad-mutect' ]: pipeline = "broad" method = entry['name'].split(".")[0] if 'somatic' in entry['name']: variant_type = 'somatic' elif 'germline' in entry['name']: variant_type = 'germline' else: raise Exception("Unknown variant type") if 'snv_mnv.vcf' in entry['name']: call_type = 'snv_mnv' elif 'sv.vcf' in entry['name']: call_type = 'sv' elif 'indel.vcf' in entry['name']: call_type = 'indel' else: raise Exception("Unknown call type: %s" % (entry['name'])) else: raise Exeception("Unknown pipeline %s" % (entry['name'])) datestr = datetime.datetime.now().strftime("%Y%m%d") name = "%s.%s.%s.%s.%s" % (sample, method, datestr, variant_type, call_type ) name = re.sub(r'.vcf$', '', name) if entry['extension'] == 'vcf': file_name = name + ".vcf" elif entry['extension'] == 'vcf_bgzip': file_name = name + ".vcf.gz" print file_name target = Target(uuid=entry['uuid']) if doc.size(target) > 0: src_file = doc.get_filename(target) dst_file = os.path.join(workdir, file_name) shutil.copy(src_file, dst_file) if entry['extension'] == 'vcf': subprocess.check_call( "bgzip %s" % dst_file, shell=True ) dst_file = dst_file + ".gz" subprocess.check_call("tabix -p vcf %s" % (dst_file), shell=True) shutil.move("%s.tbi" % (dst_file), "%s.idx" % (dst_file)) subprocess.check_call("md5sum %s | awk '{print$1}' > %s.md5" % (dst_file, dst_file), shell=True) subprocess.check_call("md5sum %s.idx | awk '{print$1}' > %s.idx.md5" % (dst_file, dst_file), shell=True) if sample not in file_map[pipeline]: file_map[pipeline][sample] = [] input_file = os.path.basename(dst_file) file_map[pipeline][sample].append(input_file) for pipeline, samples in file_map.items(): for sample, files in samples.items(): with open( os.path.join(workdir, "%s.%s.sh" %(pipeline, sample)), "w" ) as handle: input_file = os.path.basename(dst_file) urls = [ "%scghub/metadata/analysisFull/%s" % (wl_map[sample]['Normal_GNOS_endpoint'], wl_map[sample]['Normal_Analysis_ID']), "%scghub/metadata/analysisFull/%s" % (wl_map[sample]['Tumour_GNOS_endpoint'], wl_map[sample]['Tumour_Analysis_ID']) ] cmd_str = "perl /opt/vcf-uploader/gnos_upload_vcf.pl" cmd_str += " --metadata-urls %s" % (",".join(urls)) cmd_str += " --vcfs %s " % (",".join(files)) cmd_str += " --vcf-md5sum-files %s " % ((",".join( ("%s.md5" % i for i in files) ))) cmd_str += " --vcf-idxs %s" % ((",".join( ("%s.idx" % i for i in files) ))) cmd_str += " --vcf-idx-md5sum-files %s" % ((",".join( ("%s.idx.md5" % i for i in files) ))) cmd_str += " --outdir %s.%s.dir" % (pipeline, sample) cmd_str += " --key %s " % (keyfile) cmd_str += " --upload-url %s" % (upload_url) cmd_str += " --study-refname-override tcga_pancancer_vcf_test" handle.write("""#!/bin/bash %s """ % (cmd_str) )