def main(parser): args = parser.parse_args() if not args.names: assert args.dataName, 'Must pass dataName if not using -n option' #set up connection to service sal = ServiceAccessLayer(args.host,args.port) #get datastore dstore = sal.get_analysis_job_datastore(args.jobNumber) #loop through data for uuid,dsfile in dstore.files.items(): if args.names: #print the attribute values print '\t'.join([fmt(getattr(dsfile,a)) for a in attrs]) elif dsfile.name in args.dataName: #cp file to outdir ofile = '{o}{s}{n}'.format(o=args.outDir, s=os.path.sep, n=os.path.basename(dsfile.path)) copy(dsfile.path,ofile) print '\t'.join([fmt('\'%s\''%dsfile.name),'=>',ofile]) return
def main(parser): args = parser.parse_args() jobs = pd.read_csv(args.jobCsv) sal = ServiceAccessLayer(args.host, args.port) #get dicts of values for all jobs rpts = jobs.jobId.apply(sal.get_analysis_job_report_attrs).values #check for unfinished jobs, exit if any unfinished = [ j for j in jobs.jobId if sal.get_job_by_id(j).state not in FINISHED ] if unfinished: for j in unfinished: print 'job %i still running' % j print 'Exiting' sys.exit() #put the reports together and index with (jobName,host,jobId,link) jobs['link'] = jobs[['host', 'jobId']].apply(LINKFMT, axis=1) columns = ['jobName', 'host', 'jobId', 'link'] idx = pd.MultiIndex.from_arrays(jobs[columns].values.T, names=columns) collated = pd.DataFrame.from_records(rpts, index=idx).T for fmt, fnc in zip(['.csv', '.xls'], [pd.DataFrame.to_csv, pd.DataFrame.to_excel]): ofile = '{d}/{name}{fmt}'.format(d=args.outDir, name=DEFAULTCSV, fmt=fmt) fnc(collated, ofile) # float_format=FLOATFMT print 'Wrote results to %s' % ofile return None
def run_import_fasta(host, port, fasta_path, name, organism, ploidy, block=False): sal = ServiceAccessLayer(host, port) if block is True: sal.run_import_fasta(fasta_path, name, organism, ploidy) else: sal.import_fasta(fasta_path, name, organism, ploidy) return 0
def get_sal_and_status(host, port): """Get Sal or Raise if status isn't successful""" try: sal = ServiceAccessLayer(host, port) sal.get_status() return sal except RequestException as e: log.error("Failed to connect to {h}:{p}".format(h=host, p=port)) raise
def run_services_testkit_job(host, port, testkit_cfg, xml_out="test-output.xml", ignore_test_failures=False, time_out=1800, sleep_time=2, import_only=False, test_job_id=None): """ Given a testkit.cfg and host/port parameters: 1. convert the .cfg to a JSON file 2. connect to the SMRTLink services and start the job, then block until it finishes 3. run the standard test suite on the job output """ sal = ServiceAccessLayer(host, port, sleep_time=sleep_time) if test_job_id is not None: engine_job = sal.get_job_by_id(test_job_id) return run_butler_tests_from_cfg(testkit_cfg=testkit_cfg, output_dir=engine_job.path, output_xml=xml_out, service_access_layer=sal, services_job_id=test_job_id) entrypoints = get_entrypoints(testkit_cfg) pipeline_id = pipeline_id_from_testkit_cfg(testkit_cfg) job_id = job_id_from_testkit_cfg(testkit_cfg) log.info("job_id = {j}".format(j=job_id)) log.info("pipeline_id = {p}".format(p=pipeline_id)) log.info("url = {h}:{p}".format(h=host, p=port)) task_options, workflow_options = get_task_and_workflow_options(testkit_cfg) service_entrypoints = [ ServiceEntryPoint.from_d(x) for x in entrypoints_dicts(entrypoints) ] for ep, dataset_xml in entrypoints.iteritems(): log.info("Importing {x}".format(x=dataset_xml)) sal.run_import_local_dataset(dataset_xml) if import_only: log.info("Skipping job execution") return 0 log.info("starting anaylsis job...") # XXX note that workflow options are currently ignored engine_job = run_analysis_job(sal, job_id, pipeline_id, service_entrypoints, block=True, time_out=time_out, task_options=task_options) exit_code = run_butler_tests_from_cfg(testkit_cfg=testkit_cfg, output_dir=engine_job.path, output_xml=xml_out, service_access_layer=sal, services_job_id=engine_job.id) if ignore_test_failures and engine_job.was_successful(): return 0 return exit_code
def args_get_sal_summary(args): host = args.host port = args.port sal = ServiceAccessLayer(host, port) print sal.to_summary() return 0
def run_import_fasta(host, port, fasta_path, name, organism, ploidy, block=False): sal = ServiceAccessLayer(host, port) log.info("importing ({s:.2f} MB) {f} ".format(s=_get_size_mb(fasta_path), f=fasta_path)) if block is True: result = sal.run_import_fasta(fasta_path, name, organism, ploidy) log.info("Successfully imported {f}".format(f=fasta_path)) log.info("result {r}".format(r=result)) else: sal.import_fasta(fasta_path, name, organism, ploidy) return 0
def run_services_testkit_job(host, port, testkit_cfg, xml_out="test-output.xml", ignore_test_failures=False, time_out=1800, sleep_time=2, import_only=False, test_job_id=None): """ Given a testkit.cfg and host/port parameters: 1. convert the .cfg to a JSON file 2. connect to the SMRTLink services and start the job, then block until it finishes 3. run the standard test suite on the job output """ sal = ServiceAccessLayer(host, port, sleep_time=sleep_time) if test_job_id is not None: engine_job = sal.get_job_by_id(test_job_id) return run_butler_tests_from_cfg( testkit_cfg=testkit_cfg, output_dir=engine_job.path, output_xml=xml_out, service_access_layer=sal, services_job_id=test_job_id) entrypoints = get_entrypoints(testkit_cfg) pipeline_id = pipeline_id_from_testkit_cfg(testkit_cfg) job_id = job_id_from_testkit_cfg(testkit_cfg) log.info("job_id = {j}".format(j=job_id)) log.info("pipeline_id = {p}".format(p=pipeline_id)) log.info("url = {h}:{p}".format(h=host, p=port)) task_options, workflow_options = get_task_and_workflow_options(testkit_cfg) service_entrypoints = [ServiceEntryPoint.from_d(x) for x in entrypoints_dicts(entrypoints)] for ep, dataset_xml in entrypoints.iteritems(): log.info("Importing {x}".format(x=dataset_xml)) sal.run_import_local_dataset(dataset_xml) if import_only: log.info("Skipping job execution") return 0 log.info("starting anaylsis job...") # XXX note that workflow options are currently ignored engine_job = run_analysis_job(sal, job_id, pipeline_id, service_entrypoints, block=True, time_out=time_out, task_options=task_options) exit_code = run_butler_tests_from_cfg( testkit_cfg=testkit_cfg, output_dir=engine_job.path, output_xml=xml_out, service_access_layer=sal, services_job_id=engine_job.id) if ignore_test_failures and engine_job.was_successful(): return 0 return exit_code
def args_run_analysis_job(args): log.debug(args) with open(args.json_path, 'r') as f: d = json.loads(f.read()) log.debug("Loaded \n" + pprint.pformat(d)) job_name, pipeline_id, service_entry_points = load_analysis_job_json(d) sal = ServiceAccessLayer(args.host, args.port) # this should raise if there's a failure result = run_analysis_job(sal, job_name, pipeline_id, service_entry_points, block=args.block) return 0
def main(parser, options): '''options is a list of json task options added to the parser''' args = parser.parse_args() #must have at least one input assert (args.subreadSetID or args.subreadSetIdCsv), 'Must define -s or -S' #load settings from tempalte presets = json.load(open(PRESETS_TEMPLATE)) if args.subreadSetID: name = '' ssIdx = {name: int(args.subreadSetID)} else: ssIdx = parseSubreadsetIdCsv(args.subreadSetIdCsv) sal = ServiceAccessLayer(args.host, args.port) #prepare file to report jobs started columns = ['host', 'jobId', 'jobName', 'jobPath'] csvfmt = ','.join(map('{{{}}}'.format, columns)) + '\n' csvFile = open('{d}/{f}'.format(d=args.outDir, f=JOBCSVNAME), 'w') #write header csvFile.write(','.join(columns) + '\n') print 'starting jobs for {i} subreadsets'.format(i=len(ssIdx)) for name, ssId in ssIdx.items(): #get the subset ss = sal.get_subreadset_by_id(ssId) #set the job name if args.jobName: jobName = args.jobName elif name: jobName = name + NAMEPOSTFIX else: jobName = ss['name'] + NAMEPOSTFIX presets['name'] = jobName #set entry subreadset setEntryPoint(presets, ssId) #set all options for opt in options: setTaskOption(presets, opt, getattr(args, opt)) #write preset_json job_pre = '{d}/{name}_presets.json'.format( d=os.path.abspath(args.outDir), name=cleanName(jobName.replace(' ', '_'))) with open(job_pre, 'w') as oFile: json.dump(presets, oFile, indent=2) #start job print 'Starting job {name}, {time}'.format(name=jobName, time=time.asctime( time.localtime())) job = startJob(job_pre, host=args.host, port=args.port) jobSummary = job['JOB SUMMARY'] csvFile.write( csvfmt.format(host=args.host, jobId=int(jobSummary['id']), jobName=jobSummary['name'], jobPath=jobSummary['path'])) if args.wait: print 'waiting %i minutes' % args.wait time.sleep(60 * args.wait) csvFile.close() return None
def run_main(host, port, nprocesses, ntimes, profile_csv): # logging.basicConfig(level=logging.DEBUG, file=sys.stdout) profile_d = {} started_at = time.time() log.info(FUNCS.keys()) sal = ServiceAccessLayer(host, port) status = sal.get_status() log.info("Status {}".format(status)) profile_d['nprocesses'] = nprocesses profile_d["init_nsubreads"] = len(sal.get_subreadsets()) profile_d['init_nreferences'] = len(sal.get_referencesets()) profile_d['init_njobs'] = len(sal.get_analysis_jobs()) chunksize = 6 info = "{h}:{p} with ntimes:{n} with processors:{x}".format(h=host, p=port, n=ntimes, x=nprocesses) # FIXME. All paths are relative to smrtflow root def to_p(rpath): return os.path.join(os.getcwd(), rpath) # DataSet referenceset_path = to_p("test-data/smrtserver-testdata/ds-references/mk-01/mk_name_01/referenceset.xml") subreadset_path = to_p("test-data/smrtserver-testdata/ds-subreads/PacBioTestData/m54006_160504_020705.tiny.subreadset.xml") # Run Design run_design_path = to_p("smrt-server-link/src/test/resources/runCreate2.xml") # Dev Diagnostic analysis_json = to_p("smrt-server-analysis/src/test/resources/analysis-dev-diagnostic-stress-01.json") output_dir_prefix = to_p("test-output") if not os.path.exists(output_dir_prefix): os.mkdir(output_dir_prefix) # import referenceset with original UUID for the dev_diagnostic run _run_cmd("pbservice import-dataset --host={h} --port={p} {x}".format(h=host, p=port, x=referenceset_path)) xs = _generate_data(host, port, [referenceset_path, subreadset_path], analysis_json, run_design_path, output_dir_prefix, ntimes) log.info("Starting {i}".format(i=info)) p = multiprocessing.Pool(nprocesses) results = p.map(runner, xs, chunksize=chunksize) failed = [r for r in results if r.exit_code != 0] was_successful = len(failed) == 0 for f in failed: log.error(f) log.debug("exiting {i}".format(i=info)) if failed: log.error("Failed Results {r} of {x}".format(r=len(failed), x=len(results))) run_time_sec = time.time() - started_at profile_d['nresults'] = len(results) profile_d['nfailed'] = len(failed) profile_d['was_successful'] = was_successful profile_d["final_nsubreads"] = len(sal.get_subreadsets()) profile_d['final_nreferences'] = len(sal.get_referencesets()) profile_d['final_njobs'] = len(sal.get_analysis_jobs()) profile_d['run_time_sec'] = run_time_sec write_profile(profile_d, profile_csv) return 0 if was_successful else 1
def run_main(host, port, nprocesses, ntimes, profile_csv): # logging.basicConfig(level=logging.DEBUG, file=sys.stdout) profile_d = {} started_at = time.time() log.info(FUNCS.keys()) sal = ServiceAccessLayer(host, port) status = sal.get_status() log.info("Status {}".format(status)) profile_d['nprocesses'] = nprocesses profile_d["init_nsubreads"] = len(sal.get_subreadsets()) profile_d['init_nreferences'] = len(sal.get_referencesets()) profile_d['init_njobs'] = len(sal.get_analysis_jobs()) chunksize = 6 info = "{h}:{p} with ntimes:{n} with processors:{x}".format(h=host, p=port, n=ntimes, x=nprocesses) # FIXME. All paths are relative to smrtflow root def to_p(rpath): return os.path.join(os.getcwd(), rpath) # DataSet referenceset_path = to_p( "test-data/smrtserver-testdata/ds-references/mk-01/mk_name_01/referenceset.xml" ) subreadset_path = to_p( "test-data/smrtserver-testdata/ds-subreads/PacBioTestData/m54006_160504_020705.tiny.subreadset.xml" ) # Run Design run_design_path = to_p( "smrt-server-link/src/test/resources/runCreate2.xml") # Dev Diagnostic analysis_json = to_p( "smrt-server-link/src/test/resources/analysis-dev-diagnostic-stress-01.json" ) output_dir_prefix = to_p("test-output") if not os.path.exists(output_dir_prefix): os.mkdir(output_dir_prefix) # import referenceset with original UUID for the dev_diagnostic run _run_cmd("{pbservice} import-dataset --host={h} --port={p} {x}".format( pbservice=pbservice, h=host, p=port, x=referenceset_path)) xs = _generate_data(host, port, [referenceset_path, subreadset_path], analysis_json, run_design_path, output_dir_prefix, ntimes) log.info("Starting {i}".format(i=info)) p = multiprocessing.Pool(nprocesses) results = p.map(runner, xs, chunksize=chunksize) failed = [r for r in results if r.exit_code != 0] was_successful = len(failed) == 0 for f in failed: log.error(f) log.debug("exiting {i}".format(i=info)) if failed: log.error("Failed Results {r} of {x}".format(r=len(failed), x=len(results))) run_time_sec = time.time() - started_at profile_d['nresults'] = len(results) profile_d['nfailed'] = len(failed) profile_d['was_successful'] = was_successful profile_d["final_nsubreads"] = len(sal.get_subreadsets()) profile_d['final_nreferences'] = len(sal.get_referencesets()) profile_d['final_njobs'] = len(sal.get_analysis_jobs()) profile_d['run_time_sec'] = run_time_sec write_profile(profile_d, profile_csv) return 0 if was_successful else 1
def get_sal_and_status(host, port): """Get Sal or Raise if status isn't successful""" sal = ServiceAccessLayer(host, port) sal.get_status() return sal
def run_import_local_datasets(host, port, xml_or_dir): sal = ServiceAccessLayer(host, port) file_func = functools.partial(import_local_dataset, sal) dir_func = functools.partial(import_datasets, sal) return run_file_or_dir(file_func, dir_func, xml_or_dir)
def run_main(path, host, port, job_name, pipeline_id, referenceset_uuid, block=False, custom_options=None): """ :param path: Path to SubreadSet XML will be imported (if it's not already been imported) :param host: SL Host :param port: SL Port :param job_name: Job name :param pipeline_id: Pipeline Id (e.g, pbsmrtpipe.pipelines.my_pipeline :param referenceset_uuid: UUID of Rset. This *must* already be imported :param block: To block and poll for the analysis job to complete :param custom_options: Dictionary of task options for the provided Pipeline in the form {"pbalign.task_options.concordant":True} :type custom_options: dict | None :rtype: int """ # look up the reference set UUID from pbservice CLI util or # http://smrtlink-beta:8081/secondary-analysis/datasets/references # TODO. 1. Import SubreadSet if it's not already imported # TODO. 2. Check and see if the Job with the SubreadSet UUID was already submitted # TODO. 3. Add option to force a new submission to override (2) # TODO. 4. Enable custom pipeline options json file at the CLI # sanity test sset = SubreadSet(path) log.info("Loaded SubreadSet {}".format(sset)) sal = ServiceAccessLayer(host, port) # Sanity Check _ = sal.get_status() # Step 1. Import SubreadSet (and block) if it's not imported already service_sset = sal.get_subreadset_by_id(sset.uuid) # TODO. Add check to see if Job was successful if service_sset is None: log.info("Running Import-DataSet job with {}".format(path)) sset_import_job = sal.run_import_dataset_subread(path) log.info("Import-DataSet job {}".format(sset_import_job)) else: log.info("Found already imported SubreadSet {}".format(service_sset)) # Step 2. Check and See if an previous analysis job has already been run # Immediately exit if an analysis job is found analysis_job = get_job_by_subreadset_uuid_or_none(sal, sset.uuid) if analysis_job is not None: log.info("Found exiting job {} for SubreadSet {}".format(analysis_job, sset)) return 0 # Step 3. Create a new Analysis job with custom task options (if provided) task_options = {} if custom_options is None else custom_options # Get the already Successfully imported DataSets service_sset_d = sal.get_dataset_by_uuid(sset.uuid) service_rset_d = sal.get_dataset_by_uuid(referenceset_uuid) f = sal.run_by_pipeline_template_id if block else sal.create_by_pipeline_template_id # The API takes the Int id of the DataSet epoints = (ServiceEntryPoint("eid_subread", FileTypes.DS_SUBREADS.file_type_id, service_sset_d['id']), ServiceEntryPoint("eid_ref_dataset", FileTypes.DS_REF.file_type_id, service_rset_d['id'])) job = f(job_name, pipeline_id, epoints, task_options=task_options) log.info("Analysis Job {}".format(job)) if block: exit_code = 0 if job.state == JobStates.SUCCESSFUL else 1 else: # the job is in the created state exit_code = 0 return exit_code