def testJsonSubmitjob(): print "Running testJsonSubmitjob " # write out a temporary file with our query/dependencies query = tempfile.NamedTemporaryFile(delete=False) name = query.name queryFile = "pig-" + str(time.time()) + ".q" query.write("fs -copyFromLocal %s %s; cmd = load '%s'; dump cmd;" % (queryFile, queryFile, queryFile)) query.close() # read it back in as base64 encoded binary query = open(name, "rb") contents = base64.b64encode(query.read()) print contents query.close() os.unlink(name) payload = ( """ { "jobInfo": { "jobName": "PIG-JOB-TEST", "description": "This is a test", "userName" : "genietest", "groupName" : "hadoop", "jobType": "pig", "configuration": "prod", "schedule": "adHoc", "cmdArgs": "-f """ + queryFile + '''", "attachments": { "data": "''' + contents + '''", "name": "''' + queryFile + """" } } } """ ) print payload print "\n" return jobs.submitJob(serviceUrl, payload)
def testXmlSubmitjob(): print "Running testXmlSubmitjob" payload = ''' <request> <jobInfo> <jobID>''' + jobID + '''</jobID> <jobName>HADOOP-FS-CLIENT-TEST</jobName> <userName>genietest</userName> <groupName>hadoop</groupName> <userAgent>laptop</userAgent> <jobType>hadoop</jobType> <schedule>adHoc</schedule> <cmdArgs>fs -ls /</cmdArgs> </jobInfo> </request> ''' print payload return jobs.submitJob(serviceUrl, payload, 'application/xml')
def testXmlSubmitjob(): print "Running testXmlSubmitjob" payload = ''' <request> <jobInfo> <jobID>''' + jobID + '''</jobID> <jobName>HADOOP-FS-CLIENT-TEST</jobName> <userName>genietest</userName> <groupName>hadoop</groupName> <userAgent>laptop</userAgent> <jobType>hadoop</jobType> <configuration>prod</configuration> <schedule>adHoc</schedule> <cmdArgs>fs -ls /</cmdArgs> </jobInfo> </request> ''' print payload return jobs.submitJob(serviceUrl, payload, 'application/xml')
def testJsonSubmitjob(): print "Running testJsonSubmitjob " payload = ''' { "jobInfo": { "jobName": "HADOOP-JOB-TEST", "userName" : "genietest", "groupName" : "hadoop", "userAgent" : "laptop", "jobType": "hadoop", "schedule": "adHoc", "cmdArgs":"jar hadoop-examples.jar sleep -m 1 -mt 1", "fileDependencies":"''' + GENIE_TEST_PREFIX + '''/hadoop-examples.jar" } } ''' print payload print "\n" return jobs.submitJob(serviceUrl, payload)
def testJsonSubmitjob(): print "Running testJsonSubmitjob " payload = ''' { "jobInfo": { "jobName": "HADOOP-JOB-TEST", "userName" : "blahgenieamsharma", "groupName" : "hadoop", "userAgent" : "laptop", "jobType": "hadoop", "schedule": "ADHOC", "clusterName": "h2query", "cmdArgs":"jar hadoop-examples.jar pi 50 10", "fileDependencies":"''' + GENIE_TEST_PREFIX + '''/hadoop-examples.jar" } } ''' print payload print "\n" return jobs.submitJob(serviceUrl, payload)
def testJsonSubmitjob(): print "Running testJsonSubmitjob " payload = ''' { "jobInfo": { "jobName": "HADOOP-JOB-TEST", "userName" : "genietest", "groupName" : "hadoop", "userAgent" : "laptop", "jobType": "hadoop", "schedule": "adHoc", "clusterName": "h2query", "cmdArgs":"jar hadoop-examples.jar sleep -m 1 -mt 1", "fileDependencies":"''' + GENIE_TEST_PREFIX + '''/hadoop-examples.jar" } } ''' print payload print "\n" return jobs.submitJob(serviceUrl, payload)
def hiveSubmitJob(): print "Running hiveSubmitJob" payload = ''' { "jobInfo": { "jobName": "HIVE-KILL-TEST", "userName" : "genietest", "groupName" : "hadoop", "userAgent" : "laptop", "jobType": "hive", "configuration": "prod", "schedule": "adHoc", "cmdArgs": "-f hive.q", "fileDependencies":"''' + GENIE_TEST_PREFIX + '''/hive.q" } } ''' print payload print "\n" return jobs.submitJob(serviceUrl, payload)
def testXmlSubmitjob(): print "Running testXmlSubmitjob" payload = ( """ <request> <jobInfo> <jobID>""" + jobID + """</jobID> <jobName>HADOOP-FS-CLIENT-TEST</jobName> <userName>genietest</userName> <groupName>hadoop</groupName> <userAgent>laptop</userAgent> <jobType>hadoop</jobType> <schedule>adHoc</schedule> <cmdArgs>fs -ls /</cmdArgs> </jobInfo> </request> """ ) print payload return jobs.submitJob(serviceUrl, payload, "application/xml")
def testJsonSubmitjob(): print "Running testJsonSubmitjob " payload = ''' { "jobInfo": { "jobName": "PIG-CLIENT-NO-UUID-JSON", "userName" : "genietest", "groupName" : "DSE", "userAgent" : "laptop", "jobType": "pig", "configuration": "prod", "schedule": "adHoc", "cmdArgs":"-f pig.q", "pigVersion": "0.11", "fileDependencies":"''' + GENIE_TEST_PREFIX + '''/pig.q" } } ''' print payload print "\n" return jobs.submitJob(serviceUrl, payload)
def testJsonSubmitjob(): print "Running testJsonSubmitjob " payload = ''' { "jobInfo": { "jobName": "PIG-JOB-TEST", "userName" : "amsharma", "groupName" : "hadoop", "userAgent" : "laptop", "jobType": "pig", "configuration": "prod", "schedule": "adHoc", "clusterName": "h2query", "cmdArgs":" -f pig2.q", "fileDependencies":"''' + GENIE_TEST_PREFIX + '''/pig2.q" } } ''' print payload print "\n" return jobs.submitJob(serviceUrl, payload)
def testJsonSubmitjob(): print "Running testJsonSubmitjob " payload = ''' { "jobInfo": { "jobName": "HIVE-JOB-TEST", "description": "This is a test", "userName" : "amsharma", "groupName" : "hadoop", "jobType": "hive", "configuration": "prod", "schedule": "ADHOC", "clusterName": "h24query", "cmdArgs": "-f hive.q", "fileDependencies":"''' + GENIE_TEST_PREFIX + '''/hive.q" } } ''' print payload print "\n" return jobs.submitJob(serviceUrl, payload)
def testJsonSubmitjob(): print "Running testJsonSubmitjob " payload = ''' { "jobInfo": { "jobName": "HIVE-JOB-TEST", "description": "This is a test", "userName" : "genietest", "groupName" : "hadoop", "jobType": "hive", "configuration": "prod", "schedule": "adHoc", "clusterName": "h2query", "cmdArgs": "-f hive.q", "fileDependencies":"''' + GENIE_TEST_PREFIX + '''/hive.q" } } ''' print payload print "\n" return jobs.submitJob(serviceUrl, payload)
def testJsonSubmitjob(): print "Running testJsonSubmitjob " # write out a temporary file with our query/dependencies query = tempfile.NamedTemporaryFile(delete=False) name = query.name queryFile = 'pig-' + str(time.time()) + '.q' query.write("fs -copyFromLocal %s %s; cmd = load '%s'; dump cmd;" % (queryFile, queryFile, queryFile)) query.close() # read it back in as base64 encoded binary query = open(name, "rb") contents = base64.b64encode(query.read()) print contents query.close() os.unlink(name) payload = ''' { "jobInfo": { "jobName": "PIG-JOB-TEST", "description": "This is a test", "userName" : "genietest", "groupName" : "hadoop", "jobType": "pig", "configuration": "prod", "schedule": "adHoc", "cmdArgs": "-f ''' + queryFile + '''", "attachments": { "data": "''' + contents + '''", "name": "''' + queryFile + '''" } } } ''' print payload print "\n" return jobs.submitJob(serviceUrl, payload)
def testJsonSubmitjob(): print "Running testJsonSubmitjob " payload = ''' { "jobInfo": { "jobName": "HIVE-VERSION-TEST", "description": "This is a test", "userName" : "genietest", "groupName" : "hadoop", "userAgent" : "laptop", "jobType": "hive", "configuration": "prod", "schedule": "adHoc", "hiveVersion": "0.8.1.7", "cmdArgs": "-f hive.q", "fileDependencies":"''' + GENIE_TEST_PREFIX + '''/hive.q" } } ''' print payload print "\n" return jobs.submitJob(serviceUrl, payload)
def testJsonSubmitjob(): print "Running testJsonSubmitjob " # write out a temporary file with our query/dependencies query = tempfile.NamedTemporaryFile(delete=False) name = query.name query.write("show tables;") query.close() # read it back in as base64 encoded binary query = open(name, "rb") contents = base64.b64encode(query.read()) print contents query.close() os.unlink(name) payload = ''' { "jobInfo": { "jobName": "HIVE-JOB-TEST", "description": "This is a test", "userName" : "genietest", "groupName" : "hadoop", "jobType": "hive", "configuration": "prod", "schedule": "adHoc", "cmdArgs": "-f hive.q", "attachments": { "data": "''' + contents + '''", "name": "hive.q" } } } ''' print payload print "\n" return jobs.submitJob(serviceUrl, payload)
def testJsonSubmitjob(): print "Running testJsonSubmitjob " payload = ''' { "jobInfo": { "jobName": "HIVE-VERSION-TEST", "description": "This is a test", "userName" : "genietest", "groupName" : "hadoop", "userAgent" : "laptop", "jobType": "hive", "configuration": "prod", "schedule": "adHoc", "hiveVersion": "0.8.1.7", "cmdArgs": "-f hive.q", "disableLogArchival": "true", "fileDependencies":"''' + GENIE_TEST_PREFIX + '''/hive.q" } } ''' print payload print "\n" return jobs.submitJob(serviceUrl, payload)
def customizeGenomePipeline(args): """Run the genome customization pipeline.""" logs, scriptDir = jobs.baseDirs(args.logs, args.prefix, os.path.realpath(__file__)) intermediateOutDir = os.path.join(args.intermediate, args.prefix) pathlib.Path(intermediateOutDir).mkdir(parents=True, exist_ok=True) intermediateOut = os.path.join(intermediateOutDir, args.prefix) resultsOutDir = os.path.join(args.results, args.prefix) pathlib.Path(resultsOutDir).mkdir(parents=True, exist_ok=True) resultsOut = os.path.join(resultsOutDir, args.new) variantsOutDir = os.path.join(resultsOutDir, "variants.sbatch") pathlib.Path(variantsOutDir).mkdir(parents=True, exist_ok=True) variantsOut = os.path.join(variantsOutDir, args.prefix) statsOutDir = os.path.join(resultsOutDir, "stats.sbatch") pathlib.Path(statsOutDir).mkdir(parents=True, exist_ok=True) statsOut = os.path.join(statsOutDir, args.prefix) prevJob = 0 if args.bwa: cmd = (f"sbatch " + f"--output={intermediateOut}_bwa.sam " + f"--error={logs}_bwa_align_err.log " + f"--job-name={args.prefix}_bwa_align" + os.path.join(scriptDir, "bwa_mem.sbatch") + f" {args.genome} {args.reads1} {args.reads2}") prevJob = jobs.submitJob(cmd) prevJob = jobs.genericJob(prevJob, args.sort_bam, "sort_bam", logs, scriptDir, args.prefix, intermediateOut) if prevJob and args.flagstat: cmd = (f"sbatch --dependency=afterany:{prevJob}" + f"--output={statsOut}_alignment_metrics.txt " + f"--error={logs}_flagstats_err.log " + f"--job-name={args.prefix}_flagstat" + os.path.join(scriptDir, "flagstat.sbatch") + intermediateOut) prevJob = jobs.submitJob(cmd) elif args.flagstat: cmd = (f"sbatch " + f"--output={statsOut}_alignment_metrics.txt " + f"--error={logs}_flagstats_err.log " + f"--job-name={args.prefix}_flagstat" + os.path.join(scriptDir, "flagstat.sbatch") + intermediateOut) prevJob = jobs.submitJob(cmd) prevJob = jobs.genericJob(prevJob, args.mark_duplicates, "mark_duplicates.sbatch", logs, scriptDir, args.prefix, intermediateOut) prevJob = jobs.genericJob(prevJob, args.base_recalibrate, "base_recalibrator.sbatch", logs, scriptDir, args.prefix, intermediateOut, args.genome, args.vcf) prevJob = jobs.genericJob(prevJob, args.caller_haplotype, "haplotype_caller.sbatch", logs, scriptDir, args.prefix, variantsOut, args.genome) prevJob = jobs.genericJob(prevJob, args.select_snps, "select_snps.sbatch", logs, scriptDir, args.prefix, variantsOut, args.genome) prevJob = jobs.genericJob(prevJob, args.select_indels, "select_indels.sbatch", logs, scriptDir, args.prefix, variantsOut, args.genome) prevJob = jobs.genericJob(prevJob, args.filter_snps, "filter_snps.sbatch", logs, scriptDir, args.prefix, variantsOut, args.genome) prevJob = jobs.genericJob(prevJob, args.alternate_ref_make, "make_alternate_ref.sbatch", logs, scriptDir, args.prefix, args.genome, variantsOut, resultsOut) subprocess.run("squeue -u maxh")