def apply_vqsr_indel(job, shared_ids, input_args): """ Apply variant quality score recalibration for indel variants. Writes vcf file to output directory :param job: Job instance :param shared_ids: dictionary of shared file promises :param input_args: dictionary of input arguments """ work_dir = job.fileStore.getLocalTempDir() uuid = input_args['uuid'] suffix = input_args['suffix'] input_files = ['ref.fa', 'ref.fa.fai', 'ref.dict', 'unified.raw.BOTH.gatk.vcf', 'HAPINDEL.recal', 'HAPINDEL.tranches', 'HAPINDEL.plots'] read_from_filestore_hc(job, work_dir, shared_ids, *input_files) output = '{}.HAPSNP.vqsr.INDEL{}.vcf'.format(uuid, suffix) command = ['-T', 'ApplyRecalibration', '-input', 'unified.raw.BOTH.gatk.vcf', '-o', output, '-R', 'ref.fa', '-nt', '1', '-ts_filter_level', '99.0', '-tranchesFile', 'HAPINDEL.tranches', '-recalFile', 'HAPINDEL.recal', '-mode', 'INDEL'] docker_call(work_dir = work_dir, tool_parameters = command, tool = 'quay.io/ucsc_cgl/gatk', sudo = input_args['sudo']) upload_or_move_hc(work_dir, input_args, output)
def start(self): """ Start spark and hdfs master containers """ log.write("start masters\n") log.flush() if (os.uname()[0] == "Darwin"): machine = check_output(["docker-machine", "ls"]).split("\n")[1].split()[0] self.IP = check_output(["docker-machine", "ip", machine]).strip().rstrip() else: self.IP = check_output(["hostname", "-f",])[:-1] self.sparkContainerID = docker_call(no_rm = True, work_dir = os.getcwd(), tool = "quay.io/ucsc_cgl/apache-spark-master:1.5.2", docker_parameters = ["--net=host", "-d", "-v", "/mnt/ephemeral/:/ephemeral/:rw", "-e", "SPARK_MASTER_IP="+self.IP, "-e", "SPARK_LOCAL_DIRS=/ephemeral/spark/local", "-e", "SPARK_WORKER_DIR=/ephemeral/spark/work"], tool_parameters = [], sudo = self.sudo, check_output = True)[:-1] self.hdfsContainerID = docker_call(no_rm = True, work_dir = os.getcwd(), tool = "quay.io/ucsc_cgl/apache-hadoop-master:2.6.2", docker_parameters = ["--net=host", "-d"], tool_parameters = [self.IP], sudo = self.sudo, check_output = True)[:-1] return self.IP
def vqsr_snp(job, shared_ids, input_args): """ Variant quality score recalibration for SNP variants. Calls next step in pipeline - apply SNP recalibration :param job: Job instance :param shared_ids: dictionary of shared file promises :param input_args: dictionary of input arguments """ work_dir = job.fileStore.getLocalTempDir() input_files = ['ref.fa', 'ref.fa.fai', 'ref.dict', 'unified.raw.BOTH.gatk.vcf', 'hapmap.vcf', 'omni.vcf', 'dbsnp.vcf', 'phase.vcf'] read_from_filestore_hc(job, work_dir, shared_ids, *input_files) outputs = ['HAPSNP.recal', 'HAPSNP.tranches', 'HAPSNP.plots'] command = ['-T', 'VariantRecalibrator', '-R', 'ref.fa', '-input', 'unified.raw.BOTH.gatk.vcf', '-nt', input_args['cpu_count'], '-resource:hapmap,known=false,training=true,truth=true,prior=15.0', 'hapmap.vcf', '-resource:omni,known=false,training=true,truth=false,prior=12.0', 'omni.vcf', '-resource:dbsnp,known=true,training=false,truth=false,prior=2.0', 'dbsnp.vcf', '-resource:1000G,known=false,training=true,truth=false,prior=10.0', 'phase.vcf', '-an', 'QD', '-an', 'DP', '-an', 'FS', '-an', 'ReadPosRankSum', '-mode', 'SNP', '-minNumBad', '1000', '-recalFile', 'HAPSNP.recal', '-tranchesFile', 'HAPSNP.tranches', '-rscriptFile', 'HAPSNP.plots'] docker_call(work_dir = work_dir, tool_parameters = command, tool ='quay.io/ucsc_cgl/gatk', sudo = input_args['sudo']) shared_ids = write_to_filestore(job, work_dir, shared_ids, *outputs) job.addChildJobFn(apply_vqsr_snp, shared_ids, input_args)
def vqsr_indel(job, shared_ids, input_args): """ Variant quality score recalibration for Indel variants. Calls next step in pipeline - apply indel recalibration :param job: Job instance :param shared_ids: dictionary of shared file promises :param input_args: dictionary of input arguments """ work_dir = job.fileStore.getLocalTempDir() input_files = ['ref.fa', 'ref.fa.fai', 'ref.dict', 'unified.raw.BOTH.gatk.vcf', 'mills.vcf'] read_from_filestore_hc(job, work_dir, shared_ids, *input_files) outputs = ['HAPINDEL.recal', 'HAPINDEL.tranches', 'HAPINDEL.plots'] command = ['-T', 'VariantRecalibrator', '-R', 'ref.fa', '-input', 'unified.raw.BOTH.gatk.vcf', '-nt', input_args['cpu_count'], '-resource:mills,known=true,training=true,truth=true,prior=12.0', 'mills.vcf', '-an', 'DP', '-an', 'FS', '-an', 'ReadPosRankSum', '-mode', 'INDEL', '-minNumBad', '1000', '-recalFile', 'HAPINDEL.recal', '-tranchesFile', 'HAPINDEL.tranches', '-rscriptFile', 'HAPINDEL.plots', '--maxGaussians', '4'] docker_call(work_dir = work_dir, tool_parameters = command, tool ='quay.io/ucsc_cgl/gatk', sudo = input_args['sudo']) shared_ids = write_to_filestore(job, work_dir, shared_ids, *outputs) job.addChildJobFn(apply_vqsr_indel, shared_ids, input_args)
def call_conductor(masterIP, inputs, src, dst): """ Invokes the conductor container. """ docker_call(no_rm = True, work_dir = os.getcwd(), tool = "quay.io/ucsc_cgl/conductor", docker_parameters = ["--net=host"], tool_parameters = ["--master", "spark://"+masterIP+":"+SPARK_MASTER_PORT, "--conf", "spark.driver.memory=%sg" % inputs["driverMemory"], "--conf", "spark.executor.memory=%sg" % inputs["executorMemory"], "--", "-C", src, dst], sudo = inputs['sudo'])
def call_adam(masterIP, inputs, arguments): default_params = ["--master", ("spark://%s:%s" % (masterIP, SPARK_MASTER_PORT)), "--conf", ("spark.driver.memory=%sg" % inputs["driverMemory"]), "--conf", ("spark.executor.memory=%sg" % inputs["executorMemory"]), "--conf", ("spark.hadoop.fs.default.name=hdfs://%s:%s" % (masterIP, HDFS_MASTER_PORT)), "--"] docker_call(no_rm = True, work_dir = os.getcwd(), tool = "quay.io/ucsc_cgl/adam:962-ehf--6e7085f8cac4b9a927dc9fb06b48007957256b80", docker_parameters = ["--net=host"], tool_parameters = default_params + arguments, sudo = inputs['sudo'])
def haplotype_caller(job, shared_ids, input_args): """ Uses GATK HaplotypeCaller to identify SNPs and Indels and writes a gVCF. Calls per-sample genotyper to genotype gVCF. :param job: Job instance :param shared_ids: dictionary of shared file promises :param input_args: dictionary of input arguments """ work_dir = job.fileStore.getLocalTempDir() input_files = ['ref.fa', 'ref.fa.fai', 'ref.dict', 'toil.bam', 'toil.bam.bai'] read_from_filestore_hc(job, work_dir, shared_ids, *input_files) output = '%s.raw.BOTH%s.gvcf' % (input_args['uuid'], input_args['suffix']) # Call GATK -- HaplotypeCaller command = ['-nct', input_args['cpu_count'], '-R', 'ref.fa', '-T', 'HaplotypeCaller', '--genotyping_mode', 'Discovery', '--emitRefConfidence', 'GVCF', '-I', 'toil.bam', '-o', output, '-variant_index_type', 'LINEAR', '-variant_index_parameter', '128000', '--annotation', 'QualByDepth', '--annotation', 'DepthPerSampleHC', '--annotation', 'FisherStrand', '--annotation', 'ReadPosRankSumTest'] try: docker_call(work_dir = work_dir, tool_parameters = command, tool = 'quay.io/ucsc_cgl/gatk', sudo = input_args['sudo']) except: sys.stderr.write("Running haplotype caller with %s in %s failed." % ( " ".join(command), work_dir)) raise # Update fileStore and spawn child job shared_ids[output] = job.fileStore.writeGlobalFile(os.path.join(work_dir, output)) # upload gvcf upload_or_move_hc(work_dir, input_args, output) # call variants prior to vqsr job.addChildJobFn(genotype_gvcf, shared_ids, input_args)
def genotype_gvcf(job, shared_ids, input_args): """ Genotypes the gVCF generated by the HaplotypeCaller. Calls variant quality score recalibration functions. :param job: Job instance :param shared_ids: dictionary of shared file promises :param input_args: dictionary of input arguments """ work_dir = job.fileStore.getLocalTempDir() input_files = ['%s.raw.BOTH%s.gvcf' % (input_args['uuid'], input_args['suffix']), 'ref.fa', 'ref.fa.fai', 'ref.dict'] read_from_filestore_hc(job, work_dir, shared_ids, *input_files) output = 'unified.raw.BOTH.gatk.vcf' command = ['-nt', input_args['cpu_count'], '-R', 'ref.fa', '-T', 'GenotypeGVCFs', '--variant', '%s.raw.BOTH.gatk.gvcf' % input_args['uuid'], '--out', output, '-stand_emit_conf', '10.0', '-stand_call_conf', '30.0'] try: docker_call(work_dir = work_dir, tool_parameters = command, tool = 'quay.io/ucsc_cgl/gatk', sudo = input_args['sudo']) except: sys.stderr.write("Running GenotypeGVCFs with %s in %s failed." % ( " ".join(command), work_dir)) raise # Update fileStore and spawn child job shared_ids[output] = job.fileStore.writeGlobalFile(os.path.join(work_dir, output)) # run vqsr job.addChildJobFn(vqsr_snp, shared_ids, input_args) job.addChildJobFn(vqsr_indel, shared_ids, input_args)
def index(job, shared_ids, input_args): """ Index sample bam using samtools, calls haplotypeCaller. :param job: Job instance :param shared_ids: dictionary of shared file promises :param input_args: dictionary of input arguments """ work_dir = job.fileStore.getLocalTempDir() # Retrieve file path # FIXME: unused variable bam_path = return_input_paths(job, work_dir, shared_ids, 'toil.bam') output_path = os.path.join(work_dir, 'toil.bam.bai') # Call: index the normal.bam parameters = ['index', 'toil.bam'] docker_call(work_dir = work_dir, tool_parameters = parameters, tool = 'quay.io/ucsc_cgl/samtools', sudo = input_args['sudo']) # Update FileStore and call child shared_ids['toil.bam.bai'] = job.fileStore.writeGlobalFile(output_path) job.addChildJobFn(haplotype_caller, shared_ids, input_args)
def create_reference_dict_hc(job, shared_ids, input_args): """ Uses Picardtools to create sequence dictionary for reference genome. Calls next step in pipeline - spawn batch jobs :param job: Job instance :param shared_ids: dictionary of shared file promises :param input_args: dictionary of input arguments """ # Unpack convenience variables for job work_dir = job.fileStore.getLocalTempDir() # Retrieve file path # FIXME: unused variable ref_path = return_input_paths(job, work_dir, shared_ids, 'ref.fa') # Call: picardtools picard_output = os.path.join(work_dir, 'ref.dict') command = ['CreateSequenceDictionary', 'R=ref.fa', 'O=ref.dict'] docker_call(work_dir = work_dir, tool_parameters = command, tool = 'quay.io/ucsc_cgl/picardtools', sudo = input_args['sudo']) # Update fileStore for output shared_ids['ref.dict'] = job.fileStore.writeGlobalFile(picard_output) job.addChildJobFn(spawn_batch_variant_calling, shared_ids, input_args)
def create_reference_index_hc(job, shared_ids, input_args): """ Uses samtools to create reference index file in working directory, spawns next job in pipeline - create reference dictionary :param job: Job instance :param shared_ids: dictionary of shared file promises :param input_args: dictionary of input arguments """ # Unpack convenience variables for job work_dir = job.fileStore.getLocalTempDir() # Retrieve file path # FIXME: unused variable ref_path = return_input_paths(job, work_dir, shared_ids, 'ref.fa') faidx_output = os.path.join(work_dir, 'ref.fa.fai') # Call: Samtools faidx_command = ['faidx', 'ref.fa'] docker_call(work_dir = work_dir, tool_parameters = faidx_command, tool = 'quay.io/ucsc_cgl/samtools', sudo = input_args['sudo']) # Update fileStore for output shared_ids['ref.fa.fai'] = job.fileStore.writeGlobalFile(faidx_output) job.addChildJobFn(create_reference_dict_hc, shared_ids, input_args)
def start(self): """ Start spark and hdfs worker containers """ log.write("start workers\n") log.flush() self.sparkContainerID = docker_call(no_rm = True, work_dir = os.getcwd(), tool = "quay.io/ucsc_cgl/apache-spark-worker:1.5.2", docker_parameters = ["--net=host", "-d", "-v", "/mnt/ephemeral/:/ephemeral/:rw", "-e", "\"SPARK_MASTER_IP="+self.masterIP+":"+SPARK_MASTER_PORT+"\"", "-e", "SPARK_LOCAL_DIRS=/ephemeral/spark/local", "-e", "SPARK_WORKER_DIR=/ephemeral/spark/work"], tool_parameters = [self.masterIP+":"+SPARK_MASTER_PORT], sudo = self.sudo, check_output = True)[:-1] self.hdfsContainerID = docker_call(no_rm = True, work_dir = os.getcwd(), tool = "quay.io/ucsc_cgl/apache-hadoop-worker:2.6.2", docker_parameters = ["--net=host", "-d", "-v", "/mnt/ephemeral/:/ephemeral/:rw"], tool_parameters = [self.masterIP], sudo = self.sudo, check_output = True)[:-1] # fake do/while to check if HDFS is up hdfs_down = True retries = 0 while hdfs_down and (retries < 5): sys.stderr.write("Sleeping 30 seconds before checking HDFS startup.") time.sleep(30) clusterID = "" try: clusterID = check_output(["docker", "exec", self.hdfsContainerID, "grep", "clusterID", "-R", "/opt/apache-hadoop/logs"]) except: # grep returns a non-zero exit code if the pattern is not found # we expect to not find the pattern, so a non-zero code is OK pass if "Incompatible" in clusterID: sys.stderr.write("Hadoop Datanode failed to start with: %s" % clusterID) sys.stderr.write("Retrying container startup, retry #%d." % retries) retries += 1 sys.stderr.write("Removing ephemeral hdfs directory.") check_call(["docker", "exec", self.hdfsContainerID, "rm", "-rf", "/ephemeral/hdfs"]) sys.stderr.write("Killing container %s." % self.hdfsContainerID) check_call(["docker", "kill", self.hdfsContainerID]) # todo: this is copied code. clean up! sys.stderr.write("Restarting datanode.") self.hdfsContainerID = docker_call(no_rm = True, work_dir = os.getcwd(), tool = "quay.io/ucsc_cgl/apache-hadoop-worker:2.6.2", docker_parameters = ["--net=host", "-d", "-v", "/mnt/ephemeral/:/ephemeral/:rw"], tool_parameters = [self.masterIP], sudo = self.sudo, check_output = True)[:-1] else: sys.stderr.write("HDFS datanode started up OK!") hdfs_down = False if retries >= 5: raise RuntimeError("Failed %d times trying to start HDFS datanode." % retries) return