def test_make_read_pairs_per_sample_match_fwd_2match(self): fd, fp = mkstemp() close(fd) with open(fp, 'w') as f: f.write(MAPPING_FILE) self._clean_up_files.append(fp) fwd_fp = ['./folder/s3_S013_L001_R1.fastq.gz', './folder/s2_S011_L001_R1.fastq.gz', './folder/s2_S009_L001_R1.fastq.gz'] rev_fp = [] with self.assertRaises(ValueError): make_read_pairs_per_sample(fwd_fp, rev_fp, fp)
def test_make_read_pairs_per_sample_match_fwd_rev(self): fd, fp = mkstemp() close(fd) with open(fp, 'w') as f: f.write(MAPPING_FILE) self._clean_up_files.append(fp) fwd_fp = ['./folder/s3_S013_L001_R1.fastq.gz', './folder/s2_S011_L001_R1.fastq.gz', './folder/s1_S009_L001_R1.fastq.gz'] rev_fp = ['./folder/s3_S013_L001_R2.fastq.gz', './folder/s2_S011_L001_R2.fastq.gz', './folder/s1_S009_L001_R2.fastq.gz'] exp = [('s1', 'SKB8.640193', './folder/s1_S009_L001_R1.fastq.gz', './folder/s1_S009_L001_R2.fastq.gz'), ('s2', 'SKD8.640184', './folder/s2_S011_L001_R1.fastq.gz', './folder/s2_S011_L001_R2.fastq.gz'), ('s3', 'SKB7.640196', './folder/s3_S013_L001_R1.fastq.gz', './folder/s3_S013_L001_R2.fastq.gz')] obs = make_read_pairs_per_sample(fwd_fp, rev_fp, fp) self.assertEqual(obs, exp)
def generate_trim_commands(forward_seqs, reverse_seqs, map_file, out_dir, parameters): """Generates the QC_Trim commands Parameters ---------- forward_seqs : list of str The list of forward seqs filepaths reverse_seqs : list of str The list of reverse seqs filepaths map_file : str The path to the mapping file out_dir : str The job output directory parameters : dict The command's parameters, keyed by parameter name Returns ------- cmds: list of str The QC_Trim commands samples: list of tup list of 4-tuples with run prefix, sample name, fwd read fp, rev read fp Notes ----- Currently this is requiring matched pairs in the make_read_pairs_per_sample step but implicitly allowing empty reverse reads in the actual command generation. This behavior may allow support of situations with empty reverse reads in some samples, for example after trimming and QC. """ # we match filenames, samples, and run prefixes samples = make_read_pairs_per_sample(forward_seqs, reverse_seqs, map_file) cmds = [] param_string = _format_params(parameters, ATROPOS_PARAMS) for run_prefix, sample, f_fp, r_fp in samples: if r_fp is None: cmds.append("atropos trim %s -o %s -se %s" % ( param_string, join(out_dir, '%s.R1.fastq.gz' % run_prefix), f_fp)) else: cmds.append('atropos trim %s -o %s -p %s -pe1 %s -pe2 %s' % (param_string, join(out_dir, '%s.R1.fastq.gz' % run_prefix), join(out_dir, '%s.R2.fastq.gz' % run_prefix), f_fp, r_fp)) return cmds, samples
def generate_filter_commands(forward_seqs, reverse_seqs, map_file, out_dir, temp_dir, parameters): """Generates the QC_Filter commands Parameters ---------- forward_seqs : list of str The list of forward seqs filepaths reverse_seqs : list of str The list of reverse seqs filepaths map_file : str The path to the mapping file out_dir : str The job output directory parameters : dict The command's parameters, keyed by parameter name Returns ------- cmds: list of str The QC_Filter commands samples: list of tup list of 4-tuples with run prefix, sample name, fwd read fp, rev read fp Notes ----- Currently this is requiring matched pairs in the make_read_pairs_per_sample step but implicitly allowing empty reverse reads in the actual command generation. This behavior may allow support of situations with empty reverse reads in some samples, for example after trimming and QC. """ # we match filenames, samples, and run prefixes samples = make_read_pairs_per_sample(forward_seqs, reverse_seqs, map_file) cmds = [] param_string = _format_params(parameters, BOWTIE2_PARAMS) threads = parameters['Number of threads'] for run_prefix, sample, f_fp, r_fp in samples: cmds.append( 'bowtie2 {params} --very-sensitive -1 {fwd_ip} -2 {rev_ip}' ' | samtools view -f 12 -F 256 -b -o {bow_op}; ' 'samtools sort -T {sample_path} -@ {thrds} -n -o {sam_op} ' '{sam_un_op}; ' 'bedtools bamtofastq -i {sam_op} -fq {bedtools_op_one} ' '-fq2 {bedtools_op_two}; ' 'pigz -p {thrds} -c {bedtools_op_one} > {gz_op_one}; ' 'pigz -p {thrds} -c {bedtools_op_two} > {gz_op_two};'.format( params=param_string, thrds=threads, fwd_ip=f_fp, rev_ip=r_fp, bow_op=join(temp_dir, '%s.unsorted.bam' % sample), sample_path=join(temp_dir, '%s' % sample), sam_op=join(temp_dir, '%s.bam' % sample), sam_un_op=join(temp_dir, '%s.unsorted.bam' % sample), bedtools_op_one=join(temp_dir, '%s.R1.trimmed.filtered.fastq' % sample), bedtools_op_two=join(temp_dir, '%s.R2.trimmed.filtered.fastq' % sample), gz_op_one=join(out_dir, '%s.R1.trimmed.filtered.fastq.gz' % sample), gz_op_two=join(out_dir, '%s.R2.trimmed.filtered.fastq.gz' % sample))) return cmds, samples
def shogun(qclient, job_id, parameters, out_dir): """Run Shogun with the given parameters Parameters ---------- qclient : tgp.qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to run split libraries out_dir : str The path to the job's output directory Returns ------- bool, list, str The results of the job """ # Step 1 get the rest of the information need to run Atropos qclient.update_job_step(job_id, "Step 1 of 5: Collecting information") artifact_id = parameters['input'] del parameters['input'] # Get the artifact filepath information artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) fps = artifact_info['files'] # Get the artifact metadata prep_info = qclient.get('/qiita_db/prep_template/%s/' % artifact_info['prep_information'][0]) qiime_map = prep_info['qiime-map'] # Step 2 converting to fna qclient.update_job_step(job_id, "Step 2 of 5: Converting to FNA for Shogun") with TemporaryDirectory(dir=out_dir, prefix='shogun_') as temp_dir: rs = fps['raw_reverse_seqs'] if 'raw_reverse_seqs' in fps else [] samples = make_read_pairs_per_sample(fps['raw_forward_seqs'], rs, qiime_map) # Combining files comb_fp = generate_fna_file(temp_dir, samples) # Formatting parameters parameters = _format_params(parameters, SHOGUN_PARAMS) # Step 3 align align_cmd = generate_shogun_align_commands(comb_fp, temp_dir, parameters) sys_msg = "Step 3 of 5: Aligning FNA with Shogun (%d/{0})".format( len(align_cmd)) success, msg = _run_commands(qclient, job_id, align_cmd, sys_msg, 'Shogun Align') if not success: return False, None, msg # Step 4 taxonomic profile sys_msg = "Step 4 of 5: Taxonomic profile with Shogun (%d/{0})" assign_cmd, profile_fp = generate_shogun_assign_taxonomy_commands( temp_dir, parameters) success, msg = _run_commands(qclient, job_id, assign_cmd, sys_msg, 'Shogun taxonomy assignment') if not success: return False, None, msg sys_msg = "Step 5 of 5: Converting output to BIOM" qclient.update_job_step(job_id, msg) output = run_shogun_to_biom(profile_fp, [None, None, None, True], out_dir, 'profile') ainfo = [ ArtifactInfo('Shogun Alignment Profile', 'BIOM', [(output, 'biom')]) ] return True, ainfo, ""
def shogun(qclient, job_id, parameters, out_dir): """Run Shogun with the given parameters Parameters ---------- qclient : tgp.qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to run split libraries out_dir : str The path to the job's output directory Returns ------- bool, list, str The results of the job """ # Step 1 get the rest of the information need to run Atropos qclient.update_job_step(job_id, "Step 1 of 7: Collecting information") artifact_id = parameters['input'] del parameters['input'] # Get the artifact filepath information artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) fps = artifact_info['files'] # Get the artifact metadata prep_info = qclient.get('/qiita_db/prep_template/%s/' % artifact_info['prep_information'][0]) qiime_map = prep_info['qiime-map'] # Step 2 converting to fna qclient.update_job_step(job_id, "Step 2 of 7: Converting to FNA for Shogun") with TemporaryDirectory(dir=out_dir, prefix='shogun_') as temp_dir: rs = fps['raw_reverse_seqs'] if 'raw_reverse_seqs' in fps else [] samples = make_read_pairs_per_sample(fps['raw_forward_seqs'], rs, qiime_map) # Combining files comb_fp = generate_fna_file(temp_dir, samples) # Formatting parameters parameters = _format_params(parameters, SHOGUN_PARAMS) # Step 3 align sys_msg = "Step 3 of 7: Aligning FNA with Shogun (%d/{0})" align_cmd = generate_shogun_align_commands(comb_fp, temp_dir, parameters) success, msg = _run_commands(qclient, job_id, align_cmd, sys_msg, 'Shogun Align') if not success: return False, None, msg # Step 4 taxonomic profile sys_msg = "Step 4 of 7: Taxonomic profile with Shogun (%d/{0})" assign_cmd, profile_fp = generate_shogun_assign_taxonomy_commands( temp_dir, parameters) success, msg = _run_commands(qclient, job_id, assign_cmd, sys_msg, 'Shogun taxonomy assignment') if not success: return False, None, msg # Step 5 redistribute profile sys_msg = "Step 5 of 7: Redistributed profile with Shogun (%d/{0})" levels = ['genus', 'species', 'strain'] redist_fps = [] for level in levels: redist_cmd, output = generate_shogun_redist_commands( profile_fp, temp_dir, parameters, level) redist_fps.append(output) success, msg = _run_commands(qclient, job_id, redist_cmd, sys_msg, 'Shogun redistribute') if not success: return False, None, msg # Step 6 functional profile sys_msg = "Step 6 of 7: Functional profile with Shogun (%d/{0})" levels = ['species'] func_fp = '' for level in levels: func_cmd, output = generate_shogun_functional_commands( profile_fp, temp_dir, parameters, level) func_fp = output success, msg = _run_commands(qclient, job_id, func_cmd, sys_msg, 'Shogun functional') if not success: return False, None, msg # Step 6 functional profile sys_msg = "Step 7 of 7: Converting results to BIOM (%d/{0})" func_biom_outputs = [] redist_biom_outputs = [] # Converting redistributed files to biom redist_levels = ['genus', 'species', 'strain'] for redist_fp, level in zip(redist_fps, redist_levels): biom_cmd, output = generate_biom_conversion_commands( redist_fp, out_dir, level, 'redist') success, msg = _run_commands(qclient, job_id, biom_cmd, sys_msg, 'Redistribute Biom conversion') if not success: return False, None, msg else: redist_biom_outputs.append(output) # Coverting funcitonal files to biom for level in levels: func_to_biom_fps = [ "kegg.modules.coverage", "kegg.modules", "kegg.pathways.coverage", "kegg.pathways", "kegg", "normalized" ] for biom_in in func_to_biom_fps: biom_in_fp = join(func_fp, "profile.%s.%s.txt" % (level, biom_in)) biom_cmd, output = generate_biom_conversion_commands( biom_in_fp, out_dir, level, biom_in) success, msg = _run_commands(qclient, job_id, biom_cmd, sys_msg, ' Functional Biom conversion') if not success: return False, None, msg else: func_biom_outputs.append(output) func_files_type_name = 'Functional Predictions' redist_files_type_name = 'Taxonomic Predictions' ainfo = [ ArtifactInfo(func_files_type_name, 'BIOM', func_biom_outputs), ArtifactInfo(redist_files_type_name, 'BIOM', redist_biom_outputs) ] return True, ainfo, ""
def shogun(qclient, job_id, parameters, out_dir): """Run Shogun with the given parameters Parameters ---------- qclient : tgp.qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to run split libraries out_dir : str The path to the job's output directory Returns ------- bool, list, str The results of the job """ # Step 1 get the rest of the information need to run Atropos qclient.update_job_step(job_id, "Step 1 of 6: Collecting information") artifact_id = parameters['input'] del parameters['input'] # Get the artifact filepath information artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) fps = artifact_info['files'] # Get the artifact metadata prep_info = qclient.get('/qiita_db/prep_template/%s/' % artifact_info['prep_information'][0]) qiime_map = prep_info['qiime-map'] # Step 2 converting to fna qclient.update_job_step(job_id, "Step 2 of 6: Converting to FNA for Shogun") rs = fps['raw_reverse_seqs'] if 'raw_reverse_seqs' in fps else [] samples = make_read_pairs_per_sample(fps['raw_forward_seqs'], rs, qiime_map) # Combining files comb_fp = generate_fna_file(out_dir, samples) # Formatting parameters parameters = _format_params(parameters, SHOGUN_PARAMS) # Step 3 align align_cmd = generate_shogun_align_commands(comb_fp, out_dir, parameters) sys_msg = "Step 3 of 6: Aligning FNA with Shogun (%d/{0})".format( len(align_cmd)) success, msg = _run_commands(qclient, job_id, align_cmd, sys_msg, 'Shogun Align') if not success: return False, None, msg # Step 4 taxonomic profile sys_msg = "Step 4 of 6: Taxonomic profile with Shogun (%d/{0})" assign_cmd, profile_fp = generate_shogun_assign_taxonomy_commands( out_dir, parameters) success, msg = _run_commands(qclient, job_id, assign_cmd, sys_msg, 'Shogun taxonomy assignment') if not success: return False, None, msg sys_msg = "Step 5 of 6: Compressing and converting alignment to BIOM" qclient.update_job_step(job_id, msg) alignment_fp = join( out_dir, 'alignment.%s.%s' % (parameters['aligner'], ALN2EXT[parameters['aligner']])) xz_cmd = 'xz -9 -T%s %s' % (parameters['threads'], alignment_fp) std_out, std_err, return_value = system_call(xz_cmd) if return_value != 0: error_msg = ("Error during %s:\nStd out: %s\nStd err: %s" "\n\nCommand run was:\n%s" % (sys_msg, std_out, std_err, xz_cmd)) return False, None, error_msg output = run_shogun_to_biom(profile_fp, [None, None, None, True], out_dir, 'profile') ainfo = [ ArtifactInfo('Shogun Alignment Profile', 'BIOM', [(output, 'biom'), ('%s.xz' % alignment_fp, 'log')]) ] # Step 5 redistribute profile sys_msg = "Step 6 of 6: Redistributed profile with Shogun (%d/{0})" levels = ['phylum', 'genus', 'species'] redist_fps = [] for level in levels: redist_cmd, output = generate_shogun_redist_commands( profile_fp, out_dir, parameters, level) redist_fps.append(output) success, msg = _run_commands(qclient, job_id, redist_cmd, sys_msg, 'Shogun redistribute') if not success: return False, None, msg # Converting redistributed files to biom for redist_fp, level in zip(redist_fps, levels): biom_in = ["redist", None, '', True] output = run_shogun_to_biom(redist_fp, biom_in, out_dir, level, 'redist') aname = 'Taxonomic Predictions - %s' % level ainfo.append(ArtifactInfo(aname, 'BIOM', [(output, 'biom')])) return True, ainfo, ""