def _summary_FASTA_preprocessed(artifact_type, filepaths, out_dir): """Generates the HTML summary for Demultiplexed artifacts Parameters ---------- artifact_type : str The artifact type filepaths : [(str, str)] A list of string pairs where the first element is the filepath and the second is the filepath type out_dir : str The output folder Returns ------- list A list of strings with the html summary """ files = filepaths.get('preprocessed_fasta') cmd = f"quast %s -o {out_dir}/quast" % ' '.join(files) std_out, std_err, return_value = system_call(cmd) if return_value != 0: artifact_information = ( "Std out: %s\nStd err: %s\n\nCommand run was:\n%s" % (std_out, std_err, cmd)) else: with open(f'{out_dir}/quast/report.html', 'r') as f: artifact_information = f.readlines() return artifact_information
def _generate_alpha_vector_summary(files, metadata, out_dir): # Magic number [0] -> there is only one plain text file and it is the # alpha vector alpha_vector_fp = files['plain_text'][0] alpha_qza = join(out_dir, 'alpha_vectors.qza') alpha_qzv = join(out_dir, 'alpha_vectors.qzv') metadata_fp = join(out_dir, 'sample-metadata.tsv') # Get the SampleData[AlphaDiversity] qiime2 artifact cmd = ('qiime tools import --input-path %s --output-path %s ' '--type "SampleData[AlphaDiversity]"' % (alpha_vector_fp, alpha_qza)) std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = "Error converting the alpha vectors file to Q2 artifact" raise RuntimeError(error_msg) # Generate the metadata file metadata = pd.DataFrame.from_dict(metadata, orient='index') metadata.to_csv(metadata_fp, index_label='#SampleID', na_rep='', sep='\t', encoding='utf-8') # Execute alpha group significance cmd = ('qiime diversity alpha-group-significance --i-alpha-diversity %s ' '--m-metadata-file %s --o-visualization %s' % (alpha_qza, metadata_fp, alpha_qzv)) std_out, std_err, return_value = system_call(cmd) if return_value != 0: raise RuntimeError( "Error executing alpha-group-significance for the summary:\n%s" % std_err) # Extract the Q2 visualization to use it as html_summary q2vis = Visualization.load(alpha_qzv) html_dir = join(out_dir, 'support_files') html_fp = join(out_dir, 'index.html') q2vis.export_data(html_dir) index_paths = q2vis.get_index_paths() index_name = basename(index_paths['html']) with open(html_fp, 'w') as f: f.write(Q2_INDEX % index_name) return html_fp, html_dir
def _run_commands(qclient, job_id, commands, msg): for i, cmd in enumerate(commands): qclient.update_job_step(job_id, msg % i) std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error running HUMANn2:\nStd out: %s\nStd err: %s" % (std_out, std_err)) return False, error_msg return True, ""
def _run_commands(qclient, job_id, commands, msg, cmd_name): for i, cmd in enumerate(commands): qclient.update_job_step(job_id, msg % i) std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error running %s:\nStd out: %s\nStd err: %s" "\n\nCommand run was:\n%s" % (cmd_name, std_out, std_err, cmd)) return False, error_msg return True, ""
def _generate_feature_data(files, metadata, out_dir): # Magic number [0] -> there is only one plain text file and it is the # feature data fdt_fp = files['plain_text'][0] if 'qza' not in files: fdt_qza = join(out_dir, 'taxonomy.qza') # Get the SampleData[AlphaDiversity] qiime2 artifact cmd = ('qiime tools import --input-path %s --output-path %s ' '--type "FeatureData[Taxonomy]"' % (fdt_fp, fdt_qza)) std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error converting the file to Q2 artifact") raise RuntimeError(error_msg) else: fdt_qza = files['qza'][0] fdt_qzv = join(out_dir, 'feature-data.qzv') cmd = ('qiime metadata tabulate --m-input-file %s --o-visualization %s' % (fdt_qza, fdt_qzv)) std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = "Error tabulating Q2 artifact" raise RuntimeError(error_msg) # Extract the Q2 visualization to use it as html_summary q2vis = Visualization.load(fdt_qzv) html_dir = join(out_dir, 'support_files') html_fp = join(out_dir, 'index.html') q2vis.export_data(html_dir) index_paths = q2vis.get_index_paths() index_name = basename(index_paths['html']) with open(html_fp, 'w') as f: f.write(Q2_INDEX % index_name) return html_fp, html_dir
def _gzip_file(filepath, test=False): """gzip the given filepath if needed Parameters ---------- filepath : string The filepath to verify or compress test : bolean If True do not compress but change the filename, used for unit testing Returns ------- str the new gz filepath, None if error str the error, None if success """ error = None return_fp = filepath if test: return_fp = '%s.gz' % filepath else: is_gzip = False try: with gopen(filepath, 'rb') as f: f.read(1) is_gzip = True except (OSError, IOError): pass if not is_gzip: gz_cmd = 'pigz -p 5 -c {0} > {0}.gz'.format(filepath) std_out, std_err, return_value = system_call(gz_cmd) if return_value != 0 and not test: error = ("Std out: %s\nStd err: %s\n\nCommand run was:\n%s" % (std_out, std_err, gz_cmd)) else: # removing non gz file remove(filepath) return_fp = '%s.gz' % filepath return return_fp, error
def beta_group_significance(qclient, job_id, parameters, out_dir): """generate beta group significance calculations Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values for beta correlation out_dir : str The path to the job's output directory Returns ------- boolean, list, str The results of the job """ out_dir = join(out_dir, 'beta_group_significance') if not exists(out_dir): mkdir(out_dir) qclient.update_job_step(job_id, "Step 1 of 3: Collecting information") artifact_id = parameters['Distance matrix'] artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) dm_fp = artifact_info['files']['plain_text'][0] dm_qza = join(out_dir, 'q2-distance.qza') analysis_id = artifact_info['analysis'] metadata = qclient.get( "/qiita_db/analysis/%s/metadata/" % str(analysis_id)) metadata = pd.DataFrame.from_dict(metadata, orient='index') metadata_fp = join(out_dir, 'metadata.txt') metadata.to_csv(metadata_fp, sep='\t') m_metadata_category = parameters['Metadata category'] p_method = BETA_GROUP_SIG_METHODS[parameters['Method']] p_permutations = parameters['Number of permutations'] p_pairwise = BETA_GROUP_SIG_TYPE[parameters['Comparison type']] o_visualization = join(out_dir, 'beta_group_significance.qzv') qclient.update_job_step( job_id, "Step 2 of 3: Converting Qiita artifacts to Q2 artifact") cmd = ('qiime tools import --input-path %s --output-path %s ' '--type "DistanceMatrix"' % (dm_fp, dm_qza)) std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error converting distance matrix:\nStd out: %s\n" "Std err: %s" % (std_out, std_err)) return False, None, error_msg qclient.update_job_step( job_id, "Step 3 of 3: Calculating beta group significance") cmd = ('qiime diversity beta-group-significance --i-distance-matrix %s ' '--m-metadata-file %s --m-metadata-category %s --p-method %s ' '--p-permutations %s --o-visualization %s --%s' % ( dm_qza, metadata_fp, m_metadata_category, p_method, p_permutations, o_visualization, p_pairwise)) std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error in beta group significance\nStd out: %s\n" "Std err: %s" % (std_out, std_err)) return False, None, error_msg ainfo = [ArtifactInfo('Beta group significance visualization', 'q2_visualization', [(o_visualization, 'qzv')])] return True, ainfo, ""
def test_system_call_error(self): obs_out, obs_err, obs_val = system_call("IHopeThisCommandDoesNotExist") self.assertEqual(obs_out, "") self.assertTrue("not found" in obs_err) self.assertEqual(obs_val, 127)
def deblur(qclient, job_id, parameters, out_dir): """Run deblur with the given parameters Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to run deblur out_dir : str The path to the job's output directory Returns ------- boolean, list, str The results of the job Notes ----- The code will check if the artifact has a preprocessed_demux element, if not it will use the preprocessed_fastq. We prefer to work with the preprocessed_demux as running time will be greatly improved """ out_dir = join(out_dir, 'deblur_out') # Step 1 get the rest of the information need to run deblur qclient.update_job_step(job_id, "Step 1 of 4: Collecting information") artifact_id = parameters['Demultiplexed sequences'] # removing input from parameters so it's not part of the final command del parameters['Demultiplexed sequences'] # Get the artifact filepath information artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) fps = artifact_info['files'] # Step 2 generating command deblur if 'preprocessed_demux' in fps: qclient.update_job_step(job_id, "Step 2 of 4: Generating per sample " "from demux (1/2)") if not exists(out_dir): mkdir(out_dir) split_out_dir = join(out_dir, 'split') if not exists(split_out_dir): mkdir(split_out_dir) # using the same number of parallel jobs as defined by the command n_jobs = int(parameters['Jobs to start']) # [0] cause there should be only 1 file to_per_sample_files(fps['preprocessed_demux'][0], out_dir=split_out_dir, n_jobs=n_jobs) qclient.update_job_step(job_id, "Step 2 of 4: Generating per sample " "from demux (2/2)") out_dir = join(out_dir, 'deblured') cmd = generate_deblur_workflow_commands([split_out_dir], out_dir, parameters) else: qclient.update_job_step(job_id, "Step 2 of 4: Generating deblur " "command") cmd = generate_deblur_workflow_commands(fps['preprocessed_fastq'], out_dir, parameters) # Step 3 execute deblur qclient.update_job_step(job_id, "Step 3 of 4: Executing deblur job") std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error running deblur:\nStd out: %s\nStd err: %s" % (std_out, std_err)) return False, None, error_msg # Generating artifact pb = partial(join, out_dir) # Generate the filepaths final_biom = pb('all.biom') final_seqs = pb('all.seqs.fa') final_biom_hit = pb('reference-hit.biom') final_seqs_hit = pb('reference-hit.seqs.fa') if not exists(final_biom_hit): # Create an empty table. We need to send something to Qiita that is # a valid BIOM, so we are going to create an empty table t = Table([], [], []) with biom_open(final_biom_hit, 'w') as f: t.to_hdf5(f, 'qp-deblur generated') if not exists(final_seqs_hit): # Same as before, create an empty sequence file so we can send it with open(final_seqs_hit, 'w') as f: f.write("") # Step 4, communicate with archive to check and generate placements qclient.update_job_step(job_id, "Step 4 of 4 (1/4): Retrieving " "observations information") features = list(load_table(final_biom_hit).ids(axis='observation')) fp_phylogeny = None if features: observations = qclient.post( "/qiita_db/archive/observations/", data={'job_id': job_id, 'features': features}) novel_fragments = list(set(features) - set(observations.keys())) qclient.update_job_step(job_id, "Step 4 of 4 (2/4): Generating %d new " "placements" % len(novel_fragments)) # Once we support alternative reference phylogenies for SEPP in the # future, we need to translate the reference name here into # filepaths pointing to the correct reference alignment and # reference tree. If left 'None' the Greengenes 13.8 reference # shipped with the fragment-insertion conda package will be used. fp_reference_alignment = None fp_reference_phylogeny = None fp_reference_template = None fp_reference_rename = None if 'Reference phylogeny for SEPP' in parameters: if parameters['Reference phylogeny for SEPP'] == 'tiny': fp_reference_alignment = qp_deblur.get_data(join( 'sepp', 'reference_alignment_tiny.fasta')) fp_reference_phylogeny = qp_deblur.get_data(join( 'sepp', 'reference_phylogeny_tiny.nwk')) fp_reference_template = qp_deblur.get_data(join( 'sepp', 'tmpl_tiny_placement.json')) fp_reference_rename = qp_deblur.get_data(join( 'sepp', 'tmpl_tiny_rename-json.py')) try: new_placements = generate_sepp_placements( novel_fragments, out_dir, parameters['Threads per sample'], reference_alignment=fp_reference_alignment, reference_phylogeny=fp_reference_phylogeny) except ValueError as e: return False, None, str(e) qclient.update_job_step(job_id, "Step 4 of 4 (3/4): Archiving %d " "new placements" % len(novel_fragments)) # values needs to be json strings as well for fragment in new_placements.keys(): new_placements[fragment] = json.dumps(new_placements[fragment]) # fragments that get rejected by a SEPP run don't show up in # the placement file, however being rejected is a valuable # information and should be stored in the archive as well. # Thus, we avoid re-computation for rejected fragments in the # future. for fragment in novel_fragments: if fragment not in new_placements: new_placements[fragment] = "" if len(new_placements.keys()) > 0: qclient.patch(url="/qiita_db/archive/observations/", op="add", path=job_id, value=json.dumps(new_placements)) # retrieve all fragments and create actuall tree qclient.update_job_step(job_id, "Step 4 of 4 (4/4): Composing " "phylogenetic insertion tree") placements = qclient.post( "/qiita_db/archive/observations/", data={'job_id': job_id, 'features': features}) # remove fragments that have been rejected by SEPP, i.e. whoes # placement is the empty string and # convert all other placements from string to json placements = {frag: json.loads(placements[frag]) for frag, plc in placements.items() if plc != ''} try: fp_phylogeny = generate_insertion_trees( placements, out_dir, reference_template=fp_reference_template, reference_rename=fp_reference_rename) except ValueError as e: return False, None, str(e) else: new_placements = None ainfo = [ArtifactInfo('deblur final table', 'BIOM', [(final_biom, 'biom'), (final_seqs, 'preprocessed_fasta')])] if fp_phylogeny is not None: ainfo.append(ArtifactInfo('deblur reference hit table', 'BIOM', [(final_biom_hit, 'biom'), (final_seqs_hit, 'preprocessed_fasta'), (fp_phylogeny, 'plain_text')], new_placements)) return True, ainfo, ""
def _generate_template_rename(file_reference_phylogeny, file_reference_alignment, out_dir): """Produces placement template and rename script for reference phylogeny. Parameters ---------- file_reference_phylogeny : str A filepath to an alternative reference phylogeny for SEPP. file_reference_alignment : str A filepath to an alternative reference alignment for SEPP. out_dir : str The job output directory Returns ------- (str, str) : Filepaths of reference_template json file and reference_rename python script. Raises ------ ValueError If a) the given out_dir directory does not exist. b) the given reference phylogeny or alignment does not exist. c) the run-sepp.sh wrapper script fails for any reason. Notes ----- This function only needs to be called once per reference phylogeny/ alignment, i.e. if we update Greengenes or extend SEPP for Silva or other reference phylogenies. I am including this function for easier maintainance in the future. """ if not exists(out_dir): raise ValueError("Output directory '%s' does not exist!" % out_dir) if not exists(file_reference_phylogeny): raise ValueError("Reference phylogeny file '%s' does not exits!" % file_reference_phylogeny) if not exists(file_reference_alignment): raise ValueError("Reference alignment file '%s' does not exits!" % file_reference_alignment) # create a dummy sequence input file file_input = '%s/input.fasta' % out_dir with open(file_input, 'w') as f: f.write('>dummySeq\n') f.write('TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGATGGA' 'CAAGTCTGATGTGAAAGGCTGGGGCCCAACCCCGGGACTGCATTGGAAACTGCCCGTCTT' 'GAGTG\n') std_out, std_err, return_value = system_call( 'cd %s; run-sepp.sh %s dummy -x 1 -a %s -t %s' % (out_dir, file_input, file_reference_alignment, file_reference_phylogeny)) if return_value != 0: error_msg = ("Error running SEPP:\nStd out: %s\nStd err: %s" % (std_out, std_err)) raise ValueError(error_msg) # take resulting placement.json and turn it into the template by # clearing the list of placements file_template = '%s/tmpl_dummy_placement.json' % out_dir with open('%s/dummy_placement.json' % out_dir, 'r') as f: placements = json.loads(f.read()) placements['placements'] = [] with open(file_template, 'w') as fw: json.dump(placements, fw) # Another file produced by SEPP is xxx_rename-json.py, where xxx is the # name of the run, here "dummy". SEPP needs to escape node names before the # reference tree is given to guppy which can only handle a limited name # format. Thus, after guppy, the result needs to be back translated to # original names with the rename-json.py script that is generated by SEPP. return (file_template, '%s/dummy_rename-json.py' % out_dir)
def beta_diversity(qclient, job_id, parameters, out_dir): """generate beta diversity calculations Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values for beta diversity out_dir : str The path to the job's output directory Returns ------- boolean, list, str The results of the job """ out_dir = join(out_dir, 'beta_diversity') if not exists(out_dir): mkdir(out_dir) qclient.update_job_step(job_id, "Step 1 of 4: Collecting information") artifact_id = parameters['BIOM table'] metric = BETA_DIVERSITY_METRICS[parameters['Diversity metric']] tree = parameters['Phylogenetic tree'] if tree == 'None': tree = None artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) biom_fpi = artifact_info['files']['biom'][0] biom_qza = join(out_dir, 'q2-biom.qza') num_jobs = parameters['Number of jobs'] qclient.update_job_step( job_id, "Step 2 of 4: Converting Qiita artifacts to Q2 artifact") # converting biom cmd = ('qiime tools import --input-path %s --output-path %s ' '--type "FeatureTable[Frequency]' % (biom_fpi, biom_qza)) b = load_table(biom_fpi) counts = list(map(sum, b.iter_data())) if min(counts) == max(counts): cmd += " % Properties(['uniform-sampling'])\"" else: cmd += '"' std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error converting biom:\nStd out: %s\nStd err: %s" % (std_out, std_err)) return False, None, error_msg # converting tree if tree is not None: qza_tree = join(out_dir, 'tree.qza') cmd = ('qiime tools import --input-path %s --type Phylogeny[Rooted] ' '--output-path %s' % (tree, qza_tree)) tree = qza_tree std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error converting tree:\nStd out: %s\nStd err: %s" % (std_out, std_err)) return False, None, error_msg qclient.update_job_step( job_id, "Step 3 of 4: Calculating beta diversity: %s" % (metric)) if tree is not None and metric in STATE_UNIFRAC_METRICS: su_metric = STATE_UNIFRAC_METRICS[metric] dtx_fp = join(out_dir, '%s.qza' % su_metric) cmd = ('qiime diversity beta-phylogenetic-alt --p-metric %s ' '--i-table %s --i-phylogeny %s --o-distance-matrix %s ' '--p-n-jobs %s' % (su_metric, biom_qza, tree, dtx_fp, num_jobs)) if parameters['Adjust variance (phylogenetic only)']: cmd += ' --p-variance-adjusted' if parameters['Bypass tips (phylogenetic only)']: cmd += ' --p-bypass-tips' if su_metric == 'generalized_unifrac': cmd += '--p-alpha %s' % parameters[ 'Alpha value (Generalized Unifrac only)'] elif metric not in STATE_UNIFRAC_METRICS and tree is None: dtx_fp = join(out_dir, '%s.qza' % metric) cmd = ('qiime diversity beta --i-table %s --p-metric %s ' '--o-distance-matrix %s --p-n-jobs %s' % (biom_qza, metric, dtx_fp, num_jobs)) else: return False, None, ('Phylogenetic metric %s selected but no tree ' 'exists' % metric) std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error in beta div %s:\nStd out: %s\nStd err: %s" % (metric, std_out, std_err)) return False, None, error_msg qclient.update_job_step( job_id, "Step 4 of 4: Converting Q2 to Qiita artifacts") fdir = join(out_dir, 'dtx') ffp = join(fdir, 'distance-matrix.tsv') cmd = "qiime tools export --output-dir %s %s" % (fdir, dtx_fp) std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error in Q2 -> Qiita conversion:\nStd out: " "%s\nStd err: %s" % (std_out, std_err)) return False, None, error_msg ainfo = [ArtifactInfo('Distance matrix', 'distance_matrix', [(ffp, 'plain_text')])] return True, ainfo, ""
def shogun(qclient, job_id, parameters, out_dir): """Run Shogun with the given parameters Parameters ---------- qclient : tgp.qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to run split libraries out_dir : str The path to the job's output directory Returns ------- bool, list, str The results of the job """ # Step 1 get the rest of the information need to run Atropos qclient.update_job_step(job_id, "Step 1 of 6: Collecting information") artifact_id = parameters['input'] del parameters['input'] # Get the artifact filepath information artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) fps = artifact_info['files'] # Get the artifact metadata prep_info = qclient.get('/qiita_db/prep_template/%s/' % artifact_info['prep_information'][0]) qiime_map = prep_info['qiime-map'] # Step 2 converting to fna qclient.update_job_step(job_id, "Step 2 of 6: Converting to FNA for Shogun") rs = fps['raw_reverse_seqs'] if 'raw_reverse_seqs' in fps else [] samples = make_read_pairs_per_sample(fps['raw_forward_seqs'], rs, qiime_map) # Combining files comb_fp = generate_fna_file(out_dir, samples) # Formatting parameters parameters = _format_params(parameters, SHOGUN_PARAMS) # Step 3 align align_cmd = generate_shogun_align_commands(comb_fp, out_dir, parameters) sys_msg = "Step 3 of 6: Aligning FNA with Shogun (%d/{0})".format( len(align_cmd)) success, msg = _run_commands(qclient, job_id, align_cmd, sys_msg, 'Shogun Align') if not success: return False, None, msg # Step 4 taxonomic profile sys_msg = "Step 4 of 6: Taxonomic profile with Shogun (%d/{0})" assign_cmd, profile_fp = generate_shogun_assign_taxonomy_commands( out_dir, parameters) success, msg = _run_commands(qclient, job_id, assign_cmd, sys_msg, 'Shogun taxonomy assignment') if not success: return False, None, msg sys_msg = "Step 5 of 6: Compressing and converting alignment to BIOM" qclient.update_job_step(job_id, msg) alignment_fp = join( out_dir, 'alignment.%s.%s' % (parameters['aligner'], ALN2EXT[parameters['aligner']])) xz_cmd = 'xz -9 -T%s %s' % (parameters['threads'], alignment_fp) std_out, std_err, return_value = system_call(xz_cmd) if return_value != 0: error_msg = ("Error during %s:\nStd out: %s\nStd err: %s" "\n\nCommand run was:\n%s" % (sys_msg, std_out, std_err, xz_cmd)) return False, None, error_msg output = run_shogun_to_biom(profile_fp, [None, None, None, True], out_dir, 'profile') ainfo = [ ArtifactInfo('Shogun Alignment Profile', 'BIOM', [(output, 'biom'), ('%s.xz' % alignment_fp, 'log')]) ] # Step 5 redistribute profile sys_msg = "Step 6 of 6: Redistributed profile with Shogun (%d/{0})" levels = ['phylum', 'genus', 'species'] redist_fps = [] for level in levels: redist_cmd, output = generate_shogun_redist_commands( profile_fp, out_dir, parameters, level) redist_fps.append(output) success, msg = _run_commands(qclient, job_id, redist_cmd, sys_msg, 'Shogun redistribute') if not success: return False, None, msg # Converting redistributed files to biom for redist_fp, level in zip(redist_fps, levels): biom_in = ["redist", None, '', True] output = run_shogun_to_biom(redist_fp, biom_in, out_dir, level, 'redist') aname = 'Taxonomic Predictions - %s' % level ainfo.append(ArtifactInfo(aname, 'BIOM', [(output, 'biom')])) return True, ainfo, ""
def _summary_not_demultiplexed(artifact_type, filepaths): """Generates the HTML summary for non Demultiplexed artifacts Parameters ---------- artifact_type : str The artifact type filepaths : [(str, str)] A list of string pairs where the first element is the filepath and the second is the filepath type Returns ------- list A list of strings with the html summary """ # loop over each of the fps/fps_type pairs artifact_information = [] errors = [] df = None for fps_type, fps in sorted(filepaths.items()): if fps_type in {'html_summary'}: continue # Step 2: generate HTML summary # md5, from http://stackoverflow.com/a/3431838 for i, fp in enumerate(fps): fn = basename(fp) with open(fp, "rb") as f: hash_md5 = md5() for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) data = { 'filename': fn, 'md5': hash_md5.hexdigest(), 'file_type': fps_type } if artifact_type not in FILEPATH_TYPE_NO_FQTOOLS: # check if the validate summary is present if i == 0: fdata = f'{dirname(fp)}/qtp-sequencing-validate-data.csv' if exists(fdata): df = pd.read_csv(fdata, index_col=None) if df is None: cmd = f'fqtools count {fp}' std_out, std_err, return_value = system_call(cmd) if std_err or return_value != 0: errors.append(f'{fn}: {std_err}') reads = None else: reads = int(std_out) else: reads = df[(df.filename == fn) & (df.file_type == fps_type)] # [0] there is only one value reads = reads.reads.values[0] data['reads'] = reads artifact_information.append(data) if errors: raise ValueError('Found errors: \n %s' % ''.join(errors)) df = pd.DataFrame(artifact_information) order = ['file_type', 'reads'] if 'reads' in df.columns else ['file_type'] df.sort_values(order, inplace=True) return df.to_html(index=False)
def pcoa(qclient, job_id, parameters, out_dir): """generate pcoa calculations Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values for pcoa out_dir : str The path to the job's output directory Returns ------- boolean, list, str The results of the job """ out_dir = join(out_dir, 'pcoa') if not exists(out_dir): mkdir(out_dir) qclient.update_job_step(job_id, "Step 1 of 4: Collecting information") artifact_id = parameters['Distance matrix'] artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) dm_fp = artifact_info['files']['plain_text'][0] dm_qza = join(out_dir, 'q2-distance.qza') pcoa_qza = join(out_dir, 'q2-pcoa.qza') qclient.update_job_step( job_id, "Step 2 of 4: Converting Qiita artifacts to Q2 artifact") cmd = ('qiime tools import --input-path %s --output-path %s ' '--type "DistanceMatrix"' % (dm_fp, dm_qza)) std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error converting distance matrix:\nStd out: %s\n" "Std err: %s" % (std_out, std_err)) return False, None, error_msg qclient.update_job_step( job_id, "Step 3 of 4: Calculating pcoa") cmd = ('qiime diversity pcoa --i-distance-matrix %s --o-pcoa %s' % ( dm_qza, pcoa_qza)) std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error in PCoA\nStd out: %s\nStd err: %s" % (std_out, std_err)) return False, None, error_msg qclient.update_job_step( job_id, "Step 4 of 4: Converting Q2 to Qiita artifacts") fdir = join(out_dir, 'pcoa') ffp = join(fdir, 'ordination.txt') cmd = "qiime tools export --output-dir %s %s" % (fdir, pcoa_qza) std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error in Q2 -> Qiita conversion:\nStd out: " "%s\nStd err: %s" % (std_out, std_err)) return False, None, error_msg ainfo = [ArtifactInfo('Ordination results', 'ordination_results', [(ffp, 'plain_text')])] return True, ainfo, ""
def test_system_call(self): obs_out, obs_err, obs_val = system_call("pwd") self.assertEqual(obs_out, "%s\n" % getcwd()) self.assertEqual(obs_err, "") self.assertEqual(obs_val, 0)
def alpha_diversity(qclient, job_id, parameters, out_dir): """generate alpha diversity calculations Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values for alpha diversity out_dir : str The path to the job's output directory Returns ------- boolean, list, str The results of the job """ out_dir = join(out_dir, 'alpha_diversity') if not exists(out_dir): mkdir(out_dir) qclient.update_job_step(job_id, "Step 1 of 4: Collecting information") artifact_id = parameters['BIOM table'] metric = ALPHA_DIVERSITY_METRICS[parameters['Diversity metric']] tree = parameters['Phylogenetic tree'] if tree == 'None': tree = None artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) biom_fpi = artifact_info['files']['biom'][0] biom_qza = join(out_dir, 'q2-biom.qza') qclient.update_job_step( job_id, "Step 2 of 4: Converting Qiita artifacts to Q2 artifact") # converting biom cmd = ('qiime tools import --input-path %s --output-path %s ' '--type "FeatureTable[Frequency]' % (biom_fpi, biom_qza)) b = load_table(biom_fpi) counts = list(map(sum, b.iter_data())) if min(counts) == max(counts): cmd += " % Properties(['uniform-sampling'])\"" else: cmd += '"' std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error converting biom:\nStd out: %s\nStd err: %s" % (std_out, std_err)) return False, None, error_msg # converting tree if tree is not None: qza_tree = join(out_dir, 'tree.qza') cmd = ('qiime tools import --input-path %s --type Phylogeny[Rooted] ' '--output-path %s' % (tree, qza_tree)) tree = qza_tree std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error converting tree:\nStd out: %s\nStd err: %s" % (std_out, std_err)) return False, None, error_msg qclient.update_job_step( job_id, "Step 3 of 4: Calculating alpha diversity: %s" % (metric)) alpha_fp = join(out_dir, '%s.qza' % metric) if tree is not None and metric in ALPHA_PHYLOGENETIC_METRICS: cmd = 'qiime diversity alpha-phylogenetic --i-phylogeny %s ' % tree elif metric not in ALPHA_PHYLOGENETIC_METRICS and tree is None: cmd = 'qiime diversity alpha ' else: return False, None, ('Phylogenetic metric %s selected but no tree ' 'exists' % metric) cmd += '--i-table %s --p-metric %s --o-alpha-diversity %s' % ( biom_qza, metric, alpha_fp) std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error in alpha div %s:\nStd out: %s\nStd err: %s" % (metric, std_out, std_err)) return False, None, error_msg qclient.update_job_step( job_id, "Step 4 of 4: Converting Q2 to Qiita artifacts") fdir = join(out_dir, 'alpha') ffp = join(fdir, 'alpha-diversity.tsv') cmd = "qiime tools export --output-dir %s %s" % (fdir, alpha_fp) std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error in Q2 -> Qiita conversion:\nStd out: " "%s\nStd err: %s" % (std_out, std_err)) return False, None, error_msg ainfo = [ArtifactInfo('Alpha vectors', 'alpha_vector', [(ffp, 'plain_text')])] return True, ainfo, ""
def deblur(qclient, job_id, parameters, out_dir): """Run deblur with the given parameters Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to run deblur out_dir : str The path to the job's output directory Returns ------- boolean, list, str The results of the job Notes ----- The code will check if the artifact has a preprocessed_demux element, if not it will use the preprocessed_fastq. We prefer to work with the preprocessed_demux as running time will be greatly improved """ out_dir = join(out_dir, 'deblur_out') # Step 1 get the rest of the information need to run deblur qclient.update_job_step(job_id, "Step 1 of 3: Collecting information") artifact_id = parameters['seqs-fp'] # removing input from parameters so it's not part of the final command del parameters['seqs-fp'] # Get the artifact filepath information artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) fps = artifact_info['files'] # Step 2 generating command deblur if 'preprocessed_demux' in fps: qclient.update_job_step( job_id, "Step 2 of 3: Generating per sample " "from demux (1/2)") if not exists(out_dir): mkdir(out_dir) split_out_dir = join(out_dir, 'split') if not exists(split_out_dir): mkdir(split_out_dir) # using the same number of parallel jobs as defined by the command n_jobs = parameters['jobs-to-start'] # [0] cause there should be only 1 file to_per_sample_files(fps['preprocessed_demux'][0], out_dir=split_out_dir, n_jobs=n_jobs) qclient.update_job_step( job_id, "Step 2 of 3: Generating per sample " "from demux (2/2)") out_dir = join(out_dir, 'deblured') cmd = generate_deblur_workflow_commands([split_out_dir], out_dir, parameters) else: qclient.update_job_step(job_id, "Step 2 of 3: Generating deblur " "command") cmd = generate_deblur_workflow_commands(fps['preprocessed_fastq'], out_dir, parameters) # Step 3 execute deblur qclient.update_job_step(job_id, "Step 3 of 3: Executing deblur job") std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error running deblur:\nStd out: %s\nStd err: %s" % (std_out, std_err)) return False, None, error_msg # Generating artifact pb = partial(join, out_dir) # Generate the filepaths final_biom = pb('final.biom') final_seqs = pb('final.seqs.fa') final_biom_16s = pb('final.only-16s.biom') final_seqs_na = pb('final.seqs.fa.no_artifacts') if not exists(final_biom_16s): # Create an empty table. We need to send something to Qiita that is # a valid BIOM, so we are going to create an empty table t = Table([], [], []) with biom_open(final_biom_16s, 'w') as f: t.to_hdf5(f, 'qp-deblur generated') if not exists(final_seqs_na): # Same as before, create an empty sequence file so we can send it with open(final_seqs_na, 'w') as f: f.write("") ainfo = [ ArtifactInfo('deblur final table', 'BIOM', [(final_biom, 'biom'), (final_seqs, 'preprocessed_fasta')]), ArtifactInfo('deblur 16S only table', 'BIOM', [(final_biom_16s, 'biom'), (final_seqs_na, 'preprocessed_fasta')]) ] return True, ainfo, ""
def alpha_correlation(qclient, job_id, parameters, out_dir): """generate alpha correlation calculations Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values for alpha correlation out_dir : str The path to the job's output directory Returns ------- boolean, list, str The results of the job """ out_dir = join(out_dir, 'alpha_correlation') if not exists(out_dir): mkdir(out_dir) qclient.update_job_step(job_id, "Step 1 of 3: Collecting information") artifact_id = parameters['Alpha vectors'] artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) dm_fp = artifact_info['files']['plain_text'][0] dm_qza = join(out_dir, 'q2-alpha-diversity.qza') analysis_id = artifact_info['analysis'] metadata = qclient.get( "/qiita_db/analysis/%s/metadata/" % str(analysis_id)) metadata = pd.DataFrame.from_dict(metadata, orient='index') metadata_fp = join(out_dir, 'metadata.txt') metadata.to_csv(metadata_fp, sep='\t') p_method = ALPHA_CORRELATION_METHODS[parameters['Correlation method']] o_visualization = join(out_dir, 'alpha_correlation.qzv') qclient.update_job_step( job_id, "Step 2 of 3: Converting Qiita artifacts to Q2 artifact") cmd = ('qiime tools import --input-path %s --output-path %s ' '--type "SampleData[AlphaDiversity]"' % (dm_fp, dm_qza)) std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error converting distance matrix:\nStd out: %s\n" "Std err: %s" % (std_out, std_err)) return False, None, error_msg qclient.update_job_step( job_id, "Step 3 of 3: Calculating alpha correlation") cmd = ('qiime diversity alpha-correlation --i-alpha-diversity %s ' '--m-metadata-file %s --p-method %s --o-visualization %s' % ( dm_qza, metadata_fp, p_method, o_visualization)) std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error in Alpha Correlation\nStd out: %s\nStd err: %s" % (std_out, std_err)) return False, None, error_msg ainfo = [ArtifactInfo('Alpha correlation visualization', 'q2_visualization', [(o_visualization, 'qzv')])] return True, ainfo, ""
def generate_insertion_trees(placements, out_dir, reference_template=None, reference_rename=None): """Generates phylogenetic trees by inserting placements into a reference Parameters ---------- placements : dict of strings keys are the seqs, values are the new placements as JSON strings out_dir : str The job output directory reference_template : str, optional Filepath to the reference placement json file. This file can be produced via _generate_template_rename() and should be stored in the plugin package, because it can re used. If None, it falls back to the Greengenes 13.8 99% reference. reference_rename : str, optional Similar to reference_template, but a filepath to the generated python renaming script to undo the name scaping post guppy. If None, it falls back to the Greengenes 13.8 99% reference. Returns ------- str The filepath of the phylogenetic insertion tree in Newick format. Raises ------ ValueError If a) the given reference_template or reference_rename files do not exist b) or the guppy binary exits with non-zero return code c) or the given rename script exists with non-zero return code. """ # test if reference file for rename script actually exists. file_ref_rename = qp_deblur.get_data( join('sepp', 'tmpl_gg13.8-99_rename-json.py')) if reference_rename is not None: file_ref_rename = reference_rename if not exists(file_ref_rename): raise ValueError("Reference rename script '%s' does not exits!" % file_ref_rename) # create a valid placement.json file as input for guppy file_ref_template = qp_deblur.get_data( join('sepp', 'tmpl_gg13.8-99_placement.json')) if reference_template is not None: file_ref_template = reference_template if not exists(file_ref_template): raise ValueError("Reference template '%s' does not exits!" % file_ref_template) with open(file_ref_template, 'r') as f: plcmnts = json.loads(f.read()) plcmnts['placements'].extend( [{'p': placement, 'nm': [[sequence, 1]]} for sequence, placement in placements.items()]) file_placements = '%s/placements.json' % out_dir with open(file_placements, 'w') as f: json.dump(plcmnts, f) # execute guppy file_tree_escaped = join(out_dir, 'insertion_tree.tre') std_out, std_err, return_value = system_call( 'guppy tog %s -o %s' % (file_placements, file_tree_escaped)) if return_value != 0: error_msg = ("Error running guppy:\nStd out: %s\nStd err: %s" % (std_out, std_err)) raise ValueError(error_msg) # execute node name re-labeling (to revert the escaping of names necessary # for guppy) file_tree = join(out_dir, 'insertion_tree.relabelled.tre') std_out, std_err, return_value = system_call( 'cat %s | python %s > %s' % (file_tree_escaped, file_ref_rename, file_tree)) if return_value != 0: error_msg = (("Error running %s:\n" "Std out: %s\nStd err: %s") % (file_ref_rename, std_out, std_err)) raise ValueError(error_msg) # making sure that all branches in the generated tree have branch lenghts tree = TreeNode.read(file_tree) for node in tree.preorder(include_self=False): if node.length is None: node.length = 0.0 tree.write(file_tree) return file_tree
def taxa_barplot(qclient, job_id, parameters, out_dir): """Generate taxa barplot calculations Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values for taxa barplot out_dir : str The path to the job's output directory Returns ------- boolean, list, str The results of the job """ out_dir = join(out_dir, 'taxa_barplot') if not exists(out_dir): mkdir(out_dir) qclient.update_job_step(job_id, "Step 1 of 4: Collecting information") artifact_id = int(parameters['BIOM table']) artifact_info = qclient.get("/qiita_db/artifacts/%d/" % artifact_id) analysis_id = artifact_info['analysis'] metadata = qclient.get( "/qiita_db/analysis/%s/metadata/" % str(analysis_id)) metadata = pd.DataFrame.from_dict(metadata, orient='index') metadata_fp = join(out_dir, 'metadata.txt') metadata.to_csv(metadata_fp, sep='\t') biom_qza = join(out_dir, 'q2-biom.qza') taxonomy_txt = join(out_dir, 'q2-taxonomy.txt') taxonomy_qza = join(out_dir, 'q2-taxonomy.qza') taxa_plot_qzv = join(out_dir, 'taxa-barplot.qzv') # getting the biom table so we can check for taxonomies biom_fp = artifact_info['files']['biom'][0] bt = load_table(biom_fp) with open(taxonomy_txt, 'w') as fp: fp.write('Feature ID\tTaxon\n') for otu_id in bt.ids('observation'): tax = bt.metadata(id=otu_id, axis='observation') if tax is None: error_msg = ("biom table doesn't have taxonomy") return False, None, error_msg taxonomy = '; '.join(tax['taxonomy']) fp.write("%s\t%s\n" % (otu_id, taxonomy)) qclient.update_job_step( job_id, "Step 2 of 4: Converting Qiita artifacts to Q2 artifact: BIOM") cmd = ('qiime tools import --input-path %s --output-path %s ' '--type "FeatureTable[Frequency]' % (biom_fp, biom_qza)) counts = list(map(sum, bt.iter_data())) if min(counts) == max(counts): cmd += " % Properties(['uniform-sampling'])\"" else: cmd += '"' std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error converting biom:\nStd out: %s\nStd err: %s" % (std_out, std_err)) return False, None, error_msg qclient.update_job_step(job_id, "Step 3 of 4: Converting Qiita artifacts " "to Q2 artifact: Taxonomy") cmd = ('qiime tools import --input-path %s --output-path %s ' '--type "FeatureData[Taxonomy]"' % (taxonomy_txt, taxonomy_qza)) std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error converting taxonomy:\nStd out: %s\n" "Std err: %s" % (std_out, std_err)) return False, None, error_msg qclient.update_job_step(job_id, "Step 4 of 4: Generating summary") cmd = ('qiime taxa barplot --i-table %s --i-taxonomy %s ' '--m-metadata-file %s --o-visualization %s' % ( biom_qza, taxonomy_qza, metadata_fp, taxa_plot_qzv)) std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error generating taxonomy summary:\nStd out: %s\n" "Std err: %s" % (std_out, std_err)) return False, None, error_msg ainfo = [ArtifactInfo('Taxa summaries visualization', 'q2_visualization', [(taxa_plot_qzv, 'qzv')])] return True, ainfo, ""
def generate_sepp_placements(seqs, out_dir, threads, reference_phylogeny=None, reference_alignment=None): """Generates the sepp commands Parameters ---------- seqs : list of str A list of seqs to generate placements out_dir : str The job output directory threads : int Number if CPU cores to use reference_phylogeny : str, optional A filepath to an alternative reference phylogeny for SEPP. If None, default phylogeny is uses, which is Greengenes 13.8 99% id. reference_alignment : str, optional A filepath to an alternative reference alignment for SEPP. If None, default alignment is uses, which is Greengenes 13.8 99% id. Returns ------- dict of strings keys are the seqs, values are the new placements as JSON strings Raises ------ ValueError If run-sepp.sh does not produce expected file placements.json which is an indicator that something failed. """ # return an empty dict if no sequences have been passed to the function if len(seqs) < 1: return {} # Create a multiple fasta file for all input seqs file_input = "%s/input.fasta" % out_dir with open(file_input, 'w') as fh_input: for seq in seqs: fh_input.write(">%s\n%s\n" % (seq, seq)) # execute SEPP run_name = 'qiita' param_phylogeny = '' if reference_phylogeny is not None: param_phylogeny = ' -t %s ' % reference_phylogeny param_alignment = '' if reference_alignment is not None: param_alignment = ' -a %s ' % reference_alignment # SEPP writes output into the current working directory (cwd), therefore # we here first need to store the cwd, then move into the output directory, # perform SEPP and move back to the stored cwd for a clean state curr_pwd = environ['PWD'] std_out, std_err, return_value = system_call( 'cd %s && run-sepp.sh %s %s -x %s %s %s; cd %s' % (out_dir, file_input, run_name, threads, param_phylogeny, param_alignment, curr_pwd)) # parse placements from SEPP results file_placements = '%s/%s_placement.json' % (out_dir, run_name) if exists(file_placements): with open(file_placements, 'r') as fh_placements: plcmnts = json.loads(fh_placements.read()) return {p['nm'][0][0]: p['p'] for p in plcmnts['placements']} else: # due to the wrapper style of run-sepp.sh the actual exit code is never # returned and we have no way of finding out which sub-command failed # Therefore, we can only assume that something went wrong by not # observing the expected output file. # If the main SEPP program fails, it reports some information in two # files, whoes content we can read and report file_stderr = '%s/sepp-%s-err.log' % (out_dir, run_name) if exists(file_stderr): with open(file_stderr, 'r') as fh_stderr: std_err = fh_stderr.readlines() file_stdout = '%s/sepp-%s-out.log' % (out_dir, run_name) if exists(file_stdout): with open(file_stdout, 'r') as fh_stdout: std_out = fh_stdout.readlines() error_msg = ("Error running run-sepp.sh:\nStd out: %s\nStd err: %s" % (std_out, std_err)) raise ValueError(error_msg)
def filter_samples(qclient, job_id, parameters, out_dir): """Filter samples from a table Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values for filter samples out_dir : str The path to the job's output directory Returns ------- boolean, list, str The results of the job """ out_dir = join(out_dir, 'filter_samples') if not exists(out_dir): mkdir(out_dir) qclient.update_job_step(job_id, "Step 1 of 4: Collecting information") artifact_id = int(parameters['BIOM table']) p_max_frequency = int( parameters['Maximum feature frequency across samples']) p_max_features = int(parameters['Maximum features per sample']) p_min_frequency = int( parameters['Minimum feature frequency across samples']) p_min_features = int(parameters['Minimum features per sample']) p_where = parameters['SQLite WHERE-clause'] artifact_info = qclient.get("/qiita_db/artifacts/%d/" % artifact_id) analysis_id = artifact_info['analysis'] metadata = qclient.get( "/qiita_db/analysis/%s/metadata/" % str(analysis_id)) metadata = pd.DataFrame.from_dict(metadata, orient='index') metadata_fp = join(out_dir, 'metadata.txt') metadata.to_csv(metadata_fp, sep='\t') # getting just the biom file, [0] it should be only one biom_ifp = artifact_info['files']['biom'][0] biom_ofp = join(out_dir, 'biom.qza') qclient.update_job_step( job_id, "Step 2 of 4: Converting Qiita artifacts to Q2 artifact") # converting biom cmd = ('qiime tools import --input-path %s --output-path %s ' '--type "FeatureTable[Frequency]' % (biom_ifp, biom_ofp)) b = load_table(biom_ifp) counts = list(map(sum, b.iter_data())) if min(counts) == max(counts): cmd += " % Properties(['uniform-sampling'])\"" else: cmd += '"' std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error converting biom:\nStd out: %s\nStd err: %s" % (std_out, std_err)) return False, None, error_msg qclient.update_job_step(job_id, "Step 3 of 4: Filtering") filter_ofp = join(out_dir, 'biom_filtered.qza') cmd = ('qiime feature-table filter-samples --m-metadata-file %s ' '--o-filtered-table %s --p-max-frequency %d --p-max-features %d ' '--p-min-frequency %d --p-min-features %d --i-table %s' % ( metadata_fp, filter_ofp, p_max_frequency, p_max_features, p_min_frequency, p_min_features, biom_ofp)) if p_where != '': cmd += ' --p-where "%s"' % p_where std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error in filtering samples in biom\nStd out: %s\n" "Std err: %s" % (std_out, std_err)) return False, None, error_msg qclient.update_job_step( job_id, "Step 4 of 4: Converting Q2 to Qiita artifacts") fdir = join(out_dir, 'filter_samples') ffp = join(fdir, 'feature-table.biom') cmd = "qiime tools export --output-dir %s %s" % (fdir, filter_ofp) std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error in Q2 -> Qiita conversion:\nStd out: " "%s\nStd err: %s" % (std_out, std_err)) return False, None, error_msg # After calling Qiime2, the taxonomy has been dropped from the BIOM table # Re-add here orig = load_table(biom_ifp) res = load_table(ffp) metadata = {i: orig.metadata(i, axis='observation') for i in res.ids(axis='observation')} res.add_metadata(metadata, axis='observation') res_fp = join(out_dir, 'filtered.biom') with biom_open(res_fp, 'w') as bf: res.to_hdf5(bf, "Qiita's Qiime2 plugin") ainfo = [ArtifactInfo('o-table', 'BIOM', [(res_fp, 'biom')])] return True, ainfo, ""
def spades_to_array(directory, output_dir, prefix_to_name, url, job_id, params): environment = environ["ENVIRONMENT"] ppn = params["threads"] memory = params["memory"] # 1. create file list num_samples = len(prefix_to_name) if num_samples > 1024: raise ValueError('This preparation has more than 1024 samples, ' 'which is the limit; please split in multiple.') files = [] for prefix, sample_name in prefix_to_name.items(): fps = sorted(glob(join(directory, prefix + '*'))) # this should never occur but better to confirm if len(fps) != 2: error_msg = f'Expected two files to match "{prefix}"' raise ValueError(error_msg) files.append('\t'.join([fps[0], fps[1], prefix])) # 2. format main comand command = (f'spades.py --{params["type"]} -t {ppn} -m {memory} ' f'-k {params["k-mers"]} -o $OUTDIR/$SNAME') if params['merging'].startswith('flash '): # get read length quickly; note that we are going to assume # that (1) the forward and reverse are the same length and (2) # all file pairs have the same length so only calculate once fp = glob(join(directory, list(prefix_to_name)[0] + '*'))[0] std_out, std_err, return_value = system_call( f'zcat -c {fp} | head -n 2') if return_value != 0: error_msg = (f"Error uncompressing: {fp}\n" f"Std out: {std_out}\nStd err: {std_err}\n") raise ValueError(error_msg) read_length = len(std_out.split('\n')[1]) percentage = int(params['merging'][6:-1]) / 100 overlap = int(read_length * percentage) command = ( # flash f'flash --threads {ppn} --max-overlap={overlap} ' '--output-directory $OUTDIR ' '--output-prefix="$SNAME" ${FWD} ${REV} ' '--max-mismatch-density=0.1 > $OUTDIR/${SNAME}.flash.log 2>&1' ' && ' # spades f'{command} ' '--merge $OUTDIR/${SNAME}.extendedFrags.fastq ' '-1 $OUTDIR/${SNAME}.notCombined_1.fastq ' '-2 $OUTDIR/${SNAME}.notCombined_2.fastq') else: command = '%s -1 ${FWD} -2 ${REV}' % command # 3. create qsub for array submission mqsub = [ '#!/bin/bash', '#PBS -M [email protected]', f'#PBS -N {job_id}', f'#PBS -l nodes=1:ppn={ppn}', f'#PBS -l walltime={WALLTIME}', f'#PBS -l mem={memory}g', f'#PBS -o {output_dir}/{job_id}' + '_${PBS_ARRAYID}.log', f'#PBS -e {output_dir}/{job_id}' + '_${PBS_ARRAYID}.err', f'#PBS -t 1-{num_samples}%{MAX_RUNNING}', '#PBS -l epilogue=/home/qiita/qiita-epilogue.sh', f'cd {output_dir}', f'{environment}', f'OUTDIR={output_dir}/', 'date', 'hostname', 'echo ${PBS_JOBID} ${PBS_ARRAYID}', 'offset=${PBS_ARRAYID}', 'args=$(head -n $offset ${OUTDIR}/files_to_process.txt| tail -n 1)', "FWD=$(echo -e $args | awk '{ print $1 }')", "REV=$(echo -e $args | awk '{ print $2 }')", "SNAME=$(echo -e $args | awk '{ print $3 }')", f'{command}', 'date' ] # 4. create qsub to finish job in Qiita fqsub = [ '#!/bin/bash', '#PBS -M [email protected]', f'#PBS -N merge-{job_id}', '#PBS -l nodes=1:ppn=1', f'#PBS -l walltime={FINISH_WALLTIME}', f'#PBS -l mem={FINISH_MEMORY}', f'#PBS -o {output_dir}/finish-{job_id}.log', f'#PBS -e {output_dir}/finish-{job_id}.err', '#PBS -l epilogue=/home/qiita/qiita-epilogue.sh', f'cd {output_dir}', f'{environment}', 'date', 'hostname', 'echo $PBS_JOBID', f'finish_qp_spades {url} {job_id} {output_dir}\n' "date" ] # write files with open(join(output_dir, 'files_to_process.txt'), 'w') as f: f.write('\n'.join(files)) main_qsub_fp = join(output_dir, f'{job_id}.qsub') with open(main_qsub_fp, 'w') as job: job.write('\n'.join(mqsub)) job.write('\n') finish_qsub_fp = join(output_dir, f'{job_id}.finish.qsub') with open(finish_qsub_fp, 'w') as job: job.write('\n'.join(fqsub)) job.write('\n') return main_qsub_fp, finish_qsub_fp
def emperor(qclient, job_id, parameters, out_dir): """generate emperor plot calculations Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values for pcoa out_dir : str The path to the job's output directory Returns ------- boolean, list, str The results of the job """ out_dir = join(out_dir, 'emperor') if not exists(out_dir): mkdir(out_dir) qclient.update_job_step(job_id, "Step 1 of 4: Collecting information") artifact_id = parameters['Ordination results'] p_custom_axis = parameters['Custom axis'] artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) pcoa_fp = artifact_info['files']['plain_text'][0] analysis_id = artifact_info['analysis'] metadata = qclient.get( "/qiita_db/analysis/%s/metadata/" % str(analysis_id)) metadata = pd.DataFrame.from_dict(metadata, orient='index') metadata_fp = join(out_dir, 'metadata.txt') metadata.to_csv(metadata_fp, sep='\t') pcoa_qza = join(out_dir, 'q2-pcoa.qza') emperor_qzv = join(out_dir, 'q2-emperor.qzv') qclient.update_job_step( job_id, "Step 2 of 4: Converting Qiita artifacts to Q2 artifact") cmd = ('qiime tools import --input-path %s --output-path %s ' '--type "PCoAResults"' % (pcoa_fp, pcoa_qza)) std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error converting distance matrix:\nStd out: %s\n" "Std err: %s" % (std_out, std_err)) return False, None, error_msg qclient.update_job_step( job_id, "Step 3 of 4: Generating Emperor plot") cmd = ('qiime emperor plot --i-pcoa %s --o-visualization %s ' '--m-metadata-file %s' % (pcoa_qza, emperor_qzv, metadata_fp)) if p_custom_axis is not None and p_custom_axis not in ['None', '']: cmd += ' --p-custom-axis "%s"' % p_custom_axis std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error in PCoA\nStd out: %s\nStd err: %s" % (std_out, std_err)) return False, None, error_msg ainfo = [ArtifactInfo('Emperor visualization', 'q2_visualization', [(emperor_qzv, 'qzv')])] return True, ainfo, ""
def _validate_multiple(qclient, job_id, prep_info, files, atype, test=False): """Validate and fix a new 'SFF', 'FASTQ', 'FASTA' or 'FASTA_Sanger' artifact Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id prep_info : dict of {str: dict of {str: str}} The prep information keyed by sample id files : dict of {str: list of str} The files to add to the new artifact, keyed by filepath type atype: str The type of the artifact test: bolean, optional If True this is being called by a test Returns ------- dict The results of the job """ qclient.update_job_step(job_id, "Step 2: Validating '%s' files" % atype) req_fp_types, opt_fp_types = FILEPATH_TYPE_DICT[atype] all_fp_types = req_fp_types | opt_fp_types # Check if there is any filepath type that is not supported unsupported_fp_types = set(files) - all_fp_types if unsupported_fp_types: error_msg = ("Filepath type(s) %s not supported by artifact " "type %s. Supported filepath types: %s" % (', '.join(unsupported_fp_types), atype, ', '.join( sorted(all_fp_types)))) return False, None, error_msg # Check if the run_prefix column is present in the prep info offending = {} types_seen = set() if 'run_prefix' in prep_info[next(iter(prep_info))]: # We can potentially have more than one lane in the prep information # so check that the provided files are prefixed with the values in # the run_prefix column run_prefixes = set(v['run_prefix'] for k, v in prep_info.items()) num_prefixes = len(run_prefixes) # Check those filepath types that are required for ftype, t_files in files.items(): # SFF is an special case cause we can have multiple files with # the same prefix if num_prefixes != len(t_files) and atype != 'SFF': offending[ftype] = ( "The number of provided files (%d) doesn't match the " "number of run prefix values in the prep info (%d): %s" % (len(t_files), num_prefixes, ', '.join( basename(f) for f in t_files))) else: rps = [] fps = [] for fp in t_files: bn = basename(fp) found = [rp for rp in run_prefixes if bn.startswith(rp)] if found: rps.extend(found) else: fps.append(bn) if fps: offending[ftype] = ( "The provided files do not match the run prefix " "values in the prep information: %s" % ', '.join(fps)) else: rps = run_prefixes - set(rps) if rps: offending[ftype] = ( "The following run prefixes in the prep " "information file do not match any file: %s" % ', '.join(rps)) types_seen.add(ftype) else: # If the run prefix column is not provided, we only allow a single # lane, so check that we have a single file for each provided # filepath type for ftype, t_files in files.items(): if len(t_files) != 1: offending[ftype] = ( "Only one file per type is allowed. Please provide the " "column 'run_prefix' if you need more than one file per " "type: %s" % ', '.join(basename(fp) for fp in t_files)) types_seen.add(ftype) # Check that all required filepath types where present missing = req_fp_types - types_seen if missing: error_msg = ("Missing required filepath type(s): %s" % ', '.join(missing)) return False, None, error_msg # Check if there was any offending file if offending: error_list = ["%s: %s" % (k, v) for k, v in offending.items()] error_msg = ("Error creating artifact. Offending files:\n%s" % '\n'.join(error_list)) return False, None, error_msg # Everything is ok filepaths = [] for fps_type, fps in files.items(): for fp in fps: if fps_type in MUST_GZ: fp, error_msg = _gzip_file(fp, test) if error_msg is not None: return False, None, error_msg filepaths.append((fp, fps_type)) # let's count sequences; this is basically the last check errors = [] artifact_information = [] if atype not in FILEPATH_TYPE_NO_FQTOOLS: for fp, fpt in filepaths: cmd = f'fqtools count {fp}' std_out, std_err, return_value = system_call(cmd) fn = basename(fp) if std_err or return_value != 0: errors.append(f'{fn}: {std_err}') else: reads = int(std_out) artifact_information.append({ 'filename': fn, 'reads': reads, 'file_type': fpt }) if errors: raise ValueError('Found errors: \n %s' % ''.join(errors)) dname = dirname(fp) pd.DataFrame(artifact_information).to_csv( f'{dname}/qtp-sequencing-validate-data.csv', index=False) return True, [ArtifactInfo(None, atype, filepaths)], ""