import re import string import sys from json import load from os.path import join from qp_deblur import get_data with open(get_data(join('sepp', 'tmpl_gg13.8-99-revnamemap.json'))) as f: revnamemap = load(f) def relabel_newick(newick_string): pattern = re.compile("(UQrYOlnDN[^(,:)<>]+)") invalidChars = set(string.punctuation).union(set(string.whitespace)) def replace_func(m): repl = m.group(1) if m.group(1) in revnamemap: repl = revnamemap[m.group(1)] if any(char in invalidChars for char in repl): repl = "'%s'" % repl else: repl = m.group(1) return repl t = pattern.sub(replace_func, newick_string) return t
def generate_insertion_trees(placements, out_dir, reference_template=None, reference_rename=None): """Generates phylogenetic trees by inserting placements into a reference Parameters ---------- placements : dict of strings keys are the seqs, values are the new placements as JSON strings out_dir : str The job output directory reference_template : str, optional Filepath to the reference placement json file. This file can be produced via _generate_template_rename() and should be stored in the plugin package, because it can re used. If None, it falls back to the Greengenes 13.8 99% reference. reference_rename : str, optional Similar to reference_template, but a filepath to the generated python renaming script to undo the name scaping post guppy. If None, it falls back to the Greengenes 13.8 99% reference. Returns ------- str The filepath of the phylogenetic insertion tree in Newick format. Raises ------ ValueError If a) the given reference_template or reference_rename files do not exist b) or the guppy binary exits with non-zero return code c) or the given rename script exists with non-zero return code. """ # test if reference file for rename script actually exists. file_ref_rename = qp_deblur.get_data( join('sepp', 'tmpl_gg13.8-99_rename-json.py')) if reference_rename is not None: file_ref_rename = reference_rename if not exists(file_ref_rename): raise ValueError("Reference rename script '%s' does not exits!" % file_ref_rename) # create a valid placement.json file as input for guppy file_ref_template = qp_deblur.get_data( join('sepp', 'tmpl_gg13.8-99_placement.json')) if reference_template is not None: file_ref_template = reference_template if not exists(file_ref_template): raise ValueError("Reference template '%s' does not exits!" % file_ref_template) with open(file_ref_template, 'r') as f: plcmnts = json.loads(f.read()) plcmnts['placements'].extend( [{'p': placement, 'nm': [[sequence, 1]]} for sequence, placement in placements.items()]) file_placements = '%s/placements.json' % out_dir with open(file_placements, 'w') as f: json.dump(plcmnts, f) # execute guppy file_tree_escaped = join(out_dir, 'insertion_tree.tre') std_out, std_err, return_value = system_call( 'guppy tog %s -o %s' % (file_placements, file_tree_escaped)) if return_value != 0: error_msg = ("Error running guppy:\nStd out: %s\nStd err: %s" % (std_out, std_err)) raise ValueError(error_msg) # execute node name re-labeling (to revert the escaping of names necessary # for guppy) file_tree = join(out_dir, 'insertion_tree.relabelled.tre') std_out, std_err, return_value = system_call( 'cat %s | python %s > %s' % (file_tree_escaped, file_ref_rename, file_tree)) if return_value != 0: error_msg = (("Error running %s:\n" "Std out: %s\nStd err: %s") % (file_ref_rename, std_out, std_err)) raise ValueError(error_msg) # making sure that all branches in the generated tree have branch lenghts tree = TreeNode.read(file_tree) for node in tree.preorder(include_self=False): if node.length is None: node.length = 0.0 tree.write(file_tree) return file_tree
def deblur(qclient, job_id, parameters, out_dir): """Run deblur with the given parameters Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to run deblur out_dir : str The path to the job's output directory Returns ------- boolean, list, str The results of the job Notes ----- The code will check if the artifact has a preprocessed_demux element, if not it will use the preprocessed_fastq. We prefer to work with the preprocessed_demux as running time will be greatly improved """ out_dir = join(out_dir, 'deblur_out') # Step 1 get the rest of the information need to run deblur qclient.update_job_step(job_id, "Step 1 of 4: Collecting information") artifact_id = parameters['Demultiplexed sequences'] # removing input from parameters so it's not part of the final command del parameters['Demultiplexed sequences'] # Get the artifact filepath information artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) fps = artifact_info['files'] # Step 2 generating command deblur if 'preprocessed_demux' in fps: qclient.update_job_step(job_id, "Step 2 of 4: Generating per sample " "from demux (1/2)") if not exists(out_dir): mkdir(out_dir) split_out_dir = join(out_dir, 'split') if not exists(split_out_dir): mkdir(split_out_dir) # using the same number of parallel jobs as defined by the command n_jobs = int(parameters['Jobs to start']) # [0] cause there should be only 1 file to_per_sample_files(fps['preprocessed_demux'][0], out_dir=split_out_dir, n_jobs=n_jobs) qclient.update_job_step(job_id, "Step 2 of 4: Generating per sample " "from demux (2/2)") out_dir = join(out_dir, 'deblured') cmd = generate_deblur_workflow_commands([split_out_dir], out_dir, parameters) else: qclient.update_job_step(job_id, "Step 2 of 4: Generating deblur " "command") cmd = generate_deblur_workflow_commands(fps['preprocessed_fastq'], out_dir, parameters) # Step 3 execute deblur qclient.update_job_step(job_id, "Step 3 of 4: Executing deblur job") std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error running deblur:\nStd out: %s\nStd err: %s" % (std_out, std_err)) return False, None, error_msg # Generating artifact pb = partial(join, out_dir) # Generate the filepaths final_biom = pb('all.biom') final_seqs = pb('all.seqs.fa') final_biom_hit = pb('reference-hit.biom') final_seqs_hit = pb('reference-hit.seqs.fa') if not exists(final_biom_hit): # Create an empty table. We need to send something to Qiita that is # a valid BIOM, so we are going to create an empty table t = Table([], [], []) with biom_open(final_biom_hit, 'w') as f: t.to_hdf5(f, 'qp-deblur generated') if not exists(final_seqs_hit): # Same as before, create an empty sequence file so we can send it with open(final_seqs_hit, 'w') as f: f.write("") # Step 4, communicate with archive to check and generate placements qclient.update_job_step(job_id, "Step 4 of 4 (1/4): Retrieving " "observations information") features = list(load_table(final_biom_hit).ids(axis='observation')) fp_phylogeny = None if features: observations = qclient.post( "/qiita_db/archive/observations/", data={'job_id': job_id, 'features': features}) novel_fragments = list(set(features) - set(observations.keys())) qclient.update_job_step(job_id, "Step 4 of 4 (2/4): Generating %d new " "placements" % len(novel_fragments)) # Once we support alternative reference phylogenies for SEPP in the # future, we need to translate the reference name here into # filepaths pointing to the correct reference alignment and # reference tree. If left 'None' the Greengenes 13.8 reference # shipped with the fragment-insertion conda package will be used. fp_reference_alignment = None fp_reference_phylogeny = None fp_reference_template = None fp_reference_rename = None if 'Reference phylogeny for SEPP' in parameters: if parameters['Reference phylogeny for SEPP'] == 'tiny': fp_reference_alignment = qp_deblur.get_data(join( 'sepp', 'reference_alignment_tiny.fasta')) fp_reference_phylogeny = qp_deblur.get_data(join( 'sepp', 'reference_phylogeny_tiny.nwk')) fp_reference_template = qp_deblur.get_data(join( 'sepp', 'tmpl_tiny_placement.json')) fp_reference_rename = qp_deblur.get_data(join( 'sepp', 'tmpl_tiny_rename-json.py')) try: new_placements = generate_sepp_placements( novel_fragments, out_dir, parameters['Threads per sample'], reference_alignment=fp_reference_alignment, reference_phylogeny=fp_reference_phylogeny) except ValueError as e: return False, None, str(e) qclient.update_job_step(job_id, "Step 4 of 4 (3/4): Archiving %d " "new placements" % len(novel_fragments)) # values needs to be json strings as well for fragment in new_placements.keys(): new_placements[fragment] = json.dumps(new_placements[fragment]) # fragments that get rejected by a SEPP run don't show up in # the placement file, however being rejected is a valuable # information and should be stored in the archive as well. # Thus, we avoid re-computation for rejected fragments in the # future. for fragment in novel_fragments: if fragment not in new_placements: new_placements[fragment] = "" if len(new_placements.keys()) > 0: qclient.patch(url="/qiita_db/archive/observations/", op="add", path=job_id, value=json.dumps(new_placements)) # retrieve all fragments and create actuall tree qclient.update_job_step(job_id, "Step 4 of 4 (4/4): Composing " "phylogenetic insertion tree") placements = qclient.post( "/qiita_db/archive/observations/", data={'job_id': job_id, 'features': features}) # remove fragments that have been rejected by SEPP, i.e. whoes # placement is the empty string and # convert all other placements from string to json placements = {frag: json.loads(placements[frag]) for frag, plc in placements.items() if plc != ''} try: fp_phylogeny = generate_insertion_trees( placements, out_dir, reference_template=fp_reference_template, reference_rename=fp_reference_rename) except ValueError as e: return False, None, str(e) else: new_placements = None ainfo = [ArtifactInfo('deblur final table', 'BIOM', [(final_biom, 'biom'), (final_seqs, 'preprocessed_fasta')])] if fp_phylogeny is not None: ainfo.append(ArtifactInfo('deblur reference hit table', 'BIOM', [(final_biom_hit, 'biom'), (final_seqs_hit, 'preprocessed_fasta'), (fp_phylogeny, 'plain_text')], new_placements)) return True, ainfo, ""
import re import string import sys from json import load from os.path import join from qp_deblur import get_data with open(get_data(join('sepp', 'tmpl_tiny-revnamemap.json'))) as f: revnamemap = load(f) def relabel_newick(newick_string): pattern = re.compile("(UQrYOlnDN[^(,:)<>]+)") invalidChars = set(string.punctuation).union(set(string.whitespace)) def replace_func(m): repl = m.group(1) if m.group(1) in revnamemap: repl = revnamemap[m.group(1)] if any(char in invalidChars for char in repl): repl = "'%s'" % repl else: repl = m.group(1) return repl t = pattern.sub(replace_func, newick_string) return t