def edit_taxonomies(i_datasets_folder: str, taxonomies: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int): job_folder = get_job_folder(i_datasets_folder, 'taxonomy') job_folder2 = get_job_folder(i_datasets_folder, 'taxonomy/chunks') main_written = 0 to_chunk = [] run_pbs = '%s/1_run_taxonomy_edit_%s%s.sh' % (job_folder, prjct_nm, filt_raref) with open(run_pbs, 'w') as o: for dat, (_, qza, tsv) in taxonomies.items(): if not isfile(tsv): continue written = 0 out_pd = pd.read_csv(tsv, dtype=str, sep='\t') taxo = out_pd['Taxon'].tolist() taxo_edit = get_taxa_edit(taxo) if taxo != taxo_edit: out_pd['Taxon'] = taxo_edit out_pd.to_csv(tsv, index=False, sep='\t') cmd = run_import(tsv, qza, 'FeatureData[Taxonomy]') out_sh = '%s/run_taxonomy_edit_%s_%s%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] with open(out_sh, 'w') as cur_sh: cur_sh.write('echo "%s"\n' % cmd) cur_sh.write('%s\n\n' % cmd) main_written += 1 written += 1 if written: to_chunk.append(out_sh) if not chunkit: run_xpbs(out_sh, out_pbs, '%s.tx.dt.%s%s' % (prjct_nm, dat, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', o, noloc, slurm, jobs) if to_chunk and chunkit: simple_chunks(run_pbs, job_folder2, to_chunk, 'taxonomy_edit', prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, None) if main_written: print_message('# Edit features taxonomy to not contain "," characters', 'sh', run_pbs, jobs)
def run_qemistree(i_datasets_folder: str, datasets: dict, prjct_nm: str, i_qemistree: str, taxonomies: dict, force: bool, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> None: """ :param i_datasets_folder: Path to the folder containing the data/metadata subfolders. :param datasets_read: dataset -> [tsv table, meta table] :param prjct_nm: Short nick name for your project. :param i_qemistree: path to qemistree folder (feature-data and tree). :param taxonomies: dataset -> [method, assignment qza] :param force: Force the re-writing of scripts for all commands. :param qiime_env: name of your qiime2 conda environment (e.g. qiime2-2019.10). :param chmod: whether to change permission of output files (defalt: 775). """ job_folder = get_job_folder(i_datasets_folder, 'qemistree') job_folder2 = get_job_folder(i_datasets_folder, 'qemistree/chunks') written = 0 to_chunk = [] run_pbs = '%s/1_run_qemistree_%s%s.sh' % (job_folder, prjct_nm, filt_raref) with open(run_pbs, 'w') as o: for dat, tsv_meta_pds in datasets.items(): feature_data = '%s/feature-data_%s.qza' % (i_qemistree, dat) qemistree = '%s/qemistree_%s.qza' % (i_qemistree, dat) if not isfile(feature_data) or not isfile(qemistree): continue out_sh = '%s/run_qemistree_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] odir = get_analysis_folder(i_datasets_folder, 'qemistree/%s' % dat) classyfire_qza = '%s/%s-classyfire.qza' % (odir, dat) classyfire_tsv = '%s.tsv' % splitext(classyfire_qza)[0] with open(out_sh, 'w') as cur_sh: if force or not isfile(classyfire_tsv): write_qemistree(feature_data, classyfire_qza, classyfire_tsv, qemistree, cur_sh) written += 1 if isfile(classyfire_tsv): odir = get_analysis_folder(i_datasets_folder, 'taxonomy/%s' % dat) out_rad = '%s/tax_%s' % (odir, dat) tax_qza = '%s.qza' % out_rad tax_tsv = '%s.tsv' % out_rad classyfire_pd = pd.read_csv(classyfire_tsv, header=0, sep='\t') with open(tax_tsv, 'w') as o: cols = ['id', 'kingdom', 'superclass', 'class', 'subclass', 'direct_parent'] o.write('Feature ID\tTaxon\n') for row in classyfire_pd[cols].values: o.write('%s\t%s\n' % (row[0], '; '.join(row[1:]))) run_export(tax_tsv, tax_qza, 'FeatureData[Taxonomy]') taxonomies[dat] = ['direct_parent', tax_qza] written += 1 else: print('[Warning] Maybe run qemistree first and then re-run pipeline to ' 'have the classyfire taxonomy include in the barplots!') to_chunk.append(out_sh) if not chunkit: run_xpbs(out_sh, out_pbs, '%s.qmstr.%s%s' % (prjct_nm, dat, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', o, noloc, slurm, jobs) if to_chunk and chunkit: simple_chunks(run_pbs, job_folder2, to_chunk, 'qemistree', prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, None) if written: print_message('# Make qemistree classyfire classifications', 'sh', run_pbs, jobs)
def run_barplot(i_datasets_folder: str, datasets: dict, taxonomies: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> None: """Visualize taxonomy with an interactive bar plot. Parameters ---------- i_datasets_folder : str Path to the folder containing the data/metadata subfolders datasets : dict Mappig dataset name -> [tsv file path, metadata file path] taxonomies : dict Mappig dataset name -> [classification_method, tax_qza] force : bool Force the re-writing of scripts for all commands prjct_nm : str Short nick name for your project qiime_env : str Mame of a qiime2 conda environment chmod : str Whether to change permission of output files (defalt: 744) noloc : bool run_params : dict filt_raref : str jobs : bool chunkit : int Returns ------- """ job_folder = get_job_folder(i_datasets_folder, 'barplot') job_folder2 = get_job_folder(i_datasets_folder, 'barplot/chunks') written = 0 to_chunk = [] run_pbs = '%s/1_run_barplot_%s%s.sh' % (job_folder, prjct_nm, filt_raref) with open(run_pbs, 'w') as o: for dat, tsv_meta_pds_ in datasets.items(): out_sh = '%s/run_barplot_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] with open(out_sh, 'w') as cur_sh: for tsv_meta_pds in tsv_meta_pds_: tsv, meta = tsv_meta_pds if dat not in taxonomies: continue method, tax_qza, tax_tsv = taxonomies[dat] if not method: method = 'taxofromfile' qza = '%s.qza' % splitext(tsv)[0] odir = get_analysis_folder(i_datasets_folder, 'barplot/%s' % dat) out_qzv = '%s/bar_%s_%s.qzv' % (odir, dat, method) if force or not isfile(out_qzv): write_barplots(out_qzv, qza, meta, tax_qza, cur_sh) written += 1 to_chunk.append(out_sh) if not chunkit: run_xpbs(out_sh, out_pbs, '%s.brplt.%s%s' % (prjct_nm, dat, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', o, noloc, slurm, jobs) if to_chunk and chunkit: simple_chunks(run_pbs, job_folder2, to_chunk, 'barplot', prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, None) if written: print_message('# Make sample compositions barplots', 'sh', run_pbs, jobs)
def run_taxonomy(method: str, i_datasets_folder: str, datasets: dict, datasets_read: dict, datasets_phylo: dict, datasets_features: dict, datasets_filt_map: dict, i_classifier: str, taxonomies: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> None: """ Parameters ---------- method i_datasets_folder : str Path to the folder containing the data/metadata subfolders. datasets : dict Mappring dataset name -> [data file path, metadata file path]. datasets_read : dict Mapping dataset name -> [data table, metadata table] datasets_phylo : dict To be updated with ('tree_to_use', 'corrected_or_not') per dataset. datasets_features : dict Mapping dataset name -> list of features names in the dataset tsv / biom file. datasets_filt_map : dict i_classifier : str Path to the taxonomic classifier. taxonomies : dict Mapping Dataset name -> [method, assignment qza] force : bool Force the re-writing of scripts for all commands. prjct_nm : str Short nick name for your project. qiime_env : str Name of your qiime2 conda environment (e.g. qiime2-2019.10). chmod : str Whether to change permission of output files (default: 744). noloc : str run_params : dict filt_raref : str jobs : bool chunkit : int Returns ------- """ job_folder = get_job_folder(i_datasets_folder, 'taxonomy') job_folder2 = get_job_folder(i_datasets_folder, 'taxonomy/chunks') amplicon_datasets = [ dat for dat, (tree, correction) in datasets_phylo.items() if tree == 'amplicon' ] wol_datasets = [ dat for dat, (tree, correction) in datasets_phylo.items() if tree == 'wol' ] main_written = 0 to_chunk = [] run_pbs = '%s/1_run_taxonomy_%s%s.sh' % (job_folder, prjct_nm, filt_raref) with open(run_pbs, 'w') as o: for dat, tsv_meta_pds_ in datasets_read.items(): out_sh = '%s/run_taxonomy_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] if dat in datasets_filt_map: taxonomies[dat] = taxonomies[datasets_filt_map[dat]] continue written = 0 with open(out_sh, 'w') as cur_sh: for idx, tsv_meta_pds in enumerate(tsv_meta_pds_): if idx: continue tsv, meta = datasets[dat][idx] if not isinstance(tsv_meta_pds[0], pd.DataFrame) and \ tsv_meta_pds[0] == 'raref': if not isfile(tsv): print('Must have run rarefaction to use it ' 'further...\nExiting') sys.exit(0) tsv_pd, meta_pd = get_raref_tab_meta_pds(meta, tsv) datasets_read[dat][idx] = [tsv_pd, meta_pd] else: tsv_pd, meta_pd = tsv_meta_pds odir = get_analysis_folder(i_datasets_folder, 'taxonomy/%s' % dat) out_rad = '%s/tax_%s' % (odir, dat) if dat in amplicon_datasets: out_qza = '%s_%s.qza' % (out_rad, method) out_tsv = '%s.tsv' % splitext(out_qza)[0] taxonomies[dat] = [method, out_qza, out_tsv] if not i_classifier: print('No classifier passed for 16S ' 'data\nExiting...') continue cmd = run_taxonomy_amplicon(dat, i_datasets_folder, force, tsv_pd, out_qza, out_tsv, i_classifier) else: out_qza = '%s.qza' % out_rad out_tsv = '%s.tsv' % out_rad if dat in wol_datasets: cur_datasets_features = datasets_features[dat] taxonomies[dat] = ['wol', out_qza, out_tsv] cmd = run_taxonomy_wol(force, tsv_pd, out_qza, out_tsv, cur_datasets_features) else: if len( [x for x in tsv_pd.index if str(x).isdigit()]) == tsv_pd.shape[0]: continue taxonomies[dat] = ['feat', out_qza, out_tsv] cmd = run_taxonomy_others(force, tsv_pd, out_qza, out_tsv) if cmd: cur_sh.write('echo "%s"\n' % cmd) cur_sh.write('%s\n\n' % cmd) main_written += 1 written += 1 if written: to_chunk.append(out_sh) if not chunkit: run_xpbs(out_sh, out_pbs, '%s.tx.sklrn.%s%s' % (prjct_nm, dat, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', o, noloc, slurm, jobs) if to_chunk and chunkit: simple_chunks(run_pbs, job_folder2, to_chunk, 'taxonomy', prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, None) if main_written: print_message('# Classify features using classify-sklearn', 'sh', run_pbs, jobs)
def import_datasets(i_datasets_folder: str, datasets: dict, datasets_phylo: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, run_params: dict, filt_raref: str, jobs: bool, slurm: bool, chunkit: int) -> None: """Initial imports of the .tsv datasets in to Qiime2 Artefacts Parameters ---------- i_datasets_folder : str Names identifying the datasets in the input folder datasets : dict Mapping dataset name -> [data file path, metadata file path] datasets_phylo : dict Mapping dataset name -> ('tree_to_use', 'corrected_or_not') force : bool Force the re-writing of scripts for all commands prjct_nm : str Nick name for the project. qiime_env : str Name of a qiime2 conda environment where analysis tools to be run are installed chmod : str noloc : bool run_params : dict filt_raref : str jobs : bool chunkit : int Returns ------- """ job_folder = get_job_folder(i_datasets_folder, 'import_tables') job_folder2 = get_job_folder(i_datasets_folder, 'import_tables/chunks') to_chunk = [] main_written = 0 run_pbs = '%s/0_run_import_%s%s.sh' % (job_folder, prjct_nm, filt_raref) with open(run_pbs, 'w') as o: for dat, tsv_meta_pds_ in datasets.items(): written = 0 out_sh = '%s/0_run_import_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] with open(out_sh, 'w') as cur_sh: for tsv_meta_pds in tsv_meta_pds_: # REMOVE IF FIXED NOT KEPT tsv, meta = tsv_meta_pds qza = '%s.qza' % splitext(tsv)[0] if datasets_phylo[dat][1]: cmd = run_import(tsv, qza, 'FeatureTable[Frequency]') cur_sh.write('echo "%s"\n' % cmd) cur_sh.write('%s\n' % cmd) written += 1 elif force or not isfile(qza): cmd = run_import(tsv, qza, 'FeatureTable[Frequency]') cur_sh.write('echo "%s"\n' % cmd) cur_sh.write('%s\n' % cmd) written += 1 if written: main_written += 1 to_chunk.append(out_sh) if not chunkit: job_name = '%s.mprt.%s%s' % (prjct_nm, dat, filt_raref) run_xpbs(out_sh, out_pbs, job_name, qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', o, noloc, slurm, jobs) if to_chunk and chunkit: simple_chunks(run_pbs, job_folder2, to_chunk, 'imports', prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, None) if main_written: print_message('# Import tables to qiime2', 'sh', run_pbs, jobs)
def run_rarefy(i_datasets_folder: str, datasets: dict, datasets_read: dict, datasets_phylo: dict, datasets_filt_map: dict, datasets_rarefs: dict, p_raref_depths: str, eval_rarefs: bool, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, run_params: dict, filt_raref: str, filt_only: bool, jobs: bool, slurm: bool, chunkit: int) -> dict: """ Run rarefy: Rarefy table. https://docs.qiime2.org/2019.10/plugins/available/feature-table/rarefy/ :param i_datasets_folder: Path to the folder containing the data/metadata subfolders. :param datasets: dataset -> [tsv/biom path, meta path] :param datasets_read: dataset -> [tsv table, meta table] :param datasets_features: dataset -> list of features names in the dataset tsv / biom file. :param datasets_phylo: to be updated with ('tree_to_use', 'corrected_or_not') per dataset. :param force: Force the re-writing of scripts for all commands. :param prjct_nm: Nick name for your project. :param qiime_env: qiime2-xxxx.xx conda environment. :param chmod: whether to change permission of output files (defalt: 775). :return: deta divesity matrices. """ evaluation = '' eval_depths = {} datasets_raref_depths, datasets_raref_evals = check_rarefy_need( i_datasets_folder, datasets_read, p_raref_depths) if eval_rarefs: evaluation = '_eval' set_filt_rarefy(datasets_raref_depths, datasets_filt_map) datasets_update = {} datasets_read_update = {} datasets_phylo_update = {} datasets_append = {} main_written = 0 job_folder = get_job_folder(i_datasets_folder, 'rarefy%s' % evaluation) job_folder2 = get_job_folder(i_datasets_folder, 'rarefy%s/chunks' % evaluation) to_chunk = [] run_pbs = '%s/1_run_rarefy_%s%s%s.sh' % (job_folder, prjct_nm, evaluation, filt_raref) with open(run_pbs, 'w') as o: for dat, tsv_meta_pds_ in datasets.items(): written = 0 if dat not in datasets_raref_depths: continue if filt_only and dat not in datasets_filt_map: continue odir = get_analysis_folder(i_datasets_folder, 'rarefy%s/%s' % (evaluation, dat)) out_sh = '%s/run_rarefy_%s%s_%s.sh' % (job_folder2, prjct_nm, evaluation, dat) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] with open(out_sh, 'w') as cur_sh: depths = datasets_raref_depths[dat][1] if eval_rarefs: depths = datasets_raref_evals[dat] tsv_pd, meta_pd = datasets_read[dat][0] tsv_sums = tsv_pd.sum() for tsv_meta_pds in tsv_meta_pds_: tsv, meta = tsv_meta_pds for depth_ in depths: depth = get_digit_depth(depth_, tsv_sums) dat_raref = '%s_raref%s%s' % (dat, evaluation, str(depth)) meta_out = '%s/meta_%s.tsv' % (odir, dat_raref) remaining_samples = tsv_sums[ tsv_sums >= depth].index.tolist() meta_raref_pd = meta_pd.loc[ meta_pd.sample_name.isin(remaining_samples), :] meta_raref_pd.to_csv(meta_out, index=False, sep='\t') qza = tsv.replace('.tsv', '.qza') qza_out = '%s/tab_%s.qza' % (odir, dat_raref) tsv_out = '%s.tsv' % splitext(qza_out)[0] if force or not os.path.isfile(tsv_out): cmd = write_rarefy(qza, qza_out, depth) cur_sh.write('echo "%s"\n' % cmd) cur_sh.write('%s\n\n' % cmd) cmd = run_export(qza_out, tsv_out, 'FeatureTable[Frequency]') cur_sh.write('echo "%s"\n' % cmd) cur_sh.write('%s\n\n' % cmd) main_written += 1 written += 1 if eval_rarefs: eval_depths.setdefault(dat, []).append( '%s_%s' % (dat, str(depth))) datasets_update['%s_%s' % (dat, str(depth))] = [[ tsv_out, meta_out ]] datasets_read_update['%s_%s' % (dat, str(depth))] = ( 'raref', str(depth)) datasets_phylo_update[ '%s_%s' % (dat, str(depth))] = datasets_phylo[dat] else: datasets_append.setdefault(dat, []).append( [tsv_out, meta_out]) if isfile(tsv_out) and isfile(meta_out): tab_filt_pd = pd.read_csv(tsv_out, index_col=0, header=0, sep='\t') with open(meta_out) as f: for line in f: break meta_filt_pd = pd.read_csv( meta_out, header=0, sep='\t', dtype={line.split('\t')[0]: str}, low_memory=False) datasets_read[dat].append( [tab_filt_pd, meta_filt_pd]) else: datasets_read[dat].append( ('raref', str(depth))) datasets_rarefs.setdefault(dat, []).append( '_raref%s%s' % (evaluation, str(depth))) to_chunk.append(out_sh) if not chunkit: run_xpbs( out_sh, out_pbs, '%s.bt%s.%s%s' % (prjct_nm, evaluation, dat, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', o, noloc, slurm, jobs) if to_chunk and chunkit: simple_chunks(run_pbs, job_folder2, to_chunk, 'rarefy%s' % evaluation, prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, None) if main_written: print_message('# Get rarefied datasets', 'sh', run_pbs, jobs) if eval_rarefs: datasets.update(datasets_update) datasets_read.update(datasets_read_update) datasets_phylo.update(datasets_phylo_update) else: for dat, fps in datasets_append.items(): datasets[dat].extend(fps) return eval_depths
def run_volatility(i_datasets_folder: str, datasets: dict, p_longi_column: str, datasets_rarefs: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> None: """ Run volatility: Generate interactive volatility plot. https://docs.qiime2.org/2019.10/plugins/available/longitudinal/volatility/ :param i_datasets_folder: Path to the folder containing the data/metadata subfolders. :param datasets: list of datasets. :param p_longi_column: metadata column that is the time stratification. :param force: Force the re-writing of scripts for all commands. :param prjct_nm: Nick name for your project. :param qiime_env: qiime2-xxxx.xx conda environment. :param chmod: whether to change permission of output files (defalt: 775). """ job_folder = get_job_folder(i_datasets_folder, 'longitudinal') job_folder2 = get_job_folder(i_datasets_folder, 'longitudinal/chunks') main_written = 0 first_print = 0 first_print2 = 0 to_chunk = [] run_pbs = '%s/5_run_volatility_%s%s.sh' % (job_folder, prjct_nm, filt_raref) with open(run_pbs, 'w') as o: for dat, tsv_meta_pds_ in datasets.items(): written = 0 out_sh = '%s/run_volatility_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] with open(out_sh, 'w') as cur_sh: for idx, tsv_meta_pds in enumerate(tsv_meta_pds_): tsv, meta = tsv_meta_pds cur_raref = datasets_rarefs[dat][idx] meta_alphas = '%s_alphas.tsv' % splitext(meta)[0] if not isfile(meta_alphas): if not first_print: print( '\nWarning: First make sure you run alpha -> alpha merge/export (2_run_merge_alphas.sh) ' ' before running volatility\n\t(if you need the alpha as a response variable)!' ) first_print += 1 continue with open(meta) as f: for line in f: break time_point = [ x for x in line.strip().split('\t') if p_longi_column in x ][0] if not time_point: if not first_print2: print('Variable %s not in metadata %s\n' % (p_longi_column, meta_alphas)) first_print2 += 1 continue odir = get_analysis_folder( i_datasets_folder, 'longitudinal/%s%s' % (dat, cur_raref)) out_fp = '%s/%s_volatility.qzv' % (odir, dat) if force or not isfile(out_fp): write_longitudinal_volatility(out_fp, meta_alphas, time_point, cur_sh) written += 1 main_written += 1 to_chunk.append(out_sh) if not chunkit: run_xpbs(out_sh, out_pbs, '%s.vltlt.%s%s' % (prjct_nm, dat, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', o, noloc, slurm, jobs) if to_chunk and chunkit: simple_chunks(run_pbs, job_folder2, to_chunk, 'volatility', prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, None) if main_written: print_message('# Longitudinal change in alpha diversity indices', 'sh', run_pbs, jobs)
def run_alpha_rarefaction(i_datasets_folder: str, datasets: dict, datasets_rarefs: dict, datasets_phylo: dict, trees: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, As: tuple, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> None: """ Run alpha-rarefaction: Alpha rarefaction https://docs.qiime2.org/2019.10/plugins/available/diversity/alpha-rarefaction/ """ alpha_metrics = get_metrics('alpha_metrics', As) job_folder = get_job_folder(i_datasets_folder, 'alpha_rarefaction') job_folder2 = get_job_folder(i_datasets_folder, 'alpha_rarefaction/chunks') main_written = 0 run_pbs = '%s/4_run_alpha_rarefaction_%s%s.sh' % (job_folder, prjct_nm, filt_raref) to_chunk = [] with open(run_pbs, 'w') as o: for dat, tsv_meta_pds_ in datasets.items(): written = 0 out_sh = '%s/run_alpha_rarefaction_%s_%s%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] with open(out_sh, 'w') as cur_sh: for idx, tsv_meta_pds in enumerate(tsv_meta_pds_): tsv, meta = tsv_meta_pds qza = '%s.qza' % splitext(tsv)[0] cur_raref = datasets_rarefs[dat][idx] odir = get_analysis_folder( i_datasets_folder, 'alpha_rarefaction/%s%s' % (dat, cur_raref)) for metric in alpha_metrics: out_fp = '%s/rarefcurve_%s%s_%s.qzv' % ( odir, dat, cur_raref, metric) if force or not isfile(out_fp): if write_diversity_alpha_rarefaction( out_fp, qza, metric, datasets_phylo, trees, dat, meta, cur_sh): continue written += 1 main_written += 1 to_chunk.append(out_sh) if not chunkit: run_xpbs(out_sh, out_pbs, '%s.lphrrf.%s%s' % (prjct_nm, dat, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', o, noloc, slurm, jobs) if to_chunk and chunkit: simple_chunks(run_pbs, job_folder2, to_chunk, 'alpha_rarefaction', prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, None) if main_written: print_message('# Compute rarefaction curve on alpha diversity indices', 'sh', run_pbs, jobs)
def run_correlations(i_datasets_folder: str, datasets: dict, diversities: dict, datasets_rarefs: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> None: """ Run alpha-correlation: Alpha diversity correlation https://docs.qiime2.org/2019.10/plugins/available/diversity/alpha-correlation/ :param i_datasets_folder: Path to the folder containing the data/metadata subfolders. :param datasets: list of datasets. :param diversities: alpha diversity qiime2 Arfetact per dataset. :param force: Force the re-writing of scripts for all commands. :param prjct_nm: Nick name for your project. :param qiime_env: qiime2-xxxx.xx conda environment. :param chmod: whether to change permission of output files (defalt: 775). """ job_folder = get_job_folder(i_datasets_folder, 'alpha_correlations') job_folder2 = get_job_folder(i_datasets_folder, 'alpha_correlations/chunks') main_written = 0 run_pbs = '%s/4_run_alpha_correlation_%s%s.sh' % (job_folder, prjct_nm, filt_raref) to_chunk = [] with open(run_pbs, 'w') as o: for dat, tsv_meta_pds_ in datasets.items(): if dat not in diversities: continue written = 0 out_sh = '%s/run_alpha_correlation_%s_%s%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] with open(out_sh, 'w') as cur_sh: for idx, tsv_meta_pds in enumerate(tsv_meta_pds_): tsv, meta = tsv_meta_pds cur_raref = datasets_rarefs[dat][idx] for method in ['spearman', 'pearson']: for group, divs in diversities[dat][idx].items(): if group: odir = get_analysis_folder( i_datasets_folder, 'alpha_correlations/%s%s/%s' % (dat, cur_raref, group)) else: odir = get_analysis_folder( i_datasets_folder, 'alpha_correlations/%s%s' % (dat, cur_raref)) for qza in [x[0] for x in divs]: out_fp = '%s/alpha_corr_%s' % ( odir, basename(qza).replace( '.qza', '_%s.qzv' % method)) if force or not isfile(out_fp): write_diversity_alpha_correlation( out_fp, qza, method, meta, cur_sh) written += 1 main_written += 1 to_chunk.append(out_sh) if not chunkit: run_xpbs(out_sh, out_pbs, '%s.lphcrr.%s%s' % (prjct_nm, dat, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', o, noloc, slurm, jobs) if to_chunk and chunkit: simple_chunks(run_pbs, job_folder2, to_chunk, 'alpha_correlations', prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, None) if main_written: print_message( '# Correlate numeric metadata variables with alpha diversity indices', 'sh', run_pbs, jobs)
def run_alpha(i_datasets_folder: str, datasets: dict, datasets_read: dict, datasets_phylo: dict, datasets_rarefs: dict, p_alpha_subsets: str, trees: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, As: tuple, dropout: bool, run_params: dict, filt_raref: str, eval_depths: dict, jobs: bool, chunkit: int) -> dict: """ Computes the alpha diversity vectors for each dataset. :param i_datasets_folder: Path to the folder containing the data/metadata subfolders. :param datasets: dataset -> [tsv, meta] :param datasets_read: dataset -> [tsv table, meta table] :param datasets_phylo: to be updated with ('tree_to_use', 'corrected_or_not') per dataset. :param p_alpha_subsets: Subsets for alpha diversity. :param trees: to be update with tree to use for a dataset phylogenetic analyses. :param force: Force the re-writing of scripts for all commands. :param prjct_nm: Short nick name for your project. :param qiime_env: name of your qiime2 conda environment (e.g. qiime2-2019.10). :param chmod: whether to change permission of output files (defalt: 775). :return: {'dataset1': [ 'meta', {'div_index1': '.qza', 'div_index2': '.qza', ... }], 'dataset2': [ 'meta', {'div_index1': '.qza', 'div_index2': '.qza', ... }], '...'} """ evaluation = '' if len(eval_depths): evaluation = '_eval' alpha_metrics = get_metrics('alpha_metrics', As) alpha_subsets = read_yaml_file(p_alpha_subsets) job_folder = get_job_folder(i_datasets_folder, 'alpha%s' % evaluation) job_folder2 = get_job_folder(i_datasets_folder, 'alpha%s/chunks' % evaluation) diversities = {} run_pbs = '%s/1_run_alpha_%s%s%s.sh' % (job_folder, prjct_nm, evaluation, filt_raref) main_written = 0 to_chunk = [] with open(run_pbs, 'w') as o: for dat, tsv_meta_pds_ in datasets.items(): written = 0 diversities[dat] = [] out_sh = '%s/run_alpha_%s%s_%s%s.sh' % ( job_folder2, prjct_nm, evaluation, dat, filt_raref) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] with open(out_sh, 'w') as cur_sh: for idx, tsv_meta_pds in enumerate(tsv_meta_pds_): tsv, meta = tsv_meta_pds if not isinstance( datasets_read[dat][idx][0], pd.DataFrame ) and datasets_read[dat][idx][0] == 'raref': if not isfile(tsv): print( 'Must have run rarefaction to use it further...\nExiting' ) sys.exit(0) tsv_pd, meta_pd = get_raref_tab_meta_pds(meta, tsv) datasets_read[dat][idx] = [tsv_pd, meta_pd] else: tsv_pd, meta_pd = datasets_read[dat][idx] cur_raref = datasets_rarefs[dat][idx] qza = '%s.qza' % splitext(tsv)[0] divs = {} for metric in alpha_metrics: odir = get_analysis_folder( i_datasets_folder, 'alpha/%s%s' % (dat, cur_raref)) out_fp = '%s/%s_%s.qza' % ( odir, basename(splitext(qza)[0]), metric) out_tsv = '%s.tsv' % splitext(out_fp)[0] if force or not isfile(out_fp): ret_continue = write_diversity_alpha( out_fp, datasets_phylo, trees, dat, qza, metric, cur_sh, qiime_env) if ret_continue: continue cmd = run_export(out_fp, out_tsv, '') cur_sh.write('echo "%s"\n' % cmd) cur_sh.write('%s\n\n' % cmd) written += 1 main_written += 1 divs.setdefault('', []).append((out_fp, metric)) if alpha_subsets and dat in alpha_subsets: for subset, subset_regex in alpha_subsets[dat].items(): odir = get_analysis_folder( i_datasets_folder, 'alpha/%s%s/%s' % (dat, cur_raref, subset)) if dropout: qza_subset_ = '%s/%s_%s.qza' % ( odir, basename(splitext(qza)[0]), subset) else: qza_subset_ = '%s/%s_%s_noDropout.qza' % ( odir, basename(splitext(qza)[0]), subset) feats_subset = '%s.meta' % splitext(qza_subset_)[0] feats = get_subset(tsv_pd, subset_regex) if not len(feats): continue subset_pd = pd.DataFrame({ 'Feature ID': feats, 'Subset': [subset] * len(feats) }) subset_pd.to_csv(feats_subset, index=False, sep='\t') write_filter_features(tsv_pd, feats, qza, qza_subset_, feats_subset, cur_sh, dropout) for metric in alpha_metrics: if metric in [ 'faith_pd' ] and datasets_phylo[dat][1] and dat in trees: tree_in_qza = trees[dat][0] tree_in_tsv = '%s.tsv' % splitext( tree_in_qza)[0] if dropout: qza_subset = '%s/%s_%s.qza' % ( odir, basename(splitext(tree_in_qza)[0]), subset) else: qza_subset = '%s/%s_%s_noDropout.qza' % ( odir, basename(splitext(tree_in_qza)[0]), subset) write_filter_features( pd.read_csv(tree_in_tsv, header=0, index_col=0, sep='\t'), feats, tree_in_qza, qza_subset, feats_subset, cur_sh, dropout) else: qza_subset = qza_subset_ out_fp = '%s/%s__%s.qza' % ( odir, basename( splitext(qza_subset)[0]), metric) out_tsv = '%s.tsv' % splitext(out_fp)[0] if force or not isfile(out_fp): ret_continue = write_diversity_alpha( out_fp, {dat: [1, 0]}, trees, dat, qza_subset, metric, cur_sh, qiime_env) if ret_continue: continue cmd = run_export(out_fp, out_tsv, '') cur_sh.write('echo "%s"\n' % cmd) cur_sh.write('%s\n\n' % cmd) written += 1 main_written += 1 divs.setdefault(subset, []).append( (out_fp, metric)) diversities[dat].append(divs) to_chunk.append(out_sh) if not chunkit: run_xpbs( out_sh, out_pbs, '%s.mg.lph%s.%s%s' % (prjct_nm, evaluation, dat, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', o, noloc, slurm, jobs) if to_chunk and chunkit: simple_chunks(run_pbs, job_folder2, to_chunk, 'alpha', prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, None) if main_written: print_message('# Calculate alpha diversity indices', 'sh', run_pbs, jobs) return diversities
def merge_meta_alpha(i_datasets_folder: str, datasets: dict, datasets_rarefs: dict, diversities: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, dropout: bool, run_params: dict, filt_raref: str, eval_depths: dict, jobs: bool, chunkit: int) -> dict: """ Computes the alpha diversity vectors for each dataset. :param i_datasets_folder: Path to the folder containing the data/metadata subfolders. :param datasets: list of datasets. :param datasets_rarefs: list of rarefied datasets. :param diversities: paths to [alpha_divs] :param force: Force the re-writing of scripts for all commands. :param prjct_nm: Short nick name for your project. :param qiime_env: name of your qiime2 conda environment (e.g. qiime2-2019.10). :param chmod: whether to change permission of output files (defalt: 775). :return: """ evaluation = '' if len(eval_depths): evaluation = '_eval' job_folder = get_job_folder(i_datasets_folder, 'tabulate%s' % evaluation) job_folder2 = get_job_folder(i_datasets_folder, 'tabulate%s/chunks' % evaluation) to_export = {} to_chunk = [] main_written = 0 run_pbs = '%s/2_run_merge_alphas_%s%s%s.sh' % (job_folder, prjct_nm, evaluation, filt_raref) with open(run_pbs, 'w') as o: for dat, group_divs_list in diversities.items(): written = 0 to_export[dat] = [] out_sh = '%s/run_merge_alpha_%s%s_%s%s.sh' % ( job_folder2, prjct_nm, evaluation, dat, filt_raref) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] with open(out_sh, 'w') as cur_sh: for idx, group_divs in enumerate(group_divs_list): tsv, meta = datasets[dat][idx] cur_raref = datasets_rarefs[dat][idx] base = basename(splitext(tsv)[0]).lstrip('tab_') to_export_groups = [] for group, divs in group_divs.items(): if group: output_folder = get_analysis_folder( i_datasets_folder, 'tabulate%s/%s%s/%s' % (evaluation, dat, cur_raref, group)) else: output_folder = get_analysis_folder( i_datasets_folder, 'tabulate%s/%s%s' % (evaluation, dat, cur_raref)) if dropout: out_fp = '%s/%s_alphas__%s.qzv' % (output_folder, base, group) else: out_fp = '%s/%s_alphas_noDropout__%s.qzv' % ( output_folder, base, group) out_fp_tsv = '%s.tsv' % splitext(out_fp)[0] if isfile(out_fp_tsv): with open(out_fp_tsv) as f: for line in f: indices = line.strip().split('\t')[1:] break divs_alphas = [x[1] for x in divs] if len(indices) < len(divs_alphas): force = True to_export_groups.append(out_fp_tsv) if force or not isfile(out_fp): write_metadata_tabulate(out_fp, divs, meta, cur_sh) cmd = run_export(out_fp, out_fp_tsv, '') cur_sh.write('echo "%s"\n' % cmd) cur_sh.write('%s\n\n' % cmd) main_written += 1 written += 1 to_export[dat].append(to_export_groups) to_chunk.append(out_sh) if not chunkit: run_xpbs( out_sh, out_pbs, '%s.mrg.lph%s.%s%s' % (prjct_nm, evaluation, dat, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', o, noloc, slurm, jobs) if to_chunk and chunkit: simple_chunks(run_pbs, job_folder2, to_chunk, 'tabulate', prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, None) if main_written: print_message('# Merge and export alpha diversity indices', 'sh', run_pbs, jobs) return to_export