def pipeline(ctx, aligner, input, database, output, level, function, capitalist, taxacut, threads, percent_id, ra): if not os.path.exists(output): os.makedirs(output) if not capitalist: # Set to not run Burst post-align in capitalist mode ALIGNERS['burst'] = lambda database, threads=threads, shell=ctx.obj['shell']: BurstAligner(database, shell=ctx.obj['shell'], threads=threads, taxacut=taxacute, capitalist=False, percent_id=percent_id) redist_outs = [] redist_levels = [] if aligner == 'all': for align in ALIGNERS.values(): aligner_cl = align(database, threads=threads, shell=ctx.obj['shell'], percent_id=percent_id, taxacut=taxacut) aligner_cl.align(input, output) if level is not 'off': redist_out = os.path.join(output, "taxatable.%s.%s.txt" % (aligner_cl._name, level)) _redist_outs, _redist_levels = _redistribute(database, level, redist_out, aligner_cl.outfile, relative_abundance=ra) redist_outs.extend(_redist_outs) redist_levels.extend(_redist_levels) else: aligner_cl = ALIGNERS[aligner](database, threads=threads, shell=ctx.obj['shell'], percent_id=percent_id, taxacut=taxacut) aligner_cl.align(input, output) logger.debug(level) if level != 'off': redist_out = os.path.join(output, "taxatable.%s.txt" % (level)) redist_outs, redist_levels = _redistribute(database, level, redist_out, aligner_cl.outfile, relative_abundance=ra) if function and level != 'off': _function(redist_outs, database, output, redist_levels, save_median_taxatable=True) if ra: _convert_files_to_relative_abundances(redist_outs)
def __init__(self, database_dir, threads=1, post_align=True, shell=False, percent_id=.98, **kwargs): self.threads = threads self.shell = shell check, msg = self.check_database(database_dir) with open(os.path.join(database_dir, 'metadata.yaml')) as stream: self.data_files = yaml.load(stream, Loader=yaml.SafeLoader) if not check: raise Exception("Database %s is not formatted correctly: %s" % (database_dir, msg)) self.database_dir = database_dir self.tax = os.path.join(database_dir, self.data_files['general']['taxonomy']) self.fasta = os.path.join(database_dir, self.data_files['general']['fasta']) self.outfile = None logger.debug("Initiate Logger %s" % self._name) self.post_align = post_align self.percent_id = percent_id
def _redistribute(database, level, outfile, redist_inf, relative_abundance=False): logger.debug("Beginning redistribution for file: %s" % redist_inf) data_files = _load_metadata(database) shear = os.path.join(database, data_files['general']['shear']) shear_df = parse_bayes(shear) output_files = [] output_levels = [] if level == 'all': for l in TAXA: df_output = redistribute_taxatable(redist_inf, shear_df, level=TAXAMAP[l]) tmp_spl = outfile.split('.') tmp_path = '.'.join(tmp_spl[:-1] + [l] + [tmp_spl[-1]]) df_output.to_csv(tmp_path, sep='\t', float_format="%d",na_rep=0, index_label="#OTU ID") output_files.append(tmp_path) output_levels.append(l) elif level == 'off': output_files = [] else: df_output = redistribute_taxatable(redist_inf, shear_df, level=TAXAMAP[level]) df_output.to_csv(outfile, sep='\t', float_format="%d", na_rep=0, index_label="#OTU ID") output_files.append(outfile) output_levels.append(level) return output_files, output_levels
def _post_align(self, sam_file: str) -> pd.DataFrame: logger.debug("Beginning post align with aligner %s" % self._name) align_gen = yield_alignments_from_sam_inf(sam_file) lca_map = build_lca_map(align_gen, self.tree) samples_lca_map = defaultdict(Counter) for key, value in valfilter(lambda x: x is not None, lca_map).items(): samples_lca_map['_'.join(key.split('_')[:-1])].update([value]) df = pd.DataFrame(samples_lca_map, dtype=int) return df
def _post_align(self, sam_file: str, samples_iter: int = 50, confidence_threshold: float = 1.0, **kwargs) -> pd.DataFrame: logger.debug("Beginning post align with aligner %s" % self._name) df = build_lca_df(sam_file, self.tree, confidence_threshold=confidence_threshold, samples_iter=samples_iter) return df
def _load_metadata(database): metadata_file = os.path.join(database, 'metadata.yaml') if os.path.exists(metadata_file): with open(metadata_file, 'r') as stream: logger.debug( "Attempting to load the database metadata file at %s" % (os.path.abspath(metadata_file))) data_files = yaml.load(stream, Loader=yaml.SafeLoader) return data_files else: logger.critical("Unable to load database at %s" % os.path.abspath(metadata_file)) raise Exception("Unable to load database at %s" % os.path.abspath(metadata_file))
def _post_align(self, utree_out: str, **kwargs) -> pd.DataFrame: logger.debug("Beginning post align with aligner %s" % self._name) samples_lca_map = defaultdict(Counter) with open(utree_out) as utree_f: csv_utree = csv.reader(utree_f, delimiter='\t') # qname, lca, confidence, support for line in csv_utree: #TODO confidence/support filter taxonomy = split_utree_taxonomy(line[1]) samples_lca_map['_'.join(line[0].split('_')[:-1])].update( [taxonomy]) df = pd.DataFrame(samples_lca_map, dtype=int) return df
def _post_align_taxonomy(self, outf): logger.debug("Beginning post align taxonomy style with aligner %s" % self._name) samples_lca_map = defaultdict(lambda: defaultdict(int)) with open(outf) as utree_f: csv_embalm = csv.reader(utree_f, delimiter='\t') # qname, lca, confidence, support for line in csv_embalm: if line[-1] is not None: #TODO confidence/support filter samples_lca_map['_'.join( line[0].split('_')[:-1])][line[-1]] += 1 df = pd.DataFrame(samples_lca_map, dtype=np.int).fillna(0).astype(np.int) return df
def _post_align_capitalist(self, outf): logger.debug("Beginning post align capitalist style with aligner %s" % self._name) # This alignment parsing assumes capitalist output samples_lca_map = defaultdict(lambda: defaultdict(int)) with open(outf) as emb_inf: csv_embalm = csv.reader(emb_inf, delimiter='\t') # qname, lca, confidence, support for line in csv_embalm: tax = self.tree(line[1]) #TODO confidence/support filter samples_lca_map['_'.join(line[0].split('_')[:-1])][tax] += 1 df = pd.DataFrame(samples_lca_map, dtype=np.int).fillna(0).astype(np.int) return df
def _create_kegg_table(taxatable_df, row_names, column_names, kegg_table_csr): num_taxa_kegg, num_kegg_ids = kegg_table_csr.shape # pd.DataFrame(kegg_table_csr.todense(), index=sorted(row_names, key=row_names.get), columns=sorted(column_names, key=column_names.get), dtype=np.int).to_csv("/project/flatiron2/ben/kegg_species.csv") logger.debug("Kegg table for functional prediction shape %s" % (str(kegg_table_csr.shape))) num_taxa, num_samples = taxatable_df.shape logger.debug("Taxatable for functional prediction shape %s" % (str(taxatable_df.shape))) kegg_table = np.zeros((num_samples, num_kegg_ids), dtype=np.int) row_names_found = 0 for i, row in taxatable_df.iterrows(): row.name = row.name if row.name in row_names: row_names_found += 1 idx = row_names[row.name] kegg_table += np.outer(row, kegg_table_csr.getrow(idx).todense()) overlap = float(row_names_found) / num_taxa if overlap < .5: logger.warning("Overlap of taxa and function %.2f" % overlap) else: logger.debug("Overlap of taxa and function %.2f" % overlap) logger.debug("Row names found in taxatable %d" % row_names_found) out_kegg_table_df = pd.DataFrame(kegg_table, index=taxatable_df.columns, columns=sorted(column_names, key=column_names.get), dtype=np.int).T # Filter out zeros out_kegg_table_df = out_kegg_table_df[(out_kegg_table_df.T != 0).any()] return out_kegg_table_df
def run_command(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT): """ Run prepared behave command in shell and return its output. :param stderr: :param stdout: :param cmd: Well-formed behave command to run. :param shell: Force subprocess to use shell, not recommended :return: """ try: cmd = [str(i) for i in cmd] if not stdout: stdout = open(os.devnull, 'w') if not stderr: stderr = open(os.devnull, 'w') logger.debug(" ".join(cmd)) with elapsed_timer() as elapsed: with subprocess.Popen( " ".join(cmd) if shell else cmd, stdout=stdout, stderr=stderr, shell=shell, universal_newlines=True, bufsize=1, cwd=os.getcwd(), ) as proc: log_subprocess_output(proc.stdout) logger.debug("%.2f seconds" % elapsed()) logger.debug("Subprocess finished.") #if proc.returncode != 0: #raise AssertionError("exit code is non zero: %d\n%s" % (proc.returncode, " ".join(cmd))) return proc.returncode, "", "" except subprocess.CalledProcessError as e: raise AssertionError("Called Process Error: %s" % e)
def redistribute_taxatable(filename: str, counts_bayes: pd.DataFrame, level=8): df = pd.read_csv(filename, sep="\t", index_col=0) df = df[[type(_) == str for _ in df.index]] cb_index = tree() _ = [add_tree(cb_index, v) for v in counts_bayes.index] # Remove spaces in taxonomy for legacy reasons df.index = [_.replace(" ", "_") for _ in df.index] df['summary'] = [longest_path_tree(cb_index, v) for v in df.index] df = df.groupby('summary').sum() df['level'] = [_.count(';') + 1 if type(_) == str else 0 for _ in df.index] # summarize up below_level = df['level'] >= level leaf_counts_df = df[below_level].copy() leaf_counts_df['taxa_name'] = [ ';'.join(v.split(';')[:level]) for v in df[below_level].index ] leaf_counts_df = leaf_counts_df.groupby('taxa_name').sum() leaf_counts_df = leaf_counts_df.drop('level', axis=1) # summarize bayes to level counts_bayes_sum = _summarize_bayes_at_level(counts_bayes, leaf_counts_df.index, level=level) # summarize down for i, row in df[~below_level].sort_values('level', ascending=False).iterrows(): # Get all children of item tmp_name = row.name leave_filter = _filter_leaves_for_tax(leaf_counts_df, tmp_name) num_leaves = np.sum(leave_filter) if num_leaves == 0 or num_leaves is None: if row.name == "": logger.debug( "Conflict found for sequence at the kingdom level, skipping." ) continue # Filter back row names until in counts_bayes blank = ['k__', 'p__', 'c__', 'o__', 'f__', 'g__', 's__', 't__'] for i, _ in enumerate(row.name.split(';')): blank[i] = _ tmp_counts_bayes_row = _summarize_bayes_at_level( counts_bayes, row.name, level=row.name.count(';') + 1, drop_level=level) tmp_counts_bayes_row.name = ';'.join(blank[:level]) row.name = tmp_counts_bayes_row.name leaf_counts_df = leaf_counts_df.append(row[:-1]) if tmp_counts_bayes_row.name not in counts_bayes_sum.index: counts_bayes_sum = counts_bayes_sum.append( tmp_counts_bayes_row) counts_bayes_sum = counts_bayes_sum.fillna(0) elif num_leaves == 1: leaf_counts_df.loc[leave_filter] += row.values[:-1] elif num_leaves > 1: tmp_level = row.name.count(';') tmp_leaves = leaf_counts_df[leave_filter].sort_index() tmp_bayes = counts_bayes_sum.loc[tmp_leaves.index] # Series 1xn where n is the number of leave nodes below tax prob_tax_given_level = (tmp_bayes.iloc[:, tmp_level] + 1) / (tmp_bayes['genome_length'] + 1) prob_tax_given_level = prob_tax_given_level / np.sum( prob_tax_given_level) # Series 1xn where n is the number of unique reads for a given taxa uniqueness_per_genome = tmp_bayes.iloc[:, level - 1] / tmp_bayes[ 'genome_length'] # Matrix divide each observed count by uniqueness counts_over_uniqueness = tmp_leaves.T / uniqueness_per_genome.values # Matrix divide each uniqueness count by sum of sample prob_tax = counts_over_uniqueness.T / counts_over_uniqueness.sum( axis=1) # Get the redistribution parameters # Should be taxa by samples same as the tmp_leaves # Each column should sum to 1 redistribution_params = prob_tax.apply( lambda x: x * prob_tax_given_level.values, axis=0).apply(lambda x: x / x.sum(), axis=0) redistribution_numbers = (redistribution_params * row.values[:-1]).round() # Add the number back to the dataframe leaf_counts_df = leaf_counts_df.add(redistribution_numbers, fill_value=0) return leaf_counts_df
def log_subprocess_output(pipe): for line in pipe: line = line.rstrip() if line: if not line.startswith('Search Progress'): logger.debug(line)
def function_run_and_save(input, func_db, output, level, save_median_taxatable=True): prefix = ".".join(os.path.basename(input).split('.')[:-1]) kegg_pathways_df = func_db['pathways'] kegg_modules_df = func_db['modules'] row_names = func_db['names'] kegg_ids = func_db['kegg_ids'] kegg_table_csr = func_db['csr'] logger.debug( "Level for summarization %d and starting summarizing KEGG Table at level with median." % level) if level < 8: kegg_table_csr, row_names = summarize_at_level(kegg_table_csr, row_names, kegg_ids, level) logger.debug("Number of rows %d" % len(list(row_names.keys()))) if TAXA[level - 1] not in prefix: prefix += "." + TAXA[level - 1] logger.info("Reading in taxatable for functional prediction at %s." % os.path.abspath(input)) taxatable_df = pd.read_csv(input, sep="\t", index_col=0) logger.debug("Taxatable for functional prediction shape %s" % str(taxatable_df.shape)) taxatable_df = taxatable_df[[type(_) == str for _ in taxatable_df.index]] taxatable_df['summary'] = [ ';'.join(_.split(';')[:level]).replace(' ', '_') for _ in taxatable_df.index ] # Drop names above taxatable_df = taxatable_df[[ _.count(';') + 1 >= level for _ in taxatable_df['summary'] ]] taxatable_df = taxatable_df.groupby('summary').sum().fillna(0.) # Normalizing for depth at median depth taxatable_df = normalize_by_median_depth(taxatable_df) if save_median_taxatable: taxatable_df.to_csv(os.path.join(output, "%s.normalized.txt" % prefix), sep='\t', float_format="%d", na_rep=0, index_label="#OTU ID") logger.debug("Taxatable summarized shape %s" % str(taxatable_df.shape)) logger.info("Starting functional prediction.") out_kegg_table_df, out_kegg_modules_df, out_kegg_modules_coverage, out_kegg_pathways_df, out_kegg_pathways_coverage = _do_function( taxatable_df, row_names, kegg_ids, kegg_table_csr, kegg_modules_df, kegg_pathways_df) out_kegg_table_df.to_csv(os.path.join(output, "%s.kegg.txt" % prefix), sep='\t', float_format="%d", na_rep=0, index_label="#KEGG ID") out_kegg_modules_df.to_csv(os.path.join(output, "%s.kegg.modules.txt" % prefix), sep='\t', float_format="%d", na_rep=0, index_label="#MODULE ID") out_kegg_modules_coverage.to_csv(os.path.join( output, "%s.kegg.modules.coverage.txt" % prefix), sep='\t', float_format="%f", na_rep=0, index_label="#MODULE ID") out_kegg_pathways_df.to_csv(os.path.join(output, "%s.kegg.pathways.txt" % prefix), sep='\t', float_format="%d", na_rep=0, index_label="#PATHWAY ID") out_kegg_pathways_coverage.to_csv(os.path.join( output, "%s.kegg.pathways.coverage.txt" % prefix), sep='\t', float_format="%f", na_rep=0, index_label="#PATHWAY ID")
def normalize_by_median_depth(df): logger.debug("Normalizing to median depth") return df.div(df.sum(axis=0).div(df.sum(axis=0).median()), axis=1).round().astype(int)