Exemplo n.º 1
0
def pipeline(ctx, aligner, input, database, output, level, function, capitalist, taxacut, threads, percent_id, ra):
    if not os.path.exists(output):
        os.makedirs(output)

    if not capitalist:
        # Set to not run Burst post-align in capitalist mode
        ALIGNERS['burst'] = lambda database, threads=threads, shell=ctx.obj['shell']: BurstAligner(database, shell=ctx.obj['shell'], threads=threads, taxacut=taxacute, capitalist=False, percent_id=percent_id)

    redist_outs = []
    redist_levels = []
    if aligner == 'all':
        for align in ALIGNERS.values():
            aligner_cl = align(database, threads=threads, shell=ctx.obj['shell'], percent_id=percent_id, taxacut=taxacut)
            aligner_cl.align(input, output)
            if level is not 'off':
                redist_out = os.path.join(output, "taxatable.%s.%s.txt" % (aligner_cl._name, level))
                _redist_outs, _redist_levels = _redistribute(database, level, redist_out, aligner_cl.outfile, relative_abundance=ra)
                redist_outs.extend(_redist_outs)
                redist_levels.extend(_redist_levels)
    else:
        aligner_cl = ALIGNERS[aligner](database, threads=threads, shell=ctx.obj['shell'], percent_id=percent_id, taxacut=taxacut)
        aligner_cl.align(input, output)
        logger.debug(level)
        if level != 'off':
            redist_out = os.path.join(output, "taxatable.%s.txt" % (level))
            redist_outs, redist_levels = _redistribute(database, level, redist_out, aligner_cl.outfile, relative_abundance=ra)

    if function and level != 'off':
        _function(redist_outs, database, output, redist_levels, save_median_taxatable=True)

    if ra:
        _convert_files_to_relative_abundances(redist_outs)
Exemplo n.º 2
0
    def __init__(self,
                 database_dir,
                 threads=1,
                 post_align=True,
                 shell=False,
                 percent_id=.98,
                 **kwargs):
        self.threads = threads
        self.shell = shell
        check, msg = self.check_database(database_dir)

        with open(os.path.join(database_dir, 'metadata.yaml')) as stream:
            self.data_files = yaml.load(stream, Loader=yaml.SafeLoader)

        if not check:
            raise Exception("Database %s is not formatted correctly: %s" %
                            (database_dir, msg))

        self.database_dir = database_dir

        self.tax = os.path.join(database_dir,
                                self.data_files['general']['taxonomy'])
        self.fasta = os.path.join(database_dir,
                                  self.data_files['general']['fasta'])
        self.outfile = None
        logger.debug("Initiate Logger %s" % self._name)
        self.post_align = post_align
        self.percent_id = percent_id
Exemplo n.º 3
0
def _redistribute(database, level, outfile, redist_inf, relative_abundance=False):
    logger.debug("Beginning redistribution for file: %s" % redist_inf)
    data_files = _load_metadata(database)

    shear = os.path.join(database, data_files['general']['shear'])

    shear_df = parse_bayes(shear)

    output_files = []
    output_levels = []

    if level == 'all':
        for l in TAXA:
            df_output = redistribute_taxatable(redist_inf, shear_df, level=TAXAMAP[l])
            tmp_spl = outfile.split('.')
            tmp_path = '.'.join(tmp_spl[:-1] + [l] + [tmp_spl[-1]])
            df_output.to_csv(tmp_path, sep='\t', float_format="%d",na_rep=0, index_label="#OTU ID")
            output_files.append(tmp_path)
            output_levels.append(l)
    elif level == 'off':
        output_files = []
    else:
        df_output = redistribute_taxatable(redist_inf, shear_df, level=TAXAMAP[level])
        df_output.to_csv(outfile, sep='\t', float_format="%d", na_rep=0, index_label="#OTU ID")
        output_files.append(outfile)
        output_levels.append(level)

    return output_files, output_levels
Exemplo n.º 4
0
    def _post_align(self, sam_file: str) -> pd.DataFrame:
        logger.debug("Beginning post align with aligner %s" % self._name)
        align_gen = yield_alignments_from_sam_inf(sam_file)
        lca_map = build_lca_map(align_gen, self.tree)
        samples_lca_map = defaultdict(Counter)
        for key, value in valfilter(lambda x: x is not None, lca_map).items():
            samples_lca_map['_'.join(key.split('_')[:-1])].update([value])

        df = pd.DataFrame(samples_lca_map, dtype=int)
        return df
Exemplo n.º 5
0
 def _post_align(self,
                 sam_file: str,
                 samples_iter: int = 50,
                 confidence_threshold: float = 1.0,
                 **kwargs) -> pd.DataFrame:
     logger.debug("Beginning post align with aligner %s" % self._name)
     df = build_lca_df(sam_file,
                       self.tree,
                       confidence_threshold=confidence_threshold,
                       samples_iter=samples_iter)
     return df
Exemplo n.º 6
0
def _load_metadata(database):
    metadata_file = os.path.join(database, 'metadata.yaml')
    if os.path.exists(metadata_file):
        with open(metadata_file, 'r') as stream:
            logger.debug(
                "Attempting to load the database metadata file at %s" % (os.path.abspath(metadata_file)))
            data_files = yaml.load(stream, Loader=yaml.SafeLoader)
        return data_files
    else:
        logger.critical("Unable to load database at %s" % os.path.abspath(metadata_file))
        raise Exception("Unable to load database at %s" % os.path.abspath(metadata_file))
Exemplo n.º 7
0
 def _post_align(self, utree_out: str, **kwargs) -> pd.DataFrame:
     logger.debug("Beginning post align with aligner %s" % self._name)
     samples_lca_map = defaultdict(Counter)
     with open(utree_out) as utree_f:
         csv_utree = csv.reader(utree_f, delimiter='\t')
         # qname, lca, confidence, support
         for line in csv_utree:
             #TODO confidence/support filter
             taxonomy = split_utree_taxonomy(line[1])
             samples_lca_map['_'.join(line[0].split('_')[:-1])].update(
                 [taxonomy])
     df = pd.DataFrame(samples_lca_map, dtype=int)
     return df
Exemplo n.º 8
0
    def _post_align_taxonomy(self, outf):
        logger.debug("Beginning post align taxonomy style with aligner %s" %
                     self._name)
        samples_lca_map = defaultdict(lambda: defaultdict(int))
        with open(outf) as utree_f:
            csv_embalm = csv.reader(utree_f, delimiter='\t')
            # qname, lca, confidence, support
            for line in csv_embalm:
                if line[-1] is not None:
                    #TODO confidence/support filter
                    samples_lca_map['_'.join(
                        line[0].split('_')[:-1])][line[-1]] += 1

        df = pd.DataFrame(samples_lca_map,
                          dtype=np.int).fillna(0).astype(np.int)
        return df
Exemplo n.º 9
0
    def _post_align_capitalist(self, outf):
        logger.debug("Beginning post align capitalist style with aligner %s" %
                     self._name)
        # This alignment parsing assumes capitalist output
        samples_lca_map = defaultdict(lambda: defaultdict(int))
        with open(outf) as emb_inf:
            csv_embalm = csv.reader(emb_inf, delimiter='\t')
            # qname, lca, confidence, support
            for line in csv_embalm:
                tax = self.tree(line[1])
                #TODO confidence/support filter
                samples_lca_map['_'.join(line[0].split('_')[:-1])][tax] += 1

        df = pd.DataFrame(samples_lca_map,
                          dtype=np.int).fillna(0).astype(np.int)
        return df
Exemplo n.º 10
0
def _create_kegg_table(taxatable_df, row_names, column_names, kegg_table_csr):
    num_taxa_kegg, num_kegg_ids = kegg_table_csr.shape
    # pd.DataFrame(kegg_table_csr.todense(), index=sorted(row_names, key=row_names.get), columns=sorted(column_names, key=column_names.get), dtype=np.int).to_csv("/project/flatiron2/ben/kegg_species.csv")
    logger.debug("Kegg table for functional prediction shape %s" %
                 (str(kegg_table_csr.shape)))
    num_taxa, num_samples = taxatable_df.shape
    logger.debug("Taxatable for functional prediction shape %s" %
                 (str(taxatable_df.shape)))

    kegg_table = np.zeros((num_samples, num_kegg_ids), dtype=np.int)
    row_names_found = 0

    for i, row in taxatable_df.iterrows():
        row.name = row.name
        if row.name in row_names:
            row_names_found += 1
            idx = row_names[row.name]
            kegg_table += np.outer(row, kegg_table_csr.getrow(idx).todense())

    overlap = float(row_names_found) / num_taxa
    if overlap < .5:
        logger.warning("Overlap of taxa and function %.2f" % overlap)
    else:
        logger.debug("Overlap of taxa and function %.2f" % overlap)

    logger.debug("Row names found in taxatable %d" % row_names_found)

    out_kegg_table_df = pd.DataFrame(kegg_table,
                                     index=taxatable_df.columns,
                                     columns=sorted(column_names,
                                                    key=column_names.get),
                                     dtype=np.int).T
    # Filter out zeros
    out_kegg_table_df = out_kegg_table_df[(out_kegg_table_df.T != 0).any()]

    return out_kegg_table_df
Exemplo n.º 11
0
def run_command(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT):
    """
    Run prepared behave command in shell and return its output.
    :param stderr:
    :param stdout:
    :param cmd: Well-formed behave command to run.
    :param shell: Force subprocess to use shell, not recommended
    :return:
    """

    try:
        cmd = [str(i) for i in cmd]

        if not stdout:
            stdout = open(os.devnull, 'w')
        if not stderr:
            stderr = open(os.devnull, 'w')

        logger.debug(" ".join(cmd))
        with elapsed_timer() as elapsed:
            with subprocess.Popen(
                " ".join(cmd) if shell else cmd,
                stdout=stdout,
                stderr=stderr,
                shell=shell,
                universal_newlines=True,
                bufsize=1,
                cwd=os.getcwd(),
            ) as proc:
                log_subprocess_output(proc.stdout)

        logger.debug("%.2f seconds" % elapsed())
        logger.debug("Subprocess finished.")

        #if proc.returncode != 0:
            #raise AssertionError("exit code is non zero: %d\n%s" % (proc.returncode, " ".join(cmd)))
        return proc.returncode, "", ""
    except subprocess.CalledProcessError as e:
        raise AssertionError("Called Process Error: %s" % e)
Exemplo n.º 12
0
def redistribute_taxatable(filename: str, counts_bayes: pd.DataFrame, level=8):
    df = pd.read_csv(filename, sep="\t", index_col=0)
    df = df[[type(_) == str for _ in df.index]]

    cb_index = tree()
    _ = [add_tree(cb_index, v) for v in counts_bayes.index]

    # Remove spaces in taxonomy for legacy reasons
    df.index = [_.replace(" ", "_") for _ in df.index]

    df['summary'] = [longest_path_tree(cb_index, v) for v in df.index]
    df = df.groupby('summary').sum()

    df['level'] = [_.count(';') + 1 if type(_) == str else 0 for _ in df.index]

    # summarize up
    below_level = df['level'] >= level
    leaf_counts_df = df[below_level].copy()
    leaf_counts_df['taxa_name'] = [
        ';'.join(v.split(';')[:level]) for v in df[below_level].index
    ]
    leaf_counts_df = leaf_counts_df.groupby('taxa_name').sum()
    leaf_counts_df = leaf_counts_df.drop('level', axis=1)

    # summarize bayes to level
    counts_bayes_sum = _summarize_bayes_at_level(counts_bayes,
                                                 leaf_counts_df.index,
                                                 level=level)

    # summarize down
    for i, row in df[~below_level].sort_values('level',
                                               ascending=False).iterrows():
        # Get all children of item
        tmp_name = row.name
        leave_filter = _filter_leaves_for_tax(leaf_counts_df, tmp_name)
        num_leaves = np.sum(leave_filter)
        if num_leaves == 0 or num_leaves is None:
            if row.name == "":
                logger.debug(
                    "Conflict found for sequence at the kingdom level, skipping."
                )
                continue
            # Filter back row names until in counts_bayes
            blank = ['k__', 'p__', 'c__', 'o__', 'f__', 'g__', 's__', 't__']
            for i, _ in enumerate(row.name.split(';')):
                blank[i] = _
            tmp_counts_bayes_row = _summarize_bayes_at_level(
                counts_bayes,
                row.name,
                level=row.name.count(';') + 1,
                drop_level=level)
            tmp_counts_bayes_row.name = ';'.join(blank[:level])
            row.name = tmp_counts_bayes_row.name
            leaf_counts_df = leaf_counts_df.append(row[:-1])
            if tmp_counts_bayes_row.name not in counts_bayes_sum.index:
                counts_bayes_sum = counts_bayes_sum.append(
                    tmp_counts_bayes_row)
                counts_bayes_sum = counts_bayes_sum.fillna(0)
        elif num_leaves == 1:
            leaf_counts_df.loc[leave_filter] += row.values[:-1]
        elif num_leaves > 1:
            tmp_level = row.name.count(';')
            tmp_leaves = leaf_counts_df[leave_filter].sort_index()
            tmp_bayes = counts_bayes_sum.loc[tmp_leaves.index]
            # Series 1xn where n is the number of leave nodes below tax
            prob_tax_given_level = (tmp_bayes.iloc[:, tmp_level] +
                                    1) / (tmp_bayes['genome_length'] + 1)
            prob_tax_given_level = prob_tax_given_level / np.sum(
                prob_tax_given_level)
            # Series 1xn where n is the number of unique reads for a given taxa
            uniqueness_per_genome = tmp_bayes.iloc[:, level - 1] / tmp_bayes[
                'genome_length']
            # Matrix divide each observed count by uniqueness
            counts_over_uniqueness = tmp_leaves.T / uniqueness_per_genome.values
            # Matrix divide each uniqueness count by sum of sample
            prob_tax = counts_over_uniqueness.T / counts_over_uniqueness.sum(
                axis=1)
            # Get the redistribution parameters
            # Should be taxa by samples same as the tmp_leaves
            # Each column should sum to 1
            redistribution_params = prob_tax.apply(
                lambda x: x * prob_tax_given_level.values,
                axis=0).apply(lambda x: x / x.sum(), axis=0)
            redistribution_numbers = (redistribution_params *
                                      row.values[:-1]).round()
            # Add the number back to the dataframe
            leaf_counts_df = leaf_counts_df.add(redistribution_numbers,
                                                fill_value=0)
    return leaf_counts_df
Exemplo n.º 13
0
def log_subprocess_output(pipe):
    for line in pipe:
        line = line.rstrip()
        if line:
            if not line.startswith('Search Progress'):
                logger.debug(line)
Exemplo n.º 14
0
def function_run_and_save(input,
                          func_db,
                          output,
                          level,
                          save_median_taxatable=True):
    prefix = ".".join(os.path.basename(input).split('.')[:-1])

    kegg_pathways_df = func_db['pathways']
    kegg_modules_df = func_db['modules']
    row_names = func_db['names']
    kegg_ids = func_db['kegg_ids']
    kegg_table_csr = func_db['csr']

    logger.debug(
        "Level for summarization %d and starting summarizing KEGG Table at level with median."
        % level)
    if level < 8:
        kegg_table_csr, row_names = summarize_at_level(kegg_table_csr,
                                                       row_names, kegg_ids,
                                                       level)
    logger.debug("Number of rows %d" % len(list(row_names.keys())))

    if TAXA[level - 1] not in prefix:
        prefix += "." + TAXA[level - 1]

    logger.info("Reading in taxatable for functional prediction at %s." %
                os.path.abspath(input))
    taxatable_df = pd.read_csv(input, sep="\t", index_col=0)
    logger.debug("Taxatable for functional prediction shape %s" %
                 str(taxatable_df.shape))
    taxatable_df = taxatable_df[[type(_) == str for _ in taxatable_df.index]]

    taxatable_df['summary'] = [
        ';'.join(_.split(';')[:level]).replace(' ', '_')
        for _ in taxatable_df.index
    ]
    # Drop names above
    taxatable_df = taxatable_df[[
        _.count(';') + 1 >= level for _ in taxatable_df['summary']
    ]]
    taxatable_df = taxatable_df.groupby('summary').sum().fillna(0.)

    # Normalizing for depth at median depth
    taxatable_df = normalize_by_median_depth(taxatable_df)
    if save_median_taxatable:
        taxatable_df.to_csv(os.path.join(output, "%s.normalized.txt" % prefix),
                            sep='\t',
                            float_format="%d",
                            na_rep=0,
                            index_label="#OTU ID")

    logger.debug("Taxatable summarized shape %s" % str(taxatable_df.shape))

    logger.info("Starting functional prediction.")
    out_kegg_table_df, out_kegg_modules_df, out_kegg_modules_coverage, out_kegg_pathways_df, out_kegg_pathways_coverage = _do_function(
        taxatable_df, row_names, kegg_ids, kegg_table_csr, kegg_modules_df,
        kegg_pathways_df)
    out_kegg_table_df.to_csv(os.path.join(output, "%s.kegg.txt" % prefix),
                             sep='\t',
                             float_format="%d",
                             na_rep=0,
                             index_label="#KEGG ID")
    out_kegg_modules_df.to_csv(os.path.join(output,
                                            "%s.kegg.modules.txt" % prefix),
                               sep='\t',
                               float_format="%d",
                               na_rep=0,
                               index_label="#MODULE ID")
    out_kegg_modules_coverage.to_csv(os.path.join(
        output, "%s.kegg.modules.coverage.txt" % prefix),
                                     sep='\t',
                                     float_format="%f",
                                     na_rep=0,
                                     index_label="#MODULE ID")
    out_kegg_pathways_df.to_csv(os.path.join(output,
                                             "%s.kegg.pathways.txt" % prefix),
                                sep='\t',
                                float_format="%d",
                                na_rep=0,
                                index_label="#PATHWAY ID")
    out_kegg_pathways_coverage.to_csv(os.path.join(
        output, "%s.kegg.pathways.coverage.txt" % prefix),
                                      sep='\t',
                                      float_format="%f",
                                      na_rep=0,
                                      index_label="#PATHWAY ID")
Exemplo n.º 15
0
def normalize_by_median_depth(df):
    logger.debug("Normalizing to median depth")
    return df.div(df.sum(axis=0).div(df.sum(axis=0).median()),
                  axis=1).round().astype(int)