def __init__(self, species_name, allowed_variants=["4D"], clade_cutoff=None, close_pair_cutoff=1e-3):
        """
        Wrapper over DataHoarder to provide pileup specific functions
        :param species_name:
        :param allowed_variants:
        :param clade_cutoff: Only for B vulgatus to select the major clade
        :param close_pair_cutoff: For reducing overcounting of sharing
        """
        self.dh = parallel_utils.DataHoarder(species_name, mode='QP', allowed_variants=allowed_variants)
        self.good_chromo = self.dh.chromosomes[self.dh.general_mask]

        div_dir = os.path.join(config.analysis_directory, 'pairwise_divergence', 'between_hosts',
                               '%s.csv' % species_name)
        self.div_mat = np.loadtxt(div_dir, delimiter=',')

        clade_cutoff = clade_cutoff if clade_cutoff else 1
        # form first order clusters using clade divergence cutoff
        d = close_pair_utils.get_clusters_from_pairwise_matrix(self.div_mat, threshold=clade_cutoff)
        clade_cluster = 1 + np.argmax(map(len, d.values()))  # keep the largest clade
        clade_samples = d[clade_cluster]

        single_subject_samples = self.dh.get_single_subject_idxs()
        self.good_samples = np.intersect1d(single_subject_samples, clade_samples)

        self.close_pair_cutoff = close_pair_cutoff
        self.cluster_dict = close_pair_utils.get_clusters_from_pairwise_matrix(
            self.div_mat[self.good_samples, :][:, self.good_samples], threshold=close_pair_cutoff)
def compute_one_species_within_host(species_name):
    save_path = os.path.join(config.analysis_directory, "pairwise_divergence",
                             "within_hosts", "%s.csv" % species_name)
    if os.path.exists(save_path):
        print('{} has already been processed'.format(species_name))
        return
    dh = parallel_utils.DataHoarder(species_name, mode="within")
    num_samples = dh.snp_arr.shape[1]
    div_arr = np.zeros(num_samples)
    for i in range(num_samples):
        snp_vec, _ = dh.get_snp_vector(i)
        div = np.sum(snp_vec) / float(len(snp_vec))
        div_arr[i] = div
    np.savetxt(save_path, div_arr, delimiter=',')
def main():
    t0 = time.time()
    base_dir = 'zarr_snps'
    for species_name in os.listdir(
            os.path.join(config.analysis_directory, base_dir)):
        if species_name.startswith('.'):
            continue
        print("Saving for {} at {} min".format(species_name,
                                               (time.time() - t0) / 60))

        dh = parallel_utils.DataHoarder(species_name)
        path_to_file = os.path.join(
            '/Users/Device6/Documents/Research/bgoodlab/', 'fineSTRUCTURE',
            'microbiome', species_name)
        dh.save_haplotype_fs(path_to_file)
def compute_one_species(species_name, debug=False):
    save_path = os.path.join(config.analysis_directory, "pairwise_divergence",
                             "between_hosts", "%s.csv" % species_name)
    if os.path.exists(save_path):
        print('{} has already been processed'.format(species_name))
        return
    dh = parallel_utils.DataHoarder(species_name)
    num_samples = dh.snp_arr.shape[1] if not debug else 10
    div_mat = np.zeros((num_samples, num_samples))
    for i in range(num_samples):
        for j in range(i + 1, num_samples):
            snp_vec, _ = parallel_utils.get_two_QP_sample_snp_vector(
                dh.snp_arr, dh.covered_arr, (i, j))
            div = np.sum(snp_vec) / float(len(snp_vec))
            div_mat[i, j] = div
            div_mat[j, i] = div
    np.savetxt(save_path, div_mat, delimiter=',')
            local_divs[genome_divs <= clade_cutoff], bins=bins)
        between_counts, _ = np.histogram(
            local_divs[genome_divs > clade_cutoff], bins=bins)
        divs = np.concatenate([divs, divs])
        counts = np.concatenate([within_counts, between_counts])
    else:
        counts, _ = np.histogram(local_divs, bins=bins)
    return divs, counts


base_dir = 'zarr_snps'
for species_name in os.listdir(os.path.join(config.data_directory, base_dir)):
    if species_name.startswith('.'):
        continue
    print('Processing ' + species_name)
    if species_name == 'Bacteroides_vulgatus_57955':
        separate_clades = True
        clade_cutoff = 0.03
    else:
        separate_clades = False
        clade_cutoff = None
    dh = parallel_utils.DataHoarder(species_name, mode="QP")
    local_divs, genome_divs = sample_blocks(dh)
    divs, counts = get_empirical_div_dist(local_divs,
                                          genome_divs,
                                          num_bins=40,
                                          separate_clades=separate_clades,
                                          clade_cutoff=clade_cutoff)
    save_path = os.path.join(config.hmm_data_directory, species_name + '.csv')
    np.savetxt(save_path, np.vstack([divs, counts]))
예제 #6
0
def process_one_species(species_name, div_cutoff, block_size, debug=False):
    """
    :param species_name:
    :param div_cutoff: Hand annotated cutoff for first filtering of pairs
    :param block_size: Coarse-graining length scale for the genome
    :param debug: Flag to determine whether running the debug version
    :return: a DataFrame for first pass statistics, and a dict for second pass statistics (including clonal snps etc)
    """
    dh = parallel_utils.DataHoarder(species_name, mode="QP")
    good_chromo = dh.chromosomes[
        dh.general_mask]  # will be used in contig-wise transfer computation

    div_dir = os.path.join(config.analysis_directory, 'pairwise_divergence',
                           'between_hosts', '%s.csv' % species_name)
    div_mat = np.loadtxt(div_dir, delimiter=',')
    pairs = close_pair_utils.find_close_pairs(div_cutoff, div_mat,
                                              dh.get_single_subject_idxs())
    logging.info("After divergence cutoff, {} has {} pairs".format(
        species_name, len(pairs)))
    if len(pairs) < 5:
        logging.info("Too few pairs, skipping")
        return None

    FIRST_PASS_BLOCK_SIZE = config.first_pass_block_size
    logging.info("Coarse-graining the genome into blocks of size {}".format(
        FIRST_PASS_BLOCK_SIZE))
    first_pass_stats = close_pair_utils.process_close_pairs_first_pass(
        dh, pairs, FIRST_PASS_BLOCK_SIZE)
    mean_total_blocks = first_pass_stats['num_total_blocks'].mean()

    # use num of snp block as an estimate for clonal fraction
    # throw away pairs with too many blocks covered
    snp_block_cutoff = (1 - CLONAL_FRAC_CUTOFF) * mean_total_blocks
    second_pass_stats = first_pass_stats[
        first_pass_stats['snp_blocks'] < snp_block_cutoff].copy()
    good_pairs = second_pass_stats['pair_idxs']
    if debug:
        # good_pairs = good_pairs[:5]
        clade_cutoff_bin = config.empirical_histogram_bins  # for B vulgatus separate clade
    else:
        clade_cutoff_bin = None

    logging.info("After first pass, {} has {} pairs".format(
        species_name, len(good_pairs)))
    mean_genome_len = mean_total_blocks * FIRST_PASS_BLOCK_SIZE
    logging.info("Mean genome length is {} sites".format(mean_genome_len))

    logging.info("Using HMM to detect transfers")
    logging.info("Block size is {}".format(block_size))
    cphmm = init_hmm(species_name, mean_genome_len, block_size)

    dat = dict()
    dat['starts'] = []
    dat['ends'] = []
    # dat['T approxs'] = []
    dat['clonal snps'] = []
    dat['pairs'] = list(good_pairs)
    processed_count = 0
    for pair in good_pairs:
        snp_vec, snp_mask = dh.get_snp_vector(pair)
        chromosomes = good_chromo[snp_mask]
        try:
            # starts, ends, T_approx = close_pair_utils.fit_and_count_transfers_all_chromosomes(
            starts, ends, clonal_snp = close_pair_utils.fit_and_count_transfers_all_chromosomes(
                snp_vec,
                chromosomes,
                cphmm,
                block_size,
                clade_cutoff_bin=clade_cutoff_bin)
        except:
            e = sys.exc_info()[0]
            tb = traceback.format_exc()
            print(pair)
            print(tb)
            raise e
        dat['starts'].append(starts)
        dat['ends'].append(ends)
        # dat['T approxs'].append(T_approx)
        dat['clonal snps'].append(clonal_snp)

        processed_count += 1
        if processed_count % 100 == 0:
            logging.info("Finished %d out of %d pairs" %
                         (processed_count, len(good_pairs)))
    return first_pass_stats, dat
예제 #7
0
def plot_for_one_species(ax,
                         species_name,
                         num_to_plot,
                         normalization=True,
                         mode='QP'):
    color_list = sns.color_palette(palette='colorblind')
    between_color = color_list[0]
    within_color = color_list[4]
    # mode can either be 'QP' or 'within'
    # loading data
    base_dir = os.path.join(config.data_directory, 'zarr_snps', species_name)
    if not os.path.exists(base_dir):
        print('No data found for {}'.format(species_name))
        return

    dh = parallel_utils.DataHoarder(species_name, mode)
    # filtering the arrays
    good_chromo = dh.chromosomes[
        dh.general_mask]  # will be used in contig-wise run computation

    # load same clade snp cutoff
    # TODO eventually want to use only divergence
    cutoffs = json.load(open('./same_clade_snp_cutoffs.json', 'r'))
    if species_name not in cutoffs:
        lower_cutoff = 5
        upper_cutoff = 5e6
    else:
        lower_cutoff = cutoffs[species_name][
            0] or 0  # hacky way of assigning value to None
        upper_cutoff = cutoffs[species_name][1] or 5e6

    print("Finish loading for {}".format(species_name))

    ax.set_yscale('log')
    ax.set_xlabel('Normalized site counts')
    ax.set_ylabel('Survival Probability')
    if normalization:
        ax.set_xlim((0, 25))

    # prepare the list of sample pairs/samples to plot
    if mode == 'within':
        num_to_plot = min(num_to_plot, dh.snp_arr.shape[1])
        idxs = random.sample(range(dh.snp_arr.shape[1]), num_to_plot)
        if num_to_plot > 100:
            alpha = 0.1
        else:
            alpha = 0.5
        color = within_color
    elif mode == 'QP':
        idxs = []
        for i in range(num_to_plot):
            idxs.append(random.sample(range(dh.snp_arr.shape[1]), 2))
        alpha = 0.1
        color = between_color
    else:
        raise ValueError("Mode has to be either QP or within")

    for idx in idxs:
        snp_vec, snp_mask = dh.get_snp_vector(idx)
        snp_count = np.sum(snp_vec)
        if (snp_count < lower_cutoff) or (snp_count > upper_cutoff):
            continue
        if normalization:
            div = snp_count / float(len(snp_vec))
        else:
            div = 1
        runs = parallel_utils.compute_runs_all_chromosomes(
            snp_vec, good_chromo[snp_mask])
        if len(runs) == 0:
            print("Divergence is %f, for a total of %d snps" %
                  (div, snp_count))
            continue
        # normalize by multiplying div
        data = runs * div
        plot_range = (0, max(data))
        _ = ax.hist(data,
                    range=plot_range,
                    normed=True,
                    cumulative=-1,
                    bins=100,
                    histtype='step',
                    color=color,
                    alpha=alpha)
    ax.plot([], color=color, label=mode)
    return