def __init__(self, species_name, allowed_variants=["4D"], clade_cutoff=None, close_pair_cutoff=1e-3): """ Wrapper over DataHoarder to provide pileup specific functions :param species_name: :param allowed_variants: :param clade_cutoff: Only for B vulgatus to select the major clade :param close_pair_cutoff: For reducing overcounting of sharing """ self.dh = parallel_utils.DataHoarder(species_name, mode='QP', allowed_variants=allowed_variants) self.good_chromo = self.dh.chromosomes[self.dh.general_mask] div_dir = os.path.join(config.analysis_directory, 'pairwise_divergence', 'between_hosts', '%s.csv' % species_name) self.div_mat = np.loadtxt(div_dir, delimiter=',') clade_cutoff = clade_cutoff if clade_cutoff else 1 # form first order clusters using clade divergence cutoff d = close_pair_utils.get_clusters_from_pairwise_matrix(self.div_mat, threshold=clade_cutoff) clade_cluster = 1 + np.argmax(map(len, d.values())) # keep the largest clade clade_samples = d[clade_cluster] single_subject_samples = self.dh.get_single_subject_idxs() self.good_samples = np.intersect1d(single_subject_samples, clade_samples) self.close_pair_cutoff = close_pair_cutoff self.cluster_dict = close_pair_utils.get_clusters_from_pairwise_matrix( self.div_mat[self.good_samples, :][:, self.good_samples], threshold=close_pair_cutoff)
def compute_one_species_within_host(species_name): save_path = os.path.join(config.analysis_directory, "pairwise_divergence", "within_hosts", "%s.csv" % species_name) if os.path.exists(save_path): print('{} has already been processed'.format(species_name)) return dh = parallel_utils.DataHoarder(species_name, mode="within") num_samples = dh.snp_arr.shape[1] div_arr = np.zeros(num_samples) for i in range(num_samples): snp_vec, _ = dh.get_snp_vector(i) div = np.sum(snp_vec) / float(len(snp_vec)) div_arr[i] = div np.savetxt(save_path, div_arr, delimiter=',')
def main(): t0 = time.time() base_dir = 'zarr_snps' for species_name in os.listdir( os.path.join(config.analysis_directory, base_dir)): if species_name.startswith('.'): continue print("Saving for {} at {} min".format(species_name, (time.time() - t0) / 60)) dh = parallel_utils.DataHoarder(species_name) path_to_file = os.path.join( '/Users/Device6/Documents/Research/bgoodlab/', 'fineSTRUCTURE', 'microbiome', species_name) dh.save_haplotype_fs(path_to_file)
def compute_one_species(species_name, debug=False): save_path = os.path.join(config.analysis_directory, "pairwise_divergence", "between_hosts", "%s.csv" % species_name) if os.path.exists(save_path): print('{} has already been processed'.format(species_name)) return dh = parallel_utils.DataHoarder(species_name) num_samples = dh.snp_arr.shape[1] if not debug else 10 div_mat = np.zeros((num_samples, num_samples)) for i in range(num_samples): for j in range(i + 1, num_samples): snp_vec, _ = parallel_utils.get_two_QP_sample_snp_vector( dh.snp_arr, dh.covered_arr, (i, j)) div = np.sum(snp_vec) / float(len(snp_vec)) div_mat[i, j] = div div_mat[j, i] = div np.savetxt(save_path, div_mat, delimiter=',')
local_divs[genome_divs <= clade_cutoff], bins=bins) between_counts, _ = np.histogram( local_divs[genome_divs > clade_cutoff], bins=bins) divs = np.concatenate([divs, divs]) counts = np.concatenate([within_counts, between_counts]) else: counts, _ = np.histogram(local_divs, bins=bins) return divs, counts base_dir = 'zarr_snps' for species_name in os.listdir(os.path.join(config.data_directory, base_dir)): if species_name.startswith('.'): continue print('Processing ' + species_name) if species_name == 'Bacteroides_vulgatus_57955': separate_clades = True clade_cutoff = 0.03 else: separate_clades = False clade_cutoff = None dh = parallel_utils.DataHoarder(species_name, mode="QP") local_divs, genome_divs = sample_blocks(dh) divs, counts = get_empirical_div_dist(local_divs, genome_divs, num_bins=40, separate_clades=separate_clades, clade_cutoff=clade_cutoff) save_path = os.path.join(config.hmm_data_directory, species_name + '.csv') np.savetxt(save_path, np.vstack([divs, counts]))
def process_one_species(species_name, div_cutoff, block_size, debug=False): """ :param species_name: :param div_cutoff: Hand annotated cutoff for first filtering of pairs :param block_size: Coarse-graining length scale for the genome :param debug: Flag to determine whether running the debug version :return: a DataFrame for first pass statistics, and a dict for second pass statistics (including clonal snps etc) """ dh = parallel_utils.DataHoarder(species_name, mode="QP") good_chromo = dh.chromosomes[ dh.general_mask] # will be used in contig-wise transfer computation div_dir = os.path.join(config.analysis_directory, 'pairwise_divergence', 'between_hosts', '%s.csv' % species_name) div_mat = np.loadtxt(div_dir, delimiter=',') pairs = close_pair_utils.find_close_pairs(div_cutoff, div_mat, dh.get_single_subject_idxs()) logging.info("After divergence cutoff, {} has {} pairs".format( species_name, len(pairs))) if len(pairs) < 5: logging.info("Too few pairs, skipping") return None FIRST_PASS_BLOCK_SIZE = config.first_pass_block_size logging.info("Coarse-graining the genome into blocks of size {}".format( FIRST_PASS_BLOCK_SIZE)) first_pass_stats = close_pair_utils.process_close_pairs_first_pass( dh, pairs, FIRST_PASS_BLOCK_SIZE) mean_total_blocks = first_pass_stats['num_total_blocks'].mean() # use num of snp block as an estimate for clonal fraction # throw away pairs with too many blocks covered snp_block_cutoff = (1 - CLONAL_FRAC_CUTOFF) * mean_total_blocks second_pass_stats = first_pass_stats[ first_pass_stats['snp_blocks'] < snp_block_cutoff].copy() good_pairs = second_pass_stats['pair_idxs'] if debug: # good_pairs = good_pairs[:5] clade_cutoff_bin = config.empirical_histogram_bins # for B vulgatus separate clade else: clade_cutoff_bin = None logging.info("After first pass, {} has {} pairs".format( species_name, len(good_pairs))) mean_genome_len = mean_total_blocks * FIRST_PASS_BLOCK_SIZE logging.info("Mean genome length is {} sites".format(mean_genome_len)) logging.info("Using HMM to detect transfers") logging.info("Block size is {}".format(block_size)) cphmm = init_hmm(species_name, mean_genome_len, block_size) dat = dict() dat['starts'] = [] dat['ends'] = [] # dat['T approxs'] = [] dat['clonal snps'] = [] dat['pairs'] = list(good_pairs) processed_count = 0 for pair in good_pairs: snp_vec, snp_mask = dh.get_snp_vector(pair) chromosomes = good_chromo[snp_mask] try: # starts, ends, T_approx = close_pair_utils.fit_and_count_transfers_all_chromosomes( starts, ends, clonal_snp = close_pair_utils.fit_and_count_transfers_all_chromosomes( snp_vec, chromosomes, cphmm, block_size, clade_cutoff_bin=clade_cutoff_bin) except: e = sys.exc_info()[0] tb = traceback.format_exc() print(pair) print(tb) raise e dat['starts'].append(starts) dat['ends'].append(ends) # dat['T approxs'].append(T_approx) dat['clonal snps'].append(clonal_snp) processed_count += 1 if processed_count % 100 == 0: logging.info("Finished %d out of %d pairs" % (processed_count, len(good_pairs))) return first_pass_stats, dat
def plot_for_one_species(ax, species_name, num_to_plot, normalization=True, mode='QP'): color_list = sns.color_palette(palette='colorblind') between_color = color_list[0] within_color = color_list[4] # mode can either be 'QP' or 'within' # loading data base_dir = os.path.join(config.data_directory, 'zarr_snps', species_name) if not os.path.exists(base_dir): print('No data found for {}'.format(species_name)) return dh = parallel_utils.DataHoarder(species_name, mode) # filtering the arrays good_chromo = dh.chromosomes[ dh.general_mask] # will be used in contig-wise run computation # load same clade snp cutoff # TODO eventually want to use only divergence cutoffs = json.load(open('./same_clade_snp_cutoffs.json', 'r')) if species_name not in cutoffs: lower_cutoff = 5 upper_cutoff = 5e6 else: lower_cutoff = cutoffs[species_name][ 0] or 0 # hacky way of assigning value to None upper_cutoff = cutoffs[species_name][1] or 5e6 print("Finish loading for {}".format(species_name)) ax.set_yscale('log') ax.set_xlabel('Normalized site counts') ax.set_ylabel('Survival Probability') if normalization: ax.set_xlim((0, 25)) # prepare the list of sample pairs/samples to plot if mode == 'within': num_to_plot = min(num_to_plot, dh.snp_arr.shape[1]) idxs = random.sample(range(dh.snp_arr.shape[1]), num_to_plot) if num_to_plot > 100: alpha = 0.1 else: alpha = 0.5 color = within_color elif mode == 'QP': idxs = [] for i in range(num_to_plot): idxs.append(random.sample(range(dh.snp_arr.shape[1]), 2)) alpha = 0.1 color = between_color else: raise ValueError("Mode has to be either QP or within") for idx in idxs: snp_vec, snp_mask = dh.get_snp_vector(idx) snp_count = np.sum(snp_vec) if (snp_count < lower_cutoff) or (snp_count > upper_cutoff): continue if normalization: div = snp_count / float(len(snp_vec)) else: div = 1 runs = parallel_utils.compute_runs_all_chromosomes( snp_vec, good_chromo[snp_mask]) if len(runs) == 0: print("Divergence is %f, for a total of %d snps" % (div, snp_count)) continue # normalize by multiplying div data = runs * div plot_range = (0, max(data)) _ = ax.hist(data, range=plot_range, normed=True, cumulative=-1, bins=100, histtype='step', color=color, alpha=alpha) ax.plot([], color=color, label=mode) return