def process_one_species(species_name): t0 = time.time() # load data first; computed from parse_snps_to_pickle.py all_genes = core_gene_utils.get_sorted_core_genes(species_name) data_dir = os.path.join(config.analysis_directory, "within_hosts_checkpoints/") if os.path.exists("{}{}/all_runs_map.pickle".format(data_dir, species_name)): print('{} already processed'.format(species_name)) return _, sfs_map = parse_midas_data.parse_within_sample_sfs( species_name, allowed_variant_types=set(['4D'])) allele_counts_map = pickle.load( open("{}{}/allele_counts_map.pickle".format(data_dir, species_name), 'rb')) found_samples = pickle.load( open("{}{}/found_samples.pickle".format(data_dir, species_name), 'rb')) passed_sites_map = pickle.load( open("{}{}/passed_sites_map.pickle".format(data_dir, species_name), 'rb')) print("Finish loading data for {} at {} min".format( species_name, (time.time() - t0)/60)) counts_map = dict() runs_map = dict() for sample_idx in xrange(len(found_samples)): sample_id = found_samples[sample_idx] gene_snp_map = HGT_utils.find_single_host_relative_snps( sample_idx, found_samples, allele_counts_map, sfs_map) if gene_snp_map is None: print("Sample {} has no clear peak".format(sample_id)) continue all_gene_counts = HGT_utils.get_gene_snp_vector( gene_snp_map, all_genes) counts_map[sample_idx] = sum(all_gene_counts) runs, starts, ends = HGT_utils.find_runs(all_gene_counts) # Now count the number of passed sites for each run passed_site_vec = get_passed_site_vector( passed_sites_map, all_genes, sample_idx, sample_idx) site_counts = np.array([sum(passed_site_vec[start:end+1]) for (start, end) in zip(starts, ends)]) # Now count the number of anolamous events runs_map[sample_idx] = (runs, starts, ends, site_counts) # save data pickle.dump(runs_map, open( "{}{}/all_runs_map.pickle".format(data_dir, species_name), 'wb')) pickle.dump(counts_map, open( "{}{}/snp_counts_map.pickle".format(data_dir, species_name), 'wb')) print("Finish saving data for {} at {} min".format( species_name, (time.time() - t0)/60))
def process_one_species(species_name): if os.path.exists(os.path.join(config.analysis_directory, 'allele_freq', species_name)): print("{} already processed".format(species_name)) return samples, sfs_map = parse_midas_data.parse_within_sample_sfs( species_name, allowed_variant_types=set(['4D'])) highcoverage_samples = list( diversity_utils.calculate_highcoverage_samples(species_name)) for sample in highcoverage_samples: all_fs, all_pfs = sfs_utils.calculate_binned_sfs_from_sfs_map( sfs_map[sample], folding='major') df = all_fs[1] - all_fs[0] # For peak finding, only use the polymorphic sites pfs = all_pfs[all_fs < 0.95] fs = all_fs[all_fs < 0.95] # Find the max peak size within_sites, between_sites, total_sites = sfs_utils.calculate_polymorphism_rates_from_sfs_map( sfs_map[sample]) between_line = between_sites*1.0 / \ total_sites/((fs > 0.2)*(fs < 0.5)).sum() pmax = np.max([pfs[(fs > 0.1)*(fs < 0.95)].max(), between_line]) peak_idx, cutoff = HGT_utils._find_sfs_peaks_and_cutoff(fs, pfs, pmax) num_peaks = len(peak_idx) # Now plot and save the figure _ = plt.figure() ax = plt.gca() ax.set_xlim([0.50, 1.00]) ax.set_ylim([0, pmax*3]) ax.bar((all_fs-df/2), all_pfs, width=df) ax.plot(fs[peak_idx]-df/2, pfs[peak_idx], 'rx', label='peaks detected') ax.set_xlabel('Major allele freq') if cutoff: ax.axvspan(min(fs), cutoff, alpha=0.1, color='red', label='SNP sites') ax.legend() path = os.path.join(config.analysis_directory, 'allele_freq', species_name, str(num_peaks)) if not os.path.exists(path): os.makedirs(path) plt.savefig(path + '/' + sample + '.png') plt.close()
def get_single_peak_sample_mask(species_name): """ Compute a mask that keep only samples suitable for within host analysis A sample need to be 1) well covered, 2) has single clean peak The list of sample names and the list of peak cutoffs will also be returned """ sample_names = get_sample_names(species_name) blacklist = set(HGT_utils.get_within_host_bad_samples(species_name)) highcoverage_samples = set( diversity_utils.calculate_highcoverage_samples(species_name)) single_peak_dir = os.path.join(config.analysis_directory, 'allele_freq', species_name, '1') if not os.path.exists(single_peak_dir): print("No single peak samples found for {}".format(species_name)) mask = np.zeros(len(sample_names)).astype(bool) return mask, sample_names, np.array([]) else: single_peak_samples = set([ f.split('.')[0] for f in os.listdir(single_peak_dir) if not f.startswith('.') ]) allowed_samples = single_peak_samples & highcoverage_samples - blacklist mask = np.isin(sample_names, list(allowed_samples)) # filter samples with a clean single peak _, sfs_map = parse_midas_data.parse_within_sample_sfs( species_name, allowed_variant_types=set(['4D'])) results = [ HGT_utils.find_sfs_peaks_and_cutoff(sample, sfs_map) for sample in sample_names[mask] ] cutoffs = np.array([res[1] for res in results]) clean_peak_mask = np.array([cutoff is not None for cutoff in cutoffs]) mask[mask] = clean_peak_mask good_cutoffs = cutoffs[clean_peak_mask] return mask, sample_names, good_cutoffs.astype(float)
temporal_change_directory, species_name) output_file = gzip.open(intermediate_filename, "w") # header! output_file.write(", ".join([ 'Species', 'Sample1', 'Sample2', 'Type', 'L', 'Perr', 'Change1', '...' ])) output_file.write("\n") for species_name in good_species_list: sample_coverage_map = parse_midas_data.parse_sample_coverage_map( species_name) sys.stderr.write("Loading SFSs for %s...\t" % species_name) samples, sfs_map = parse_midas_data.parse_within_sample_sfs( species_name, allowed_variant_types=set(['1D', '2D', '3D', '4D'])) sys.stderr.write("Done!\n") sys.stderr.write("Loading temporal samples...\n") # Only plot samples above a certain depth threshold that are involved in timecourse snp_samples = diversity_utils.calculate_temporal_samples(species_name) # On purpose looking at non-consecutive pairs too # (restriction to consecutive pairs is later) same_sample_idxs, same_subject_idxs, diff_subject_idxs = sample_utils.calculate_nonconsecutive_ordered_subject_pairs( sample_order_map, snp_samples) if len(same_subject_idxs[0]) < min_sample_size: sys.stderr.write("Not enough temporal samples!\n") continue
divergence_matrices[species_name] = snp_substitution_matrix between_divergences[species_name] = [] for i in xrange(0, divergence_matrices[species_name].shape[0]): for j in xrange(i + 1, divergence_matrices[species_name].shape[0]): if divergence_matrices[species_name][i, j] >= 0: between_divergences[species_name].append( divergence_matrices[species_name][i, j]) between_divergences[species_name] = numpy.array( between_divergences[species_name]) # Load SNP information for species_name sys.stderr.write("Loading SFSs for %s...\t" % species_name) sfs_samples, sfs_map = parse_midas_data.parse_within_sample_sfs( species_name) sys.stderr.write("Done!\n") highcoverage_samples = diversity_utils.calculate_highcoverage_samples( species_name) desired_samples = snp_samples within_polymorphisms[species_name] = [] for sample in desired_samples: within_sites, between_sites, total_sites = sfs_utils.calculate_polymorphism_rates_from_sfs_map( sfs_map[sample]) within_polymorphisms[species_name].append(within_sites * 1.0 / total_sites) species_names = [] sample_sizes = []
species_name) median_coverages = numpy.array([ stats_utils.calculate_nonzero_median_from_histogram( sample_coverage_histogram) for sample_coverage_histogram in sample_coverage_histograms ]) sample_coverage_map = { samples[i]: median_coverages[i] for i in xrange(0, len(samples)) } # Load pi information for species_name sys.stderr.write("Loading within-sample diversity for %s...\n" % species_name) samples, site_map = parse_midas_data.parse_within_sample_sfs( species_name, allowed_variant_types=set(['4D']), allowed_genes=core_genes, debug=debug) sys.stderr.write("Done!\n") median_coverages = numpy.array( [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))]) print[len(site_map[i].keys()) for i in xrange(0, len(site_map))] sys.exit(0) # Only plot samples above a certain depth threshold that are "haploids" snp_samples = samples[(median_coverages >= min_coverage) * (pis <= 1e-03)] num_haploids = len(snp_samples)