def get_allele_frequency_trajectories(pname, samples, fragment, qual_min=30, VERBOSE=0): '''Scan the reads of all samples and write to a single file''' if VERBOSE >= 1: print 'Getting allele frequency trajectories:', pname, fragment from hivwholeseq.patients.filenames import get_initial_reference_filename, \ get_mapped_to_initial_filename, get_allele_frequency_trajectories_filename, \ get_allele_count_trajectories_filename from hivwholeseq.utils.one_site_statistics import get_allele_counts_insertions_from_file, \ get_allele_counts_insertions_from_file_unfiltered, \ filter_nus refseq = SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta') # Prepare output data structures cos_traj = np.zeros((len(samples), len(alpha), len(refseq)), int) nus_traj = np.zeros((len(samples), len(alpha), len(refseq))) for it, sample in enumerate(samples): if VERBOSE >= 2: print pname, it, sample input_filename = get_mapped_to_initial_filename(pname, sample, fragment, type='bam') (counts, inserts) = get_allele_counts_insertions_from_file_unfiltered( input_filename, len(refseq), qual_min=qual_min, VERBOSE=VERBOSE) # Take the total counts, blending in the read types cou = counts.sum(axis=0) cos_traj[it] = cou # Take the filtered frequencies, blending in the read types nu = filter_nus(counts) nus_traj[it] = nu #FIXME: test, etc. return (cos_traj, nus_traj)
summary=summary) continue # Get coverage and counts counts = np.load( get_allele_counts_filename(data_folder, adaID, fragment)) if len(counts.shape) == 2: import warnings warnings.warn( 'Counts not divided by read type: will normalize instead of filter!' ) nu_filtered = 1.0 * counts / counts.sum(axis=0) else: # Filter the minor frequencies by comparing the read types nu_filtered = filter_nus(counts) # Write output write_frequency_files(data_folder, adaID, fragment, nu_filtered, VERBOSE=VERBOSE) if summary: import matplotlib.pyplot as plt was_interactive = plt.isinteractive() plt.ioff() plot_SFS_folded(data_folder, adaID, fragment,
# Iterate over all requested samples for samplename, sample in samples.iterrows(): sample = SampleSeq(sample) adaID = sample.adapter if not fragments: fragments_sample = sample.regions_generic else: fragments_sample = sorted(set(fragments) & set(sample.regions_generic)) for fragment in fragments_sample: # Submit to the cluster self if requested if submit: fork_self(seq_run, adaID, fragment, VERBOSE=VERBOSE) continue counts, inserts = get_allele_counts(data_folder, adaID, fragment, VERBOSE=VERBOSE) write_counts_files(data_folder, adaID, fragment, counts, inserts, VERBOSE=VERBOSE) if summary: plot_coverage(data_folder, adaID, fragment, counts, VERBOSE=VERBOSE, savefig=True) if write_frequencies: nu_filtered = filter_nus(counts) write_frequency_files(data_folder, adaID, fragment, nu_filtered, VERBOSE=VERBOSE) if summary: plot_SFS_folded(data_folder, adaID, fragment, nu_filtered, VERBOSE=VERBOSE, savefig=True)
def plot_minor_allele_frequency(data_folder, adaID, fragments, VERBOSE=0, savefig=False): '''Plot minor allele frequency along the genome''' from hivwholeseq.sequencing.filenames import get_minor_allele_frequency_figure_filename as gff import matplotlib params = {'axes.labelsize': 20, 'text.fontsize': 20, 'legend.fontsize': 8, 'xtick.labelsize': 16, 'ytick.labelsize': 16, 'text.usetex': False} matplotlib.rcParams.update(params) from matplotlib import cm import matplotlib.pyplot as plt plot_grid = [(1, 1), (1, 2), (1, 3), (2, 2), (1, 5), (2, 3)] # Store in globals structures covs = {} nus_minor = {} alls_minor = {} nus_filtered = {} nus_minor_filtered = {} for fragment in fragments: coverage = np.load(get_coverage_filename(data_folder, adaID, fragment)) covs[fragment] = coverage counts = np.load(get_allele_counts_filename(data_folder, adaID, fragment)) (counts_major, counts_minor, counts_minor2) = get_minor_allele_counts(counts, n_minor=2) # Get minor allele frequencies and identities nu_minor = 1.0 * counts_minor[:, :, 1] / (coverage + 1e-6) nus_minor[fragment] = nu_minor all_minor = counts_minor[:, :, 0] alls_minor[fragment] = all_minor # Filter the minor frequencies by comparing the read types try: nu_filtered = np.load(get_allele_frequencies_filename(data_folder, adaID, fragment)) except IOError: nu_filtered = filter_nus(counts, coverage) nut = np.zeros(nu_filtered.shape[-1]) for pos, nupos in enumerate(nu_filtered.T): nut[pos] = np.sort(nupos)[-2] nus_filtered[fragment] = nu_filtered nus_minor_filtered[fragment] = nut # Plot them (n_plots_y, n_plots_x) = plot_grid[len(fragments) - 1] fig, axs = plt.subplots(n_plots_y, n_plots_x, figsize=(13, 8)) if len(fragments) > 1: axs = axs.ravel() else: axs = [axs] fig.suptitle('adapterID '+adaID, fontsize=20) labss = {'read1 f': 'read1 fwd', 'read1 r': 'read1 rev', 'read2 f': 'read2 fwd', 'read2 r': 'read2 rev'} for i, fragment in enumerate(fragments): ax = axs[i] ax.set_yscale('log') ax.set_title(fragment) if i in [0, 3]: ax.set_ylabel(r'$\nu$') if i > 2: ax.set_xlabel('Position') # Plot divided by readtype for js, nu_minorjs in enumerate(nus_minor[fragment]): color = cm.jet(int(255.0 * js / len(read_types))) ax.plot(nu_minorjs, lw=1.5, c=color, label=labss[read_types[js]]) ax.scatter(np.arange(len(nu_minorjs)), nu_minorjs, lw=1.5, color=color) # Plot filtered ax.plot(nus_minor_filtered[fragment], lw=1.5, c='k', alpha=0.5, label='Filtered') ax.scatter(np.arange(len(nus_minor_filtered[fragment])), nus_minor_filtered[fragment], lw=1.5, c='k', alpha=0.5) # Plot 1/max(coverage) coverage = covs[fragment] cov_tot = coverage.sum(axis=0) ax.plot(1.0 / cov_tot, lw=1.2, c='r', label='Detection limit') ax.set_xlim(-100, len(nu_minorjs) + 100) plt.grid() plt.legend(loc='upper right') plt.tight_layout(rect=(0, 0, 1, 0.95)) if savefig: outputfile = gff(data_folder, adaID, fragment) fig.savefig(outputfile) plt.close(fig) else: plt.ion() plt.show()
def plot_minor_allele_frequency_filtered(data_folder, adaID, fragments, VERBOSE=0, savefig=False): '''Plot minor allele frequency along the genome''' from hivwholeseq.sequencing.filenames import get_minor_allele_frequency_figure_filename as gff import matplotlib params = {'axes.labelsize': 20, 'text.fontsize': 20, 'legend.fontsize': 8, 'xtick.labelsize': 16, 'ytick.labelsize': 16, 'text.usetex': False} matplotlib.rcParams.update(params) from matplotlib import cm import matplotlib.pyplot as plt # Store in globals structures covs = {} nus_minor_filtered = {} for fragment in fragments: coverage = np.load(get_coverage_filename(data_folder, adaID, fragment)) covs[fragment] = coverage try: nu_filtered = np.load(get_allele_frequencies_filename(data_folder, adaID, fragment)) except IOError: counts = np.load(get_allele_counts_filename(data_folder, adaID, fragment)) nu_filtered = filter_nus(counts) nut = np.zeros(nu_filtered.shape[-1]) for pos, nupos in enumerate(nu_filtered.T): nut[pos] = np.sort(nupos)[-2] nus_minor_filtered[fragment] = nut # Plot them plot_grid = [(1, 1), (1, 2), (1, 3), (2, 2), (1, 5), (2, 3)] (n_plots_y, n_plots_x) = plot_grid[len(fragments) - 1] fig, axs = plt.subplots(n_plots_y, n_plots_x, figsize=(13, 8)) if len(fragments) > 1: axs = axs.ravel() else: axs = [axs] fig.suptitle('adapterID '+adaID, fontsize=20) for i, fragment in enumerate(fragments): ax = axs[i] ax.set_yscale('log') ax.set_title(fragment) if i in [0, 3]: ax.set_ylabel(r'$\nu$') if i > 2: ax.set_xlabel('Position') # Plot filtered ax.plot(nus_minor_filtered[fragment], lw=1.5, c='k', alpha=0.5, label='Filtered') ax.scatter(np.arange(len(nus_minor_filtered[fragment])), nus_minor_filtered[fragment], lw=1.5, c='k', alpha=0.5) ax.set_xlim(-100, len(nus_minor_filtered[fragment]) + 100) #plt.legend(loc='upper right') plt.tight_layout(rect=(0, 0, 1, 0.95)) if savefig: outputfile = gff(data_folder, adaID, fragment, only_filtered=True) fig.savefig(outputfile) plt.close(fig) else: plt.ion() plt.show()