def plot_drug_resistance_mutation_trajectories(pcode): ''' auxillary function to check for potential drug resistance evolution in RNA sequences only p10 has drug resistance mutations in the last two samples ''' plt.figure() p = Patient.load(pcode) RT = p.get_allele_frequency_trajectories('RT', type='aa') for mt in ['NNRTI', 'NRTI']: for aa1, pos, aa2 in drug_muts[mt]['mutations']: traj = 1 - RT[:, alphaal.index(aa1), pos - 1] if max(traj) > 0.1: plt.plot(p.dsi, traj, '-o', label=mt + ' ' + str(pos)) PR = p.get_allele_frequency_trajectories('PR', type='aa') for mt in ['PI']: for aa1, pos, aa2 in drug_muts[mt]['mutations']: traj = 1 - PR[:, alphaal.index(aa1), pos - 1] if max(traj) > 0.1: plt.plot(p.dsi, traj, '-o', label=mt + ' ' + str(pos)) plt.legend(loc=2, ncol=2)
def plot_drug_resistance_mutation_trajectories(pcode): """ auxillary function to check for potential drug resistance evolution in RNA sequences only p10 has drug resistance mutations in the last two samples """ plt.figure() p = Patient.load(pcode) RT = p.get_allele_frequency_trajectories("RT", type="aa") for mt in ["NNRTI", "NRTI"]: for aa1, pos, aa2 in drug_muts[mt]["mutations"]: traj = 1 - RT[:, alphaal.index(aa1), pos - 1] if max(traj) > 0.1: plt.plot(p.dsi, traj, "-o", label=mt + " " + str(pos)) PR = p.get_allele_frequency_trajectories("PR", type="aa") for mt in ["PI"]: for aa1, pos, aa2 in drug_muts[mt]["mutations"]: traj = 1 - PR[:, alphaal.index(aa1), pos - 1] if max(traj) > 0.1: plt.plot(p.dsi, traj, "-o", label=mt + " " + str(pos)) plt.legend(loc=2, ncol=2)
def fitness_cost_mutation(region, data, aa_mutation_rates, pos, target_aa, nbootstraps=0): ''' determine the fitness cost associated with a particular amino acid mutations such as K103N this requires specification of the target amino acid and a specific calculation of the mutation rate into the amino acid, which requires the ancestral codon present in each individual patient ''' def s(pats): # calcute nu/mu for each patient with patient specific mutation rates excluding double hit mutations nu_over_mu = [minor_af_by_pat[pat]/aa_mutation_rates[(codons[pat][pos],target_aa)] for pat in pats if aa_mutation_rates[(codons[pat][pos],target_aa)]>0] # return the inverse, i.e. essentially the harmonic mean if len(nu_over_mu): savg = 1.0/max(0.01, np.mean(nu_over_mu)) else: savg=np.nan return savg target_ii = alphaal.index(target_aa) codons = data['init_codon'][region] minor_af_by_pat = {pat: x[target_ii,pos].sum(axis=0)/x[:20,pos].sum(axis=0) for pat, x in data['af_by_pat'][region].iteritems() if pos in codons[pat]} all_patients = minor_af_by_pat.keys() if nbootstraps: s_bs = [] for bi in xrange(nbootstraps): tmp_s = s([all_patients[pi] for pi in np.random.randint(len(all_patients), size=len(all_patients))]) if not np.isnan(tmp_s): s_bs.append(tmp_s) if len(s_bs): s_out = [np.percentile(s_bs, perc) for perc in [5, 25, 50, 75,95]] else: s_out = [np.nan for perc in [5, 25, 50, 75,95]] else: s_out = s(all_patients) return s_out
def plot_drug_resistance_mutations(data, aa_mutation_rates, fname=None): """Plot the frequency of drug resistance mutations""" import matplotlib.patches as patches fs = 16 region = "pol" pcodes = data["init_codon"][region].keys() fig, axs = plt.subplots(2, 1, gridspec_kw={"height_ratios": [1, 6]}) ax = axs[1] drug_afs_items = [] mut_types = [] drug_classes = ["PI", "NRTI", "NNRTI", "INI"] for prot in drug_classes: drug_afs = {} drug_mut_rates = {} offset = drug_muts[prot]["offset"] for cons_aa, pos, target_aa in drug_muts[prot]["mutations"]: codons = {pat: data["init_codon"][region][pat][pos + offset] for pat in pcodes} mut_rates = {pat: np.sum([aa_mutation_rates[(codons[pat], aa)] for aa in target_aa]) for pat in pcodes} freqs = { pat: np.sum( [ data["af_by_pat"][region][pat][alphaal.index(aa), pos + offset] / data["af_by_pat"][region][pat][:20, pos + offset].sum() for aa in target_aa ] ) for pat in pcodes } drug_afs[(cons_aa, pos, target_aa)] = freqs drug_mut_rates[(cons_aa, pos, target_aa)] = mut_rates drug_afs_items.extend( filter( lambda x: np.sum(filter(lambda y: ~np.isnan(y), x[1].values())) > 0, sorted(drug_afs.items(), key=lambda x: x[0][1]), ) ) mut_types.append(len(drug_afs_items)) # make list of all mutations whose non-nan frequencies sum to 0 mono_muts = [ "".join(map(str, x[0])) for x in filter( lambda x: np.sum(filter(lambda y: ~np.isnan(y), x[1].values())) == 0, sorted(drug_afs.items(), key=lambda x: x[0][1]), ) ] print("Monomorphic:", prot, mono_muts) plt.ylim([1.1e-5, 1e-1]) for mi in mut_types[:-1]: plt.plot([mi - 0.5, mi - 0.5], plt.ylim(), c=(0.3, 0.3, 0.3), lw=3, alpha=0.5) ax.axhline(4e-5, c=(0.3, 0.3, 0.3), lw=3, alpha=0.5) for ni, prot in enumerate(drug_classes): plt.text(0.5 * (mut_types[ni] + (mut_types[ni - 1] if ni else 0)) - 0.5, 0.12, prot, fontsize=16, ha="center") for mi in range(max(mut_types)): c = 0.5 + 0.2 * (mi % 2) ax.add_patch( patches.Rectangle( (mi - 0.5, plt.ylim()[0]), 1.0, plt.ylim()[1], color=(c, c, c), alpha=0.2 # (x,y), width, height ) ) # plt.xticks(np.arange(len(all_muts)), ["".join(map(str, x)) for x in all_muts], rotation=60) afdr = pd.DataFrame( np.array([x[1].values() for x in drug_afs_items]).T, columns=["".join(map(str, x[0])) for x in drug_afs_items] ) afdr[afdr < 0.8e-4] = 0 sns.stripplot(data=afdr, jitter=0.4, alpha=0.8, size=12, lw=1, edgecolor="white") # Add the number of missing points at the bottom of the plot, and the cost # at the top dd = afdr.iloc[[0, 1, 2, 3, 4]].copy() dd.index = ["x", "freq", "size", "cost", "mr"] dd.loc["x"] = np.arange(dd.shape[1]) dd.loc["freq"] = 2e-5 dd.loc["n"] = afdr.shape[0] - (afdr > 1e-4).sum(axis=0) dd.loc["size"] = dd.loc["n"] ** (1.4) + 13 dd.loc["cost"] = 1.0 / afdr.fillna(0).mean(axis=0) dd.loc["mr"] = 0 # NOTE: the first 6 mutations are in PR, the rest in RT import re from Bio.Seq import translate reference = HIVreference(refname="HXB2", load_alignment=False) seq_PR = reference.annotation["PR"].extract(reference.seq) seq_RT = reference.annotation["RT"].extract(reference.seq) seq_IN = reference.annotation["IN"].extract(reference.seq) murate = load_mutation_rates()["mu"] for i, mut in enumerate(dd.T.index): mr = 0 if i < 6: seq_tmp = seq_PR elif i < 6 + 5 + 4: seq_tmp = seq_RT else: seq_tmp = seq_IN aa_from, pos, aas_to = re.sub("([A-Z])(\d+)([A-Z]+)", r"\1_\2_\3", mut).split("_") cod = str(seq_tmp.seq[(int(pos) - 1) * 3 : int(pos) * 3]) for pos_cod in xrange(3): for nuc in ["A", "C", "G", "T"]: codmut = list(cod) codmut[pos_cod] = nuc codmut = "".join(codmut) if (codmut != cod) and (translate(cod) == aa_from) and (translate(codmut) in aas_to): mr += murate[cod[pos_cod] + "->" + nuc] dd.loc["cost", mut] *= mr dd.loc["mr", mut] = mr for im, (mutname, s) in enumerate(dd.T.iterrows()): ax.scatter( s["x"], s["freq"], s=s["size"] ** 2, alpha=0.8, edgecolor="white", facecolor=sns.color_palette("husl", afdr.shape[1])[im], lw=2, ) ax.text(s["x"], s["freq"], str(int(s["n"])), fontsize=fs, ha="center", va="center") plt.yscale("log") plt.xticks(rotation=50) plt.ylabel("minor variant frequency", fontsize=fs) plt.tick_params(labelsize=fs * 0.8) for tick in ax.xaxis.get_major_ticks(): tick.label.set_horizontalalignment("right") # Fitness cost at the top ax1 = axs[0] ax1.set_xlim(*ax.get_xlim()) ax1.set_xticks(ax.get_xticks() + 0.5) ax1.set_xticklabels([]) ax1.set_ylim(1e-3, 1) ax1.set_yticks([1e-3, 1e-2, 1e-1, 1]) ax1.yaxis.set_tick_params(labelsize=fs * 0.8) ax1.set_yscale("log") ax1.set_ylabel("cost", fontsize=fs) for im, (mut, y) in enumerate(dd.loc["cost"].iteritems()): ax1.bar(im - 0.5, y, 1, color=sns.color_palette("husl", afdr.shape[1])[im]) plt.tight_layout() if fname is not None: for ext in ["svg", "pdf", "png"]: plt.savefig(fname + "." + ext) else: plt.ion() plt.show()
if region=='pol': for prot in drug_muts: drug_afs = {} drug_mut_rates = {} offset = drug_muts[prot]['offset'] all_muts = drug_muts[prot]['mutations'] for cons_aa, pos, muts in all_muts: tmp = [] tmp_muts = [] for pcode in combined_af_by_pat[region]: drug_af = 0 mut_rate = 0 af_vec = combined_af_by_pat[region][pcode][:,pos+offset] tot=af_vec.sum() init_codon = initial_codons_by_pat[region][pcode][pos+offset] if af_vec.argmax()!=alphaal.index(cons_aa): print('Doesn') if tot: for aa in muts: print(prot, pcode, pos, cons_aa) mut_rate += aminoacid_mutation_rate(init_codon, aa, nuc_muts, doublehit=True) drug_af+=af_vec[alphaal.index(aa)]/tot tmp.append(drug_af) tmp_muts.append(mut_rate) else: tmp.append(np.nan) tmp_muts.append(np.nan) drug_afs[(cons_aa,pos,muts)] = np.array(tmp) drug_mut_rates[(cons_aa,pos,muts)] = np.array(tmp_muts) plt.figure()
def plot_drug_resistance_mutations(data, aa_mutation_rates, fname=None): '''Plot the frequency of drug resistance mutations''' import matplotlib.patches as patches fs = 16 region = 'pol' pcodes = data['init_codon'][region].keys() fig, axs = plt.subplots(2, 1, gridspec_kw={'height_ratios': [1, 6]}) ax = axs[1] drug_afs_items = [] mut_types = [] drug_classes = ['PI', 'NRTI', 'NNRTI', 'INI'] for prot in drug_classes: drug_afs = {} drug_mut_rates = {} offset = drug_muts[prot]['offset'] for cons_aa, pos, target_aa in drug_muts[prot]['mutations']: codons = { pat: data['init_codon'][region][pat][pos + offset] for pat in pcodes } mut_rates = { pat: np.sum( [aa_mutation_rates[(codons[pat], aa)] for aa in target_aa]) for pat in pcodes } freqs = {pat:np.sum([data['af_by_pat'][region][pat][alphaal.index(aa), pos+offset]\ /data['af_by_pat'][region][pat][:20,pos+offset].sum() for aa in target_aa]) for pat in pcodes} drug_afs[(cons_aa, pos, target_aa)] = freqs drug_mut_rates[(cons_aa, pos, target_aa)] = mut_rates drug_afs_items.extend( filter( lambda x: np.sum(filter(lambda y: ~np.isnan(y), x[1].values())) > 0, sorted(drug_afs.items(), key=lambda x: x[0][1]))) mut_types.append(len(drug_afs_items)) #make list of all mutations whose non-nan frequencies sum to 0 mono_muts = [ ''.join(map(str, x[0])) for x in filter( lambda x: np.sum(filter(lambda y: ~np.isnan(y), x[1].values())) == 0, sorted(drug_afs.items(), key=lambda x: x[0][1])) ] print('Monomorphic:', prot, mono_muts) plt.ylim([1.1e-5, 1e-1]) for mi in mut_types[:-1]: plt.plot([mi - 0.5, mi - 0.5], plt.ylim(), c=(.3, .3, .3), lw=3, alpha=0.5) ax.axhline(4e-5, c=(.3, .3, .3), lw=3, alpha=0.5) for ni, prot in enumerate(drug_classes): plt.text(0.5 * (mut_types[ni] + (mut_types[ni - 1] if ni else 0)) - 0.5, 0.12, prot, fontsize=16, ha='center') for mi in range(max(mut_types)): c = 0.5 + 0.2 * (mi % 2) ax.add_patch( patches.Rectangle( (mi - 0.5, plt.ylim()[0]), 1.0, plt.ylim()[1], #(x,y), width, height color=(c, c, c), alpha=0.2)) #plt.xticks(np.arange(len(all_muts)), ["".join(map(str, x)) for x in all_muts], rotation=60) afdr = pd.DataFrame( np.array([x[1].values() for x in drug_afs_items]).T, columns=["".join(map(str, x[0])) for x in drug_afs_items]) afdr[afdr < 0.8e-4] = 0 sns.stripplot(data=afdr, jitter=0.4, alpha=0.8, size=12, lw=1, edgecolor='white') # Add the number of missing points at the bottom of the plot, and the cost # at the top dd = afdr.iloc[[0, 1, 2, 3, 4]].copy() dd.index = ['x', 'freq', 'size', 'cost', 'mr'] dd.loc['x'] = np.arange(dd.shape[1]) dd.loc['freq'] = 2e-5 dd.loc['n'] = afdr.shape[0] - (afdr > 1e-4).sum(axis=0) dd.loc['size'] = dd.loc['n']**(1.4) + 13 dd.loc['cost'] = 1.0 / afdr.fillna(0).mean(axis=0) dd.loc['mr'] = 0 # NOTE: the first 6 mutations are in PR, the rest in RT import re from Bio.Seq import translate reference = HIVreference(refname='HXB2', load_alignment=False) seq_PR = reference.annotation['PR'].extract(reference.seq) seq_RT = reference.annotation['RT'].extract(reference.seq) seq_IN = reference.annotation['IN'].extract(reference.seq) murate = load_mutation_rates()['mu'] for i, mut in enumerate(dd.T.index): mr = 0 if i < 6: seq_tmp = seq_PR elif i < 6 + 5 + 4: seq_tmp = seq_RT else: seq_tmp = seq_IN aa_from, pos, aas_to = re.sub('([A-Z])(\d+)([A-Z]+)', r'\1_\2_\3', mut).split('_') cod = str(seq_tmp.seq[(int(pos) - 1) * 3:int(pos) * 3]) for pos_cod in xrange(3): for nuc in ['A', 'C', 'G', 'T']: codmut = list(cod) codmut[pos_cod] = nuc codmut = ''.join(codmut) if (codmut != cod) and (translate(cod) == aa_from) and (translate(codmut) in aas_to): mr += murate[cod[pos_cod] + '->' + nuc] dd.loc['cost', mut] *= mr dd.loc['mr', mut] = mr for im, (mutname, s) in enumerate(dd.T.iterrows()): ax.scatter( s['x'], s['freq'], s=s['size']**2, alpha=0.8, edgecolor='white', facecolor=sns.color_palette('husl', afdr.shape[1])[im], lw=2, ) ax.text(s['x'], s['freq'], str(int(s['n'])), fontsize=fs, ha='center', va='center') plt.yscale('log') plt.xticks(rotation=50) plt.ylabel('minor variant frequency', fontsize=fs) plt.tick_params(labelsize=fs * 0.8) for tick in ax.xaxis.get_major_ticks(): tick.label.set_horizontalalignment('right') # Fitness cost at the top ax1 = axs[0] ax1.set_xlim(*ax.get_xlim()) ax1.set_xticks(ax.get_xticks() + 0.5) ax1.set_xticklabels([]) ax1.set_ylim(1e-3, 1) ax1.set_yticks([1e-3, 1e-2, 1e-1, 1]) ax1.yaxis.set_tick_params(labelsize=fs * 0.8) ax1.set_yscale('log') ax1.set_ylabel('cost', fontsize=fs) for im, (mut, y) in enumerate(dd.loc['cost'].iteritems()): ax1.bar(im - 0.5, y, 1, color=sns.color_palette('husl', afdr.shape[1])[im]) plt.tight_layout() if fname is not None: for ext in ['svg', 'pdf', 'png']: plt.savefig(fname + '.' + ext) else: plt.ion() plt.show()
drug_afs = {} drug_mut_rates = {} offset = drug_muts[prot]['offset'] all_muts = drug_muts[prot]['mutations'] for cons_aa, pos, muts in all_muts: tmp = [] tmp_muts = [] for pcode in combined_af_by_pat[region]: drug_af = 0 mut_rate = 0 af_vec = combined_af_by_pat[region][pcode][:, pos + offset] tot = af_vec.sum() init_codon = initial_codons_by_pat[region][pcode][ pos + offset] if af_vec.argmax() != alphaal.index(cons_aa): print('Doesn') if tot: for aa in muts: print(prot, pcode, pos, cons_aa) mut_rate += aminoacid_mutation_rate( init_codon, aa, nuc_muts, doublehit=True) drug_af += af_vec[alphaal.index(aa)] / tot tmp.append(drug_af) tmp_muts.append(mut_rate) else: tmp.append(np.nan) tmp_muts.append(np.nan) drug_afs[(cons_aa, pos, muts)] = np.array(tmp) drug_mut_rates[(cons_aa, pos, muts)] = np.array(tmp_muts)