def load(virus): if virus == 'h1': from escape import load_doud2018 seq, seqs_escape = load_doud2018() train_fname = 'target/flu/clusters/all_h1.fasta' mut_fname = 'target/flu/mutation/mutations_h1.fa' anchor_id = ('gb:LC333185|ncbiId:BBB04702.1|UniProtKB:-N/A-|' 'Organism:Influenza') elif virus == 'h3': from escape import load_lee2019 seq, seqs_escape = load_lee2019() train_fname = 'target/flu/clusters/all_h3.fasta' mut_fname = 'target/flu/mutation/mutations_h3.fa' anchor_id = 'Reference_Perth2009_HA_coding_sequence' elif virus == 'bg505': from escape import load_dingens2019 seq, seqs_escape = load_dingens2019() train_fname = 'target/hiv/clusters/all_BG505.fasta' mut_fname = 'target/hiv/mutation/mutations_hiv.fa' anchor_id = 'A1.KE.-.BG505_W6M_ENV_C2.DQ208458' elif virus == 'sarscov2': from escape import load_baum2020 seq, seqs_escape = load_baum2020() train_fname = 'target/cov/clusters/all_sarscov2.fasta' mut_fname = 'target/cov/mutation/mutations_sarscov2.fa' anchor_id = 'YP_009724390.1' elif virus == 'cov2rbd': from escape import load_greaney2020 seq, seqs_escape = load_greaney2020() train_fname = 'target/cov/clusters/all_sarscov2.fasta' mut_fname = 'target/cov/mutation/mutations_sarscov2.fa' anchor_id = 'YP_009724390.1' else: raise ValueError('invalid option {}'.format(virus)) return seq, seqs_escape, train_fname, mut_fname, anchor_id
def cached_escape(cache_fname, beta, cutoff=None, expr_cutoff=None, bind_cutoff=None, plot=True, namespace='semantics'): if 'flu_h1' in cache_fname: from escape import load_doud2018 if cutoff is None: wt_seq, seqs_escape = load_doud2018() else: wt_seq, seqs_escape = load_doud2018(survival_cutoff=cutoff) elif 'flu_h3' in cache_fname: from escape import load_lee2019 if cutoff is None: wt_seq, seqs_escape = load_lee2019() else: wt_seq, seqs_escape = load_lee2019(survival_cutoff=cutoff) elif 'hiv' in cache_fname: from escape import load_dingens2019 if cutoff is None: wt_seq, seqs_escape = load_dingens2019() else: wt_seq, seqs_escape = load_dingens2019(survival_cutoff=cutoff) elif '_cov_' in cache_fname: from escape import load_baum2020 wt_seq, seqs_escape = load_baum2020() elif 'cov2rbd' in cache_fname: from escape import load_greaney2020 if cutoff is None: wt_seq, seqs_escape = load_greaney2020() elif expr_cutoff is not None: wt_seq, seqs_escape = load_greaney2020(expr_cutoff=expr_cutoff) else: wt_seq, seqs_escape = load_greaney2020(survival_cutoff=cutoff) else: raise ValueError('invalid option {}'.format(cache_fname)) prob, change, escape_idx, viable_idx = [], [], [], [] with open(cache_fname) as f: f.readline() for line in f: fields = line.rstrip().split('\t') pos = int(fields[0]) if 'rbd' in cache_fname: if pos < 330 or pos > 530: continue if fields[2] in {'U', 'B', 'J', 'X', 'Z'}: continue aa_wt = fields[1] aa_mut = fields[2] assert (wt_seq[pos] == aa_wt) mut_seq = wt_seq[:pos] + aa_mut + wt_seq[pos + 1:] if mut_seq not in seqs_escape: continue prob.append(float(fields[3])) change.append(float(fields[4])) viable_idx.append(fields[5] == 'True') escape_idx.append( (mut_seq in seqs_escape) and (sum([m['significant'] for m in seqs_escape[mut_seq]]) > 0)) prob, orig_prob = np.array(prob), np.array(prob) change, orig_change = np.array(change), np.array(change) escape_idx = np.array(escape_idx) viable_idx = np.array(viable_idx) acquisition = ss.rankdata(change) + (beta * ss.rankdata(prob)) pos_change_idx = change > 0 pos_change_escape_idx = np.logical_and(pos_change_idx, escape_idx) escape_prob = prob[pos_change_escape_idx] escape_change = change[pos_change_escape_idx] prob = prob[pos_change_idx] change = change[pos_change_idx] log_prob, log_change = np.log10(prob), np.log10(change) log_escape_prob, log_escape_change = (np.log10(escape_prob), np.log10(escape_change)) if plot: mkdir_p('figures') plt.figure() plt.scatter(log_prob, log_change, c=acquisition[pos_change_idx], cmap='viridis', alpha=0.3) plt.scatter(log_escape_prob, log_escape_change, c='red', alpha=0.5, marker='x') plt.xlabel(r'$ \log_{10}(\hat{p}(x_i | \mathbf{x}_{[N] ∖ \{i\} })) $') plt.ylabel(r'$ \log_{10}(\Delta \mathbf{\hat{z}}) $') plt.savefig('figures/{}_acquisition.png'.format(namespace), dpi=300) plt.close() rand_idx = np.random.choice(len(prob), len(escape_prob)) plt.figure() plt.scatter(log_prob, log_change, c=acquisition[pos_change_idx], cmap='viridis', alpha=0.3) plt.scatter(log_prob[rand_idx], log_change[rand_idx], c='red', alpha=0.5, marker='x') plt.xlabel(r'$ \log_{10}(\hat{p}(x_i | \mathbf{x}_{[N] ∖ \{i\} })) $') plt.ylabel(r'$ \log_{10}(\Delta \mathbf{\hat{z}}) $') plt.savefig('figures/{}_acquisition_rand.png'.format(namespace), dpi=300) plt.close() if len(escape_prob) == 0: print('No escape mutations found.') return acq_argsort = ss.rankdata(-acquisition) escape_rank_dist = acq_argsort[escape_idx] size = len(prob) print('Number of escape seqs: {} / {}'.format(len(escape_rank_dist), sum(escape_idx))) print('Mean rank: {} / {}'.format(np.mean(escape_rank_dist), size)) print('Median rank: {} / {}'.format(np.median(escape_rank_dist), size)) print('Min rank: {} / {}'.format(np.min(escape_rank_dist), size)) print('Max rank: {} / {}'.format(np.max(escape_rank_dist), size)) print('Rank stdev: {} / {}'.format(np.std(escape_rank_dist), size)) max_consider = len(prob) n_consider = np.array([i + 1 for i in range(max_consider)]) n_escape = np.array( [sum(escape_rank_dist <= i + 1) for i in range(max_consider)]) norm = max(n_consider) * max(n_escape) norm_auc = auc(n_consider, n_escape) / norm escape_rank_prob = ss.rankdata(-orig_prob)[escape_idx] n_escape_prob = np.array( [sum(escape_rank_prob <= i + 1) for i in range(max_consider)]) norm_auc_prob = auc(n_consider, n_escape_prob) / norm escape_rank_change = ss.rankdata(-orig_change)[escape_idx] n_escape_change = np.array( [sum(escape_rank_change <= i + 1) for i in range(max_consider)]) norm_auc_change = auc(n_consider, n_escape_change) / norm if plot: plt.figure() plt.plot(n_consider, n_escape) plt.plot(n_consider, n_escape_change, c='C0', linestyle='-.') plt.plot(n_consider, n_escape_prob, c='C0', linestyle=':') plt.plot(n_consider, n_consider * (len(escape_prob) / len(prob)), c='gray', linestyle='--') plt.xlabel(r'$ \log_{10}() $') plt.ylabel(r'$ \log_{10}(\Delta \mathbf{\hat{z}}) $') plt.legend([ r'$ \Delta \mathbf{\hat{z}} + ' + r'\beta \hat{p}(x_i | \mathbf{x}_{[N] ∖ \{i\} }) $,' + (' AUC = {:.3f}'.format(norm_auc)), r'$ \Delta \mathbf{\hat{z}} $ only,' + (' AUC = {:.3f}'.format(norm_auc_change)), r'$ \hat{p}(x_i | \mathbf{x}_{[N] ∖ \{i\} }) $ only,' + (' AUC = {:.3f}'.format(norm_auc_prob)), 'Random guessing, AUC = 0.500' ]) plt.xlabel('Top N') plt.ylabel('Number of escape mutations in top N') plt.savefig('figures/{}_consider_escape.png'.format(namespace), dpi=300) plt.close() print('Escape semantics, beta = {} [{}]'.format(beta, namespace)) norm_auc_p = compute_p(norm_auc, sum(escape_idx), len(escape_idx)) print('AUC (CSCS): {}, P = {}'.format(norm_auc, norm_auc_p)) print('AUC (semantic change only): {}'.format(norm_auc_change)) print('AUC (grammaticality only): {}'.format(norm_auc_prob)) print('{:.4g} (mean log prob), {:.4g} (mean log prob escape), ' '{:.4g} (p-value)'.format( log_prob.mean(), log_escape_prob.mean(), ss.mannwhitneyu(log_prob, log_escape_prob, alternative='two-sided')[1])) print('{:.4g} (mean log change), {:.4g} (mean log change escape), ' '{:.4g} (p-value)'.format( change.mean(), escape_change.mean(), ss.mannwhitneyu(change, escape_change, alternative='two-sided')[1]))
raise ValueError('Model must be trained or loaded ' 'from checkpoint.') no_embed = {'hmm'} if args.model_name in no_embed: raise ValueError('Embeddings not available for models: {}'.format( ', '.join(no_embed))) analyze_embedding(args, model, seqs, vocabulary) if args.semantics: if args.checkpoint is None and not args.train: raise ValueError('Model must be trained or loaded ' 'from checkpoint.') from escape import load_baum2020, load_greaney2020 tprint('Baum et al. 2020...') seq_to_mutate, seqs_escape = load_baum2020() analyze_semantics( args, model, vocabulary, seq_to_mutate, seqs_escape, comb_batch=5000, prob_cutoff=0, beta=1., plot_acquisition=True, ) tprint('Greaney et al. 2020...') seq_to_mutate, seqs_escape = load_greaney2020() analyze_semantics( args,