def compare_scores(dataset_name, *batchscore_ids): """ Compare scores from 2 scoring methods in a scatter plot. """ if len(batchscore_ids) != 2: raise Exception("Need 2 batchscore runs to compare") id1, id2 = batchscore_ids pos_pairs = [] neg_pairs = [] for alignment, scores_cols in get_batchscores(dataset_name, batchscore_ids): ts = alignment.testset scores_pairs = zip(*scores_cols) pos_pairs += (scores_pairs[i] for i in xrange(len(scores_pairs)) if ts[i]) neg_pairs += (scores_pairs[i] for i in xrange(len(scores_pairs)) if not ts[i]) plt.figure() x1, y1 = zip(*random.sample(neg_pairs, min(len(neg_pairs), 10000))) plt.scatter(x1, y1, color="red") x2, y2 = zip(*random.sample(pos_pairs, min(len(pos_pairs), 1000))) plt.scatter(x2, y2, color="green") plt.xlabel(id1) plt.ylabel(id2) plt.xlim([min(min(x1), min(x2)), max(max(x1), max(x2))]) plt.ylim([min(min(y1), min(y2)), max(max(y1), max(y2))]) plt.show()
def compute_variance(dataset_name, *batchscore_ids): assert len(batchscore_ids) == 1 var_list = [] for alignment, scores_cols in get_batchscores(dataset_name, batchscore_ids): var_list.append(np.var(zip(*scores_cols))) print np.mean(var_list) plt.hist(var_list) plt.title('Per-sequence variances of rates r') plt.xlabel('Variance') plt.ylabel('Count') plt.show()
def compute_summary(dataset_name): n_sites = [] n_positives = [] n_seqs = [] n_seqs_orig = [] for alignment, _ in get_batchscores(dataset_name): n_sites.append( len(alignment.msa[0]) ) n_positives.append( alignment.testset.count(1) ) n_seqs.append( len(alignment.msa) ) n_seqs_orig.append( alignment.orig_num_sequences ) print "Avg # seqs per alignment: %d" % np.mean(n_seqs) print "Avg # seqs per alignment before filtering: %d" % np.mean(n_seqs_orig) print "Avg # sites per alignment: %d" % np.mean(n_sites) print "Avg %% positives per alignment: %f" % np.mean(np.array(n_positives) / np.array(n_sites)) print "%% positives total: %f" % ( np.sum(np.array(n_positives)) / np.sum(np.array(n_sites)) )
def pr_roc(dataset_name, *batchscore_ids): """ Draw PR and ROC curves for each scorer. """ allscores_cols = [[] for i in batchscore_ids] test_scores = [] # Just aggregate all scores across all data files. It isn't much memory anyway. for alignment, scores_cols in get_batchscores(dataset_name, batchscore_ids): ts = alignment.testset counts = [] for allscores_col, scores in zip(allscores_cols, scores_cols): counts.append(ts.count(None)) allscores_col += [scores[i] for i in xrange(len(ts)) if ts[i] is not None] test_scores += [ts[i] for i in xrange(len(ts)) if ts[i] is not None] scorer_fprs = [] scorer_tprs = [] scorer_precisions = [] scorer_recalls = [] for allscores_col in allscores_cols: fprs, tprs, _ = roc_curve(test_scores, allscores_col, pos_label=1) scorer_fprs.append(fprs) scorer_tprs.append(tprs) precisions, recalls, _ = precision_recall_curve(test_scores, allscores_col, pos_label=1) scorer_precisions.append(precisions) scorer_recalls.append(recalls) print_auc("PR", batchscore_ids, scorer_recalls, scorer_precisions) print_auc("ROC", batchscore_ids, scorer_fprs, scorer_tprs) plot_pr(dataset_name, scorer_precisions, scorer_recalls, batchscore_ids) plot_roc(dataset_name, scorer_fprs, scorer_tprs, batchscore_ids, .5) plt.show(block=False) print """\n* To plot another PR curve: plot_pr(dataset_name, scorer_precisions, scorer_recalls, batchscore_ids, legend='upper right') plt.show()""" print """\n* To plot another ROC curve: plot_roc(dataset_name, scorer_fprs, scorer_tprs, batchscore_ids, x_max=.5, legend='lower right') plt.show()""" print "" import ipdb ipdb.set_trace()
def hist_scores(dataset_name, *batchscore_ids, **kwargs): """ Histogram positive and negative scores for each scorer. Plot F1 for corresponding thresholds alongside. """ if 'noblock' in kwargs: noblock = kwargs['noblock'] else: noblock = False if 'fit_gamma' in kwargs: fit_gamma = kwargs['fit_gamma'] else: fit_gamma = False batchscore_ids = list(batchscore_ids) N = len(batchscore_ids) pos_cols = [[] for i in xrange(N)] neg_cols = [[] for i in xrange(N)] # Just aggregate all scores across all data files. It isn't much memory anyway. for alignment, scores_cols in get_batchscores(dataset_name, batchscore_ids): ts = alignment.testset for pos_col, neg_col, scores_col in zip(pos_cols, neg_cols, scores_cols): pos_col += (scores_col[i] for i in xrange(len(scores_col)) if ts[i]) neg_col += (scores_col[i] for i in xrange(len(scores_col)) if not ts[i]) for i in xrange(len(pos_cols)): pos_col = pos_cols[i] if isinstance(pos_col, tuple) and len(pos_col[0]) == 2: # We are analyzing r4s_func. pos_cols[i], pos_col_new = zip(*pos_col) pos_cols.append(pos_col_new) neg_cols[i], neg_col_new = zip(*neg_cols[i]) neg_cols.append(neg_col_new) batchscore_ids.append(batchscore_ids[i] + '-c') figs = [] for pos_col, neg_col, batchscore_id in zip(pos_cols, neg_cols, batchscore_ids): pos_col = np.array(pos_col) neg_col = np.array(neg_col) # Compute summary statistics tot = len(pos_col) + len(neg_col) pos_mean = np.mean(pos_col) pos_var = np.var(pos_col) neg_mean = np.mean(neg_col) neg_var = np.var(neg_col) print "%s:\n\tpos %f, neg %s\n\tpos mean %f, var %f\n\tneg mean %f, var %f" % ( batchscore_id, len(pos_col)/tot, len(neg_col)/tot, pos_mean, pos_var, neg_mean, neg_var) # Compute plot statistics scores_min = np.min((np.min(pos_col), np.min(neg_col))) scores_max = np.max((np.max(pos_col), np.max(neg_col))) bins = np.linspace(scores_min,scores_max,101) bin_width = bins[1] - bins[0] bin_lows = bins[:-1] bin_mids = bin_lows + bin_width # Plot counts plt.ion() fig = plt.figure() plt.xlabel('Score') pos_counts, _ = np.histogram(pos_col, bins) neg_counts, _ = np.histogram(neg_col, bins) ax = plt.gca() ax.bar(bin_lows, neg_counts, bin_width, color='r', alpha=0.5)#, bottom=pos_counts) ax.bar(bin_lows, pos_counts, bin_width, color='g', alpha=0.8) ax.set_ylabel('Count') # Plot gamma fit if desired if fit_gamma: x = np.abs(bin_mids) plt.plot(bin_mids, bin_width * len(pos_col) * \ ss.gamma.pdf(x, a=abs(pos_mean)**2/pos_var, scale=pos_var/abs(pos_mean)), 'g') plt.plot(bin_mids, bin_width * len(neg_col) * \ ss.gamma.pdf(x, a=abs(neg_mean)**2/neg_var, scale=neg_var/abs(neg_mean)), 'r') # Plot corresponding F1 pos_tot = sum(pos_counts) pos_right = 0 neg_right = 0 f1 = [] for pos_count, neg_count in reversed(zip(pos_counts, neg_counts)): pos_right += pos_count neg_right += neg_count precision = pos_right / pos_tot recall = pos_right / (pos_right + neg_right) if precision or recall: f1.append( 2 * precision * recall / (precision + recall) ) else: f1.append(0) f1 = np.array(list(reversed(f1))) ax = ax.twinx() ax.plot(bin_mids, f1, '--') ax.set_ylabel('F1') plt.title('Scores (%s, %s)' % (batchscore_id, dataset_name)) figs.append(fig) if noblock: plt.show() else: plt.show(block=False) import ipdb ipdb.set_trace()
def r4s_ppc(dataset_name, **jsd_params): afs = list(get_batchscores(dataset_name, align_files_only=True)) dc = DATASET_CONFIGS[dataset_name] r4s_name = 'R4S_EB-vanilla' r4s_dir = os.path.join(get_batchscore_dir(dataset_name), r4s_name) r4s = Rate4siteEb() jsd = JsDivergence(**jsd_params) # Choose random alignment/scores pair align_file = random.choice(afs) test_file = dc.get_test_file(align_file) r4s_file = dc.get_out_file(align_file, r4s_dir) alignment = Alignment(align_file, test_file=test_file, parse_testset_fn=dc.parse_testset_fn) n_seqs = len(alignment.msa) n_sites = len(alignment.msa[0]) fig = plt.figure() ax = plt.gca() inds = range(n_sites) rates = read_batchscores(r4s_file) tree = alignment.get_phylotree() root = tree.root names_map = dict((name,i) for i,name in enumerate(alignment.names)) # Pre-compute probabilities for branch, for every site (i.e., rate) P_cached = precompute_tree_probs(tree, rates, r4s.sub_model) for r in P_cached: for node in P_cached[r]: # We treat root separately if node != root: cum_probs = np.cumsum(P_cached[r][node],axis=1) if not np.all(np.abs(cum_probs[:,-1]-1) < 1e-4): raise ValueError("Bad probability matrix") cum_probs[:,-1] = 1 P_cached[r][node] = np.array(cum_probs) root_freqs = np.cumsum(r4s.sub_model.freqs) if not abs(root_freqs[-1]-1) < 1e-4: raise ValueError("Bad probability matrix") root_freqs[-1] = 1 # Repeat replication for N_RUNS jsd_rep_scores_all = [] for n in xrange(N_RUNS): # For each site, generate amino acids for each sequence using that site's rate # DFS through tree, setting amino acids at each node msa = [[] for i in xrange(n_seqs)] for i in xrange(n_sites): r = rates[i] aa_ind = weighted_choice(root_freqs) bfs = [(aa_ind,node) for node in root.clades] for aa_ind, node in bfs: aa_ind = weighted_choice(P_cached[r][node][aa_ind]) if node.is_terminal(): msa[names_map[node.name]].append(amino_acids[aa_ind]) else: bfs += ((aa_ind,child) for child in node.clades) aln_rep = MockAlignment(alignment.names, msa, tree, alignment.get_seq_weights) jsd_rep_scores = jsd.score(aln_rep) jsd_rep_scores_all.append(jsd_rep_scores) ax.scatter(inds, jsd_rep_scores, color='k', alpha=0.2) jsd_orig_scores = jsd.score(alignment) ts = alignment.testset pos_inds = [i for i in inds if ts[i]] pos_orig_scores = [jsd_orig_scores[i] for i in pos_inds] neg_inds = [i for i in inds if not ts[i]] neg_orig_scores = [jsd_orig_scores[i] for i in neg_inds] ax.scatter(pos_inds, pos_orig_scores, color='g') ax.scatter(neg_inds, neg_orig_scores, color='r') ax.set_ylabel('JSD score') rep_scores_per_col = np.array(jsd_rep_scores_all).T means = np.mean(rep_scores_per_col, axis=1) stds = np.std(rep_scores_per_col, axis=1) zscores = (np.array(jsd_orig_scores) - means) / stds ax2 = ax.twinx() ax2.plot(inds, zscores, '--') ax2.set_ylabel('Deviations') plt.xlim(inds[0], inds[-1]) plt.xlabel('Sites') plt.figure() plt.scatter(rates, zscores) plt.xlabel('Rates') plt.xlabel('Deviations') plt.show(block=False) import ipdb ipdb.set_trace()