def pmid_26033813_analysis(drug: str): tree = build_tree() feature_label_path = find_newest_data_path( f'compute_drug_features_labels_alpha_{args.alpha:.2f}') labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle') selected_samples = sorted_intersection(labels_all.index, expr.index) selected_labels = labels_all.loc[selected_samples] selected_expr = expr.loc[selected_samples, :] fit_tree(selected_expr, selected_labels, tree) predictions = pd.Series( [ predict_sample(sample_name, selected_expr, tree) for sample_name in selected_samples ], index=selected_samples, ) rd = RocData.calculate(selected_labels, predictions) rd.save(data_path / f'roc_data_{drug}.pickle') plot_roc(rd, f'PMID26033813 ROC: {drug.title()}', output_path / f'{drug}_roc.pdf') pr = PrData.calculate(selected_labels, predictions) plot_pr(pr, f'PMID26033813 Precision-Recall: {drug.title()}', output_path / f'{drug}_pr.pdf')
def map_reads_to_genes(sam_path: Path) -> Tuple[pd.Series, pd.Series]: tree_path = find_newest_data_path('build_tree') with open(tree_path / 'trees.pickle', 'rb') as f: tree_data = pickle.load(f) trees = tree_data['trees'] gene_length = tree_data['gene_length'] intervals_by_gene = tree_data['intervals_by_gene'] read_counts = pd.Series(0, index=sorted(intervals_by_gene)) reads_mapped_to_genes = 0 reads_aligned = 0 reads_total = 0 print('Reading', sam_path) with open(sam_path) as f: # Read each line of the SAM file. for line in f: # Filter out the line that is not a read. if line.startswith('@'): continue reads_total += 1 col = line.split('\t') flags = int(col[1]) if flags & 0x4: # unmapped continue reads_aligned += 1 chrom = col[2] start = int(col[3]) read_length = len(col[9]) end = start + read_length # Get the gene id at a certain point if there is any. gene_ids = trees[chrom][start:end] # Reads shouldn't map to multiple genes, but it's still better to be # safe with this and not count reads multiple times if this happens if gene_ids: reads_mapped_to_genes += 1 for gene_id in gene_ids: read_counts.loc[gene_id.data] += 1 rpkm = (read_counts * 1000000) / (reads_total * gene_length) gene_count = (read_counts > 0).sum() summary_data = pd.Series({ 'read_count': reads_total, 'reads_aligned': reads_aligned, 'mapped_to_genes': reads_mapped_to_genes, 'genes_with_reads': gene_count, }) return rpkm, summary_data
def read_hugo_entrez_mapping() -> Dict[str, str]: print('Reading Hugo to Entrez mapping') hugo_entrez_mapping = {} entrez_ids = set() with open(HUGO_ENTREZ_MAPPING_PATH) as f: r = csv.DictReader(f, delimiter='\t') for row in r: entrez_id = row['Entrez Gene ID(supplied by NCBI)'] entrez_ids.add(entrez_id) hugo_entrez_mapping[row['Approved Symbol']] = entrez_id for synonym in row['Synonyms'].split(): hugo_entrez_mapping[synonym] = entrez_id # List of 2-tuples: # [0] key in hugo_entrez_mapping # [1] new key which will map to the same value manual_mapping_addition = [ ('ADGRE5', 'CD97'), ] for key_existing, key_new in manual_mapping_addition: hugo_entrez_mapping[key_new] = hugo_entrez_mapping[key_existing] mygene_path = find_newest_data_path('query_mygene') with open(mygene_path / 'mapping.json') as f: hugo_entrez_mapping.update(json.load(f)) print( 'Read Hugo to Entrez mapping: {} gene names to {} Entrez IDs'.format( len(hugo_entrez_mapping), len(entrez_ids), ) ) return hugo_entrez_mapping
def get_cluster_assignments() -> pd.Series: nbs_matlab_path = find_newest_data_path('nbs_matlab') matlab_file = nbs_matlab_path / 'nbs_cluster.mat' print('Reading', matlab_file) mat = scipy.io.loadmat(str(matlab_file)) labels = mat['NBS_cc_label'].flatten() mut_data = mat['baseSMData'][0][0] patient_ids = [p[0] for p in mut_data[3].flatten()] assert len(patient_ids) == len(labels) return pd.Series(labels, index=patient_ids)
def ki67_analysis(drug: str): feature_label_path = find_newest_data_path( f'compute_drug_features_labels_alpha_{args.alpha:.2f}') labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle') selected_samples = sorted_intersection(labels_all.index, expr.index) selected_expr = expr.loc[selected_samples, gene] selected_labels = labels_all.loc[selected_samples] rd = RocData.calculate(selected_labels, selected_expr) rd.save(data_path / f'roc_data_{drug}.pickle') plot_roc(rd, f'Ki67 ROC: {drug.title()}', output_path / f'{drug}_roc.pdf') pr = PrData.calculate(selected_labels, selected_expr) plot_pr(pr, f'Ki67 Precision-Recall: {drug.title()}', output_path / f'{drug}_pr.pdf')
def pmid_26892682_analysis(drug: str): feature_label_path = find_newest_data_path( f'compute_drug_features_labels_alpha_{args.alpha:.2f}') labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle') selected_samples = sorted_intersection(labels_all.index, expr.index) selected_expr = expr.loc[selected_samples, selected_genes] selected_labels = labels_all.loc[selected_samples] ln_p_over_1_minus_p = selected_expr.as_matrix() @ coefs.as_matrix() probs = expit(ln_p_over_1_minus_p) rd = RocData.calculate(selected_labels, probs) rd.save(data_path / f'roc_data_{drug}.pickle') plot_roc(rd, f'PMID26892682 ROC: {drug.title()}', output_path / f'{drug}_roc.pdf') pr = PrData.calculate(selected_labels, probs) plot_pr(pr, f'PMID26892682 Precision-Recall: {drug.title()}', output_path / f'{drug}_pr.pdf')
#!/usr/bin/env python3 from data_path_utils import ( create_data_path, find_newest_data_path, ) import pandas as pd from gene_mappings import read_entrez_hugo_mapping, read_hugo_entrez_mapping data_path = create_data_path('dump_muts_for_nbs') hugo_entrez_mapping = read_hugo_entrez_mapping() entrez_hugo_mapping = read_entrez_hugo_mapping() mut_path = find_newest_data_path('parse_tcga_mutations') muts_all = pd.read_pickle(mut_path / 'mutations.pickle') gene_sel = pd.Series( [ (gene in hugo_entrez_mapping and hugo_entrez_mapping[gene]) for gene in muts_all.columns ], index=muts_all.columns, ).astype(bool) muts = muts_all.loc[:, gene_sel] muts.columns = [hugo_entrez_mapping[gene] for gene in muts.columns] muts = muts.groupby(axis=1, level=-1).any().astype(int) gene_symbols = [entrez_hugo_mapping[gene] for gene in muts.columns] with open(data_path / 'gene_symbols.txt', 'w') as f: print('Gene', file=f)
sorted_intersection, ) p = ArgumentParser() p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA) if '__file__' in globals(): args = p.parse_args() else: args = p.parse_args([]) script_label = 'snf_cluster_results' data_path = create_data_path(script_label) output_path = create_output_path(script_label) snf_path = find_newest_data_path('run_snf') snf_data = pd.read_csv(snf_path / 'clustering.csv', index_col=0) snf_data.columns = ['cluster'] cluster_assignments = snf_data.iloc[:, 0] clusters = sorted(set(cluster_assignments)) def get_cluster_permutation(permutation: List[int]) -> pd.Series: mapping = dict(zip(clusters, permutation)) relabeled = pd.Series( [mapping[value] for value in cluster_assignments], index=cluster_assignments.index, ) return relabeled
def main(): script_label = 'prop_edge_lbs_shuffle' data_path = create_data_path(script_label) output_path = create_output_path(script_label) hem = read_hugo_entrez_mapping() lbs_mut_path = find_newest_data_path('intersect_muts_lbs') lbs_muts = pd.read_csv(lbs_mut_path / 'brca_lbs_muts.csv') prop_edge_path = find_newest_data_path(f'propagate_mutations_edges_alpha_{args.alpha:.2f}') with pd.HDFStore(prop_edge_path / 'data_propagated.hdf5') as store: mut_edge_prop = store['mutations'] patients_with_lbs_muts = set(lbs_muts.patient) print('Patients with LBS mutations:', len(patients_with_lbs_muts)) lbs_muts_by_patient = defaultdict(set) for i, row in lbs_muts.iterrows(): if row.gene not in hem: print('Skipping gene', row.gene) continue lbs_muts_by_patient[row.patient].add(hem[row.gene]) all_edge_set = {i for i in mut_edge_prop.columns if '_' in i} all_edges = sorted(all_edge_set) all_gene_set = set(mut_edge_prop.columns) - all_edge_set shuffle_count = 100 sorted_patients = sorted(patients_with_lbs_muts) patient_count = len(sorted_patients) lbs_edges_by_patient = pd.Series(0, index=sorted_patients) # Assign label of 1 for an edge if either node has a LBS mutation selected_edges_by_patient: Dict[str, Set[str]] = {} shuffled_edges_by_patient: Dict[str, List[Set[str]]] = {} shuffled_by_patient = {} for i, patient in enumerate(patients_with_lbs_muts, 1): print(f'Shuffling LBS mutations for patient {patient} ({i}/{patient_count})') muts = lbs_muts_by_patient[patient] mut_count = len(muts) l = [] for j in range(shuffle_count): other_genes = all_gene_set - muts new_muts = sample(other_genes, mut_count) l.append(new_muts) shuffled_by_patient[patient] = l # TODO: parallelize this; it's too slow for i, patient in enumerate(patients_with_lbs_muts, 1): print(f'Computing selected/shuffled edges for patient {i}/{patient_count}') lbs_genes = lbs_muts_by_patient[patient] selected_edges: Set[str] = set() shuffled_edges: List[Set[str]] = [set() for _ in range(shuffle_count)] edge_scores = mut_edge_prop.loc[patient, all_edges].copy().sort_values(ascending=False) for g1_g2 in edge_scores.index: g1, g2 = g1_g2.split('_') if g1 in lbs_genes or g2 in lbs_genes: selected_edges.add(g1_g2) # TODO: clean up iteration for j, shuffled_genes in enumerate(shuffled_by_patient[patient]): if g1 in shuffled_genes or g2 in shuffled_genes: shuffled_edges[j].add(g1_g2) lbs_edges_by_patient.loc[patient] = len(selected_edges) selected_edges_by_patient[patient] = selected_edges shuffled_edges_by_patient[patient] = shuffled_edges selected_edge_count = pd.Series( {patient: len(edges) for patient, edges in selected_edges_by_patient.items()} ).sort_index() with new_plot(): selected_edge_count.plot.hist(bins=25) plt.xlabel('Number of LBS-incident edges') plt.ylabel('Patients') figure_path = output_path / 'lbs_edge_count.pdf' print('Saving LBS edge count histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') shuffled_data_path = data_path / 'shuffled_muts_edges_by_patient.pickle' print('Saving shuffled muts by patient to', shuffled_data_path) with open(shuffled_data_path, 'wb') as f: pickle.dump( { 'shuffled_by_patient': shuffled_by_patient, 'selected_edges_by_patient': selected_edges_by_patient, 'shuffled_edges_by_patient': shuffled_edges_by_patient, }, f, )
p = ArgumentParser() p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA) if '__file__' in globals(): args = p.parse_args() else: args = p.parse_args([]) output_path = create_output_path('consolidated_roc_plot') fold_count = 5 # [0] are labels, [1] are data paths single_curve_input_paths = [ (r'Turnbull $\it{et}$ $\it{al.}$', find_newest_data_path('pmid_26033813_analysis')), (r'Reijm $\it{et}$ $\it{al.}$', find_newest_data_path('pmid_26892682_analysis')), (r'WExT Mutation Set Count', find_newest_data_path('wext_mut_sets')), (r'Network Based Stratification', find_newest_data_path('nbs_cluster_results')), (r'Similarity Network Fusion', find_newest_data_path('snf_cluster_results')), ] crossval_input_paths = [ ( 'Full, {clf}', find_newest_data_path( f'tcga_train_response_stratify_alpha_{args.alpha:.2f}'), ),
) p = ArgumentParser() p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA) p.add_argument('--plot-pca-components', action='store_true') if '__file__' in globals(): args = p.parse_args() else: args = p.parse_args([]) entrez_hugo_mapping = read_entrez_hugo_mapping() output_label = f'compute_drug_features_labels_alpha_{args.alpha:.2f}' data_path = create_data_path(output_label) drug_response_dir = find_newest_data_path('drug_response_labels') tx_info_raw = pd.read_pickle(drug_response_dir / 'tx_info.pickle') network_path = find_newest_data_path('build_hippie_network') / 'network.pickle' print('Loading network from', network_path) with network_path.open('rb') as f: network = pickle.load(f) self_edge_count = 0 # HACK: remove self edges for node in network.nodes: if network.has_edge(node, node): network.remove_edge(node, node) self_edge_count += 1 print(f'Removed {self_edge_count} self-edges from network')
selected_cancer = 'brca' p = ArgumentParser() p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA) if __name__ == '__main__': args = p.parse_args() else: args = p.parse_args([]) label = f'treatment_features_alpha_{args.alpha:.2f}' data_path = create_data_path(label) output_path = create_output_path(label) network_path = find_newest_data_path('build_hippie_network') with (network_path / 'network.pickle').open('rb') as f: network = pickle.load(f) nodes = sorted(network.nodes()) node_set = set(nodes) w_prime = normalize(network) def get_prop_vec(name, genes): s = pd.Series(0.0, index=nodes) gene_set = set(genes) genes_in_network = gene_set & node_set genes_not_in_network = gene_set - node_set print('Drug {}: {} genes in network, {} not'.format( name,
create_data_path, create_output_path, find_newest_data_path, ) import pandas as pd from gene_mappings import read_ensembl_entrez_mapping from utils import sorted_union p = ArgumentParser() p.add_argument('gdc_manifest', type=Path) args = p.parse_args() data_path = create_data_path('consolidate_mrna_expression') input_path = find_newest_data_path('query_cases_by_file') / 'raw_responses' gdc_manifest = pd.read_table(args.gdc_manifest) files_in_manifest = set(gdc_manifest.id) def get_submitter_ids(data: dict): for key, value in data.items(): if key == 'submitter_id': yield value if isinstance(value, dict): yield from get_submitter_ids(value) if isinstance(value, list): for sub_data in value: yield from get_submitter_ids(sub_data)
script_label = 'wext_mut_sets' data_path = create_data_path(script_label) output_path = create_output_path(script_label) p = ArgumentParser() p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA) if '__file__' in globals(): args = p.parse_args() else: args = p.parse_args([]) hugo_entrez_mapping = read_hugo_entrez_mapping() # Manually created input_path = find_newest_data_path('wext_results') gene_set_data = pd.read_table(input_path / 'tcga-exclusive-sets-sampled-sets.tsv') cols = list(gene_set_data.columns) cols[:2] = ['gene_set', 'pvalue'] gene_set_data.columns = cols cutoff = 0.002 selected_gene_set_strs = gene_set_data.loc[gene_set_data.pvalue < cutoff, 'gene_set'] selected_gene_sets = [set(gene_set.split(',')) for gene_set in selected_gene_set_strs] entrez_gene_sets = [ {hugo_entrez_mapping[gene] for gene in gene_set if gene in hugo_entrez_mapping} for gene_set in selected_gene_sets ]
from propagation import propagate, normalize DEFAULT_SUBPROCESSES = 2 p = ArgumentParser() p.add_argument('-s', '--subprocesses', type=int, default=DEFAULT_SUBPROCESSES) p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA) if __name__ == '__main__': args = p.parse_args() else: args = p.parse_args([]) data_path = create_data_path(f'propagate_mutations_alpha_{args.alpha:.2f}') with (find_newest_data_path('build_hippie_network') / 'network.pickle').open('rb') as f: network = pickle.load(f) print('Loaded network') w_prime = normalize(network) node_set = set(network.nodes()) nodes = sorted(node_set) node_count = len(nodes) with pd.HDFStore(find_newest_data_path('parse_tcga_mutations') / 'mutations.hdf5', 'r') as store: mutations = store['muts'] print('Read mutations') expr = pd.read_pickle(find_newest_data_path('parse_cosmic_diffexpr') / 'brca_expr.pickle') print('Read log-fold expression with Hugo symbols') cutoff = 2
plot_roc, sorted_intersection, ) p = ArgumentParser() p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA) if '__file__' in globals(): args = p.parse_args() else: args = p.parse_args([]) script_label = 'ki67_analysis' data_path = create_data_path(script_label) output_path = create_output_path(script_label) expr_path = find_newest_data_path('parse_cosmic_diffexpr') expr = pd.read_pickle(expr_path / 'brca_expr.pickle') gene = 'MKI67' def ki67_analysis(drug: str): feature_label_path = find_newest_data_path( f'compute_drug_features_labels_alpha_{args.alpha:.2f}') labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle') selected_samples = sorted_intersection(labels_all.index, expr.index) selected_expr = expr.loc[selected_samples, gene] selected_labels = labels_all.loc[selected_samples] rd = RocData.calculate(selected_labels, selected_expr)
from propagation import propagate, normalize DEFAULT_SUBPROCESSES = 2 p = ArgumentParser() p.add_argument('-s', '--subprocesses', type=int, default=DEFAULT_SUBPROCESSES) p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA) if __name__ == '__main__': args = p.parse_args() else: args = p.parse_args([]) data_path = create_data_path(f'propagate_mutations_edges_alpha_{args.alpha:.2f}') with (find_newest_data_path('build_hippie_network') / 'network.pickle').open('rb') as f: orig_network = pickle.load(f) print('Loaded network') self_edge_count = 0 # HACK: remove self edges for node in orig_network.nodes: if orig_network.has_edge(node, node): orig_network.remove_edge(node, node) self_edge_count += 1 print(f'Removed {self_edge_count} self-edges from original network') network = insert_dummy_edge_nodes(orig_network, edge_name_func=join_string_keys) w_prime = normalize(network) node_set = set(network.nodes())
'HIF1A', 'CYP17A1', 'HSD17B1', 'ARSC', 'NFKB1', 'HSD17B3', 'BLM', 'NR3C1', 'HSD11B2', ], 'femara': [ 'ARSC', 'CYP11B1', 'CYP11B2', 'CYP19A1', 'CYP26A1', ], } er_target_path = find_newest_data_path('parse_er_targets') / 'er_targets.txt' with er_target_path.open() as f: print('Assigning ER targets from', er_target_path) targets_raw['er_targets'] = set(line.strip() for line in f) targets = { drug: [entrez_id_mapping[target] for target in target_list] for drug, target_list in targets_raw.items() } drugs = sorted(targets)
def main(): script_label = 'prop_edge_lbs_overlap' data_path = create_data_path(script_label) output_path = create_output_path(script_label) hem = read_hugo_entrez_mapping() lbs_mut_path = find_newest_data_path('intersect_muts_lbs') lbs_muts = pd.read_csv(lbs_mut_path / 'brca_lbs_muts.csv') prop_edge_path = find_newest_data_path( f'propagate_mutations_edges_alpha_{args.alpha:.2f}') with pd.HDFStore(prop_edge_path / 'data_propagated.hdf5') as store: mut_edge_prop = store['mutations'] patients_with_lbs_muts = set(lbs_muts.patient) print('Patients with LBS mutations:', len(patients_with_lbs_muts)) lbs_muts_by_patient = defaultdict(set) for i, row in lbs_muts.iterrows(): if row.gene not in hem: print('Skipping gene', row.gene) continue lbs_muts_by_patient[row.patient].add(hem[row.gene]) all_edge_set = {i for i in mut_edge_prop.columns if '_' in i} all_edges = sorted(all_edge_set) edge_prop = mut_edge_prop.loc[:, all_edges] shuffle_count = 100 sorted_patients = sorted(patients_with_lbs_muts) patient_count = len(sorted_patients) ndcg = pd.Series(0.0, index=sorted_patients) shuffled_ndcg = pd.DataFrame(0.0, index=sorted_patients, columns=range(shuffle_count)) lbs_edges_by_patient = pd.Series(0, index=sorted_patients) print('Loading shuffled data') prop_lbs_shuffle_path = find_newest_data_path('prop_edge_lbs_shuffle') with open(prop_lbs_shuffle_path / 'shuffled_muts_edges_by_patient.pickle', 'rb') as f: d = pickle.load(f) shuffled_by_patient = d['shuffled_by_patient'] selected_edges_by_patient = d['selected_edges_by_patient'] shuffled_edges_by_patient = d['shuffled_edges_by_patient'] ## NDCG analysis # For each patient, rank edges by propagated mutation scores, assign label of 1 if # either node connected to that edge has a LBS mutation for i, patient in enumerate(patients_with_lbs_muts, 1): print(f'Computing NDCG for patient {i}/{patient_count}') edge_scores = mut_edge_prop.loc[patient, all_edges].copy().sort_values( ascending=False) selected_edges = selected_edges_by_patient[patient] shuffled_edge_list = shuffled_edges_by_patient[patient] relevance = np.array([e in selected_edges for e in edge_scores.index]).astype(float) ndcg.loc[patient] = normalized_discounted_cumulative_gain( relevance)[-1] for j, shuffled_edges in enumerate(shuffled_edge_list): shuffled_relevance = np.array( [e in shuffled_edges for e in edge_scores.index]).astype(float) shuffled_ndcg.loc[patient, j] = normalized_discounted_cumulative_gain( shuffled_relevance)[-1] with pd.HDFStore(data_path / 'ndcg_data.hdf5') as store: store['ndcg'] = ndcg store['shuffled_ndcg'] = shuffled_ndcg store['lbs_edges_by_patient'] = lbs_edges_by_patient shuffled_ndcg_flat = shuffled_ndcg.unstack() #shuffled_ndcg_median = shuffled_ndcg.median(axis=1) with new_plot(): ndcg.plot.hist(bins=hist_bin_count) plt.title('NDCG histogram') plt.xlabel( 'Patient NDCG score: selection of LBS edges by propagated edge score' ) figure_path = output_path / 'ndcg_hist.pdf' print('Saving NDCG histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') with new_plot(): shuffled_ndcg_flat.plot.hist(bins=hist_bin_count) plt.title('NDCG histogram') plt.xlabel( 'Patient NDCG score: selection of shuffled LBS edges by propagated edge score' ) figure_path = output_path / 'shuffled_ndcg_hist.pdf' print('Saving NDCG histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') ndcg_ks = scipy.stats.ks_2samp(ndcg, shuffled_ndcg_flat) ndcg_ks_pvalue_str = to_matplotlib_sci_notation(ndcg_ks[1]) with new_plot(): ndcg.plot.hist( bins=hist_bin_count, alpha=0.8, label='Real NDCG', density=True, ) shuffled_ndcg_flat.plot.hist( bins=hist_bin_count, alpha=0.8, label='Shuffled NDCG, across 100 permutations', density=True, ) plt.xlabel( 'Patient NDCG score: selection of LBS edges by propagated edge score' ) plt.legend() plt.figtext( 0.89, 0.7, f'Kolmogorov-Smirnov $P = {ndcg_ks_pvalue_str}$', horizontalalignment='right', ) figure_path = output_path / 'ndcg_both_hist.pdf' print('Saving NDCG histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') ## /NDCG analysis ## PR and ROC AUC analysis roc_auc = pd.Series(0.0, index=sorted_patients) average_pr_scores = pd.Series(0.0, index=sorted_patients) shuffled_roc_auc = pd.DataFrame(0.0, index=sorted_patients, columns=range(shuffle_count)) shuffled_average_pr_scores = pd.DataFrame(0.0, index=sorted_patients, columns=range(shuffle_count)) # Maps patient IDs to performance objects roc_data_objects = {} pr_data_objects = {} for i, patient in enumerate(patients_with_lbs_muts, 1): print( f'Computing classifier performance for patient {i}/{patient_count}' ) selected_edges: Set[str] = selected_edges_by_patient[patient] edge_scores = mut_edge_prop.loc[patient, all_edges].copy() labels = np.array([e in selected_edges for e in edge_scores.index]).astype(float) rd = RocData.calculate(labels, edge_scores) roc_data_objects[patient] = rd roc_auc.loc[patient] = rd.auc pr = PrData.calculate(labels, edge_scores) pr_data_objects[patient] = pr average_pr_scores.loc[patient] = average_precision_score( labels, edge_scores) shuffled_edge_list: List[Set[str]] = shuffled_edges_by_patient[patient] for j, shuffled_edges in enumerate(shuffled_edge_list): shuffled_labels = np.array( [e in shuffled_edges for e in edge_scores.index]).astype(float) shuffled_rd = RocData.calculate(shuffled_labels, edge_scores) shuffled_roc_auc.loc[patient, j] = shuffled_rd.auc shuffled_average_pr_scores.loc[patient, j] = average_precision_score( shuffled_labels, edge_scores, ) with pd.HDFStore(data_path / 'classifier_data.hdf5') as store: store['roc_auc'] = roc_auc store['average_pr'] = average_pr_scores store['shuffled_roc_auc'] = shuffled_roc_auc store['shuffled_average_pr'] = shuffled_average_pr_scores with new_plot(): roc_auc.plot.hist(bins=hist_bin_count) plt.title('ROC AUC histogram') plt.xlabel( 'Patient ROC AUC: selection of LBS edges by propagated edge score') figure_path = output_path / 'roc_auc_hist.pdf' print('Saving ROC AUC histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') #shuffled_roc_auc_median = shuffled_roc_auc.median(axis=1) shuffled_roc_auc_flat = shuffled_roc_auc.unstack() with new_plot(): shuffled_roc_auc_flat.plot.hist(bins=hist_bin_count) plt.title('ROC AUC histogram') plt.xlabel( 'Patient ROC AUC: selection of shuffled LBS edges by propagated edge score' ) figure_path = output_path / 'shuffled_roc_auc_hist.pdf' print('Saving ROC AUC histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') roc_auc_ks = scipy.stats.ks_2samp(roc_auc, shuffled_roc_auc_flat) roc_auc_ks_pvalue_str = to_matplotlib_sci_notation(roc_auc_ks[1]) with new_plot(): roc_auc.plot.hist( bins=hist_bin_count, alpha=0.8, label='Real ROC AUC', density=True, ) shuffled_roc_auc_flat.plot.hist( bins=50, alpha=0.8, label='Shuffled ROC AUC, across 100 permutations', density=True, ) plt.xlabel( 'Patient ROC AUC: selection of LBS edges by propagated edge score') plt.legend() plt.figtext( 0.14, 0.7, f'Kolmogorov-Smirnov $P = {roc_auc_ks_pvalue_str}$', horizontalalignment='left', ) figure_path = output_path / 'roc_auc_both_hist.pdf' print('Saving ROC AUC histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') with new_plot(): average_pr_scores.plot.hist(bins=hist_bin_count) plt.title('Average precision histogram') plt.xlabel( 'Average precision: selection of LBS edges by propagated edge score' ) figure_path = output_path / 'avg_prec_hist.pdf' print('Saving AP histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') shuffled_average_pr_median = shuffled_average_pr_scores.median(axis=1) with new_plot(): shuffled_average_pr_median.plot.hist(bins=hist_bin_count) plt.title('Average precision histogram') plt.xlabel( 'Average precision: selection of shuffled LBS edges by propagated edge score' ) figure_path = output_path / 'shuffled_avg_prec_hist.pdf' print('Saving AP histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') top_n = 4 rest_uniform = 6 sorted_pr_scores = average_pr_scores.dropna().sort_values() usable_patient_count = sorted_pr_scores.shape[0] # Top 5, and 5 uniformly distributed from the rest patient_indexes = list( np.linspace( 0, usable_patient_count - 1 - top_n, num=rest_uniform, ).astype(int)) patient_indexes.extend( range(usable_patient_count - top_n, usable_patient_count)) selected_patients = sorted_pr_scores.index[list(reversed(patient_indexes))] with new_plot(): plt.figure(figsize=(10, 10)) for patient in selected_patients: prd = pr_data_objects[patient] plt.plot(prd.rec, prd.prec, label=patient) plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.xlabel('Recall') plt.ylabel('Precision') plt.axes().set_aspect('equal', 'datalim') plt.legend() plt.title( f'Precision-recall: top {top_n} patients, uniform spacing of bottom {rest_uniform}' ) figure_path = output_path / 'pr_selected.pdf' print('Saving selected PR curves to', figure_path) plt.savefig(figure_path, bbox_inches='tight') ## /PR and ROC AUC analysis ## Spearman correlation P-value analysis spearman_pvalues = pd.Series(0.0, index=sorted_patients) shuffled_spearman_pvalues = pd.DataFrame(0.0, index=sorted_patients, columns=range(shuffle_count)) for i, patient in enumerate(patients_with_lbs_muts, 1): print( f'Computing Spearman correlation P-value for patient {i}/{patient_count}' ) selected_edges: Set[str] = selected_edges_by_patient[patient] edge_scores = mut_edge_prop.loc[patient, all_edges].copy() labels = np.array([e in selected_edges for e in edge_scores.index]).astype(float) spearman_result = scipy.stats.spearmanr(edge_scores, labels) spearman_pvalue = spearman_result[1] spearman_pvalues.loc[patient] = spearman_pvalue shuffled_edge_list: List[Set[str]] = shuffled_edges_by_patient[patient] for j, shuffled_edges in enumerate(shuffled_edge_list): shuffled_labels = np.array( [e in shuffled_edges for e in edge_scores.index]).astype(float) shuffled_spearman_result = scipy.stats.spearmanr( edge_scores, shuffled_labels) shuffled_spearman_pvalue = shuffled_spearman_result[1] shuffled_spearman_pvalues.loc[patient, j] = shuffled_spearman_pvalue sp_dir = Path('data/prop_edge_lbs_overlap_20180606-105746') with pd.HDFStore(sp_dir / 'spearman_pvalues.hdf5') as store: spearman_pvalues = store['spearman_pvalues'] shuffled_spearman_pvalues = store['shuffled_spearman_pvalues'] with pd.HDFStore(data_path / 'spearman_pvalues.hdf5') as store: store['spearman_pvalues'] = spearman_pvalues store['shuffled_spearman_pvalues'] = shuffled_spearman_pvalues nl10_spearman_pvalues_all = -np.log10(spearman_pvalues) nl10_spearman_pvalues = nl10_spearman_pvalues_all.loc[ ~(nl10_spearman_pvalues_all.isnull()) & ~(np.isinf(nl10_spearman_pvalues_all))] with new_plot(): nl10_spearman_pvalues.plot.hist(bins=50) plt.title('Spearman $P$-value histogram') plt.xlabel( 'Spearman $P$-values ($-\\log_{10}$): LBS edges vs. prop. edge score' ) figure_path = output_path / 'spearman_pvalue_hist.pdf' print('Saving Spearman P-value histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') shuffled_spearman_pvalues_flat = shuffled_spearman_pvalues.unstack() nl10_shuffled_spearman_pvalues_flat_all = -np.log10( shuffled_spearman_pvalues_flat) nl10_shuffled_spearman_pvalues_flat = nl10_shuffled_spearman_pvalues_flat_all.loc[ ~(nl10_shuffled_spearman_pvalues_flat_all.isnull()) & ~(np.isinf(nl10_shuffled_spearman_pvalues_flat_all))] with new_plot(): nl10_shuffled_spearman_pvalues_flat.plot.hist(bins=50) plt.title('Spearman $P$-value histogram') plt.xlabel( 'Spearman $P$-values ($-\\log_{10}$): shuffled LBS edges vs. prop. edge score' ) figure_path = output_path / 'shuffled_spearman_pvalue_hist.pdf' print('Saving Spearman P-value histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') spearman_ks = scipy.stats.ks_2samp(spearman_pvalues, shuffled_spearman_pvalues_flat) spearman_ks_pvalue_str = to_matplotlib_sci_notation(spearman_ks[1]) with new_plot(): nl10_spearman_pvalues.plot.hist( bins=hist_bin_count, alpha=0.8, label='Real Spearman $P$-values', density=True, ) nl10_shuffled_spearman_pvalues_flat.plot.hist( bins=hist_bin_count, alpha=0.8, label='Shuffled Spearman $P$-values, across 100 permutations', density=True, ) plt.xlabel( 'Spearman $P$-values ($-\\log_{10}$): LBS edges vs. prop. edge score' ) plt.legend() plt.figtext( 0.89, 0.7, f'Kolmogorov-Smirnov $P = {spearman_ks_pvalue_str}$', horizontalalignment='right', ) figure_path = output_path / 'spearman_pvalues_both_hist.pdf' print('Saving Spearman P-value histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') ## /Spearman correlation P-value analysis ## Overall ROC AUC print('Creating binary LBS edge matrix') lbs_edge_matrix = pd.DataFrame(0, index=edge_prop.index, columns=edge_prop.columns) for patient, edges in selected_edges_by_patient.items(): lbs_edge_matrix.loc[patient, list(edges)] = 1 lbs_matrix_path = data_path / 'lbs_edge_matrix.hdf5' print('Saving LBS edge matrix to', lbs_matrix_path) with pd.HDFStore(lbs_matrix_path) as store: store['lbs_edge_matrix'] = lbs_edge_matrix sorted_flattened_edge_scores = edge_prop.unstack().sort_values( ascending=False) flattened_lbs_edges = lbs_edge_matrix.unstack() ordered_flattened_lbs_edges = flattened_lbs_edges.loc[ sorted_flattened_edge_scores.index] flattened_rd = RocData.calculate(ordered_flattened_lbs_edges, sorted_flattened_edge_scores) flattened_rd_path = data_path / 'flattened_rd.pickle' print('Saving flattened vector RocData to', flattened_rd_path) with open(flattened_rd_path, 'wb') as f: pickle.dump(flattened_rd, f) ## /Overall ROC AUC ## Survival analysis edge_prop_survival_dir = find_newest_data_path('edge_prop_survival') survival_data = pd.read_csv(edge_prop_survival_dir / 'univariate_surv_results.csv', index_col=0) # Indexed by gene/edge, across all patients surv_edge_sel = [('_' in i) for i in survival_data.index] edge_survival_data = survival_data.loc[surv_edge_sel, :] lbs_mut_edge_matrix = pd.DataFrame( 0.0, index=sorted(selected_edges_by_patient), columns=all_edges, ) for patient, edges in selected_edges_by_patient.items(): lbs_mut_edge_matrix.loc[patient, list(edges)] = 1 # Binary vector: is this edge incident on a LBS mut in at least one patient? edges_with_lbs_muts = lbs_mut_edge_matrix.sum(axis=0).astype(bool) surv_pvalues_with_lbs = edge_survival_data.loc[edges_with_lbs_muts, 'pvalue'] surv_pvalues_with_lbs.name = 'With LBS' surv_pvalues_without_lbs = edge_survival_data.loc[~edges_with_lbs_muts, 'pvalue'] surv_pvalues_without_lbs.name = 'Without LBS' ks_res = scipy.stats.ks_2samp(surv_pvalues_with_lbs, surv_pvalues_without_lbs) with new_plot(): plot_cdf(surv_pvalues_with_lbs) plot_cdf(surv_pvalues_without_lbs) plt.legend() plt.ylabel('CDF') plt.xlabel('Univariate Cox Regression $P$-value') figure_path = output_path / 'surv_pvalue_cdfs.pdf' plt.savefig(figure_path, bbox_inches='tight') with new_plot(): fig = plt.figure() surv_pvalues_with_lbs.plot.hist(bins=50, ax=plt.gca(), alpha=0.5) surv_pvalues_without_lbs.plot.hist(bins=50, ax=plt.gca(), alpha=0.5) plt.legend('topleft') plt.xlabel('Univariate Cox Regression $P$-value') figure_path = output_path / 'surv_pvalue_hist.pdf' plt.savefig(figure_path, bbox_inches='tight') ## /Survival analysis ## Permuted survival analysis pvalues = edge_survival_data.loc[:, 'r_square'] ks_manual = (np.array([0.1, 0.2, 0.25, 0.3]) * edge_prop.shape[0]).astype(int) ks_auto = np.logspace(1, 3, num=15).astype(int) ks = sorted(chain(ks_manual, ks_auto)) edge_count = 1000 template = dedent(''' \\begin{{frame}}[plain] \\begin{{center}} \\includegraphics[width=0.7\\textwidth]{{survival_rsquare_hist_k_{k}}} \\end{{center}} \\end{{frame}} ''') with open(data_path / 'figure_include.tex', 'w') as f: for k in ks: print(template.format(k=k), file=f) for k in ks: print('Computing edge ranking results for k =', k) edge_ranking = get_rank_k_edge_values(edge_prop, k) sorted_edge_scores = edge_ranking.sort_values(ascending=False) top_edges = sorted_edge_scores.iloc[:edge_count] top_edge_pvalues = pvalues.loc[top_edges.index] bottom_edges = sorted_edge_scores.iloc[edge_count:] permutation_count = 1000 permutation_pvalues = pd.Series(0.0, index=range(permutation_count)) for i in range(permutation_count): edge_selection = np.random.choice(bottom_edges.index, size=100) selected_pvalues = pvalues.loc[edge_selection] comparison_result = scipy.stats.mannwhitneyu( top_edge_pvalues, selected_pvalues, alternative='greater', ) permutation_pvalues.iloc[i] = comparison_result.pvalue nl10_permutation_pvalues = -np.log10(permutation_pvalues) with new_plot(): plt.figure(figsize=(5, 5)) nl10_permutation_pvalues.plot.hist(bins=50) title = (f'Survival $R^2$: top {edge_count} edges ($k = {k}$) vs. ' f'{permutation_count} random selections') plt.title(title) plt.xlabel('$- \\log_{10}$($P$-value) from Mann-Whitney $U$ test') nl10_0_05 = -np.log10(0.05) plt.axvline(x=nl10_0_05, color='#FF0000FF') nl10_0_001 = -np.log10(0.001) plt.axvline(x=nl10_0_001, color='#000000FF') figure_path = output_path / f'survival_rsquare_hist_k_{k}.pdf' print('Saving survival R^2 histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight')
drug_target_data_all = pd.DataFrame(raw_data).T # Select those with protein targets drug_target_data = drug_target_data_all.loc[drug_target_data_all.gene_symbols.notnull(), :] dtd_path = data_path / 'drug_targets.pickle' print('Saving drug target data matrix to', dtd_path) drug_target_data.to_pickle(dtd_path) synonyms = defaultdict(list) for row_name, synonym_csv in drug_target_data.synonyms.iteritems(): for synonym in synonym_csv.split(','): synonyms[synonym].append(row_name) synonym_counts = pd.Series({k: len(v) for k, v in synonyms.items()}) with (find_newest_data_path('build_hippie_network') / 'network.pickle').open('rb') as f: network = pickle.load(f) print('Loaded network') nodes = sorted(network.nodes()) node_set = set(nodes) w_prime = normalize(network) all_targets = set(chain.from_iterable(drug_target_data.gene_symbols)) both = node_set & all_targets print('Nodes in network:', len(node_set)) print('Genes targeted by at least one drug:', len(all_targets)) print('Overlap between network and targets:', len(both))
def get_genes() -> Iterable[str]: hit_data_dir = find_newest_data_path('tf_mirna_hits_both') hits = pd.read_pickle(hit_data_dir / 'hits_max_annotated_in_transmir.pickle') tfs = set(tfn.split('::')[0] for tfn in hits.tf_name) return tfs
create_data_path, find_newest_data_path, ) import pandas as pd from scipy.stats import pearsonr from utils import consolidate_data_frames, sorted_intersection data_path = create_data_path('tcga_lincs_expr_features') drugs = [ 'arimidex', 'taxol', ] tcga_expr_path = find_newest_data_path( 'parse_cosmic_diffexpr') / 'brca_expr.pickle' print('Reading expression data from', tcga_expr_path) tcga_expr = pd.read_pickle(tcga_expr_path) lincs_expr = pd.read_csv( find_newest_data_path('gct_drug_subset') / 'subset.csv', header=None, index_col=0, ) lincs_expr.columns = drugs lincs_genes = set(lincs_expr.index) tcga_genes = set(tcga_expr.columns) lincs_benchmark_gene_data = pd.read_excel(DATA_PATH / 'Landmark_Genes_n978.xlsx')