decoder = Decoder() discriminator = Discriminator() m_VAE = nn.Sequential(encoder, decoder) m_GAN = nn.Sequential(decoder, discriminator) m_FULL = nn.Sequential(encoder, decoder, discriminator) csv_files = [] datasets = CANCER_TYPES for cur_ds in datasets: dataset = cur_ds constants.update_dirs(DATASET_NAME_u=dataset) data_normalizaton = "fpkm" gene_expression_file_name, phenotype_file_name, survival_file_name, mutation_file_name, mirna_file_name, pval_preprocessing_file_name = \ build_gdc_params(dataset=dataset, data_normalizaton=data_normalizaton) csv_files.append( os.path.join(constants.DATA_DIR, gene_expression_file_name)) trainset = CancerTypesDataset(csv_files=csv_files, labels=datasets) trainloader = torch.utils.data.DataLoader(trainset, batch_size=100, shuffle=True, num_workers=40, pin_memory=True) testset = trainset # CancerTypesDataset(csv_files=csv_files, labels=datasets) testloader = trainloader # torch.utils.data.DataLoader(trainset, batch_size=10, # shuffle=True, num_workers=10) criterion = nn.BCELoss()
def main(dataset="COMB"): constants.update_dirs(DATASET_NAME_u=dataset) data_normalizaton = "fpkm_bc" cur_json = "cancer_types" meta_groups = [json.load(file("../groups/{}.json".format(cur_json)))] filter_expression = json.load(file("../filters/{}.json".format(cur_json))) gene_expression_file_name, phenotype_file_name, survival_file_name, mutation_file_name, mirna_file_name, pval_preprocessing_file_name = build_gdc_params( dataset=dataset, data_normalizaton=data_normalizaton) tested_gene_expression, h_rows, h_cols, labels_assignment, survival_dataset = infra.load_integrated_ge_data( "dip_bg.txt", "dip_bg.txt", gene_expression_file_name, survival_file_name, phenotype_file_name, gene_filter_file_name=None, filter_expression=filter_expression, meta_groups=meta_groups, var_th_index=None) h_cols = [x.split('.')[0] for x in h_cols] pd.DataFrame(index=h_rows, columns=h_cols, data=tested_gene_expression).T.to_csv(os.path.join( constants.DATA_DIR, 'ge.tsv'), index_label="eid", sep="\t") var_th_index = None start_k = 2 end_k = 2 # algos = ["matisse", "keypathwayminer_INES_GREEDY", "netbox", "hotnet2", "bionet", "jactivemodules_greedy", # "jactivemodules_sa", "reactomefi"] algos = ["matisse"] run_dataset(dataset, score_method=constants.DEG_EDGER) gene_list_file_names = [] generate_plot = True clustering_algorithm = "correlation" for cur_algo in algos: algo_output = json.loads( file( os.path.join(constants.OUTPUT_DIR, "{}_client_output.txt".format( cur_algo))).read().split("\n")[1]) i = 0 algo_pvals = [] random_pvals = [] while True: algo_genes_flatted = [ x['eid'] for x in algo_output if i in x['modules'] ] if len(algo_genes_flatted) == 0 and i > 0: break if len(algo_genes_flatted) == 0: i += 1 continue gene_list_file_names.append( os.path.join(constants.LIST_DIR, cur_algo + ".txt")) file(gene_list_file_names[-1], 'w+').write("\n".join(algo_genes_flatted)) algo_pvals.append( find_clusters_and_survival( tested_gene_list_file_name=gene_list_file_names[-1], total_gene_list_file_name="protein_coding.txt", gene_expression_file_name=gene_expression_file_name, phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, is_unsupervised=True, start_k=start_k, end_k=end_k, filter_expression=filter_expression, meta_groups=meta_groups, clustering_algorithm=clustering_algorithm, plot=generate_plot)) random_set_file_name = generate_random_set( random_size=len(algo_genes_flatted), meta_gene_set="dip_bg.txt") random_pvals.append( find_clusters_and_survival( tested_gene_list_file_name=random_set_file_name, total_gene_list_file_name="protein_coding.txt", gene_expression_file_name=gene_expression_file_name, phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, is_unsupervised=True, start_k=start_k, end_k=end_k, filter_expression=filter_expression, meta_groups=meta_groups, clustering_algorithm=clustering_algorithm, plot=generate_plot)) i += 1 print " algo pvals" print algo_pvals print "# above TH: {}".format( len([x for x in algo_pvals if any(y < 0.001 for y in x)])) print " random pvals" print random_pvals print "# above TH: {}".format( len([x for x in random_pvals if any(y < 0.001 for y in x)])) print "# of modules better over random: {}/{}".format( len([ x for x1, x2 in zip(algo_pvals, random_pvals) if x1[0] < x2[0] ]), len(algo_pvals))
from utils.rfe import RANDOMIZED, REVERSED, NORMAL, RFE, print_to_excel from utils.param_builder import build_gdc_params import constants import json from utils.pca import plot_detailed_pca from utils.groups_generator import generate_random_set from infra import load_integrated_ge_data import numpy as np import matplotlib.pyplot as plt import os dataset = "BRCA" constants.update_dirs(DATASET_NAME_u=dataset) data_normalizaton = "counts_normalized_by_genes_standardization" gene_expression_file_name, phenotype_file_name, survival_file_name, mutation_file_name, mirna_file_name, pval_preprocessing_file_name = build_gdc_params( dataset=dataset, data_normalizaton=data_normalizaton) constants.PHENOTYPE_FORMAT = "TCGA" phenotype_file_name = 'BRCA_clinicalMatrix' cur_json = "brca_pam53" meta_groups = None filter_expression = None meta_groups = [json.load(file("../groups/{}.json".format(cur_json)))] filter_expression = json.load(file("../filters/{}.json".format(cur_json))) var_th_index = None gene_list_file_name = "dip_bg.txt" rounds = 1 rank_method = DISTANCE recursion_number_of_steps = 20 recursion_step_size = 50
def main(dataset="BRCA"): constants.update_dirs(DATASET_NAME_u=dataset) data_normalizaton = "counts_normalized_by_genes_standardization" cur_json = "brca_pam53" meta_groups = None filter_expression = None meta_groups = [json.load(file("../groups/{}.json".format(cur_json)))] filter_expression = json.load(file("../filters/{}.json".format(cur_json))) gene_expression_file_name, phenotype_file_name, survival_file_name, mutation_file_name, mirna_file_name, pval_preprocessing_file_name = build_gdc_params( dataset=dataset, data_normalizaton=data_normalizaton) phenotype_file_name = 'BRCA_clinicalMatrix' tested_gene_expression, h_rows, h_cols, labels_assignment, survival_dataset = infra.load_integrated_ge_data( "dip_bg.txt", "dip_bg.txt", gene_expression_file_name, survival_file_name, phenotype_file_name, gene_filter_file_name=None, filter_expression=filter_expression, meta_groups=meta_groups, var_th_index=None) file(os.path.join(constants.DATA_DIR, "classes.tsv"), 'w+').write('\t'.join([str(x) for x in labels_assignment[0]])) h_cols = [x.split('.')[0] for x in h_cols] df_data = pd.DataFrame(index=h_rows, columns=h_cols, data=tested_gene_expression).T df_data.to_csv(os.path.join(constants.DATA_DIR, 'ge.tsv'), index_label="eid", sep="\t") var_th_index = None start_k = 2 end_k = 2 # algos = ["matisse", "keypathwayminer_INES_GREEDY", "netbox", "hotnet2", "bionet", "jactivemodules_greedy", # "jactivemodules_sa", "reactomefi"] algos = ["netbox"] run_dataset(dataset, score_method=constants.DEG_EDGER) gene_list_file_names = [] generate_plot = True clustering_algorithm = "correlation" for cur_algo in algos: algo_output = json.loads( file( os.path.join(constants.OUTPUT_DIR, "{}_client_output.txt".format( cur_algo))).read().split("\n")[1]) i = 0 algo_pvals = [] random_pvals = [] df_mean = pd.DataFrame() all_algo_genes_flatted = [] gene_2_module = {} while True: algo_genes_flatted = [ x['eid'] for x in algo_output if i in x['modules'] ] for cur in algo_genes_flatted: gene_2_module[cur] = i all_algo_genes_flatted += algo_genes_flatted if len(algo_genes_flatted) == 0 and i > 0: break if len(algo_genes_flatted) < 4: i += 1 continue gene_list_file_names.append( os.path.join(constants.LIST_DIR, cur_algo + ".txt")) file(gene_list_file_names[-1], 'w+').write("\n".join(algo_genes_flatted)) df_mean = pd.concat((df_mean, df_data[df_data.index.isin( algo_genes_flatted)].mean().to_frame().T)) i += 1 all_genes_file_name = os.path.join(constants.LIST_DIR, "{}_all_genes.txt".format(cur_algo)) file(all_genes_file_name, 'w+').write('\n'.join(all_algo_genes_flatted)) mean_file_name = os.path.join(constants.DATA_DIR, "mean.tsv") df_mean.index = np.arange(df_mean.shape[0]) df_mean.to_csv(mean_file_name, sep="\t", index_label="eid") index_file_name = os.path.join(constants.LIST_DIR, "{}_indices.txt".format(cur_algo)) file(index_file_name, 'w+').write('\n'.join([str(x) for x in df_mean.index.values])) algo_pvals.append( find_clusters_and_survival( tested_gene_list_file_name="{}_indices.txt".format(cur_algo), total_gene_list_file_name="protein_coding.txt", gene_expression_file_name=mean_file_name, phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, is_unsupervised=True, start_k=start_k, end_k=end_k, filter_expression=filter_expression, meta_groups=meta_groups, clustering_algorithm=clustering_algorithm, plot=generate_plot)) for cur in range(RAND_TIMES): random_set_file_name = generate_random_set( random_size=len(df_mean.index), meta_gene_set="{}_all_genes.txt".format(cur_algo)) random_pvals.append( find_clusters_and_survival( tested_gene_list_file_name=random_set_file_name, total_gene_list_file_name="dip_bg.txt", gene_expression_file_name=gene_expression_file_name, phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, is_unsupervised=True, start_k=start_k, end_k=end_k, filter_expression=filter_expression, meta_groups=meta_groups, clustering_algorithm=clustering_algorithm, plot=generate_plot)) print " algo pvals" print algo_pvals print "# above TH: {}".format( len([x for x in algo_pvals if any(y < 0.001 for y in x)])) print " random pvals" print random_pvals print "# above TH: {}".format( len([x for x in random_pvals if any(y < 0.001 for y in x)])) print "# of modules better over random: {}/{}".format( len([ x for x1, x2 in zip(algo_pvals, random_pvals) if x1[0] < x2[0] ]), len(algo_pvals))