"seed": seed, "train_size": train_size, } if args.percentile != 50: gene_percentile = np.percentile(dataset.df[gene].values, args.percentile) dataset.labels = dataset.df[gene].where(dataset.df[gene] > gene_percentile).notnull().astype("int") else: dataset.labels = dataset.df[gene].where(dataset.df[gene] > 0).notnull().astype("int") dataset.labels = dataset.labels.values if type(dataset.labels) == pd.Series else dataset.labels # if labels are chosen such that only one class exists, skip the gene # otherwise this throws a cuda device-side assert which breaks all # subsequent experiments if np.unique(dataset.labels).shape[0] == 1: experiment['error'] = 'Expression distribution too narrow, skipping' results = record_result(results, experiment, filename) continue try: X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(dataset.df, dataset.labels, stratify=dataset.labels, train_size=train_size, test_size=test_size, random_state=seed) except ValueError: results = record_result(results, experiment, filename) continue if is_first_degree: if is_landmark: neighbors = landmark_genes
def doMLP(): skopt_args = collections.OrderedDict() skopt_args["lr"] = Integer(2, 5) skopt_args["channels"] = Integer(4, 12) skopt_args["layers"] = Integer(1, 4) optimizer = skopt.Optimizer(dimensions=skopt_args.values(), base_estimator="GP", n_initial_points=3, random_state=args.seed) print(skopt_args) best_valid_metric = 0 test_for_best_valid_metric = 0 best_config = None already_done = set() for i in range(10): suggestion = optimizer.ask() if str(suggestion) in already_done: continue already_done.add(str(suggestion)) sdict = dict(zip(skopt_args.keys(), suggestion)) sdict["lr"] = 10**float((-sdict["lr"])) sdict["channels"] = 2**sdict["channels"] model = models.mlp.MLP(name="MLP", num_layer=sdict["layers"], channels=sdict["channels"], lr=sdict["lr"], num_epochs=100, patience=50, cuda=torch.cuda.is_available(), metric=sklearn.metrics.accuracy_score, verbose=False, seed=args.seed) model.fit(X_train, y_train) y_valid_pred = model.predict(X_valid) valid_metric = sklearn.metrics.accuracy_score( y_valid, np.argmax(y_valid_pred, axis=1)) opt_results = optimizer.tell(suggestion, -valid_metric) print(opt_results) #record metrics to write and plot if best_valid_metric < valid_metric: best_valid_metric = valid_metric best_config = sdict y_test_pred = model.predict(X_test) test_metric = sklearn.metrics.accuracy_score( y_test, np.argmax(y_test_pred, axis=1)) test_for_best_valid_metric = test_metric print(i, "This result:", valid_metric, sdict) experiment = { "model": model.name, "graph": "", "num_genes": len(list(X_train.columns)), "train_size": args.ntrain, "seed": args.seed, "acc": valid_metric, 'lr': sdict["lr"], 'channels': sdict["channels"], 'embedding': 0, 'num_layer': sdict["layers"], 'prepool_extralayers': 0 } global results results = record_result(results, experiment, filename) print("#Final Results", test_for_best_valid_metric, best_config) return test_metric, best_config
def doGGC(): gene_graphs = [ data.gene_graphs.OntologyGraph(neighbors=30, embeddings_name='dl2vec', randomize=False, gene_names=list(features.columns), relabel_genes=False), ] for graph in gene_graphs: adj = graph.adj() for dropout in [False]: #, False]: import gc gc.collect() skopt_args = collections.OrderedDict() skopt_args["lr"] = Integer(3, 5) skopt_args["channels"] = Integer(3, 6) # skopt_args["embedding"]=Integer(4, 5) skopt_args["num_layer"] = Integer(1, 3) skopt_args["gat_heads"] = Integer(1, 3) skopt_args["prepool_extralayers"] = Integer(0, 1) optimizer = skopt.Optimizer(dimensions=skopt_args.values(), base_estimator="GP", n_initial_points=4, random_state=args.seed) print(skopt_args) best_valid_metric = 0 test_for_best_valid_metric = 0 best_config = None already_done = set() for i in range(100): import gc gc.collect() suggestion = optimizer.ask() if str(suggestion) in already_done: continue already_done.add(str(suggestion)) sdict = dict(zip(skopt_args.keys(), suggestion)) sdict["lr"] = 10**float((-sdict["lr"])) sdict["channels"] = 2**sdict["channels"] sdict["gat_heads"] = 2**sdict["gat_heads"] sdict["embedding"] = 2 # 2**sdict["embedding"] print(sdict) neighbors = graph.nx_graph intersection_nodes = np.intersect1d(X_train.columns, neighbors.nodes) x_train = X_train[list(intersection_nodes)].copy() x_valid = X_valid[list(intersection_nodes)].copy() toremove = set(neighbors.nodes) toremove = toremove.difference(intersection_nodes) neighbors.remove_nodes_from(toremove) adj = sparse.csr_matrix(nx.to_numpy_matrix(neighbors)) model = models.gnn.GCN( name="GAT", dropout=dropout, gnn="GAT", gat_heads=sdict["gat_heads"], cuda=torch.cuda.is_available(), num_layer=sdict["num_layer"], prepool_extralayers=sdict["prepool_extralayers"], channels=sdict["channels"], embedding=sdict["channels"], #sdict["embedding"], aggregation=None, lr=sdict["lr"], num_epochs=100, patience=40, verbose=True, seed=args.seed) try: model.fit(x_train, y_train, adj) with torch.no_grad(): model.eval() y_valid_pred = model.predict(x_valid) valid_metric = sklearn.metrics.accuracy_score( y_valid, np.argmax(y_valid_pred, axis=1)) opt_results = optimizer.tell(suggestion, -valid_metric) # #record metrics to write and plot # if best_valid_metric < valid_metric: # best_valid_metric = valid_metric # print("best_valid_metric", best_valid_metric, sdict) # best_config = sdict # y_test_pred = model.predict(x_test) # test_metric = sklearn.metrics.accuracy_score(y_test, np.argmax(y_test_pred,axis=1)) # test_for_best_valid_metric = test_metric experiment = { "model": model.name, "graph": graph.graph_name, "num_genes": len(x_train.columns), "train_size": args.ntrain, "seed": args.seed, "acc": valid_metric, 'lr': sdict["lr"], 'channels': sdict["channels"], 'embedding': sdict["embedding"], 'num_layer': sdict["num_layer"], 'prepool_extralayers': sdict["prepool_extralayers"] } print(i, "This result:", valid_metric, experiment) global results results = record_result(results, experiment, filename) except Exception as e: print(e) logging.error(logging.traceback.format_exc()) # cleanup model.best_model = None del model torch.cuda.empty_cache() print("#Final Results", test_for_best_valid_metric, best_config) return test_for_best_valid_metric, best_config
def doGGC(): # if args.graph == "stringdb": # graph = data.gene_graphs.StringDBGraph(datastore="./data") # elif args.graph == "genemania": # graph = data.gene_graphs.GeneManiaGraph() # elif args.graph == "ontology": # graph = data.gene_graphs.OntologyGraph(neighbors=30, embeddings_name='el') # else: # print("unknown graph") # sys.exit(1) gene_graphs = [ OntologyGraph(neighbors=30, embeddings_name='dl2vec', randomize=False, gene_names=list(features.columns), relabel_genes=False), OntologyGraph(neighbors=30, embeddings_name='el', randomize=False, gene_names=list(features.columns), relabel_genes=False)] # data.gene_graphs.OntologyGraph(neighbors=n, embeddings_name=emb) # for n in [30] #[500, 100, 30, 10] # for emb in ['el', 'dl2vec', 'opa2vec', 'opa2vec_go']] + [ # data.gene_graphs.GeneManiaGraph(), # data.gene_graphs.StringDBGraph(datastore="./data")] # data.gene_graphs.OntologyGraph(neighbors=30, embeddings_name='el'), # data.gene_graphs.OntologyGraph(neighbors=30, embeddings_name='el'), # gene_graphs = [data.gene_graphs.OntologyGraph(neighbors=100, embeddings_name='el')] # data.gene_graphs.GeneManiaGraph(), # data.gene_graphs.StringDBGraph(datastore="./data")] for graph in gene_graphs: adj = graph.adj() for num_genes in [1000, 16000]: for dropout in [True, False]: #, False]: import gc gc.collect() skopt_args = collections.OrderedDict() # skopt_args["lr"]=Integer(3, 4) skopt_args["channels"]=Integer(4, 9) # skopt_args["embedding"]=Integer(4, 5) skopt_args["num_layer"]=Integer(0, 4) skopt_args["prepool_extralayers"]=Integer(0, 3) optimizer = skopt.Optimizer(dimensions=skopt_args.values(), base_estimator="GP", n_initial_points=4, random_state=args.seed) print(skopt_args) best_valid_metric = 0 test_for_best_valid_metric = 0 best_config = None already_done = set() for i in range(15): import gc gc.collect() suggestion = optimizer.ask() if str(suggestion) in already_done: continue already_done.add(str(suggestion)) sdict = dict(zip(skopt_args.keys(),suggestion)) sdict["lr"] = 0.001 #10**float((-sdict["lr"])) sdict["channels"] = 2**sdict["channels"] sdict["embedding"] = 2# 2**sdict["embedding"] print(sdict) gene = 'ESR1' neighbors = graph.bfs_sample_neighbors(gene, num_genes*1.5) intersection_nodes = np.intersect1d(features.columns, neighbors.nodes) x_train = X_train[list(intersection_nodes)[:num_genes]].copy() x_valid = X_valid[list(intersection_nodes)[:num_genes]].copy() x_test = X_test[list(intersection_nodes)[:num_genes]].copy() toremove = set(neighbors.nodes) toremove = toremove.difference(intersection_nodes) neighbors.remove_nodes_from(toremove) adj = sparse.csr_matrix(nx.to_numpy_matrix(neighbors)) model = models.gcn.GCN(name="GCN_noemb" + ("_dropout" if dropout else ""), #_lay3_chan64_emb32_dropout_agg_hierarchy", dropout=dropout, cuda=torch.cuda.is_available(), num_layer=sdict["num_layer"], prepool_extralayers=sdict["prepool_extralayers"], channels=sdict["channels"], embedding=sdict["embedding"], # sdict["embedding"], aggregation="hierarchy", lr=sdict["lr"], num_epochs=100, patience=30, verbose=False, seed=args.seed ) try: model.fit(x_train, y_train, adj) with torch.no_grad(): model.eval() y_valid_pred = model.predict(x_valid) valid_metric = sklearn.metrics.accuracy_score(y_valid, np.argmax(y_valid_pred,axis=1)) opt_results = optimizer.tell(suggestion, - valid_metric) # #record metrics to write and plot # if best_valid_metric < valid_metric: # best_valid_metric = valid_metric # print("best_valid_metric", best_valid_metric, sdict) # best_config = sdict # y_test_pred = model.predict(x_test) # test_metric = sklearn.metrics.accuracy_score(y_test, np.argmax(y_test_pred,axis=1)) # test_for_best_valid_metric = test_metric experiment = { "model": model.name, "graph": graph.graph_name, "num_genes": num_genes, "train_size": args.ntrain, "seed": args.seed, "acc": valid_metric, 'lr': sdict["lr"], 'channels': sdict["channels"], 'embedding': sdict["embedding"], 'num_layer': sdict["num_layer"], 'prepool_extralayers': sdict["prepool_extralayers"] } print(i, num_genes, "This result:",valid_metric, experiment) global results results = record_result(results, experiment, filename) except Exception as e: print(e) logging.error(logging.traceback.format_exc()) # cleanup model.best_model = None del model torch.cuda.empty_cache() print("#Final Results", test_for_best_valid_metric, best_config) return test_for_best_valid_metric, best_config