def run_asd_experiment(method, lambda_term, **kwargs): print("begin method:", method, "with \\lambda", lambda_term, datetime.datetime.now(), "#" * 20) method = method.lower() all_real = [] all_pred = [] all_prob = [] sparse_features_counts = [] sparse_groups_counts = [] sparse_groups_all = [] if "num_epochs" in kwargs: num_epochs = kwargs["num_epochs"] else: num_epochs = 5 # we will use cross entropy layers = [3000, 500, 2] criterion = torch.nn.CrossEntropyLoss() for cv_id, val_indices in enumerate(cross_validation_ids): num_val = len(val_indices) train_features = features.drop(val_indices).values train_labels = df.isASD.drop(val_indices).values.reshape(-1) val_features = features.ix[val_indices].values.reshape(num_val, -1) val_labels = np.array(df.isASD[val_indices]).reshape(-1) # Normalize Features if normalize_features: scaler = StandardScaler().fit(train_features) train_features = scaler.transform(train_features) val_features = scaler.transform(val_features) print("CV:", cv_id, "Shape Verification:", str(datetime.datetime.now())) if method == "lasso" or method == "linear regression" or \ method == "logistic regression" or method == "group lasso": val_rst, test_rst, n_sparse_features, n_sparse_groups, sparse_groups =\ linear_experiment(train_features, train_labels, val_features, val_labels, None, None, # nothing for testing feature_groups, lambda_term=lambda_term, model_to_use=method) if method == "sgin" or method == "sgin_sgd" or method == "nn" or method == "theory": if method == "sgin": opt_method = "sbcgd" lam = lambda_term elif method == "sgin_sgd": opt_method = "sgd" lam = lambda_term elif method == "nn": opt_method = "sgd" lam = 0 # ignore and override the lambda for standard NN method elif method == "theory": opt_method = "theory" lam = lambda_term val_rst, test_rst, n_sparse_features, n_sparse_groups, sparse_groups =\ sgin_experiment( train_features, train_labels, val_features, val_labels, None, None, # no testing set. We use cross validation here feature_groups, cv_id=cv_id, criterion=criterion, optmizer_method=opt_method, lam=lam, layers=layers, num_epochs=num_epochs, train_batch_size=100, verbose=False) real, pred, prob = val_rst all_real += real all_pred += pred all_prob += prob sparse_features_counts.append(n_sparse_features) sparse_groups_counts.append(n_sparse_groups) sparse_groups_all.append(sparse_groups) classification_metric(all_real, all_pred, all_prob) print( "Final Sparsity %d features from %d groups in this Cross Validation:" % (n_sparse_features, n_sparse_groups)) print("#" * 10, "SUMMARY for", method) print("avg sparse features: %.2f; avg sparse groups: %.2f" % (np.mean(sparse_features_counts), np.mean(sparse_groups_counts))) acc, f1, auc, cm, precision, recall, sensitivity, specificity, _ = classification_metric( all_real, all_pred, all_prob) # Cache Result of = open("rst/et_asd_classification_rst.tsv", "a") # with > 9000 features rst = [ method, lambda_term, np.mean(sparse_features_counts), np.mean(sparse_groups_counts), np.std(sparse_features_counts), np.std(sparse_groups_counts), acc, f1, auc, cm, precision, recall, sensitivity, specificity, kwargs, sparse_groups_all ] rst = [str(_) for _ in rst] of.write("\t".join(rst).replace("\n", " ") + "\n") of.close() print("#" * 200)
def run_mnist_experiment(method, lambda_term, **kwargs): print("begin method:".upper(), method, "with \\lambda", lambda_term, datetime.datetime.now(), "#" * 20) method = method.lower() if "num_epochs" in kwargs: num_epochs = kwargs["num_epochs"] else: num_epochs = 5 if "random_group_order" in kwargs and kwargs["random_group_order"] == True: sgin_model.RANDOM_GROUP_ORDER = True if "momentum_value" in kwargs and kwargs["momentum_value"] == True: sgin_model.MOMENTUM_VALUE = kwargs["momentum_value"] if "layers" in kwargs: layers = kwargs["layers"] if "learning_rate" in kwargs: learning_rate = kwargs["learning_rate"] else: learning_rate = 0.1 if "batch_size" in kwargs: batch_size = kwargs["batch_size"] else: batch_size = 100 criterion = torch.nn.CrossEntropyLoss() if method == "sgin": opt_method = "sbcgd" lam = lambda_term elif method == "sgin_sgd": opt_method = "sgd" lam = lambda_term elif method == "nn": # We can set lambda to zero, so sgin_sgd becomes normal NN opt_method = "sgd" lam = 0 elif method == "theory": opt_method = "theory" lam = lambda_term if method in ["sgin", "sgin_sgd", "nn", "theory"]: val_rst, test_rst, n_sparse_features, n_sparse_groups, sparse_groups = sgin_experiment( train_features, train_labels, val_features, val_labels, test_features, test_labels, group_definition, input_dim=len(group_definition), cv_id=-1, criterion=criterion, optmizer_method=opt_method, lam=lam, layers=layers, num_epochs=num_epochs, train_batch_size=batch_size, learning_rate=learning_rate, verbose=False) print( "Final Sparsity %d features from %d groups in this Cross Validation:" % (n_sparse_features, n_sparse_groups)) val_real, val_pred, val_prob = val_rst test_real, test_pred, test_prob = test_rst val_acc = np.sum( np.array(val_real) == np.array(val_pred)) / len(val_real) test_acc = np.sum( np.array(test_real) == np.array(test_pred)) / len(test_real) sparse_groups_found = sparse_groups # set this variable to be compatible with linear method elif method in ["lasso", "group lasso"]: sparse_groups_found = [] val_rsts = [] test_rsts = [] ns_sparse_features = [] ns_sparse_groups = [] val_pred_matrix = [ ] # it will become (n, 10), matrix where n = number of testing data test_pred_matrix = [] for label_name in range(10): binary_train_labels = train_labels == label_name binary_val_labels = val_labels == label_name binary_test_labels = test_labels == label_name val_rst, test_rst, n_sparse_features, n_sparse_groups, sparse_groups = linear_experiment( train_features, binary_train_labels, val_features, binary_val_labels, test_features, binary_test_labels, group_definition, lambda_term=lambda_term, model_to_use=method) # record result for this 1-vs-rest run val_rsts.append(val_rst) test_rsts.append(test_rst) ns_sparse_features.append(n_sparse_features) ns_sparse_groups.append(n_sparse_groups) sparse_groups_found.append(sparse_groups) val_pred_matrix.append(val_rst[2]) # probability test_pred_matrix.append(test_rst[2]) # probability val_final_pred = np.array(val_pred_matrix).argmax(axis=0) test_final_pred = np.array(test_pred_matrix).argmax(axis=0) sparse_groups_found = [set(_) for _ in sparse_groups_found] # final sparse groups sparse_groups = set.intersection(*sparse_groups_found) val_acc = np.sum( np.array(val_labels) == np.array(val_final_pred)) / len(val_labels) test_acc = np.sum(np.array(test_labels) == np.array( test_final_pred)) / len(test_labels) print("!!! Validation Accuracy !!!!", val_acc, "!" * 20) print("!!! Testing Accuracy !!!!", test_acc, "!" * 20) num_features_in_sparse_groups = 0 for g in sparse_groups: num_features_in_sparse_groups += np.sum( np.array(group_definition) == g) print("Total %d/%d (%.2f%%) features are in the sparse group" % (num_features_in_sparse_groups, len(group_definition), num_features_in_sparse_groups / len(group_definition) * 100)) print("The final sparsified groups are:", sparse_groups) # Cache Result of = open("rst/rst_mnist.tsv", "a") # this n_features_used_for_final_product will make the visualization eaiser # and it is compatible to the older version of the saved txt file format. kwargs["n_features_used_for_final_product"] = train_features.shape[ 1] - num_features_in_sparse_groups rst = [ method, lambda_term, n_sparse_features, n_sparse_groups, num_features_in_sparse_groups, args.representation, val_acc, test_acc, sparse_groups_found, str(kwargs), str(sparse_groups) ] rst = [str(_) for _ in rst] of.write("\t".join(rst).replace("\n", " ") + "\n") of.close() print("#" * 200)
def run_rna_experiment(method, lambda_term, weighted_loss=False, **kwargs): print("begin method:".upper(), method, "with \\lambda", lambda_term, datetime.datetime.now(), "#" * 20) method = method.lower() if "num_epochs" in kwargs: num_epochs = kwargs["num_epochs"] else: num_epochs = 5 if "layers" in kwargs: layers = kwargs["layers"] else: layers = [30, 20, 10, 2] if weighted_loss: pos_ratio = np.sum(train_labels_big == 1) / train_labels_big.shape[0] weights_sample = torch.Tensor([pos_ratio, 1 - pos_ratio]) if sgin_model.USE_CUDA: weights_sample = weights_sample.cuda() criterion = torch.nn.CrossEntropyLoss(weight=weights_sample) train_features_ = train_features_big train_labels_ = train_labels_big else: criterion = torch.nn.CrossEntropyLoss() train_features_ = train_features train_labels_ = train_labels if method == "lasso" or method == "group lasso": if method == "group lasso": print( "Please use GLLR model in R code with grplasso package for the RNA Splicing Experiment" ) val_rst, test_rst, n_sparse_features, n_sparse_groups, sparse_groups =\ linear_experiment(train_features, train_labels, val_features, val_labels,\ test_features, test_labels, \ feature_groups, lambda_term=lambda_term, model_to_use=method) if method == "sgin" or method == "sgin_sgd" or method == "nn" or method == "theory": if method == "sgin": opt_method = "sbcgd" lam = lambda_term elif method == "sgin_sgd": opt_method = "sgd" lam = lambda_term elif method == "nn": opt_method = "sgd" lam = 0 elif method == "theory": opt_method = method lam = lambda_term val_rst, test_rst, n_sparse_features, n_sparse_groups, sparse_groups =\ sgin_experiment(train_features_, train_labels_, val_features, val_labels, test_features, test_labels, feature_groups, input_dim=None, cv_id=-1, criterion=criterion, optmizer_method=opt_method, lam=lam, layers=layers, num_epochs=num_epochs, train_batch_size=100, verbose=False) print( "Final Sparsity %d features from %d groups in this Cross Validation:" % (n_sparse_features, n_sparse_groups)) val_rst = classification_metric(val_rst[0], val_rst[1], val_rst[2]) test_rst = classification_metric(test_rst[0], test_rst[1], test_rst[2]) # Cache Result of = open("rst/rst_rna_exp.txt", "a") rst = [ method, lambda_term, n_sparse_features, n_sparse_groups, 0, 0, *val_rst, *test_rst, str(kwargs), sparse_groups ] rst = [str(_) for _ in rst] of.write("\t".join(rst).replace("\n", " ") + "\n") of.close() print("#" * 200)
def run_linear_mnist_experiment(method, lambda_term, **kwargs): print("begin method:".upper(), method, "with \\lambda", lambda_term, datetime.datetime.now(), "#" * 20) method = method.lower() sparse_groups_found = [] val_rsts = [] test_rsts = [] ns_sparse_features = [] ns_sparse_groups = [] val_pred_matrix = [ ] # it will become (n, 10), matrix where n = number of testing data test_pred_matrix = [] for label_name in range(10): binary_train_labels = train_labels == label_name binary_val_labels = val_labels == label_name binary_test_labels = test_labels == label_name val_rst, test_rst, n_sparse_features, n_sparse_groups, sparse_groups = linear_experiment( train_features, binary_train_labels, val_features, binary_val_labels, test_features, binary_test_labels, group_definition, lambda_term=lambda_term, model_to_use=method) # record result for this 1-vs-rest run val_rsts.append(val_rst) test_rsts.append(test_rst) ns_sparse_features.append(n_sparse_features) ns_sparse_groups.append(n_sparse_groups) sparse_groups_found.append(sparse_groups) val_pred_matrix.append(val_rst[2]) # probability test_pred_matrix.append(test_rst[2]) # probability val_final_pred = np.array(val_pred_matrix).argmax(axis=0) test_final_pred = np.array(test_pred_matrix).argmax(axis=0) sparse_groups_found = [set(_) for _ in sparse_groups_found] final_sparse_group = set.intersection(*sparse_groups_found) val_acc = np.sum( np.array(val_labels) == np.array(val_final_pred)) / len(val_labels) test_acc = np.sum( np.array(test_labels) == np.array(test_final_pred)) / len(test_labels) print("!!! Validation Accuracy !!!!", val_acc, "!" * 20) print("!!! Testing Accuracy !!!!", test_acc, "!" * 20) print("Final Sparsity %d groups" % (len(final_sparse_group)), final_sparse_group) # Cache Result of = open("rst/linear_mnist_exp_rst.tsv", "a") rst = [ method, lambda_term, np.mean(ns_sparse_features), np.mean(ns_sparse_groups), 0, 0, val_acc, test_acc, sparse_groups_found, str(kwargs), str(final_sparse_group) ] rst = [str(_) for _ in rst] of.write("\t".join(rst).replace("\n", " ") + "\n") of.close() print("#" * 200)
def run_ados_experiment(method, lambda_term, permutation_test=False, **kwargs): print("begin method:", method, "with \\lambda", lambda_term, datetime.datetime.now(), "#" * 20) method = method.lower() all_real = [] all_pred = [] all_prob = [] sparse_features_counts = [] sparse_groups_counts = [] sparse_groups_all = [] if "num_epochs" in kwargs: num_epochs = kwargs["num_epochs"] else: num_epochs = 5 if "train_batch_size" in kwargs: train_batch_size = kwargs["train_batch_size"] else: train_batch_size = 1 if "lr" in kwargs: lr = float(kwargs["lr"]) else: lr = 0.01 print("Init Learning Rate:", lr) if "layers" in kwargs: layers = kwargs["layers"] else: layers = [3000, 'R', 500, 'R', 1, 'S'] print("Layers:", layers, "Train_batch_size:", train_batch_size) criterion = torch.nn.MSELoss() for cv_id, val_indices in enumerate(cross_validation_ids): num_val = len(val_indices) train_features = features.drop(val_indices).values train_labels = df[label_col_name].drop(val_indices).values.reshape(-1) val_features = features.iloc[val_indices].values.reshape(num_val, -1) val_labels = np.array(df.loc[list(val_indices), label_col_name]).reshape(-1) # Normalize Features if normalize_features: scaler = StandardScaler().fit(train_features) train_features = scaler.transform(train_features) val_features = scaler.transform(val_features) print("CV:", cv_id, "Shape Verification:", str(datetime.datetime.now())) if method == "lasso" or method == "linear regression" or \ method == "logistic regression" or method == "group lasso": val_rst, test_rst, n_sparse_features, n_sparse_groups, sparse_groups =\ linear_experiment(train_features, train_labels, val_features, val_labels, None, None, # nothing for testing feature_groups, lambda_term=lambda_term, model_to_use=method) if method == "sgin" or method == "sgin_sgd" or method == "nn" or method == "theory": if method == "sgin": opt_method = "sbcgd" lam = lambda_term elif method == "sgin_sgd": opt_method = "sgd" lam = lambda_term elif method == "nn": # We can set lambda to zero, so sgin_sgd becomes normal NN opt_method = "sgd" lam = 0 elif method == "theory": opt_method = "theory" lam = lambda_term val_rst, test_rst, n_sparse_features, n_sparse_groups, sparse_groups =\ sgin_experiment( train_features, train_labels, val_features, val_labels, None, None, # no testing set feature_groups, cv_id=cv_id, criterion=criterion, optmizer_method=opt_method, lam=lam, layers=layers, num_epochs=num_epochs, train_batch_size=train_batch_size, verbose=False, learning_rate=lr ) real, pred, prob = val_rst all_real += real all_pred += pred all_prob += prob sparse_features_counts.append(n_sparse_features) sparse_groups_counts.append(n_sparse_groups) sparse_groups_all.append(sparse_groups) try: print("Curr Results:", scipy.stats.linregress(all_prob, all_real)) except: pass # Not enough data to evaluate correlation print( "Final Sparsity %d features from %d groups in this Cross Validation:" % (n_sparse_features, n_sparse_groups)) print("#" * 10, "SUMMARY for", method) print("avg sparse features: %.2f; avg sparse groups: %.2f" % (np.mean(sparse_features_counts), np.mean(sparse_groups_counts))) all_prob = np.array(all_prob) all_prob[np.isnan(all_prob)] = np.mean(df[label_col_name]) mse = ((np.array(all_real) - np.array(all_prob))**2).mean() slope, intercept, r_value, p_value, std_err = scipy.stats.linregress( all_prob, all_real) # Cache Result of = open("rst/et_%s_rst.txt" % label_col_name, "a") # with > 9000 features rst = [ method, lambda_term, np.mean(sparse_features_counts), np.mean(sparse_groups_counts), np.std(sparse_features_counts), np.std(sparse_groups_counts), mse, slope, intercept, r_value, p_value, std_err, kwargs, sparse_groups_all ] rst = [str(_) for _ in rst] of.write("\t".join(rst).replace("\n", " ") + "\n") of.close() print(label_col_name, "Final Result:", "slope:", slope, "intercept:", intercept, "r_value:", r_value, "p_value:", p_value, "std_err:", std_err, "mse:", mse) print("#" * 200) return r_value