예제 #1
0
def run_asd_experiment(method, lambda_term, **kwargs):
    print("begin method:", method, "with \\lambda", lambda_term,
          datetime.datetime.now(), "#" * 20)
    method = method.lower()

    all_real = []
    all_pred = []
    all_prob = []

    sparse_features_counts = []
    sparse_groups_counts = []
    sparse_groups_all = []

    if "num_epochs" in kwargs:
        num_epochs = kwargs["num_epochs"]
    else:
        num_epochs = 5

    # we will use cross entropy
    layers = [3000, 500, 2]
    criterion = torch.nn.CrossEntropyLoss()

    for cv_id, val_indices in enumerate(cross_validation_ids):

        num_val = len(val_indices)
        train_features = features.drop(val_indices).values
        train_labels = df.isASD.drop(val_indices).values.reshape(-1)

        val_features = features.ix[val_indices].values.reshape(num_val, -1)
        val_labels = np.array(df.isASD[val_indices]).reshape(-1)

        # Normalize Features
        if normalize_features:
            scaler = StandardScaler().fit(train_features)
            train_features = scaler.transform(train_features)
            val_features = scaler.transform(val_features)

        print("CV:", cv_id, "Shape Verification:",
              str(datetime.datetime.now()))


        if method == "lasso" or method == "linear regression" or  \
                method == "logistic regression" or method == "group lasso":
            val_rst, test_rst, n_sparse_features, n_sparse_groups, sparse_groups =\
            linear_experiment(train_features, train_labels,
                              val_features, val_labels,
                              None, None, # nothing for testing
                              feature_groups,
                              lambda_term=lambda_term, model_to_use=method)

        if method == "sgin" or method == "sgin_sgd" or method == "nn" or method == "theory":
            if method == "sgin":
                opt_method = "sbcgd"
                lam = lambda_term
            elif method == "sgin_sgd":
                opt_method = "sgd"
                lam = lambda_term
            elif method == "nn":
                opt_method = "sgd"
                lam = 0  # ignore and override the lambda for standard NN method
            elif method == "theory":
                opt_method = "theory"
                lam = lambda_term

            val_rst, test_rst, n_sparse_features, n_sparse_groups, sparse_groups =\
                    sgin_experiment(
                            train_features, train_labels,
                            val_features, val_labels,
                            None, None, # no testing set. We use cross validation here
                            feature_groups, cv_id=cv_id, criterion=criterion,
                            optmizer_method=opt_method, lam=lam, layers=layers,
                            num_epochs=num_epochs, train_batch_size=100,
                            verbose=False)

        real, pred, prob = val_rst
        all_real += real
        all_pred += pred
        all_prob += prob
        sparse_features_counts.append(n_sparse_features)
        sparse_groups_counts.append(n_sparse_groups)
        sparse_groups_all.append(sparse_groups)

        classification_metric(all_real, all_pred, all_prob)

        print(
            "Final Sparsity %d features from %d groups in this Cross Validation:"
            % (n_sparse_features, n_sparse_groups))

    print("#" * 10, "SUMMARY for", method)
    print("avg sparse features: %.2f; avg sparse groups: %.2f" %
          (np.mean(sparse_features_counts), np.mean(sparse_groups_counts)))

    acc, f1, auc, cm, precision, recall, sensitivity, specificity, _ = classification_metric(
        all_real, all_pred, all_prob)

    # Cache Result
    of = open("rst/et_asd_classification_rst.tsv", "a")  # with > 9000 features
    rst = [
        method, lambda_term,
        np.mean(sparse_features_counts),
        np.mean(sparse_groups_counts),
        np.std(sparse_features_counts),
        np.std(sparse_groups_counts), acc, f1, auc, cm, precision, recall,
        sensitivity, specificity, kwargs, sparse_groups_all
    ]
    rst = [str(_) for _ in rst]
    of.write("\t".join(rst).replace("\n", " ") + "\n")
    of.close()
    print("#" * 200)
예제 #2
0
def run_mnist_experiment(method, lambda_term, **kwargs):
    print("begin method:".upper(), method, "with \\lambda", lambda_term,
          datetime.datetime.now(), "#" * 20)
    method = method.lower()

    if "num_epochs" in kwargs:
        num_epochs = kwargs["num_epochs"]
    else:
        num_epochs = 5

    if "random_group_order" in kwargs and kwargs["random_group_order"] == True:
        sgin_model.RANDOM_GROUP_ORDER = True

    if "momentum_value" in kwargs and kwargs["momentum_value"] == True:
        sgin_model.MOMENTUM_VALUE = kwargs["momentum_value"]

    if "layers" in kwargs:
        layers = kwargs["layers"]

    if "learning_rate" in kwargs:
        learning_rate = kwargs["learning_rate"]
    else:
        learning_rate = 0.1

    if "batch_size" in kwargs:
        batch_size = kwargs["batch_size"]
    else:
        batch_size = 100

    criterion = torch.nn.CrossEntropyLoss()

    if method == "sgin":
        opt_method = "sbcgd"
        lam = lambda_term
    elif method == "sgin_sgd":
        opt_method = "sgd"
        lam = lambda_term
    elif method == "nn":
        # We can set lambda to zero, so sgin_sgd becomes normal NN
        opt_method = "sgd"
        lam = 0
    elif method == "theory":
        opt_method = "theory"
        lam = lambda_term

    if method in ["sgin", "sgin_sgd", "nn", "theory"]:
        val_rst, test_rst, n_sparse_features, n_sparse_groups, sparse_groups = sgin_experiment(
            train_features,
            train_labels,
            val_features,
            val_labels,
            test_features,
            test_labels,
            group_definition,
            input_dim=len(group_definition),
            cv_id=-1,
            criterion=criterion,
            optmizer_method=opt_method,
            lam=lam,
            layers=layers,
            num_epochs=num_epochs,
            train_batch_size=batch_size,
            learning_rate=learning_rate,
            verbose=False)

        print(
            "Final Sparsity %d features from %d groups in this Cross Validation:"
            % (n_sparse_features, n_sparse_groups))

        val_real, val_pred, val_prob = val_rst
        test_real, test_pred, test_prob = test_rst

        val_acc = np.sum(
            np.array(val_real) == np.array(val_pred)) / len(val_real)
        test_acc = np.sum(
            np.array(test_real) == np.array(test_pred)) / len(test_real)
        sparse_groups_found = sparse_groups  # set this variable to be compatible with linear method

    elif method in ["lasso", "group lasso"]:
        sparse_groups_found = []
        val_rsts = []
        test_rsts = []
        ns_sparse_features = []
        ns_sparse_groups = []

        val_pred_matrix = [
        ]  # it will become (n, 10), matrix where n = number of testing data
        test_pred_matrix = []
        for label_name in range(10):
            binary_train_labels = train_labels == label_name
            binary_val_labels = val_labels == label_name
            binary_test_labels = test_labels == label_name

            val_rst, test_rst, n_sparse_features, n_sparse_groups, sparse_groups = linear_experiment(
                train_features,
                binary_train_labels,
                val_features,
                binary_val_labels,
                test_features,
                binary_test_labels,
                group_definition,
                lambda_term=lambda_term,
                model_to_use=method)

            # record result for this 1-vs-rest run
            val_rsts.append(val_rst)
            test_rsts.append(test_rst)
            ns_sparse_features.append(n_sparse_features)
            ns_sparse_groups.append(n_sparse_groups)
            sparse_groups_found.append(sparse_groups)

            val_pred_matrix.append(val_rst[2])  # probability
            test_pred_matrix.append(test_rst[2])  # probability

        val_final_pred = np.array(val_pred_matrix).argmax(axis=0)
        test_final_pred = np.array(test_pred_matrix).argmax(axis=0)

        sparse_groups_found = [set(_) for _ in sparse_groups_found]

        # final sparse groups
        sparse_groups = set.intersection(*sparse_groups_found)

        val_acc = np.sum(
            np.array(val_labels) == np.array(val_final_pred)) / len(val_labels)
        test_acc = np.sum(np.array(test_labels) == np.array(
            test_final_pred)) / len(test_labels)

    print("!!! Validation Accuracy !!!!", val_acc, "!" * 20)
    print("!!! Testing Accuracy !!!!", test_acc, "!" * 20)

    num_features_in_sparse_groups = 0
    for g in sparse_groups:
        num_features_in_sparse_groups += np.sum(
            np.array(group_definition) == g)

    print("Total %d/%d (%.2f%%) features are in the sparse group" %
          (num_features_in_sparse_groups, len(group_definition),
           num_features_in_sparse_groups / len(group_definition) * 100))

    print("The final sparsified groups are:", sparse_groups)

    # Cache Result
    of = open("rst/rst_mnist.tsv", "a")
    # this n_features_used_for_final_product will make the visualization eaiser
    # and it is compatible to the older version of the saved txt file format.
    kwargs["n_features_used_for_final_product"] = train_features.shape[
        1] - num_features_in_sparse_groups
    rst = [
        method, lambda_term, n_sparse_features, n_sparse_groups,
        num_features_in_sparse_groups, args.representation, val_acc, test_acc,
        sparse_groups_found,
        str(kwargs),
        str(sparse_groups)
    ]

    rst = [str(_) for _ in rst]
    of.write("\t".join(rst).replace("\n", " ") + "\n")
    of.close()
    print("#" * 200)
예제 #3
0
def run_rna_experiment(method, lambda_term, weighted_loss=False, **kwargs):
    print("begin method:".upper(), method, "with \\lambda", lambda_term,
          datetime.datetime.now(), "#" * 20)

    method = method.lower()

    if "num_epochs" in kwargs:
        num_epochs = kwargs["num_epochs"]
    else:
        num_epochs = 5

    if "layers" in kwargs:
        layers = kwargs["layers"]
    else:
        layers = [30, 20, 10, 2]

    if weighted_loss:
        pos_ratio = np.sum(train_labels_big == 1) / train_labels_big.shape[0]
        weights_sample = torch.Tensor([pos_ratio, 1 - pos_ratio])
        if sgin_model.USE_CUDA:
            weights_sample = weights_sample.cuda()
        criterion = torch.nn.CrossEntropyLoss(weight=weights_sample)

        train_features_ = train_features_big
        train_labels_ = train_labels_big
    else:
        criterion = torch.nn.CrossEntropyLoss()
        train_features_ = train_features
        train_labels_ = train_labels

    if method == "lasso" or method == "group lasso":
        if method == "group lasso":
            print(
                "Please use GLLR model in R code with grplasso package for the RNA Splicing Experiment"
            )

        val_rst, test_rst, n_sparse_features, n_sparse_groups, sparse_groups =\
        linear_experiment(train_features, train_labels, val_features, val_labels,\
                          test_features, test_labels, \
                          feature_groups, lambda_term=lambda_term, model_to_use=method)

    if method == "sgin" or method == "sgin_sgd" or method == "nn" or method == "theory":
        if method == "sgin":
            opt_method = "sbcgd"
            lam = lambda_term
        elif method == "sgin_sgd":
            opt_method = "sgd"
            lam = lambda_term
        elif method == "nn":
            opt_method = "sgd"
            lam = 0
        elif method == "theory":
            opt_method = method
            lam = lambda_term


        val_rst, test_rst, n_sparse_features, n_sparse_groups, sparse_groups =\
            sgin_experiment(train_features_, train_labels_,
                            val_features, val_labels,
                            test_features, test_labels,
                            feature_groups, input_dim=None, cv_id=-1, criterion=criterion,
                            optmizer_method=opt_method, lam=lam, layers=layers,
                            num_epochs=num_epochs, train_batch_size=100,
                            verbose=False)

    print(
        "Final Sparsity %d features from %d groups in this Cross Validation:" %
        (n_sparse_features, n_sparse_groups))

    val_rst = classification_metric(val_rst[0], val_rst[1], val_rst[2])
    test_rst = classification_metric(test_rst[0], test_rst[1], test_rst[2])

    # Cache Result
    of = open("rst/rst_rna_exp.txt", "a")
    rst = [
        method, lambda_term, n_sparse_features, n_sparse_groups, 0, 0,
        *val_rst, *test_rst,
        str(kwargs), sparse_groups
    ]

    rst = [str(_) for _ in rst]
    of.write("\t".join(rst).replace("\n", " ") + "\n")
    of.close()
    print("#" * 200)
예제 #4
0
def run_linear_mnist_experiment(method, lambda_term, **kwargs):
    print("begin method:".upper(), method, "with \\lambda", lambda_term,
          datetime.datetime.now(), "#" * 20)
    method = method.lower()

    sparse_groups_found = []
    val_rsts = []
    test_rsts = []
    ns_sparse_features = []
    ns_sparse_groups = []

    val_pred_matrix = [
    ]  # it will become (n, 10), matrix where n = number of testing data
    test_pred_matrix = []
    for label_name in range(10):
        binary_train_labels = train_labels == label_name
        binary_val_labels = val_labels == label_name
        binary_test_labels = test_labels == label_name

        val_rst, test_rst, n_sparse_features, n_sparse_groups, sparse_groups = linear_experiment(
            train_features,
            binary_train_labels,
            val_features,
            binary_val_labels,
            test_features,
            binary_test_labels,
            group_definition,
            lambda_term=lambda_term,
            model_to_use=method)

        # record result for this 1-vs-rest run
        val_rsts.append(val_rst)
        test_rsts.append(test_rst)
        ns_sparse_features.append(n_sparse_features)
        ns_sparse_groups.append(n_sparse_groups)
        sparse_groups_found.append(sparse_groups)

        val_pred_matrix.append(val_rst[2])  # probability
        test_pred_matrix.append(test_rst[2])  # probability

    val_final_pred = np.array(val_pred_matrix).argmax(axis=0)
    test_final_pred = np.array(test_pred_matrix).argmax(axis=0)

    sparse_groups_found = [set(_) for _ in sparse_groups_found]

    final_sparse_group = set.intersection(*sparse_groups_found)

    val_acc = np.sum(
        np.array(val_labels) == np.array(val_final_pred)) / len(val_labels)
    test_acc = np.sum(
        np.array(test_labels) == np.array(test_final_pred)) / len(test_labels)

    print("!!! Validation Accuracy !!!!", val_acc, "!" * 20)
    print("!!! Testing Accuracy !!!!", test_acc, "!" * 20)
    print("Final Sparsity %d groups" % (len(final_sparse_group)),
          final_sparse_group)

    # Cache Result
    of = open("rst/linear_mnist_exp_rst.tsv", "a")
    rst = [
        method, lambda_term,
        np.mean(ns_sparse_features),
        np.mean(ns_sparse_groups), 0, 0, val_acc, test_acc,
        sparse_groups_found,
        str(kwargs),
        str(final_sparse_group)
    ]

    rst = [str(_) for _ in rst]
    of.write("\t".join(rst).replace("\n", " ") + "\n")
    of.close()
    print("#" * 200)
예제 #5
0
def run_ados_experiment(method, lambda_term, permutation_test=False, **kwargs):
    print("begin method:", method, "with \\lambda", lambda_term,
          datetime.datetime.now(), "#" * 20)
    method = method.lower()

    all_real = []
    all_pred = []
    all_prob = []

    sparse_features_counts = []
    sparse_groups_counts = []
    sparse_groups_all = []

    if "num_epochs" in kwargs:
        num_epochs = kwargs["num_epochs"]
    else:
        num_epochs = 5

    if "train_batch_size" in kwargs:
        train_batch_size = kwargs["train_batch_size"]
    else:
        train_batch_size = 1

    if "lr" in kwargs:
        lr = float(kwargs["lr"])
    else:
        lr = 0.01
    print("Init Learning Rate:", lr)

    if "layers" in kwargs:
        layers = kwargs["layers"]
    else:
        layers = [3000, 'R', 500, 'R', 1, 'S']

    print("Layers:", layers, "Train_batch_size:", train_batch_size)

    criterion = torch.nn.MSELoss()

    for cv_id, val_indices in enumerate(cross_validation_ids):

        num_val = len(val_indices)
        train_features = features.drop(val_indices).values
        train_labels = df[label_col_name].drop(val_indices).values.reshape(-1)

        val_features = features.iloc[val_indices].values.reshape(num_val, -1)
        val_labels = np.array(df.loc[list(val_indices),
                                     label_col_name]).reshape(-1)

        # Normalize Features
        if normalize_features:
            scaler = StandardScaler().fit(train_features)
            train_features = scaler.transform(train_features)
            val_features = scaler.transform(val_features)

        print("CV:", cv_id, "Shape Verification:",
              str(datetime.datetime.now()))

        if method == "lasso" or method == "linear regression" or  \
                method == "logistic regression" or method == "group lasso":
            val_rst, test_rst, n_sparse_features, n_sparse_groups, sparse_groups =\
            linear_experiment(train_features, train_labels,
                              val_features, val_labels,
                              None, None, # nothing for testing
                              feature_groups,
                              lambda_term=lambda_term, model_to_use=method)

        if method == "sgin" or method == "sgin_sgd" or method == "nn" or method == "theory":
            if method == "sgin":
                opt_method = "sbcgd"
                lam = lambda_term
            elif method == "sgin_sgd":
                opt_method = "sgd"
                lam = lambda_term
            elif method == "nn":
                # We can set lambda to zero, so sgin_sgd becomes normal NN
                opt_method = "sgd"
                lam = 0
            elif method == "theory":
                opt_method = "theory"
                lam = lambda_term


            val_rst, test_rst, n_sparse_features, n_sparse_groups, sparse_groups =\
                sgin_experiment(
                        train_features, train_labels,
                        val_features, val_labels,
                        None, None, # no testing set
                        feature_groups, cv_id=cv_id, criterion=criterion,
                        optmizer_method=opt_method, lam=lam, layers=layers,
                        num_epochs=num_epochs, train_batch_size=train_batch_size,
                        verbose=False, learning_rate=lr
                )

        real, pred, prob = val_rst
        all_real += real
        all_pred += pred
        all_prob += prob
        sparse_features_counts.append(n_sparse_features)
        sparse_groups_counts.append(n_sparse_groups)
        sparse_groups_all.append(sparse_groups)

        try:
            print("Curr Results:", scipy.stats.linregress(all_prob, all_real))
        except:
            pass  # Not enough data to evaluate correlation

        print(
            "Final Sparsity %d features from %d groups in this Cross Validation:"
            % (n_sparse_features, n_sparse_groups))

    print("#" * 10, "SUMMARY for", method)
    print("avg sparse features: %.2f; avg sparse groups: %.2f" %
          (np.mean(sparse_features_counts), np.mean(sparse_groups_counts)))

    all_prob = np.array(all_prob)
    all_prob[np.isnan(all_prob)] = np.mean(df[label_col_name])

    mse = ((np.array(all_real) - np.array(all_prob))**2).mean()
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
        all_prob, all_real)

    # Cache Result
    of = open("rst/et_%s_rst.txt" % label_col_name,
              "a")  # with > 9000 features
    rst = [
        method, lambda_term,
        np.mean(sparse_features_counts),
        np.mean(sparse_groups_counts),
        np.std(sparse_features_counts),
        np.std(sparse_groups_counts), mse, slope, intercept, r_value, p_value,
        std_err, kwargs, sparse_groups_all
    ]
    rst = [str(_) for _ in rst]
    of.write("\t".join(rst).replace("\n", " ") + "\n")
    of.close()
    print(label_col_name, "Final Result:", "slope:", slope, "intercept:",
          intercept, "r_value:", r_value, "p_value:", p_value, "std_err:",
          std_err, "mse:", mse)
    print("#" * 200)

    return r_value