Exemplo n.º 1
0
def prepare_data_for_training(file_dir):
    dataframe = pd.read_csv(file_dir)

    # replacing the labels
    dataframe['feature_class']. \
        replace({'ASD': 1.0, 'TD': -1.0}, inplace=True)

    # splitting data into train and test
    x_train, x_test, y_train, y_test = Utils.train_test_split(dataframe=dataframe, test_size=0.2)

    # convert to dataframe to numpy array and return
    return x_train.to_numpy(), x_test.to_numpy(), y_train.to_numpy(), y_test.to_numpy()
def process_training_data(fl_dir, min_max_scalar=None):
    df = pd.read_csv(fl_dir)

    # replace labels
    df['feature_class'].replace({'ASD': 1.0, 'TD': -1.0}, inplace=True)

    x_train, x_test, y_train, y_test = Utils.train_test_split(df, 0.2)

    # if min_max_scalar is not None:
    #     x_train = Utils.normalize_dataset(x_train, min_max_scalar)
    #     x_test = Utils.normalize_dataset(x_test, min_max_scalar)

    return x_train, x_test, y_train, y_test
Exemplo n.º 3
0
def process_training_data(fl_dir):
    df = pd.read_csv(fl_dir)

    # finding min max scalar
    # min_max_scalar = Utils.calculate_min_max_scalar(pd.read_csv(fl_dir))

    # replace labels
    df['feature_class'].replace({'ASD': 1.0, 'TD': -1.0}, inplace=True)

    X_train, X_test, y_train, y_test = Utils.train_test_split(df, 0.2)

    # X_train = Utils.normalize_dataset(X_train, min_max_scalar)
    # X_test = Utils.normalize_dataset(X_test, min_max_scalar)

    X_train = X_train.to_numpy()
    X_test = X_test.to_numpy()
    y_train = y_train.to_numpy()
    y_test = y_test.to_numpy()

    return X_train, X_test, y_train, y_test
Exemplo n.º 4
0
def process_training_data(fl_dir, min_max_scalar):
    dataframe = pd.read_csv(fl_dir)

    #  calculates the min max value of each col in the dataframe
    # min_max_scalar = Utils.calculate_min_max_scalar(dataset=dataframe)

    # replacing the labels
    dataframe['feature_class']. \
        replace({'ASD': 1.0, 'TD': -1.0}, inplace=True)

    # splitting data into train and test
    x_train, x_test, y_train, y_test = Utils.train_test_split(
        dataframe=dataframe, test_size=0.2)

    # normalizing train and test data
    # x_train = Utils.normalize_dataset(df=x_train, min_max=min_max_scalar)
    # x_test = Utils.normalize_dataset(df=x_test, min_max=min_max_scalar)
    #
    # # insert intercept col 'b' in (W * Xi + b)
    # x_train.insert(loc=len(x_train.columns), column='intercept', value=1)
    # x_test.insert(loc=len(x_test.columns), column='intercept', value=1)

    return x_train.to_numpy(), x_test.to_numpy(), y_train.to_numpy(
    ), y_test.to_numpy()
Exemplo n.º 5
0
    def is_leaf_node(self):
        # if self.value is not None:
        #     return True
        # else:
        #     return False
        return self.value is not None


if __name__ == "__main__":
    df = pd.read_csv("D:/TrainingDataset_YEAR_PROJECT/TrainingSet.csv")

    # replace labels
    df['feature_class'].replace({'ASD': 1.0, 'TD': -1.0}, inplace=True)

    X_train, X_test, y_train, y_test = Utils.train_test_split(df, 0.2)

    X_train = X_train.values
    X_test = X_test.values
    y_train = y_train.values
    y_test = y_test.values

    decision_tree = DecisionTree(max_depth=10)
    decision_tree.train_decision_tree(X_train, y_train)

    y_pred = decision_tree.predict(X_test)

    accuracy_score = Utils.calculate_accuracy_score(y_test, y_pred)
    print("Accuracy: ", accuracy_score)

# References:
def train_svm_model(fl_dir):
    #  X_train, X_test, Y_train, Y_test = SupportVectorMachine.proc_CSV_data(fl_dir)
    dataframe = pd.read_csv(fl_dir)
    dataframe["feature_class"]. \
        replace({"ASD": 1, "TD": 0},
                inplace=True)
    dataframe = dataframe.sample(frac=1)

    # DATA NORMALIZATION
    # min_max_scalar = Utils.calculate_min_max_scalar(dataframe)
    # dataframe = Utils.normalize_dataset(dataframe, min_max_scalar)
    #
    # X = dataframe.drop(labels="feature_class", axis=1)
    # Y = dataframe['feature_class']
    #
    # #  feature selection BEGIN
    # print("Shape before feature selection", X.shape)
    #
    # # L1-based feature selection
    # lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, Y)
    # model = SelectFromModel(lsvc, prefit=True)
    # X = model.transform
    #
    # # Univariate feature selection
    # X = SelectKBest(chi2, k=6).fit_transform(X, Y)

    # Tree-based feature selection
    # clf = ExtraTreesClassifier(n_estimators=50)
    # clf = clf.fit(X, Y)
    # clf.feature_importances_
    # model = SelectFromModel(clf, prefit=True)
    # X = model.transform(X)

    # print("Shape after feature seleciton", X.shape)
    #  feature selection END

    #  train test split
    # X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20)
    X_train, X_test, Y_train, Y_test = Utils.train_test_split(
        dataframe=dataframe, test_size=0.20)

    #  SVM

    # scalar = MinMaxScaler()
    # X_train = pd.DataFrame(scalar.fit_transform(X_train.values))
    # X_test = pd.DataFrame(scalar.transform(X_test.values))
    #
    # ns_probs = [0 for _ in range(len(Y_test))]
    # svm_model_linear = SVC(kernel='rbf', probability=True).fit(X_train, Y_train)  # polynomial kernel
    #
    # # load the saved model
    # # load_model = joblib.load(filename=saved_mdl_path)
    #
    # svm_prediction = svm_model_linear.predict(X_test)
    #
    # accuracy = svm_model_linear.score(X_test, Y_test)
    # print(accuracy)  # debug
    #
    # # creating a confusion matrix
    # cm = confusion_matrix(Y_test, svm_prediction)
    #
    # print(cm)
    # print(classification_report(Y_test, svm_prediction))
    #
    # # predict probabilities
    # lr_probs = svm_model_linear.predict_proba(X_test)
    # # keep probabilities for the positive outcome only
    # lr_probs = lr_probs[:, 1]
    # # calculate scores
    # ns_auc = roc_auc_score(Y_test, ns_probs)
    # lr_auc = roc_auc_score(Y_test, lr_probs)
    # # summarize scores
    # print('No Skill: ROC AUC=%.3f' % (ns_auc))
    # print('Logistic: ROC AUC=%.3f' % (lr_auc))
    # # calculate roc curves
    # ns_fpr, ns_tpr, _ = roc_curve(Y_test, ns_probs)
    # lr_fpr, lr_tpr, _ = roc_curve(Y_test, lr_probs)
    # # plot the roc curve for the model
    # pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
    # pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
    # # axis labels
    # pyplot.xlabel('False Positive Rate')
    # pyplot.ylabel('True Positive Rate')
    #
    # # save the model
    # # saved_mdl_path = 'normal_leaf_model.sav'
    # # joblib.dump(svm_model_linear, saved_mdl_path)
    #
    # # show the legend
    # pyplot.legend()
    # # show the plot
    # pyplot.show()

    # RANDOM FOREST CLASSIFIER
    # scalar = MinMaxScaler()
    # X_train = pd.DataFrame(scalar.fit_transform(X_train.values))
    # X_test = pd.DataFrame(scalar.transform(X_test.values))
    #
    # ns_probs = [0 for _ in range(len(Y_test))]
    # regressor = RandomForestClassifier(n_estimators=18, max_depth=10).fit(X_train, Y_train)
    # y_pred = regressor.predict(X_test)
    #
    # print(confusion_matrix(Y_test, y_pred))
    # print(classification_report(Y_test, y_pred))
    # # print(Y_test)
    # # print(y_pred)
    # print(SupportVectorMachine.calc_accuracy_score(Y_test, y_pred))
    #
    # accuracy = regressor.score(X_test, Y_test)
    # print(accuracy)  # debug
    #
    # print(accuracy_score(Y_test, y_pred.round(), normalize=False))

    # NN..............................................................
    # mlp = MLPClassifier(hidden_layer_sizes=9, activation='relu', max_iter=400, solver='adam').fit(X_train, Y_train)
    #
    # predictions = mlp.predict(X_test)
    # print(confusion_matrix(Y_test, predictions))
    # print(classification_report(Y_test, predictions))
    # print(Utils.calculate_accuracy_score(Y_test, predictions))

    # KNN
    # scalar = MinMaxScaler()
    # X_train = pd.DataFrame(scalar.fit_transform(X_train.values))
    # X_test = pd.DataFrame(scalar.transform(X_test.values))

    model = KNeighborsClassifier(n_neighbors=2).fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    #
    accuracy = model.score(X_test, Y_test)
    print(accuracy)  # debug

    print(confusion_matrix(Y_test, y_pred))
    print(classification_report(Y_test, y_pred))
    accuracy = model.score(X_test, Y_test)
    print(accuracy)  # debug