예제 #1
0
def get_target_values(data, instruction, yLabel):
    # Get target columns
    target = get_similar_column(get_value_instruction(instruction), data)
    X = data[target]
    del data[target]
    #labels
    Y = data[get_similar_column(get_value_instruction(yLabel), data)]
    return X, Y
예제 #2
0
def dimensionality_KPCA(instruction, dataset, target="", y=""):
    global currLog
    global counter

    dataReader = DataReader("./data/" + get_last_file()[0])

    if target == "":
        data = dataReader.data_generator()
        data.fillna(0, inplace=True)
        remove = get_similar_column(get_value_instruction(instruction), data)

        y = data[remove]
        del data[remove]
        le = preprocessing.LabelEncoder()
        y = le.fit_transform(y)

    kpca = KernelPCA(n_components=len(dataset.columns), kernel="rbf")
    data_modified = kpca.fit_transform(dataset)

    X_train, X_test, y_train, y_test = train_test_split(dataset,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=49)
    X_train_mod, X_test_mod, y_train_mod, y_test_mod = train_test_split(
        data_modified, y, test_size=0.2, random_state=49)

    clf = tree.DecisionTreeClassifier()
    clf.fit(X_train, y_train)

    clf_mod = tree.DecisionTreeClassifier()
    clf_mod.fit(X_train_mod, y_train_mod)
    acc = []
    acc.append(accuracy_score(clf_mod.predict(X_test_mod), y_test_mod))
    for i, j in product(range(3, 10), ["entropy", "gini"]):
        model = tree.DecisionTreeClassifier(criterion=j, max_depth=i)
        model = model.fit(X_train_mod, y_train_mod)
        acc.append(accuracy_score(model.predict(X_test_mod), y_test))
    del i, j
    data_modified = pd.DataFrame(data_modified)
    data_modified[target] = np.r_[y_train, y_test]
    # data_modified.to_csv("./data/housingPCA.csv")

    return data_modified, accuracy_score(
        clf.predict(X_test),
        y_test), max(acc), (len(dataset.columns) - len(data_modified.columns))

    def booster(dataset, obj):
        #obj=["reg:linear","multi:softmax "]

        X_train, X_test, y_train, y_test = train_test_split(dataset,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=49)
        clf = XGBClassifier(objective=obj,
                            learning_rate=0.1,
                            silent=1,
                            alpha=10)
        clf.fit(X_train, y_train)
        return accuracy_score(clf.predict(X_test_mod), y_test_mod)
예제 #3
0
def dimensionality_RF(instruction, dataset, target="", y="", n_features=10):
    global currLog
    global counter

    dataReader = DataReader("./data/" + get_last_file()[0])

    if target == "":
        data = dataReader.data_generator()
        data.fillna(0, inplace=True)
        remove = get_similar_column(get_value_instruction(instruction), data)
        data = structured_preprocesser(data)

        y = data[remove]
        del data[remove]
        le = preprocessing.LabelEncoder()
        y = le.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(dataset,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=49)
    first_classifier = tree.DecisionTreeClassifier()
    first_classifier.fit(X_train, y_train)

    first_classifier_acc = accuracy_score(first_classifier.predict(X_test),
                                          y_test)

    accuracy_scores = [first_classifier_acc]
    columns = []
    datas = []
    datas.append(dataset)
    columns.append([])

    for i, x in product(range(3, 10), range(4, len(dataset.columns))):
        feature_model = RandomForestRegressor(random_state=1, max_depth=i)
        feature_model.fit(X_train, y_train)

        importances = feature_model.feature_importances_
        indices = np.argsort(importances)[-x:]
        columns.append(dataset.columns[indices])

        X_temp_train = X_train[dataset.columns[indices]]
        X_temp_test = X_test[dataset.columns[indices]]

        val = pd.DataFrame(np.r_[X_temp_train, X_temp_test])
        val[target] = np.r_[y_train, y_test]
        datas.append(val)

        vr = tree.DecisionTreeClassifier()
        vr.fit(X_temp_train, y_train)

        accuracy_scores.append(accuracy_score(vr.predict(X_temp_test), y_test))

    the_index = accuracy_scores.index(max(accuracy_scores))

    return datas[the_index], accuracy_scores[0], max(accuracy_scores), list(
        columns[the_index])
예제 #4
0
    def neural_network_query(self,
                             instruction,
                             mca_threshold=None,
                             drop=None,
                             preprocess=True,
                             test_size=0.2,
                             random_state=49,
                             epochs=50,
                             generate_plots=True,
                             callback_mode='min',
                             maximizer="val_loss",
                             save_model=False,
                             save_path=os.getcwd()):

        data = pd.read_csv(self.dataset)

        if preprocess:

            remove = get_similar_column(get_value_instruction(instruction),
                                        data)
            if (data[remove].dtype.name == 'object'):
                callback_mode = 'max'
                maximizer = "val_accuracy"
                self.classification_query_ann(instruction,
                                              mca_threshold=mca_threshold,
                                              preprocess=preprocess,
                                              test_size=test_size,
                                              random_state=random_state,
                                              epochs=epochs,
                                              generate_plots=generate_plots,
                                              callback_mode=callback_mode,
                                              maximizer=maximizer,
                                              save_model=save_model,
                                              save_path=save_path)
            else:
                self.regression_query_ann(instruction,
                                          mca_threshold=mca_threshold,
                                          preprocess=preprocess,
                                          test_size=test_size,
                                          random_state=random_state,
                                          epochs=epochs,
                                          generate_plots=generate_plots,
                                          callback_mode=callback_mode,
                                          maximizer=maximizer,
                                          drop=drop,
                                          save_model=save_model,
                                          save_path=save_path)
예제 #5
0
def initial_preprocesser(data, instruction, preprocess, mca_threshold):
    # Scans for object columns just in case we have a datetime column that
    # isn't detected
    object_columns = [
        col for col, col_type in data.dtypes.iteritems()
        if col_type == 'object'
    ]

    # Handles dates without timestamps
    for col in object_columns:
        try:
            data[col] = pd.to_datetime(data[col], infer_datetime_format=True)
        except ValueError:
            pass

    # get target column
    target = get_similar_column(get_value_instruction(instruction), data)
    y = data[target]

    # remove rows where target is NaN
    data = data[y.notna()]
    y = y[y.notna()]

    del data[target]
    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=49)

    data = {
        'train': pd.concat([X_train], axis=1),
        'test': pd.concat([X_test], axis=1)
    }
    # preprocess the dataset
    full_pipeline = None
    if preprocess:
        data, full_pipeline = structured_preprocesser(data, mca_threshold)
    else:
        data.fillna(0, inplace=True)

    y = {'train': y_train, 'test': y_test}

    return data, y, target, full_pipeline
예제 #6
0
def instruction_identifier(params):
    remove = get_similar_column(get_value_instruction(params['instruction']),
                                params['data'])
    params['y'] = params['data'][remove]
    del params['data'][remove]
예제 #7
0
def dimensionality_reduc(instruction,
                         dataset,
                         arr=["RF", "PCA", "KPCA", "ICA"],
                         inplace=False):
    global currLog
    global counter

    dataReader = DataReader(dataset)

    logger("loading dataset...")
    data = dataReader.data_generator()
    data.fillna(0, inplace=True)

    logger("getting most similar column from instruction...")
    target = get_similar_column(get_value_instruction(instruction), data)

    y = data[target]
    del data[target]
    le = preprocessing.LabelEncoder()
    y = le.fit_transform(y)

    data = structured_preprocesser(data)

    perms = []
    overall_storage = []
    finals = []

    logger("generating dimensionality permutations...")
    for i in range(1, len(arr) + 1):
        for elem in list(permutations(arr, i)):
            perms.append(elem)

    logger("running each possible permutation...")
    logger("realigning tensors...")
    for path in perms:
        currSet = data
        for element in path:
            if element == "RF":
                data_mod, beg_acc, final_acc, col_removed = dimensionality_RF(
                    instruction, currSet, target, y)
            elif element == "PCA":
                data_mod, beg_acc, final_acc, col_removed = dimensionality_PCA(
                    instruction, currSet, target, y)
            elif element == "KPCA":
                data_mod, beg_acc, final_acc, col_removed = dimensionality_KPCA(
                    instruction, currSet, target, y)
            elif element == "ICA":
                data_mod, beg_acc, final_acc, col_removed = dimensionality_ICA(
                    instruction, currSet, target, y)
            overall_storage.append(
                list([data_mod, beg_acc, final_acc, col_removed]))
            currSet = data_mod
        finals.append(overall_storage[len(overall_storage) - 1])

    logger("Fetching Best Accuracies...")
    accs = []
    print("")
    print("Baseline Accuracy: " + str(finals[0][1]))
    print("----------------------------")
    for i, element in product(range(len(finals)), finals):
        print("Permutation --> " + str(perms[i]) + " | Final Accuracy --> " +
              str(element[2]))
        if finals[0][1] < element[2]:
            accs.append(
                list([
                    "Permutation --> " + str(perms[i]) +
                    " | Final Accuracy --> " + str(element[2])
                ]))
    print("")
    print("Best Accuracies")
    print("----------------------------")
    for element in accs:
        print(element)

    if inplace:
        data.to_csv(dataset)