예제 #1
0
def get_ner(self, instruction):
    """
    function to identify name entities
    :param instruction: Used to get target column
    :return: dictionary object with detected name-entities
    """
    data = DataReader(self.dataset)
    data = data.data_generator()

    target = get_similar_column(get_value_instruction(instruction), data)
    logger("->", "Target Column Found: {}".format(target))

    # Remove stopwords if any from the detection column
    data['combined_text_for_ner'] = data[target].apply(
        lambda x: ' '.join([word for word in x.split() if word not in stopwords.words()]))

    logger("Detecting Name Entities from : {} data files".format(data.shape[0] + 1))

    # Named entity recognition pipeline, default model selection
    with NoStdStreams():
        hugging_face_ner_detector = pipeline('ner', grouped_entities=True, framework='tf')
        data['ner'] = data['combined_text_for_ner'].apply(lambda x: hugging_face_ner_detector(x))
    logger("NER detection status complete")
    logger("Storing information in client object under key 'named_entity_recognition'")

    self.models["named_entity_recognition"] = {
        "model": hugging_face_ner_detector.model,
        "tokenizer": hugging_face_ner_detector.tokenizer,
        'name_entities': data['ner'].to_dict()}

    logger("Output: ", data['ner'].to_dict())
    clearLog()
    return self.models["named_entity_recognition"]
예제 #2
0
def decision_tree(instruction,
                  dataset=None,
                  preprocess=True,
                  ca_threshold=None,
                  text=None,
                  test_size=0.2,
                  drop=None):
    logger("Reading in dataset....")

    dataReader = DataReader(dataset)
    data = dataReader.data_generator()

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    data, y, remove, full_pipeline = initial_preprocesser(
        data, instruction, preprocess, ca_threshold, text)
    logger("->", "Target Column Found: {}".format(remove))

    X_train = data['train']
    y_train = y['train']
    X_test = data['test']
    y_test = y['test']

    # classification_column = get_similar_column(getLabelwithInstruction(instruction), data)

    # Needed to make a custom label encoder due to train test split changes
    # Can still be inverse transformed, just a bit of extra work
    y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0))
    label_mappings = {}
    for i in range(len(y_vals)):
        label_mappings[y_vals[i]] = i

    # Custom label encoder due to train test split
    y_train = y_train.apply(lambda x: label_mappings[x]).values
    y_test = y_test.apply(lambda x: label_mappings[x]).values
    num_classes = len(np.unique(y))

    # fitting and storing
    logger("Fitting Decision Tree...")

    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train)

    score = accuracy_score(clf.predict(X_test), y_test)
    logger("->", "Score found on testing set: {}".format(score))
    logger("Stored model under 'decision_tree' key")

    clearLog()

    return {
        'id': generate_id(),
        "model": clf,
        "target": remove,
        "accuracy_score": score,
        "preprocesser": full_pipeline,
        "interpeter": label_mappings,
        "cross_val_score": cross_val_score(clf, X_train, y_train, cv=3)
    }
예제 #3
0
def dimensionality_RF(instruction, dataset, target="", y="", n_features=10):

    global counter

    dataReader = DataReader(dataset)

    if target == "":
        data = dataReader.data_generator()
        data.fillna(0, inplace=True)
        remove = get_similar_column(get_value_instruction(instruction), data)
        data, y, target, full_pipeline = initial_preprocessor(
            data, instruction, True, 0.2, [], 0.2, random_state=49)

        le = preprocessing.LabelEncoder()
        X_train = data['train']
        y_train = y['train']
        X_test = data['test']
        y_test = y['test']

        y_train= le.fit_transform(y_train)
        y_test = le.fit_transform(y_test)

    first_classifier = tree.DecisionTreeClassifier()
    first_classifier.fit(X_train, y_train)

    first_classifier_acc = accuracy_score(
        first_classifier.predict(X_test), y_test)

    accuracy_scores = [first_classifier_acc]
    columns = []
    datas = []
    datas.append(dataset)
    columns.append([])

    for i, x in product(range(3, 10), range(4, len(X_train.columns))):
        feature_model = RandomForestRegressor(random_state=1, max_depth=x)
        feature_model.fit(X_train, y_train)

        importances = feature_model.feature_importances_
        indices = np.argsort(importances)[-x:]
        columns.append(X_train.columns[indices])

        X_temp_train = X_train[X_train.columns[indices]]
        X_temp_test = X_test[X_train.columns[indices]]

        val = pd.DataFrame(np.r_[X_temp_train, X_temp_test])
        val[target] = np.r_[y_train, y_test]
        datas.append(val)

        vr = tree.DecisionTreeClassifier()
        vr.fit(X_temp_train, y_train)

        accuracy_scores.append(accuracy_score(vr.predict(X_temp_test), y_test))

    the_index = accuracy_scores.index(max(accuracy_scores))
    print(accuracy_scores)
    return datas[the_index], accuracy_scores[0], max(
        accuracy_scores), list(columns[the_index])
def dimensionality_KPCA(instruction, dataset, target="", y=""):
    global currLog
    global counter

    dataReader = DataReader("./data/" + get_last_file()[0])

    if target == "":
        data = dataReader.data_generator()
        data.fillna(0, inplace=True)
        remove = get_similar_column(get_value_instruction(instruction), data)

        y = data[remove]
        del data[remove]
        le = preprocessing.LabelEncoder()
        y = le.fit_transform(y)

    kpca = KernelPCA(n_components=len(dataset.columns), kernel="rbf")
    data_modified = kpca.fit_transform(dataset)

    X_train, X_test, y_train, y_test = train_test_split(
        dataset, y, test_size=0.2, random_state=49)
    X_train_mod, X_test_mod, y_train_mod, y_test_mod = train_test_split(
        data_modified, y, test_size=0.2, random_state=49)

    clf = tree.DecisionTreeClassifier()
    clf.fit(X_train, y_train)

    clf_mod = tree.DecisionTreeClassifier()
    clf_mod.fit(X_train_mod, y_train_mod)
    acc = []
    acc.append(accuracy_score(
        clf_mod.predict(X_test_mod), y_test_mod))
    for i, j in product(range(3, 10), ["entropy", "gini"]):
        model = tree.DecisionTreeClassifier(criterion=j, max_depth=i)
        model = model.fit(X_train_mod, y_train_mod)
        acc.append(accuracy_score(model.predict(X_test_mod), y_test))
    del i, j
    data_modified = pd.DataFrame(data_modified)
    data_modified[target] = np.r_[y_train, y_test]
    # data_modified.to_csv("./data/housingPCA.csv")

    return data_modified, accuracy_score(
        clf.predict(X_test), y_test), max(acc), (len(
            dataset.columns) - len(data_modified.columns))

    def booster(dataset, obj):
        #obj=["reg:linear","multi:softmax "]

        X_train, X_test, y_train, y_test = train_test_split(
            dataset, y, test_size=0.2, random_state=49)
        clf = XGBClassifier(
            objective=obj,
            learning_rate=0.1,
            silent=1,
            alpha=10)
        clf.fit(X_train, y_train)
        return accuracy_score(clf.predict(X_test_mod), y_test_mod)
def dimensionality_KPCA(instruction, dataset, target="", y=""):
    '''
    function to reduce dimensionality in dataset via kernal principal component analysis
    :param instruction: command sent to client instance in written query.
    :param dataset: data instantiated in client instance passed to the algorithm
    :param target: column name of response variable/feature
    :param y: dictionary of train/test data values associated with response variable/feature
    '''

    pca = KernelPCA(kernel='rbf')

    dataReader = DataReader(dataset)
    dataset = dataReader.data_generator()

    data, y, target, full_pipeline = initial_preprocesser(dataset,
                                                          instruction,
                                                          True,
                                                          0.2, [],
                                                          0.2,
                                                          random_state=49)

    X_train = data['train']
    X_test = data['test']

    y_train = y['train']
    y_test = y['test']

    X_train_mod = pca.fit_transform(X_train)
    X_test_mod = pca.transform(X_test)

    clf = tree.DecisionTreeClassifier()
    clf_mod = tree.DecisionTreeClassifier()

    clf.fit(X_train, y_train)
    clf_mod.fit(X_train_mod, y_train)

    acc = []
    acc.append(accuracy_score(clf_mod.predict(X_test_mod), y_test))

    for i, j in product(range(3, 10), ["entropy", "gini"]):
        model = tree.DecisionTreeClassifier(criterion=j, max_depth=i)
        model = model.fit(X_train_mod, y_train)
        acc.append(accuracy_score(model.predict(X_test_mod), y_test))
    del i, j

    data_modified = pd.concat(
        [pd.DataFrame(X_train_mod),
         pd.DataFrame(X_test_mod)], axis=0)

    y_combined = np.r_[y_train, y_test]
    data_modified[target] = y_combined
    # data_modified.to_csv("./data/housingPCA.csv")

    return data_modified, accuracy_score(
        clf.predict(X_test),
        y_test), max(acc), (len(dataset.columns) - len(data_modified.columns))
def dimensionality_RF(instruction, dataset, target="", y="", n_features=10):
    global currLog
    global counter

    dataReader = DataReader("./data/" + get_last_file()[0])

    if target == "":
        data = dataReader.data_generator()
        data.fillna(0, inplace=True)
        remove = get_similar_column(get_value_instruction(instruction), data)
        data = structured_preprocesser(data)

        y = data[remove]
        del data[remove]
        le = preprocessing.LabelEncoder()
        y = le.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(
        dataset, y, test_size=0.2, random_state=49)
    first_classifier = tree.DecisionTreeClassifier()
    first_classifier.fit(X_train, y_train)

    first_classifier_acc = accuracy_score(
        first_classifier.predict(X_test), y_test)

    accuracy_scores = [first_classifier_acc]
    columns = []
    datas = []
    datas.append(dataset)
    columns.append([])

    for i, x in product(range(3, 10), range(4, len(dataset.columns))):
        feature_model = RandomForestRegressor(random_state=1, max_depth=i)
        feature_model.fit(X_train, y_train)

        importances = feature_model.feature_importances_
        indices = np.argsort(importances)[-x:]
        columns.append(dataset.columns[indices])

        X_temp_train = X_train[dataset.columns[indices]]
        X_temp_test = X_test[dataset.columns[indices]]

        val = pd.DataFrame(np.r_[X_temp_train, X_temp_test])
        val[target] = np.r_[y_train, y_test]
        datas.append(val)

        vr = tree.DecisionTreeClassifier()
        vr.fit(X_temp_train, y_train)

        accuracy_scores.append(accuracy_score(vr.predict(X_temp_test), y_test))

    the_index = accuracy_scores.index(max(accuracy_scores))

    return datas[the_index], accuracy_scores[0], max(
        accuracy_scores), list(columns[the_index])
예제 #7
0
def nearest_neighbors(instruction=None,
                      dataset=None,
                      ca_threshold=None,
                      preprocess=True,
                      drop=None,
                      min_neighbors=3,
                      max_neighbors=10):
    logger("Reading in dataset....")
    # Reads in dataset
    # data = pd.read_csv(self.dataset)
    dataReader = DataReader(dataset)
    data = dataReader.data_generator()
    if drop is not None:
        data.drop(drop, axis=1, inplace=True)
    data, y, remove, full_pipeline = initial_preprocesser(
        data, instruction, preprocess, ca_threshold, text)
    logger("->", "Target Column Found: {}".format(remove))
    X_train = data['train']
    y_train = y['train']
    X_test = data['test']
    y_test = y['test']
    # classification_column = get_similar_column(getLabelwithInstruction(instruction), data)
    num_classes = len(np.unique(y))
    # encodes the label dataset into 0's and 1's
    y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0))
    label_mappings = {}
    for i in range(len(y_vals)):
        label_mappings[y_vals[i]] = i
    y_train = y_train.apply(lambda x: label_mappings[x]).values
    y_test = y_test.apply(lambda x: label_mappings[x]).values
    models = []
    scores = []
    logger("Fitting Nearest Neighbor...")
    logger("Identifying optimal number of neighbors...")
    # Tries all neighbor possibilities, based on either defaults or user
    # specified values
    for x in range(min_neighbors, max_neighbors):
        knn = KNeighborsClassifier(n_neighbors=x)
        knn.fit(X_train, y_train)
        models.append(knn)
        scores.append(accuracy_score(knn.predict(X_test), y_test))
    logger("Stored model under 'nearest_neighbors' key")
    knn = models[scores.index(min(scores))]
    return {
        'id': generate_id(),
        "model": knn,
        "accuracy_score": scores.index(min(scores)),
        "preprocesser": full_pipeline,
        "interpreter": label_mappings,
        "target": remove,
        "cross_val_score": cross_val_score(knn, X_train, y_train, cv=3)
    }
    clearLog()
def dimensionality_PCA(instruction, dataset, ca_threshold=None):

    global counter

    pca = PCA(0.92)

    dataReader = DataReader(dataset)
    dataset = dataReader.data_generator()

    data, y, target, full_pipeline = initial_preprocesser(dataset,
                                                          instruction,
                                                          True,
                                                          0.2, [],
                                                          0.2,
                                                          random_state=49)

    X_train = data['train']
    X_test = data['test']

    y_train = y['train']
    y_test = y['test']

    X_train_mod = pca.fit_transform(X_train)
    X_test_mod = pca.transform(X_test)

    clf = tree.DecisionTreeClassifier()
    clf_mod = tree.DecisionTreeClassifier()

    clf.fit(X_train, y_train)
    clf_mod.fit(X_train_mod, y_train)

    acc = []
    acc.append(accuracy_score(clf_mod.predict(X_test_mod), y_test))

    for i, j in product(range(3, 10), ["entropy", "gini"]):
        model = tree.DecisionTreeClassifier(criterion=j, max_depth=i)
        model = model.fit(X_train_mod, y_train)
        acc.append(accuracy_score(model.predict(X_test_mod), y_test))
    del i, j

    data_modified = pd.concat(
        [pd.DataFrame(X_train_mod),
         pd.DataFrame(X_test_mod)], axis=0)

    y_combined = np.r_[y_train, y_test]
    data_modified[target] = y_combined
    # data_modified.to_csv("./data/housingPCA.csv")

    return data_modified, accuracy_score(
        clf.predict(X_test),
        y_test), max(acc), (len(dataset.columns) - len(data_modified.columns))
def dimensionality_KPCA(instruction, dataset, target="", y=""):
    '''
    function to reduce dimensionality in dataset via kernal principal component analysis
    :param instruction: command sent to client instance in written query.
    :param dataset: data instantiated in client instance passed to the algorithm
    :param target: column name of response variable/feature
    :param y: dictionary of train/test data values associated with response variable/feature
    '''
    
    global counter

    dataReader = DataReader("./data/" + get_last_file()[0])

    if target == "":
        data = dataReader.data_generator()
        data.fillna(0, inplace=True)
        remove = get_similar_column(get_value_instruction(instruction), data)

        y = data[remove]
        del data[remove]
        le = preprocessing.LabelEncoder()
        y = le.fit_transform(y)

    kpca = KernelPCA(n_components=len(dataset.columns), kernel="rbf")
    data_modified = kpca.fit_transform(dataset)

    X_train, X_test, y_train, y_test = train_test_split(
        dataset, y, test_size=0.2, random_state=49)
    X_train_mod, X_test_mod, y_train_mod, y_test_mod = train_test_split(
        data_modified, y, test_size=0.2, random_state=49)

    clf = tree.DecisionTreeClassifier()
    clf.fit(X_train, y_train)

    clf_mod = tree.DecisionTreeClassifier()
    clf_mod.fit(X_train_mod, y_train_mod)
    acc = []
    acc.append(accuracy_score(
        clf_mod.predict(X_test_mod), y_test_mod))
    for i, j in product(range(3, 10), ["entropy", "gini"]):
        model = tree.DecisionTreeClassifier(criterion=j, max_depth=i)
        model = model.fit(X_train_mod, y_train_mod)
        acc.append(accuracy_score(model.predict(X_test_mod), y_test))
    del i, j
    data_modified = pd.DataFrame(data_modified)
    data_modified[target] = np.r_[y_train, y_test]
    # data_modified.to_csv("./data/housingPCA.csv")

    return data_modified, accuracy_score(
        clf.predict(X_test), y_test), max(acc), (len(
            dataset.columns) - len(data_modified.columns))
예제 #10
0
def train_svm(instruction,
              dataset=None,
              test_size=0.2,
              kernel='linear',
              text=[],
              preprocess=True,
              ca_threshold=None,
              drop=None,
              cross_val_size=0.3,
              degree=3,
              gamma='scale',
              coef0=0.0,
              max_iter=-1,
              random_state=49):
    '''
    function to train a support vector machine clustering algorithm
    :param many params: used to hyperparametrize the function.
    :return a dictionary object with all of the information for the algorithm.
    '''

    logger("Reading in dataset")

    dataReader = DataReader(dataset)
    data = dataReader.data_generator()

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    logger("Preprocessing data")
    data, y, target, full_pipeline = initial_preprocesser(
        data,
        instruction,
        preprocess,
        ca_threshold,
        text,
        test_size=test_size,
        random_state=random_state)
    logger("->", "Target column found: {}".format(target))

    X_train = data['train']
    y_train = y['train']
    X_test = data['test']
    y_test = y['test']

    # classification_column = get_similar_column(getLabelwithInstruction(instruction), data)
    num_classes = len(np.unique(y))

    # Needed to make a custom label encoder due to train test split changes
    # Can still be inverse transformed, just a bit of extra work
    y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0))
    label_mappings = sklearn.preprocessing.LabelEncoder()
    label_mappings.fit(y_vals)

    y_train = label_mappings.transform(y_train)
    y_test = label_mappings.transform(y_test)

    # Fitting to SVM and storing in the model dictionary
    logger("Fitting Support Vector Machine")
    clf = svm.SVC(kernel=kernel,
                  degree=degree,
                  gamma=gamma,
                  coef0=coef0,
                  max_iter=max_iter)
    clf.fit(X_train, y_train)

    score = accuracy_score(clf.predict(X_test), y_test)

    logger("->", "Accuracy found on testing set: {}".format(score))

    logger('->', "Stored model under 'svm' key")
    clearLog()
    return {
        'id': generate_id(),
        "model": clf,
        'num_classes': num_classes,
        "accuracy": {
            'cross_val_score': cross_val_score(clf, X_train, y_train),
            'accuracy_score': score
        },
        "target": target,
        "preprocesser": full_pipeline,
        "interpreter": label_mappings,
        'test_data': {
            'X': X_test,
            'y': y_test
        }
    }
    clearLog()
예제 #11
0
def tune_helper(model_to_tune=None,
                dataset=None,
                models=None,
                max_layers=10,
                min_layers=2,
                min_dense=32,
                max_dense=512,
                executions_per_trial=3,
                max_trials=1,
                activation='relu',
                loss='categorical_crossentropy',
                metrics='accuracy',
                seed=42,
                objective='val_accuracy',
                directory='my_dir',
                epochs=10,
                step=32,
                verbose=0,
                test_size=0.2):
    logger("Getting target model for tuning...")

    # checks to see which requested model is in the self.models

    # processing for regression feed forward NN
    if model_to_tune == 'regression_ANN':
        logger("Tuning model hyperparameters...")
        dataReader = DataReader(dataset)
        data = dataReader.data_generator()
        target = models['regression_ANN']['target']
        target_column = data[models['regression_ANN']['target']]
        data = models['regression_ANN']['preprocesser'].transform(
            data.drop(target, axis=1))
        returned_model, returned_pms, history = tuneReg(
            data,
            target_column,
            max_layers=max_layers,
            min_layers=min_layers,
            min_dense=min_dense,
            max_dense=max_dense,
            executions_per_trial=executions_per_trial,
            max_trials=max_trials,
            epochs=epochs,
            activation=activation,
            step=step,
            verbose=verbose,
            test_size=test_size)
        models['regression_ANN'] = {
            'model': returned_model,
            'hyperparametes': returned_pms,
            'losses': {
                'training_loss': history.history['loss'],
                'val_loss': history.history['val_loss']
            }
        }

        # processing for classification feed forward NN
    elif model_to_tune == "classification_ANN":
        logger("Tuning model hyperparameters...")
        dataReader = DataReader(dataset)
        data = dataReader.data_generator()
        target = models['classification_ANN']['target']
        target_column = data[models['classification_ANN']['target']]
        data = models['classification_ANN']['preprocesser'].transform(
            data.drop(target, axis=1))
        returned_model, returned_pms, history = tuneClass(
            data,
            target_column,
            models['classification_ANN']['num_classes'],
            max_layers=max_layers,
            min_layers=min_layers,
            min_dense=min_dense,
            max_dense=max_dense,
            executions_per_trial=executions_per_trial,
            max_trials=max_trials,
            activation=activation,
            loss=loss,
            metrics=metrics,
            epochs=epochs,
            step=step,
            verbose=verbose,
            test_size=test_size)
        models['classification_ANN'] = {
            'model': returned_model,
            'hyperparametes': returned_pms,
            'losses': {
                'training_loss': history.history['loss'],
                'val_loss': history.history['val_loss']
            }
        }
        # processing for convolutional NN
    elif model_to_tune == "convolutional_NN":
        logger("Tuning model hyperparameters...")
        X_train, X_test, height, width, num_classes = get_image_data(dataset)
        model, returned_pms, history = tuneCNN(
            X_train,
            X_test,
            height,
            width,
            num_classes,
            executions_per_trial=executions_per_trial,
            max_trials=max_trials,
            seed=seed,
            objective=objective,
            directory=directory,
            epochs=epochs,
            verbose=verbose,
            test_size=test_size)
        models["convolutional_NN"]["model"] = model
        models["convolutional_NN"]["hyperparametes"] = returned_pms,
        models["convolutional_NN"]["losses"] = {
            'training_loss': history.history['loss'],
            'val_loss': history.history['val_loss']
        }

    return models
예제 #12
0
def dimensionality_ICA(instruction, dataset, target="", y=""):

    global counter

    dataReader = DataReader(dataset)

    if target == "":
        data = dataReader.data_generator()
        data.fillna(0, inplace=True)
        remove = get_similar_column(get_value_instruction(instruction), data)

        data, y, target, full_pipeline = initial_preprocessor(
            data, instruction, True, 0.2, [], 0.2, random_state=49)

        X_train = data['train']
        X_test = data['test']

        y_train = y['train']
        y_test = y['test']


    pca = FastICA(n_components=len(X_train.columns))
    X_train_mod = pca.fit_transform(X_train)
    X_test_mod = pca.fit_transform(X_test)


    clf = tree.DecisionTreeClassifier()
    clf.fit(X_train, y_train)

    clf_mod = tree.DecisionTreeClassifier()
    clf_mod.fit(X_train_mod, y_train)
    acc = []
    sets = []
    acc.append(accuracy_score(
        clf_mod.predict(X_test_mod), y_test))

    frame = pd.DataFrame(pd.DataFrame(X_train_mod).append(pd.DataFrame(X_test_mod)))
    frame[target] = np.r_[y_train, y_test]
    sets.append(frame)

    for i in range(2, len(X_train.columns)):
        pca = FastICA(n_components=i)
        X_train_mod = pca.fit_transform(X_train)
        X_test_mod = pca.fit_transform(X_test)

        frame = pd.DataFrame(pd.DataFrame(X_train_mod).append(pd.DataFrame(X_test_mod)))
        frame[target] = np.r_[y_train, y_test]
        sets.append(frame)

        clf_mod = tree.DecisionTreeClassifier()
        clf_mod.fit(X_train_mod, y_train)

        acc.append(accuracy_score(
            clf_mod.predict(X_test_mod), y_test))

    del i

    data_modified = sets[acc.index(max(acc))]
    score = max(acc)


    return data_modified, score, ((len(
            X_train.columns) + 1) - len(data_modified.columns))
예제 #13
0
def classification_ann(instruction,
                       callback=False,
                       dataset=None,
                       text=[],
                       ca_threshold=None,
                       preprocess=True,
                       callback_mode='min',
                       drop=None,
                       random_state=49,
                       test_size=0.2,
                       epochs=50,
                       generate_plots=True,
                       maximizer="val_accuracy",
                       save_model=False,
                       save_path=os.getcwd(),
                       add_layer={}):
    '''
    Body of the classification function used that is called in the neural network query
    if the data is categorical.
    :param many parameters: used to preprocess, tune, plot generation, and parameterizing the neural network trained.
    :return dictionary that holds all the information for the finished model.
    '''

    if dataset is None:
        dataReader = DataReader(get_file())
    else:
        dataReader = DataReader(dataset)
    logger("Reading in dataset")
    data = dataReader.data_generator()

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    data, y, remove, full_pipeline = initial_preprocessor(
        data,
        instruction,
        preprocess,
        ca_threshold,
        text,
        test_size=test_size,
        random_state=random_state)
    logger("->", "Target column found: {}".format(remove))

    # Needed to make a custom label encoder due to train test split changes
    # Can still be inverse transformed, just a bit of extra work
    y = pd.concat([y['train'], y['test']], axis=0)

    num_classes = len(np.unique(y))

    if num_classes < 2:
        raise Exception("Number of classes must be greater than or equal to 2")

    X_train = data['train']
    X_test = data['test']

    if num_classes >= 2:
        # ANN needs target one hot encoded for classification
        one_hotencoder = OneHotEncoder()
        y = pd.DataFrame(one_hotencoder.fit_transform(
            np.reshape(y.values, (-1, 1))).toarray(),
                         columns=one_hotencoder.get_feature_names())

    y_train = y.iloc[:len(X_train)]
    y_test = y.iloc[len(X_train):]

    models = []
    losses = []
    accuracies = []
    model_data = []

    logger("Establishing callback function")

    # early stopping callback
    es = EarlyStopping(monitor=maximizer, mode='max', verbose=0, patience=5)

    callback_value = None
    if callback is not False:
        callback_value = [es]

    i = 0
    model = get_keras_model_class(data, i, num_classes, add_layer)
    logger("Training initial model")

    history = model.fit(X_train,
                        y_train,
                        callbacks=callback_value,
                        epochs=epochs,
                        validation_data=(X_test, y_test),
                        verbose=0)

    model_data.append(model)
    models.append(history)
    col_name = [[
        "Initial number of layers ", "| Training Accuracy ", "| Test Accuracy "
    ]]
    col_width = max(len(word) for row in col_name for word in row) + 2
    for row in col_name:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    values = []
    values.append(str(len(model.layers)))
    values.append("| " + str(history.history['accuracy'][
        len(history.history['val_accuracy']) - 1]))
    values.append("| " + str(history.history['val_accuracy'][
        len(history.history['val_accuracy']) - 1]))
    datax = []
    datax.append(values)
    for row in datax:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    # print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl'))
    losses.append(history.history[maximizer][len(history.history[maximizer]) -
                                             1])
    accuracies.append(
        history.history['val_accuracy'][len(history.history['val_accuracy']) -
                                        1])
    # keeps running model and fit functions until the validation loss stops
    # decreasing

    logger("Testing number of layers")
    col_name = [[
        "Current number of layers", "| Training Accuracy", "| Test Accuracy"
    ]]
    col_width = max(len(word) for row in col_name for word in row) + 2

    for row in col_name:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    datax = []
    # while all(x < y for x, y in zip(accuracies, accuracies[1:])):
    while (len(accuracies) <= 2 or
           accuracies[len(accuracies) - 1] > accuracies[len(accuracies) - 2]):
        model = get_keras_model_class(data, i, num_classes, add_layer)
        history = model.fit(X_train,
                            y_train,
                            callbacks=callback_value,
                            epochs=epochs,
                            validation_data=(X_test, y_test),
                            verbose=0)

        values = []
        datax = []
        values.append(str(len(model.layers)))
        values.append("| " + str(history.history['accuracy'][
            len(history.history['accuracy']) - 1]))
        values.append("| " + str(history.history['val_accuracy'][
            len(history.history['val_accuracy']) - 1]))
        datax.append(values)
        for row in datax:
            print((" " * 2 * counter) + "| " +
                  ("".join(word.ljust(col_width) for word in row)) + " |")
        del values, datax
        losses.append(
            history.history[maximizer][len(history.history[maximizer]) - 1])
        accuracies.append(history.history['val_accuracy'][
            len(history.history['val_accuracy']) - 1])
        models.append(history)
        model_data.append(model)

        i += 1
    # print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl'))
    # del values, datax

    final_model = model_data[accuracies.index(max(accuracies))]
    final_hist = models[accuracies.index(max(accuracies))]

    print("")
    logger('->',
           "Best number of layers found: " + str(len(final_model.layers)))
    logger(
        '->', "Training Accuracy: " + str(final_hist.history['accuracy'][
            len(final_hist.history['val_accuracy']) - 1]))
    logger(
        '->', "Test Accuracy: " + str(final_hist.history['val_accuracy'][
            len(final_hist.history['val_accuracy']) - 1]))

    # genreates appropriate classification plots by feeding all information
    plots = {}
    if generate_plots:
        plots = generate_classification_plots(models[len(models) - 1])

    if save_model:
        save(final_model, save_model, save_path)

    print("")
    logger("Stored model under 'classification_ANN' key")
    clearLog()

    K.clear_session()

    # stores the values and plots into the object dictionary
    return {
        'id': generate_id(),
        "model": final_model,
        'num_classes': num_classes,
        "plots": plots,
        "target": remove,
        "preprocessor": full_pipeline,
        "interpreter": one_hotencoder,
        'test_data': {
            'X': X_test,
            'y': y_test
        },
        'losses': {
            'training_loss': final_hist.history['loss'],
            'val_loss': final_hist.history['val_loss']
        },
        'accuracy': {
            'training_accuracy': final_hist.history['accuracy'],
            'validation_accuracy': final_hist.history['val_accuracy']
        }
    }
예제 #14
0
def train_xgboost(instruction,
                  dataset=None,
                  learning_rate=0.1,
                  n_estimators=1000,
                  ca_threshold=None,
                  max_depth=6,
                  min_child_weight=1,
                  gamma=0,
                  subsample=0.8,
                  colsample_bytree=0.8,
                  objective='binary:logistic',
                  random_state=27,
                  test_size=0.2,
                  text=[],
                  preprocess=True,
                  verbosity=0,
                  drop=None):
    '''
    function to train a xgboost algorithm
    :param many params: used to hyperparametrize the function.
    :return a dictionary object with all of the information for the algorithm.
    '''

    logger("Reading in dataset")

    dataReader = DataReader(dataset)
    data = dataReader.data_generator()

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    logger("Preprocessing data")
    data, y, target, full_pipeline = initial_preprocesser(
        data,
        instruction,
        preprocess,
        ca_threshold,
        text,
        test_size=test_size,
        random_state=random_state)
    logger("->", "Target column found: {}".format(target))

    X_train = data['train']
    y_train = y['train']
    X_test = data['test']
    y_test = y['test']

    # classification_column = get_similar_column(getLabelwithInstruction(instruction), data)
    num_classes = len(np.unique(y))

    if num_classes > 2:
        objective = 'multi:softmax'

    # Needed to make a custom label encoder due to train test split changes
    # Can still be inverse transformed, just a bit of extra work
    y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0))
    label_mappings = sklearn.preprocessing.LabelEncoder()
    label_mappings.fit(y_vals)

    y_train = label_mappings.transform(y_train)
    y_test = label_mappings.transform(y_test)

    # Fitting to SVM and storing in the model dictionary
    logger("Fitting XGBoost")
    clf = XGBClassifier(learning_rate=learning_rate,
                        n_estimators=n_estimators,
                        max_depth=max_depth,
                        min_child_weight=min_child_weight,
                        gamma=gamma,
                        subsample=subsample,
                        colsample_bytree=colsample_bytree,
                        objective=objective,
                        verbosity=verbosity,
                        random_state=random_state)
    clf.fit(X_train, y_train)

    score = accuracy_score(clf.predict(X_test), y_test)

    logger("->", "Accuracy found on testing set: {}".format(score))

    logger('->', "Stored model under 'xgboost' key")
    clearLog()
    clearLog()

    return {
        'id': generate_id(),
        "model": clf,
        "target": target,
        'num_classes': num_classes,
        "accuracy": {
            'cross_val_score': cross_val_score(
                clf,
                X_train,
                y_train,
            ),
            'accuracy_score': score
        },
        "accuracy_score": score,
        "preprocesser": full_pipeline,
        "interpreter": label_mappings,
        'test_data': {
            'X': X_test,
            'y': y_test
        }
    }
예제 #15
0
def summarization_query(self,
                        instruction,
                        preprocess=True,
                        label_column=None,
                        drop=None,
                        epochs=10,
                        batch_size=32,
                        learning_rate=1e-4,
                        max_text_length=512,
                        max_summary_length=150,
                        test_size=0.2,
                        random_state=49,
                        gpu=False,
                        generate_plots=True,
                        save_model=False,
                        save_path=os.getcwd()):
    '''
    function to apply algorithm for text summarization 
    :param many params: used to hyperparametrize the function.
    :return a dictionary object with all of the information for the algorithm.
    '''

    if test_size < 0:
        raise Exception("Test size must be a float between 0 and 1")

    if test_size >= 1:
        raise Exception(
            "Test size must be a float between 0 and 1 (a test size greater than or equal to 1 results in no training "
            "data)")

    if max_text_length < 2 | max_summary_length < 2:
        raise Exception("Text and summary must be at least of length 2")

    if epochs < 1:
        raise Exception(
            "Epoch number is less than 1 (model will not be trained)")

    if batch_size < 1:
        raise Exception("Batch size must be equal to or greater than 1")

    if max_text_length < 1:
        raise Exception("Max text length must be equal to or greater than 1")

    if max_summary_length < 1:
        raise Exception(
            "Max summary length must be equal to or greater than 1")

    if save_model:
        if not os.path.exists(save_path):
            raise Exception("Save path does not exists")

    if test_size == 0:
        testing = False
    else:
        testing = True

    if gpu:
        device = "cuda"
    else:
        device = "cpu"

    data = DataReader(self.dataset)
    data = data.data_generator()

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    if preprocess:
        data.fillna(0, inplace=True)

    logger("Preprocessing data...")

    if label_column is None:
        label = "summary"
    else:
        label = label_column

    X, Y, target = get_target_values(data, instruction, label)
    df = pd.DataFrame({'text': Y, 'ctext': X})
    logger("->", "Target Column Found: {}".format(target))

    torch.manual_seed(random_state)
    np.random.seed(random_state)

    tokenizer = T5Tokenizer.from_pretrained("t5-small")

    train_size = 1 - test_size
    train_dataset = df.sample(frac=train_size,
                              random_state=random_state).reset_index(drop=True)

    logger("Establishing dataset walkers")
    training_set = CustomDataset(train_dataset, tokenizer, max_text_length,
                                 max_summary_length)

    if testing:
        val_dataset = df.drop(train_dataset.index).reset_index(drop=True)

        val_set = CustomDataset(val_dataset, tokenizer, max_text_length,
                                max_summary_length)

        val_params = {
            'batch_size': batch_size,
            'shuffle': False,
            'num_workers': 0
        }
        val_loader = DataLoader(val_set, **val_params)
    else:
        val_loader = None

    train_params = {
        'batch_size': batch_size,
        'shuffle': True,
        'num_workers': 0
    }

    training_loader = DataLoader(training_set, **train_params)
    # used small model
    model = T5ForConditionalGeneration.from_pretrained("t5-small")
    model = model.to(device)

    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

    logger('Fine-Tuning the model on your dataset...')
    total_loss_train = []
    total_loss_val = []
    for epoch in range(epochs):
        loss_train, loss_val = train(epoch,
                                     tokenizer,
                                     model,
                                     device,
                                     training_loader,
                                     val_loader,
                                     optimizer,
                                     testing=testing)
        total_loss_train.append(loss_train)
        total_loss_val.append(loss_val)

    logger("->", "Final training loss: {}".format(loss_train))
    if testing:
        logger("->", "Final validation loss: {}".format(loss_val))
    else:
        logger("->",
               "Final validation loss: {}".format("0, No validation done"))

    plots = {}
    if generate_plots:
        logger("Generating plots")
        plots.update({
            "loss":
            libra.plotting.nonkeras_generate_plots.plot_loss(
                total_loss_train, total_loss_val)
        })

    if save_model:
        logger("Saving model")
        path = save_path + "DocSummarization.pth"
        torch.save(model, path)
        logger("->", "Saved model to disk as DocSummarization.pth")

    logger(
        "Storing information in client object under key 'doc_summarization'")

    self.models["doc_summarization"] = {
        "model": model,
        "max_text_length": max_text_length,
        "max_sum_length": max_summary_length,
        "plots": plots,
        'losses': {
            'training_loss': loss_train,
            'val_loss': loss_val
        }
    }
    clearLog()
    return self.models["doc_summarization"]
def dimensionality_reduc(
        instruction,
        dataset,
        arr=[
            "RF",
            "PCA",
            "KPCA",
            "ICA"],
        inplace=False):
    global currLog
    global counter

    dataReader = DataReader(dataset)

    logger("loading dataset...")
    data = dataReader.data_generator()
    data.fillna(0, inplace=True)

    logger("getting most similar column from instruction...")
    target = get_similar_column(get_value_instruction(instruction), data)

    y = data[target]
    del data[target]
    le = preprocessing.LabelEncoder()
    y = le.fit_transform(y)

    data = structured_preprocesser(data)

    perms = []
    overall_storage = []
    finals = []

    logger("generating dimensionality permutations...")
    for i in range(1, len(arr) + 1):
        for elem in list(permutations(arr, i)):
            perms.append(elem)

    logger("running each possible permutation...")
    logger("realigning tensors...")
    for path in perms:
        currSet = data
        for element in path:
            if element == "RF":
                data_mod, beg_acc, final_acc, col_removed = dimensionality_RF(
                    instruction, currSet, target, y)
            elif element == "PCA":
                data_mod, beg_acc, final_acc, col_removed = dimensionality_PCA(
                    instruction, currSet, target, y)
            elif element == "KPCA":
                data_mod, beg_acc, final_acc, col_removed = dimensionality_KPCA(
                    instruction, currSet, target, y)
            elif element == "ICA":
                data_mod, beg_acc, final_acc, col_removed = dimensionality_ICA(
                    instruction, currSet, target, y)
            overall_storage.append(
                list([data_mod, beg_acc, final_acc, col_removed]))
            currSet = data_mod
        finals.append(overall_storage[len(overall_storage) - 1])

    logger("Fetching Best Accuracies...")
    accs = []
    print("")
    print("Baseline Accuracy: " + str(finals[0][1]))
    print("----------------------------")
    for i, element in product(range(len(finals)), finals):
        print("Permutation --> " +
              str(perms[i]) +
              " | Final Accuracy --> " +
              str(element[2]))
        if finals[0][1] < element[2]:
            accs.append(list(["Permutation --> " +
                              str(perms[i]) +
                              " | Final Accuracy --> " +
                              str(element[2])]))
    print("")
    print("Best Accuracies")
    print("----------------------------")
    for element in accs:
        print(element)

    if inplace:
        data.to_csv(dataset)
예제 #17
0
def nearest_neighbors(instruction=None,
                      dataset=None,
                      ca_threshold=None,
                      preprocess=True,
                      drop=None,
                      min_neighbors=3,
                      max_neighbors=10,
                      leaf_size=30,
                      p=2,
                      test_size=0.2,
                      random_state=49,
                      algorithm='auto',
                      text=[]):
    '''
    function to train a nearest neighbor algorithm
    :param many params: used to hyperparametrize the function.
    :return a dictionary object with all of the information for the algorithm.
    '''

    logger("Reading in dataset")
    # Reads in dataset
    # data = pd.read_csv(self.dataset)
    dataReader = DataReader(dataset)
    data = dataReader.data_generator()
    if drop is not None:
        data.drop(drop, axis=1, inplace=True)
    logger("Preprocessing data")
    data, y, remove, full_pipeline = initial_preprocesser(
        data,
        instruction,
        preprocess,
        ca_threshold,
        text,
        test_size=test_size,
        random_state=random_state)
    logger("->", "Target column found: {}".format(remove))
    X_train = data['train']
    y_train = y['train']
    X_test = data['test']
    y_test = y['test']
    # classification_column = get_similar_column(getLabelwithInstruction(instruction), data)
    num_classes = len(np.unique(y))
    # encodes the label dataset into 0's and 1's
    y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0))
    label_mappings = sklearn.preprocessing.LabelEncoder()
    label_mappings.fit(y_vals)

    y_train = label_mappings.transform(y_train)
    y_test = label_mappings.transform(y_test)
    logger("Labels being mapped to appropriate classes")
    models = []
    scores = []
    logger("Fitting nearest neighbors model")
    logger("Identifying optimal number of neighbors")
    # Tries all neighbor possibilities, based on either defaults or user
    # specified values
    num_neighbors = []
    for x in range(min_neighbors, max_neighbors):
        knn = KNeighborsClassifier(n_neighbors=x,
                                   leaf_size=leaf_size,
                                   p=p,
                                   algorithm=algorithm)
        knn.fit(X_train, y_train)
        models.append(knn)
        scores.append(accuracy_score(knn.predict(X_test), y_test))
        num_neighbors.append(x)

    logger(
        "->", "Optimal number of neighbors found: {}".format(
            num_neighbors[scores.index(max(scores))]))
    logger(
        "->", "Accuracy found on testing set: {}".format(scores[scores.index(
            max(scores))]))
    logger("Stored model under 'nearest_neighbors' key")
    knn = models[scores.index(min(scores))]
    clearLog()
    return {
        'id': generate_id(),
        "model": knn,
        'num_classes': num_classes,
        "accuracy": {
            'accuracy_score': scores[scores.index(max(scores))],
            'cross_val_score': cross_val_score(knn, X_train, y_train, cv=3)
        },
        "preprocesser": full_pipeline,
        "interpreter": label_mappings,
        'test_data': {
            'X': X_test,
            'y': y_test
        },
        "target": remove
    }
    clearLog()
예제 #18
0
def classification_ann(instruction,
                       dataset=None,
                       text=None,
                       ca_threshold=None,
                       preprocess=True,
                       callback_mode='min',
                       drop=None,
                       random_state=49,
                       test_size=0.2,
                       epochs=50,
                       generate_plots=True,
                       maximizer="val_loss",
                       save_model=True,
                       save_path=os.getcwd()):

    global currLog
    logger("Reading in dataset...")

    dataReader = DataReader(dataset)
    data = dataReader.data_generator()

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    data, y, remove, full_pipeline = initial_preprocesser(
        data, instruction, preprocess, ca_threshold, text)
    logger("->", "Target Column Found: {}".format(remove))

    # Needed to make a custom label encoder due to train test split changes
    # Can still be inverse transformed, just a bit of extra work
    y = pd.concat([y['train'], y['test']], axis=0)

    num_classes = len(np.unique(y))

    X_train = data['train']
    X_test = data['test']

    # ANN needs target one hot encoded for classification
    one_hot_encoder = OneHotEncoder()

    y = pd.DataFrame(one_hot_encoder.fit_transform(
        np.reshape(y.values, (-1, 1))).toarray(),
                     columns=one_hot_encoder.get_feature_names())

    y_train = y.iloc[:len(X_train)]
    y_test = y.iloc[len(X_train):]

    models = []
    losses = []
    accuracies = []
    model_data = []

    logger("Establishing callback function...")

    # early stopping callback
    es = EarlyStopping(monitor=maximizer, mode='min', verbose=0, patience=5)

    i = 0
    model = get_keras_model_class(data, i, num_classes)
    logger("Training initial model...")
    history = model.fit(X_train,
                        y_train,
                        epochs=epochs,
                        validation_data=(X_test, y_test),
                        callbacks=[es],
                        verbose=0)

    model_data.append(model)
    models.append(history)
    col_name = [[
        "Initial number of layers ", "| Training Loss ", "| Test Loss "
    ]]
    col_width = max(len(word) for row in col_name for word in row) + 2
    for row in col_name:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    values = []
    values.append(str(len(model.layers)))
    values.append(
        "| " +
        str(history.history['loss'][len(history.history['val_loss']) - 1]))
    values.append(
        "| " +
        str(history.history['val_loss'][len(history.history['val_loss']) - 1]))
    datax = []
    datax.append(values)
    for row in datax:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    #print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl'))
    losses.append(history.history[maximizer][len(history.history[maximizer]) -
                                             1])
    # keeps running model and fit functions until the validation loss stops
    # decreasing

    logger("Testing number of layers...")
    col_name = [["Current number of layers", "| Training Loss", "| Test Loss"]]
    col_width = max(len(word) for row in col_name for word in row) + 2

    for row in col_name:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    datax = []
    while (all(x > y for x, y in zip(losses, losses[1:]))):
        model = get_keras_model_class(data, i, num_classes)
        history = model.fit(X_train,
                            y_train,
                            epochs=epochs,
                            validation_data=(X_test, y_test),
                            callbacks=[es],
                            verbose=0)

        values = []
        datax = []
        values.append(str(len(model.layers)))
        values.append(
            "| " +
            str(history.history['loss'][len(history.history['val_loss']) - 1]))
        values.append("| " + str(history.history['val_loss'][
            len(history.history['val_loss']) - 1]))
        datax.append(values)
        for row in datax:
            print((" " * 2 * counter) + "| " +
                  ("".join(word.ljust(col_width) for word in row)) + " |")
        losses.append(
            history.history[maximizer][len(history.history[maximizer]) - 1])
        accuracies.append(history.history['val_accuracy'][
            len(history.history['val_accuracy']) - 1])
        i += 1
    #print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl'))
    #del values, datax
    final_model = model_data[losses.index(min(losses))]
    final_hist = models[losses.index(min(losses))]
    print("")
    logger('->',
           "Best number of layers found: " + str(len(final_model.layers)))
    logger(
        '->', "Training Accuracy: " + str(final_hist.history['accuracy'][
            len(final_hist.history['val_accuracy']) - 1]))
    logger(
        '->', "Test Accuracy: " + str(final_hist.history['val_accuracy'][
            len(final_hist.history['val_accuracy']) - 1]))

    # genreates appropriate classification plots by feeding all information
    plots = generate_classification_plots(models[len(models) - 1], data, y,
                                          model, X_test, y_test)

    if save_model:
        save(final_model, save_model)

    print("")
    logger("Stored model under 'classification_ANN' key")

    # stores the values and plots into the object dictionary
    return {
        'id': generate_id(),
        "model": final_model,
        'num_classes': num_classes,
        "plots": plots,
        "target": remove,
        "preprocesser": full_pipeline,
        "interpreter": one_hot_encoder,
        'losses': {
            'training_loss': final_hist.history['loss'],
            'val_loss': final_hist.history['val_loss']
        },
        'accuracy': {
            'training_accuracy': final_hist.history['accuracy'],
            'validation_accuracy': final_hist.history['val_accuracy']
        }
    }
예제 #19
0
def decision_tree(instruction,
                  dataset=None,
                  preprocess=True,
                  ca_threshold=None,
                  text=[],
                  test_size=0.2,
                  drop=None,
                  criterion='gini',
                  splitter='best',
                  max_depth=None,
                  min_samples_split=2,
                  min_samples_leaf=1,
                  min_weight_fraction_leaf=0.0,
                  max_leaf_nodes=None,
                  min_impurity_decrease=0.0,
                  ccp_alpha=0.0):
    '''
    function to train a decision tree algorithm.
    :param many params: used to hyperparametrize the function.
    :return a dictionary object with all of the information for the algorithm.
    '''
    logger("Reading in dataset")

    dataReader = DataReader(dataset)
    data = dataReader.data_generator()
    logger("Preprocessing data")
    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    data, y, remove, full_pipeline = initial_preprocesser(
        data, instruction, preprocess, ca_threshold, text)
    logger("->", "Target column found: {}".format(remove))

    X_train = data['train']
    y_train = y['train']
    X_test = data['test']
    y_test = y['test']

    # classification_column = get_similar_column(getLabelwithInstruction(instruction), data)

    # Needed to make a custom label encoder due to train test split changes
    # Can still be inverse transformed, just a bit of extra work
    y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0))
    label_mappings = sklearn.preprocessing.LabelEncoder()
    label_mappings.fit(y_vals)

    y_train = label_mappings.transform(y_train)
    y_test = label_mappings.transform(y_test)

    logger("Labels being mapped to appropriate classes")
    num_classes = len(np.unique(y))

    # fitting and storing
    logger("Fitting Decision Tree")

    clf = tree.DecisionTreeClassifier(
        criterion=criterion,
        splitter=splitter,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        min_weight_fraction_leaf=min_weight_fraction_leaf,
        max_leaf_nodes=max_leaf_nodes,
        min_impurity_decrease=min_impurity_decrease,
        ccp_alpha=ccp_alpha)
    clf = clf.fit(X_train, y_train)

    score = accuracy_score(clf.predict(X_test), y_test)
    logger("->", "Score found on testing set: {}".format(score))
    logger("Stored model under 'decision_tree' key")
    clearLog()

    return {
        'id': generate_id(),
        "model": clf,
        "target": remove,
        'num_classes': num_classes,
        "accuracy": {
            'cross_val_score': cross_val_score(clf, X_train, y_train, cv=3),
            'accuracy_score': score
        },
        "accuracy_score": score,
        "preprocesser": full_pipeline,
        "interpreter": label_mappings,
        'test_data': {
            'X': X_test,
            'y': y_test
        }
    }
예제 #20
0
def regression_ann(instruction,
                   ca_threshold=None,
                   text=None,
                   dataset=None,
                   drop=None,
                   preprocess=True,
                   test_size=0.2,
                   random_state=49,
                   epochs=50,
                   generate_plots=True,
                   callback_mode='min',
                   maximizer="val_loss",
                   save_model=True,
                   save_path=os.getcwd()):

    global currLog
    logger("reading in dataset...")

    dataReader = DataReader(dataset)
    data = dataReader.data_generator()
    # data = pd.read_csv(self.dataset)

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    data, y, target, full_pipeline = initial_preprocesser(
        data, instruction, preprocess, ca_threshold, text)
    logger("->", "Target Column Found: {}".format(target))

    X_train = data['train']
    X_test = data['test']

    # Target scaling
    target_scaler = StandardScaler()

    y_train = target_scaler.fit_transform(np.array(y['train']).reshape(-1, 1))
    y_test = target_scaler.transform(np.array(y['test']).reshape(-1, 1))

    logger("Establishing callback function...")

    models = []
    losses = []
    model_data = []

    # callback function to store lowest loss value
    es = EarlyStopping(monitor=maximizer,
                       mode=callback_mode,
                       verbose=0,
                       patience=5)

    i = 0

    # get the first 3 layer model
    model = get_keras_model_reg(data, i)

    logger("Training initial model...")
    history = model.fit(X_train,
                        y_train,
                        epochs=epochs,
                        validation_data=(X_test, y_test),
                        callbacks=[es],
                        verbose=0)
    models.append(history)
    model_data.append(model)

    col_name = [[
        "Initial number of layers ", "| Training Loss ", "| Test Loss "
    ]]
    col_width = max(len(word) for row in col_name for word in row) + 2
    for row in col_name:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    values = []
    values.append(str(len(model.layers)))
    values.append(
        "| " +
        str(history.history['loss'][len(history.history['val_loss']) - 1]))
    values.append(
        "| " +
        str(history.history['val_loss'][len(history.history['val_loss']) - 1]))
    datax = []
    datax.append(values)
    for row in datax:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")

    losses.append(history.history[maximizer][len(history.history[maximizer]) -
                                             1])

    # keeps running model and fit functions until the validation loss stops
    # decreasing
    logger("Testing number of layers...")
    print(currLog)
    col_name = [["Current number of layers", "| Training Loss", "| Test Loss"]]
    col_width = max(len(word) for row in col_name for word in row) + 2
    for row in col_name:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    datax = []
    while (all(x > y for x, y in zip(losses, losses[1:]))):
        model = get_keras_model_reg(data, i)
        history = model.fit(X_train,
                            y_train,
                            epochs=epochs,
                            validation_data=(X_test, y_test),
                            verbose=0)
        model_data.append(model)
        models.append(history)

        values = []
        datax = []
        values.append(str(len(model.layers)))
        values.append(
            "| " +
            str(history.history['loss'][len(history.history['val_loss']) - 1]))
        values.append("| " + str(history.history['val_loss'][
            len(history.history['val_loss']) - 1]))
        datax.append(values)
        for row in datax:
            print((" " * 2 * counter) + "| " +
                  ("".join(word.ljust(col_width) for word in row)) + " |")
        del values, datax
        losses.append(
            history.history[maximizer][len(history.history[maximizer]) - 1])
        i += 1
    #print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl'))
    final_model = model_data[losses.index(min(losses))]
    final_hist = models[losses.index(min(losses))]
    print("")
    logger('->',
           "Best number of layers found: " + str(len(final_model.layers)))

    logger(
        '->', "Training Loss: " +
        str(final_hist.history['loss'][len(final_hist.history['val_loss']) -
                                       1]))
    logger(
        '->', "Test Loss: " +
        str(final_hist.history['val_loss'][len(final_hist.history['val_loss'])
                                           - 1]))

    # calls function to generate plots in plot generation
    if generate_plots:
        init_plots, plot_names = generate_regression_plots(
            models[len(models) - 1], data, y)
        plots = {}
        for x in range(len(plot_names)):
            plots[str(plot_names[x])] = init_plots[x]

    if save_model:
        save(final_model, save_model)
    # stores values in the client object models dictionary field
    print("")
    logger("Stored model under 'regression_ANN' key")
    return {
        'id': generate_id(),
        'model': final_model,
        "target": target,
        "plots": plots,
        "preprocesser": full_pipeline,
        "interpreter": target_scaler,
        'losses': {
            'training_loss': final_hist.history['loss'],
            'val_loss': final_hist.history['val_loss']
        }
    }
예제 #21
0
def k_means_clustering(dataset=None,
                       scatters=[],
                       clusters=None,
                       preprocess=True,
                       generate_plots=True,
                       drop=None,
                       base_clusters=1,
                       verbose=0,
                       n_init=10,
                       max_iter=300,
                       random_state=42,
                       text=[]):
    '''
    function to train a k means clustering algorithm
    :param many params: used to hyperparametrize the function.
    :return a dictionary object with all of the information for the algorithm.
    '''

    logger("Reading in dataset")

    dataReader = DataReader(dataset)
    data = dataReader.data_generator()

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    dataPandas = data.copy()

    full_pipeline = None
    if preprocess:
        logger("Preprocessing data")
        data, full_pipeline = clustering_preprocessor(data)
        data = np.array(data)

    modelStorage = []
    inertiaStor = []

    # processes dataset and runs KMeans algorithm on one cluster as
    # baseline
    if clusters is None:
        i = base_clusters
        logger("Creating unsupervised clustering task")
        kmeans = KMeans(n_clusters=i,
                        random_state=random_state,
                        verbose=verbose,
                        n_init=n_init,
                        max_iter=max_iter).fit(data)
        modelStorage.append(kmeans)
        # stores SSE values in an array for later comparison
        inertiaStor.append(kmeans.inertia_)

        logger("Identifying best centroid count and optimizing accuracy")

        col_name = [["Number of clusters   ", "| Inertia  "]]
        col_width = max(len(word) for row in col_name for word in row) + 2
        printtable(col_name, col_width)
        values = []
        values.append(str(i))
        values.append("| " + str(inertiaStor[i - base_clusters]))
        datax = []
        datax.append(values)
        printtable(datax, col_width)

        i += 1

        # continues to increase cluster size until SSE values don't decrease by
        # 1000 - this value was decided based on precedence
        while (all(earlier >= later
                   for earlier, later in zip(inertiaStor, inertiaStor[1:]))):
            kmeans = KMeans(n_clusters=i,
                            random_state=random_state,
                            verbose=verbose,
                            n_init=n_init,
                            max_iter=max_iter).fit(data)
            modelStorage.append(kmeans)
            inertiaStor.append(kmeans.inertia_)

            values = []
            values.append(str(i))
            values.append("| " + str(inertiaStor[i - base_clusters]))
            datax = []
            datax.append(values)
            printtable(datax, col_width)

            # minimize inertia up to 10000
            i += 1

            # checks to see if it should continue to run; need to improve this
            # algorithm
            if i > 3 and inertiaStor[len(inertiaStor) -
                                     2] - 1000 <= inertiaStor[len(inertiaStor)
                                                              - 1]:
                print()
                break

        # generates the clustering plots approiately
        logger("->", "Optimal number of clusters found: {}".format(i))
        logger("->",
               "Final inertia of {}".format(inertiaStor[len(inertiaStor) - 1]))
    else:
        kmeans = KMeans(n_clusters=clusters,
                        random_state=random_state,
                        verbose=verbose,
                        n_init=n_init,
                        max_iter=max_iter).fit(data)

    plots = {}
    if generate_plots:
        if clusters is None:
            logger("Generating plots and storing in model")
            init_plots, plot_names, elbow = generate_clustering_plots(
                modelStorage[len(modelStorage) - 1], dataPandas, data,
                scatters, inertiaStor, base_clusters)
            for x in range(len(plot_names)):
                plots[str(plot_names[x])] = init_plots[x]
            plots['elbow'] = elbow

    logger("Stored model under 'k_means_clustering' key")
    clearLog()
    # stores plots and information in the dictionary client model
    return {
        'id':
        generate_id(),
        "model":
        (modelStorage[len(modelStorage) - 1] if clusters is None else kmeans),
        "preprocesser":
        full_pipeline,
        "plots":
        plots
    }
예제 #22
0
def tune_helper(model_to_tune=None,
                dataset=None,
                models=None,
                max_layers=10,
                min_layers=2,
                min_dense=32,
                max_dense=512,
                executions_per_trial=3,
                max_trials=1,
                activation='relu',
                loss='categorical_crossentropy',
                metrics='accuracy',
                seed=42,
                objective='val_accuracy',
                generate_plots=True,
                directory='my_dir',
                epochs=10,
                step=32,
                patience=1,
                verbose=0,
                test_size=0.2):
    '''
    Helper function that calls the appropriate tuning function
    :param instruction: the objective that you want to reduce dimensions to maximize
    :return the updated models dictionary
    '''
    print("")
    logger("Getting target model for tuning...")

    # checks to see which requested model is in the self.models

    # processing for regression feed forward NN
    if model_to_tune == 'regression_ANN':
        logger("Reading in data")
        logger("Tuning model hyperparameters...")
        dataReader = DataReader(dataset)
        data = dataReader.data_generator()
        target = models['regression_ANN']['target']
        target_column = data[models['regression_ANN']['target']]
        data = models['regression_ANN']['preprocesser'].transform(
            data.drop(target, axis=1))
        returned_model, returned_pms, history, X_test, y_test = tuneReg(
            data.values,
            target_column.values,
            max_layers=max_layers,
            min_layers=min_layers,
            min_dense=min_dense,
            max_dense=max_dense,
            executions_per_trial=executions_per_trial,
            max_trials=max_trials,
            epochs=epochs,
            activation=activation,
            step=step,
            directory=directory,
            verbose=verbose,
            test_size=test_size)
        plots = {}
        logger("->",
               'Best Hyperparameters Found: {}'.format(returned_pms.values))
        if generate_plots:
            logger("Generating updated plots")
            init_plots, plot_names = generate_regression_plots(
                history, data, target_column)
            for x in range(len(plot_names)):
                plots[str(plot_names[x])] = init_plots[x]

        models['regression_ANN'] = {
            'id': models['regression_ANN']['id'],
            'model': returned_model,
            'target': target,
            "plots": plots,
            'preprocesser': models['regression_ANN']['preprocesser'],
            'interpreter': models['regression_ANN']['interpreter'],
            'test_data': {
                'X': X_test,
                'y': y_test
            },
            'hyperparameters': returned_pms.values,
            'losses': {
                'training_loss': history.history['loss'],
                'val_loss': history.history['val_loss']
            }
        }
        logger("Re-stored model under 'regression_ANN' key")

        # processing for classification feed forward NN
    elif model_to_tune == "classification_ANN":
        logger("Reading in data")
        logger("Tuning model hyperparameters...")
        dataReader = DataReader(dataset)
        data = dataReader.data_generator()
        target = models['classification_ANN']['target']
        target_column = data[models['classification_ANN']['target']]
        data = models['classification_ANN']['preprocesser'].transform(
            data.drop(target, axis=1))
        returned_model, returned_pms, history, X_test, y_test = tuneClass(
            data,
            target_column,
            models['classification_ANN']['num_classes'],
            max_layers=max_layers,
            min_layers=min_layers,
            min_dense=min_dense,
            max_dense=max_dense,
            executions_per_trial=executions_per_trial,
            max_trials=max_trials,
            activation=activation,
            loss=loss,
            directory=directory,
            metrics=metrics,
            epochs=epochs,
            step=step,
            verbose=verbose,
            test_size=test_size)
        plots = {}
        logger("->",
               'Best Hyperparameters Found: {}'.format(returned_pms.values))
        if generate_plots:
            logger("Generating updated plots")
            plots = generate_classification_plots(history, data, target_column,
                                                  returned_model, X_test,
                                                  y_test)

        logger("Re-stored model under 'classification_ANN' key")
        models['classification_ANN'] = {
            'id': models['classification_ANN']['id'],
            'model': returned_model,
            'hyperparameters': returned_pms.values,
            'plots': plots,
            'preprocesser': models['classification_ANN']['preprocesser'],
            'interpreter': models['classification_ANN']['interpreter'],
            'test_data': {
                'X': X_test,
                'y': y_test
            },
            'target': target,
            'losses': {
                'training_loss': history.history['loss'],
                'val_loss': history.history['val_loss']
            },
            'accuracy': {
                'training_accuracy': history.history['accuracy'],
                'validation_accuracy': history.history['val_accuracy']
            }
        }

    elif model_to_tune == "convolutional_NN":
        logger("Tuning model hyperparameters...")
        X_train, X_test, height, width, num_classes = get_image_data(models)
        logger('Located image data')
        model, returned_pms, history = tuneCNN(
            X_train,
            X_test,
            height,
            width,
            num_classes,
            executions_per_trial=executions_per_trial,
            max_trials=max_trials,
            seed=seed,
            objective=objective,
            directory=directory,
            patience=patience,
            epochs=epochs,
            verbose=verbose,
            test_size=test_size)
        logger("->", "Optimal image size identified: {}".format(
            (height, width, 3)))
        logger('Packaging HyperModel')
        logger("->",
               'Best Hyperparameters Found: {}'.format(returned_pms.values))
        logger("Re-stored model under 'convolutional_NN' key")

        models['convolutional_NN'] = {
            'id': models['convolutional_NN']['id'],
            'data_type': models['convolutional_NN']['data_type'],
            'data_path': models['convolutional_NN']['data_path'],
            'data': {
                'train': X_train,
                'test': X_test
            },
            'shape': models['convolutional_NN']['shape'],
            'model': model,
            'num_classes': models['convolutional_NN']['num_classes'],
            'data_sizes': models['convolutional_NN']['data_sizes'],
            'losses': {
                'training_loss': history.history['loss'],
                'val_loss': history.history['val_loss']
            },
            'accuracy': {
                'training_accuracy': history.history['accuracy'],
                'validation_accuracy': history.history['val_accuracy']
            }
        }
    clearLog()
    return models
예제 #23
0
def regression_ann(instruction,
                   callback=False,
                   ca_threshold=None,
                   text=[],
                   dataset=None,
                   drop=None,
                   preprocess=True,
                   test_size=0.2,
                   random_state=49,
                   epochs=50,
                   generate_plots=True,
                   callback_mode='min',
                   maximizer="val_loss",
                   save_model=False,
                   save_path=os.getcwd(),
                   add_layer={}):
    '''
    Body of the regression function used that is called in the neural network query
    if the data is numerical.
    :param many parameters: used to preprocess, tune, plot generation, and parameterizing the neural network trained.
    :return dictionary that holds all the information for the finished model.
    '''

    if dataset is None:
        dataReader = DataReader(get_file())
    else:
        dataReader = DataReader(dataset)
    logger("Reading in dataset")
    data = dataReader.data_generator()
    # data = pd.read_csv(self.dataset)

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)
    data, y, target, full_pipeline = initial_preprocessor(
        data,
        instruction,
        preprocess,
        ca_threshold,
        text,
        test_size=test_size,
        random_state=random_state)
    logger("->", "Target column found: {}".format(target))

    X_train = data['train']
    X_test = data['test']

    # Target scaling
    target_scaler = StandardScaler()

    y_train = target_scaler.fit_transform(np.array(y['train']).reshape(-1, 1))
    y_test = target_scaler.transform(np.array(y['test']).reshape(-1, 1))

    logger("Establishing callback function")

    models = []
    losses = []
    model_data = []

    # callback function to store lowest loss value
    es = EarlyStopping(monitor=maximizer,
                       mode=callback_mode,
                       verbose=0,
                       patience=5)

    callback_value = None
    if callback is not False:
        callback_value = [es]

    i = 0

    # add_layer format: {<object> : list of indexs}
    # get the first 3 layer model
    model = get_keras_model_reg(data, i, add_layer)

    logger("Training initial model")
    history = model.fit(X_train,
                        y_train,
                        epochs=epochs,
                        validation_data=(X_test, y_test),
                        callbacks=callback_value,
                        verbose=0)
    models.append(history)
    model_data.append(model)

    col_name = [[
        "Initial number of layers ", "| Training Loss ", "| Test Loss "
    ]]
    col_width = max(len(word) for row in col_name for word in row) + 2
    for row in col_name:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    values = []
    values.append(str(len(model.layers)))
    values.append(
        "| " +
        str(history.history['loss'][len(history.history['val_loss']) - 1]))
    values.append(
        "| " +
        str(history.history['val_loss'][len(history.history['val_loss']) - 1]))
    datax = []
    datax.append(values)
    for row in datax:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")

    losses.append(history.history[maximizer][len(history.history[maximizer]) -
                                             1])

    # keeps running model and fit functions until the validation loss stops
    # decreasing
    logger("Testing number of layers")
    col_name = [["Current number of layers", "| Training Loss", "| Test Loss"]]
    col_width = max(len(word) for row in col_name for word in row) + 2
    for row in col_name:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    datax = []
    # while all(x > y for x, y in zip(losses, losses[1:])):
    while (len(losses) <= 2
           or losses[len(losses) - 1] < losses[len(losses) - 2]):
        model = get_keras_model_reg(data, i, add_layer)
        history = model.fit(X_train,
                            y_train,
                            callbacks=callback_value,
                            epochs=epochs,
                            validation_data=(X_test, y_test),
                            verbose=0)
        model_data.append(model)
        models.append(history)

        values = []
        datax = []
        values.append(str(len(model.layers)))
        values.append(
            "| " +
            str(history.history['loss'][len(history.history['val_loss']) - 1]))
        values.append("| " + str(history.history['val_loss'][
            len(history.history['val_loss']) - 1]))
        datax.append(values)
        for row in datax:
            print((" " * 2 * counter) + "| " +
                  ("".join(word.ljust(col_width) for word in row)) + " |")
        del values, datax
        losses.append(
            history.history[maximizer][len(history.history[maximizer]) - 1])
        i += 1
    # print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl'))
    final_model = model_data[losses.index(min(losses))]
    final_hist = models[losses.index(min(losses))]
    print("")
    logger('->',
           "Best number of layers found: " + str(len(final_model.layers)))

    logger(
        '->', "Training Loss: " +
        str(final_hist.history['loss'][len(final_hist.history['val_loss']) -
                                       1]))
    logger(
        '->', "Test Loss: " +
        str(final_hist.history['val_loss'][len(final_hist.history['val_loss'])
                                           - 1]))

    # calls function to generate plots in plot generation
    plots = {}
    if generate_plots:
        init_plots, plot_names = generate_regression_plots(
            models[len(models) - 1], data, y)
        for x in range(len(plot_names)):
            plots[str(plot_names[x])] = init_plots[x]

    if save_model:
        save(final_model, save_model, save_path)
    # stores values in the client object models dictionary field
    print("")
    logger("Stored model under 'regression_ANN' key")
    clearLog()

    K.clear_session()

    return {
        'id': generate_id(),
        'model': final_model,
        "target": target,
        "num_classes": 1,
        "plots": plots,
        "preprocessor": full_pipeline,
        "interpreter": target_scaler,
        'test_data': {
            'X': X_test,
            'y': y_test
        },
        'losses': {
            'training_loss': final_hist.history['loss'],
            'val_loss': final_hist.history['val_loss']
        }
    }
예제 #24
0
def image_caption_query(self,
                        instruction,
                        label_column=None,
                        drop=None,
                        epochs=10,
                        preprocess=True,
                        random_state=49,
                        test_size=0.2,
                        top_k=5000,
                        batch_size=32,
                        buffer_size=1000,
                        embedding_dim=256,
                        units=512,
                        gpu=False,
                        generate_plots=True,
                        save_model_decoder=False,
                        save_path_decoder=os.getcwd(),
                        save_model_encoder=False,
                        save_path_encoder=os.getcwd()):
    '''
    function to apply predictive algorithm for image_caption generation
    :param many params: used to hyperparametrize the function.
    :return a dictionary object with all of the information for the algorithm.
    '''

    if test_size < 0:
        raise Exception("Test size must be a float between 0 and 1")

    if test_size >= 1:
        raise Exception(
            "Test size must be a float between 0 and 1 (a test size greater than or equal to 1 results in no training "
            "data)")

    if top_k < 1:
        raise Exception("Top_k value must be equal to or greater than 1")

    if batch_size < 1:
        raise Exception("Batch size must be equal to or greater than 1")

    if buffer_size < 1:
        raise Exception("Buffer size must be equal to or greater than 1")

    if embedding_dim < 1:
        raise Exception(
            "Embedding dimension must be equal to or greater than 1")

    if units < 1:
        raise Exception("Units must be equal to or greater than 1")

    if epochs < 1:
        raise Exception(
            "Epoch number is less than 1 (model will not be trained)")

    if save_model_decoder:
        if not os.path.exists(save_path_decoder):
            raise Exception("Decoder save path does not exists")

    if save_model_encoder:
        if not os.path.exists(save_path_encoder):
            raise Exception("Encoder save path does not exists")

    if test_size == 0:
        testing = False
    else:
        testing = True

    if gpu:
        if tf.test.gpu_device_name():
            print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
        else:
            raise Exception("Please install GPU version of Tensorflow")

        device = '/device:GPU:0'
    else:
        device = '/device:CPU:0'

    np.random.seed(random_state)
    tf.random.set_seed(random_state)

    data = DataReader(self.dataset)
    df = data.data_generator()

    if preprocess:
        df.fillna(0, inplace=True)
    if drop is not None:
        df.drop(drop, axis=1, inplace=True)

    logger("Preprocessing data")

    train_captions = []
    img_name_vector = []

    if label_column is None:
        label = instruction
    else:
        label = label_column

    x = get_path_column(df)
    y = get_similar_column(get_value_instruction(label), df)
    logger("->", "Target Column Found: {}".format(y))

    for row in df.iterrows():
        if preprocess:
            caption = '<start> ' + row[1][y] + ' <end>'
        image_id = row[1][x]
        image_path = image_id

        img_name_vector.append(image_path)
        train_captions.append(caption)

    image_model = tf.keras.applications.InceptionV3(include_top=False,
                                                    weights='imagenet')
    new_input = image_model.input
    hidden_layer = image_model.layers[-1].output
    logger("Extracting features from model")
    image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

    image_dataset = tf.data.Dataset.from_tensor_slices(
        sorted(set(img_name_vector)))
    image_dataset = image_dataset.map(
        load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16)

    for img, path in image_dataset:
        batch_features = image_features_extract_model(img)
        batch_features = tf.reshape(
            batch_features,
            (batch_features.shape[0], -1, batch_features.shape[3]))

        for bf, p in zip(batch_features, path):
            path_of_feature = p.numpy().decode("utf-8")
            np.save(path_of_feature, bf.numpy())
    logger("->", "Tokenizing top {} words".format(top_k))
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=top_k,
        oov_token="<unk>",
        filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
    tokenizer.fit_on_texts(train_captions)
    tokenizer.word_index['<pad>'] = 0
    tokenizer.index_word[0] = '<pad>'
    train_seqs = tokenizer.texts_to_sequences(train_captions)
    cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs,
                                                               padding='post')

    vocab_size = top_k + 1
    # num_steps = len(img_name_vector) // batch_size

    if testing:
        img_name_train, img_name_val, cap_train, cap_val = train_test_split(
            img_name_vector, cap_vector, test_size=test_size, random_state=0)
    else:
        img_name_train = img_name_vector
        cap_train = cap_vector

    dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))

    dataset = dataset.map(lambda item1, item2: tf.numpy_function(
        map_func, [item1, item2], [tf.float32, tf.int32]),
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)

    # Shuffle and batch
    logger("Shuffling dataset")
    dataset = dataset.shuffle(buffer_size).batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    if testing:
        dataset_val = tf.data.Dataset.from_tensor_slices(
            (img_name_val, cap_val))

        dataset_val = dataset_val.map(
            lambda item1, item2: tf.numpy_function(map_func, [item1, item2],
                                                   [tf.float32, tf.int32]),
            num_parallel_calls=tf.data.experimental.AUTOTUNE)

        # Shuffle and batch
        dataset_val = dataset_val.shuffle(buffer_size).batch(batch_size)
        dataset_val = dataset_val.prefetch(
            buffer_size=tf.data.experimental.AUTOTUNE)

    logger("Establishing encoder decoder framework")
    encoder = CNN_Encoder(embedding_dim)
    decoder = RNN_Decoder(embedding_dim, units, vocab_size)

    optimizer = tf.keras.optimizers.Adam()
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')

    def loss_function(real, pred):
        mask = tf.math.logical_not(tf.math.equal(real, 0))
        loss_ = loss_object(real, pred)

        mask = tf.cast(mask, dtype=loss_.dtype)
        loss_ *= mask

        return tf.reduce_mean(loss_)

    @tf.function
    def train_step(img_tensor, target):
        with tf.device(device):
            loss = 0

            # initializing the hidden state for each batch
            # because the captions are not related from image to image
            hidden = decoder.reset_state(batch_size=target.shape[0])

            dec_input = tf.expand_dims([tokenizer.word_index['<start>']] *
                                       target.shape[0], 1)

            with tf.GradientTape() as tape:
                features = encoder(img_tensor)

                for i in range(1, target.shape[1]):
                    # passing the features through the decoder
                    predictions, hidden, _ = decoder(dec_input, features,
                                                     hidden)

                    loss += loss_function(target[:, i], predictions)

                    # using teacher forcing
                    dec_input = tf.expand_dims(target[:, i], 1)

            total_loss = (loss / int(target.shape[1]))

            trainable_variables = encoder.trainable_variables + decoder.trainable_variables

            gradients = tape.gradient(loss, trainable_variables)

            optimizer.apply_gradients(zip(gradients, trainable_variables))

            return loss, total_loss

    @tf.function
    def val_step(img_tensor, target):
        with tf.device(device):
            loss = 0

            # initializing the hidden state for each batch
            # because the captions are not related from image to image
            hidden = decoder.reset_state(batch_size=target.shape[0])

            dec_input = tf.expand_dims([tokenizer.word_index['<start>']] *
                                       target.shape[0], 1)

            with tf.GradientTape() as tape:
                features = encoder(img_tensor)

                for i in range(1, target.shape[1]):
                    # passing the features through the decoder
                    predictions, hidden, _ = decoder(dec_input, features,
                                                     hidden)

                    loss += loss_function(target[:, i], predictions)

                    # using teacher forcing
                    dec_input = tf.expand_dims(target[:, i], 1)

            total_loss = (loss / int(target.shape[1]))
            return total_loss

    logger("Training model...")
    with tf.device(device):
        loss_plot_train = []
        loss_plot_val = []
        for epoch in range(epochs):
            total_loss = 0
            total_loss_val = 0

            for (batch, (img_tensor, target)) in enumerate(dataset):
                batch_loss, t_loss = train_step(img_tensor, target)
                total_loss += t_loss

            loss_plot_train.append(total_loss.numpy())

            if testing:
                for (batch, (img_tensor, target)) in enumerate(dataset_val):
                    batch_loss, t_loss = train_step(img_tensor, target)
                    total_loss_val += t_loss

                loss_plot_val.append(total_loss_val.numpy())

    dir_name = os.path.dirname(img_name_vector[0])
    files = os.listdir(dir_name)

    for item in files:
        if item.endswith(".npy"):
            os.remove(os.path.join(dir_name, item))

    plots = {}
    if generate_plots:
        logger("Generating plots")
        plots.update({
            "loss":
            libra.plotting.nonkeras_generate_plots.plot_loss(
                loss_plot_train, loss_plot_val)
        })

    logger("->", "Final training loss: {}".format(str(total_loss.numpy())))
    total_loss = total_loss.numpy()
    if testing:
        total_loss_val = total_loss_val.numpy()
        total_loss_val_str = str(total_loss_val)
    else:
        total_loss_val = 0
        total_loss_val_str = str("0, No validation done")

    logger("->", "Final validation loss: {}".format(total_loss_val_str))

    if save_model_decoder:
        logger("Saving decoder checkpoint...")
        encoder.save_weights(save_path_decoder + "decoderImgCap.ckpt")

    if save_model_encoder:
        logger("Saving encoder checkpoint...")
        encoder.save_weights(save_path_encoder + "encoderImgCap.ckpt")

    logger("Storing information in client object under key 'image_caption'")

    self.models["image_caption"] = {
        "decoder": decoder,
        "encoder": encoder,
        "tokenizer": tokenizer,
        "feature_extraction": image_features_extract_model,
        "plots": plots,
        'losses': {
            'Training loss': total_loss,
            'Validation loss': total_loss_val
        }
    }
    clearLog()
    return self.models["image_caption"]
예제 #25
0
def dimensionality_RF(instruction, dataset, target="", y="", n_features=10):
    '''
    function to reduce dimensionality in dataset via random forest method
    :param instruction: command sent to client instance in written query.
    :param dataset: data instantiated in client instance passed to the algorithm
    :param target: column name of response variable/feature
    :param y: dictionary of train/test data values associated with response variable/feature
    :param n_features: maximum number of features to choose to analyze/select
    '''
    
    global counter

    dataReader = DataReader("./data/" + get_last_file()[0])

    if target == "":
        data = dataReader.data_generator()
        data.fillna(0, inplace=True)
        remove = get_similar_column(get_value_instruction(instruction), data)
        data = structured_preprocesser(data)

        y = data[remove]
        del data[remove]
        le = preprocessing.LabelEncoder()
        y = le.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(
        dataset, y, test_size=0.2, random_state=49)
    first_classifier = tree.DecisionTreeClassifier()
    first_classifier.fit(X_train, y_train)

    first_classifier_acc = accuracy_score(
        first_classifier.predict(X_test), y_test)

    accuracy_scores = [first_classifier_acc]
    columns = []
    datas = []
    datas.append(dataset)
    columns.append([])

    for i, x in product(range(3, 10), range(4, len(dataset.columns))):
        feature_model = RandomForestRegressor(random_state=1, max_depth=i)
        feature_model.fit(X_train, y_train)

        importances = feature_model.feature_importances_
        indices = np.argsort(importances)[-x:]
        columns.append(dataset.columns[indices])

        X_temp_train = X_train[dataset.columns[indices]]
        X_temp_test = X_test[dataset.columns[indices]]

        val = pd.DataFrame(np.r_[X_temp_train, X_temp_test])
        val[target] = np.r_[y_train, y_test]
        datas.append(val)

        vr = tree.DecisionTreeClassifier()
        vr.fit(X_temp_train, y_train)

        accuracy_scores.append(accuracy_score(vr.predict(X_temp_test), y_test))

    the_index = accuracy_scores.index(max(accuracy_scores))

    return datas[the_index], accuracy_scores[0], max(
        accuracy_scores), list(columns[the_index])
예제 #26
0
def k_means_clustering(dataset=None,
                       preprocess=True,
                       generate_plots=True,
                       drop=None,
                       base_clusters=1):
    logger("Reading dataset...")
    # loads dataset and replaces n/a with zero
    # data = pd.read_csv(self.dataset)

    dataReader = DataReader(dataset)
    data = dataReader.data_generator()

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    dataPandas = data.copy()

    full_pipeline = None
    if preprocess:
        logger("Preprocessing data...")
        data, full_pipeline = clustering_preprocessor(data)
        data = np.array(data)

    modelStorage = []
    inertiaStor = []

    # processes dataset and runs KMeans algorithm on one cluster as
    # baseline
    i = base_clusters
    logger("Creating unsupervised clustering task...")
    kmeans = KMeans(n_clusters=i, random_state=0).fit(data)
    modelStorage.append(kmeans)

    # stores SSE values in an array for later comparison
    inertiaStor.append(kmeans.inertia_)
    i += 1

    logger("Identifying best centroid count and optimizing accuracy")
    # continues to increase cluster size until SSE values don't decrease by
    # 1000 - this value was decided based on precedence
    while (all(earlier >= later
               for earlier, later in zip(inertiaStor, inertiaStor[1:]))):
        kmeans = KMeans(n_clusters=i, random_state=0).fit(data)
        modelStorage.append(kmeans)
        inertiaStor.append(kmeans.inertia_)
        # minimize inertia up to 10000
        i += 1

        # checks to see if it should continue to run; need to improve this
        # algorithm
        if i > 3 and inertiaStor[len(inertiaStor) - 2] - 1000 <= inertiaStor[
                len(inertiaStor) - 1]:
            break
    # generates the clustering plots approiately
    logger("->", "Optimal number of clusters found: {}".format(i))

    if generate_plots:
        logger("Generating plots and storing in model")
        init_plots, plot_names = generate_clustering_plots(
            modelStorage[len(modelStorage) - 1], dataPandas, data)

        plots = {}

        for x in range(len(plot_names)):
            plots[str(plot_names[x])] = init_plots[x]

    logger("Stored model under 'k_means_clustering' key")

    # stores plots and information in the dictionary client model
    return {
        'id': generate_id(),
        "model": modelStorage[len(modelStorage) - 1],
        "preprocesser": full_pipeline,
        "plots": plots
    }
    clearLog()
예제 #27
0
def summarization_query(self,
                        instruction,
                        preprocess=True,
                        label_column=None,
                        drop=None,
                        epochs=5,
                        batch_size=32,
                        learning_rate=3e-5,
                        max_text_length=512,
                        gpu=False,
                        test_size=0.2,
                        random_state=49,
                        generate_plots=True,
                        save_model=False,
                        save_path=os.getcwd()):
    '''
    function to apply algorithm for text summarization
    :param many params: used to hyperparametrize the function.
    :return a dictionary object with all of the information for the algorithm.
    '''

    if test_size < 0:
        raise Exception("Test size must be a float between 0 and 1")

    if test_size >= 1:
        raise Exception(
            "Test size must be a float between 0 and 1 (a test size greater than or equal to 1 results in no training "
            "data)")

    if max_text_length < 2:
        raise Exception("Text and summary must be at least of length 2")

    if epochs < 1:
        raise Exception(
            "Epoch number is less than 1 (model will not be trained)")

    if batch_size < 1:
        raise Exception("Batch size must be equal to or greater than 1")

    if max_text_length < 1:
        raise Exception("Max text length must be equal to or greater than 1")

    if save_model:
        if not os.path.exists(save_path):
            raise Exception("Save path does not exist")

    if test_size == 0:
        testing = False
    else:
        testing = True

    if gpu:
        if tf.test.gpu_device_name():
            print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
        else:
            raise Exception("Please install GPU version of Tensorflow")

        device = '/device:GPU:0'
    else:
        device = '/device:CPU:0'

    tf.random.set_seed(random_state)
    np.random.seed(random_state)

    data = DataReader(self.dataset)
    data = data.data_generator()

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    if preprocess:
        data.fillna(0, inplace=True)

    logger("Preprocessing data...")

    if label_column is None:
        label = "summary"
    else:
        label = label_column

    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    # Find target columns
    X, Y, target = get_target_values(data, instruction, label)
    logger("->", "Target Column Found: {}".format(target))
    logger("Establishing dataset walkers")

    # Clean up text
    if preprocess:
        logger("Preprocessing data")
        X = add_prefix(lemmatize_text(text_clean_up(X.array)), "summarize: ")
        Y = add_prefix(lemmatize_text(text_clean_up(Y.array)), "summarize: ")

    # tokenize text/summaries
    X = tokenize_for_input_ids(X, tokenizer, max_text_length)
    Y = tokenize_for_input_ids(Y, tokenizer, max_text_length)

    logger('Fine-Tuning the model on your dataset...')

    # Suppress unnecessary output
    with NoStdStreams():
        model = TFT5ForConditionalGeneration.from_pretrained(
            "t5-small", output_loading_info=False)

    if testing:
        X_train, X_test, y_train, y_test = train_test_split(
            X, Y, test_size=test_size, random_state=random_state)
        test_dataset = tf.data.Dataset.from_tensor_slices(
            (X_test, y_test)).shuffle(10000).batch(batch_size)
    else:
        X_train = X
        y_train = Y
    train_dataset = tf.data.Dataset.from_tensor_slices(
        (X_train, y_train)).shuffle(10000).batch(batch_size)

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    total_training_loss = []
    total_validation_loss = []

    # Training Loop
    with tf.device(device):
        for epoch in range(epochs):
            total_loss = 0
            total_loss_val = 0
            for data, truth in train_dataset:
                with tf.GradientTape() as tape:
                    out = model(inputs=data, decoder_input_ids=data)
                    loss_value = loss(truth, out[0])
                    total_loss += loss_value
                    grads = tape.gradient(loss_value, model.trainable_weights)
                    optimizer.apply_gradients(
                        zip(grads, model.trainable_weights))

            total_training_loss.append(total_loss)

            # Validation Loop
            if testing:
                for data, truth in test_dataset:
                    logits = model(inputs=data,
                                   decoder_input_ids=data,
                                   training=False)
                    val_loss = loss(truth, logits[0])
                    total_loss_val += val_loss

                total_validation_loss.append(total_loss_val)

    logger(
        "->", "Final training loss: {}".format(
            str(total_training_loss[len(total_training_loss) - 1].numpy())))

    if testing:
        total_loss_val_str = str(
            total_validation_loss[len(total_validation_loss) - 1].numpy())
    else:
        total_loss_val = [0]
        total_loss_val_str = str("0, No validation done")

    logger("->", "Final validation loss: {}".format(total_loss_val_str))

    if testing:
        losses = {
            "Training loss":
            total_training_loss[len(total_training_loss) - 1].numpy(),
            "Validation loss":
            total_validation_loss[len(total_validation_loss) - 1].numpy()
        }
    else:
        losses = {
            "Training loss":
            total_training_loss[len(total_training_loss) - 1].numpy()
        }

    plots = None
    if generate_plots:
        logger("Generating plots")
        plots = {
            "loss":
            libra.plotting.nonkeras_generate_plots.plot_loss(
                total_training_loss, total_validation_loss)
        }

    if save_model:
        logger("Saving model")
        model.save_weights(save_path + "summarization_checkpoint.ckpt")

    logger("Storing information in client object under key 'summarization'")

    self.models["summarization"] = {
        "model": model,
        "max_text_length": max_text_length,
        "plots": plots,
        "tokenizer": tokenizer,
        'losses': losses
    }

    clearLog()
    return self.models["summarization"]
예제 #28
0
def dimensionality_reduc(
        instruction,
        dataset,
        arr=[
            "RF",
            "PCA",
            "KPCA",
            "ICA"],
        inplace=False):
    '''
    function to perform dimensionality reduction on the dataset (retrieve only 
    features with most relevance from multidimensional space of the dataset)
    :param instruction: command sent to client instance in written query
    :param dataset: data instantiated in client instance passed to the algorithm
    :param arr: list of options of algorithm/dimension reducing techniques 
    options to choose from
    :param inplace: option to keep features that were deemed as not important
    intact in the dataset
    '''
    
    global counter
    
    dataReader = DataReader(dataset)

    logger("loading dataset...")
    data = dataReader.data_generator()
    data.fillna(0, inplace=True)

    logger("getting most similar column from instruction...")
    target = get_similar_column(get_value_instruction(instruction), data)

    y = data[target]
    del data[target]
    le = preprocessing.LabelEncoder()
    y = le.fit_transform(y)

    data = structured_preprocesser(data)

    perms = []
    overall_storage = []
    finals = []

    logger("generating dimensionality permutations...")
    for i in range(1, len(arr) + 1):
        for elem in list(permutations(arr, i)):
            perms.append(elem)

    logger("running each possible permutation...")
    logger("realigning tensors...")
    for path in perms:
        currSet = data
        for element in path:
            if element == "RF":
                data_mod, beg_acc, final_acc, col_removed = dimensionality_RF(
                    instruction, currSet, target, y)
            elif element == "PCA":
                data_mod, beg_acc, final_acc, col_removed = dimensionality_PCA(
                    instruction, currSet, target, y)
            elif element == "KPCA":
                data_mod, beg_acc, final_acc, col_removed = dimensionality_KPCA(
                    instruction, currSet, target, y)
            elif element == "ICA":
                data_mod, beg_acc, final_acc, col_removed = dimensionality_ICA(
                    instruction, currSet, target, y)
            overall_storage.append(
                list([data_mod, beg_acc, final_acc, col_removed]))
            currSet = data_mod
        finals.append(overall_storage[len(overall_storage) - 1])

    logger("Fetching Best Accuracies...")
    accs = []
    logger("->", "Baseline Accuracy: " + str(finals[0][1]))
    # print("----------------------------")
    col_name = [["Permutation ", "| Final Accuracy "]]
    printtable(col_name, max(len(word)
                             for row in col_name for word in row) + 5)
    for i, element in product(range(len(finals)), finals):
        values = []
        values.append(str(perms[i]))
        values.append("| " + str(element[2]))
        datax = []
        datax.append(values)
        printtable(datax, max(len(word)
                              for row in col_name for word in row) + 5)
        del values, datax
        if finals[0][1] < element[2]:
            accs.append(list([str(perms[i]),
                              "| " + str(element[2])]))
    print("")
    logger("->", " Best Accuracies")
    # print("----------------------------")
    col_name = [["Permutation ", "| Final Accuracy "]]
    printtable(col_name, max(len(word)
                             for row in col_name for word in row) + 5)
    printtable(accs, col_width)

    if inplace:
        data.to_csv(dataset)
예제 #29
0
def text_classification_query(self,
                              instruction,
                              drop=None,
                              preprocess=True,
                              label_column=None,
                              test_size=0.2,
                              random_state=49,
                              learning_rate=1e-2,
                              epochs=20,
                              monitor="val_loss",
                              batch_size=32,
                              max_text_length=200,
                              max_features=20000,
                              generate_plots=True,
                              save_model=False,
                              save_path=os.getcwd()):
    """
    function to apply text_classification algorithm for sentiment analysis
    :param many params: used to hyperparametrize the function.
    :return a dictionary object with all of the information for the algorithm.
    """

    if test_size < 0:
        raise Exception("Test size must be a float between 0 and 1")

    if test_size >= 1:
        raise Exception(
            "Test size must be a float between 0 and 1 (a test size greater than or equal to 1 results in no training "
            "data)")

    if epochs < 1:
        raise Exception(
            "Epoch number is less than 1 (model will not be trained)")

    if batch_size < 1:
        raise Exception("Batch size must be equal to or greater than 1")

    if max_text_length < 1:
        raise Exception("Max text length must be equal to or greater than 1")

    if save_model:
        if not os.path.exists(save_path):
            raise Exception("Save path does not exists")

    if test_size == 0:
        testing = False
    else:
        testing = True

    data = DataReader(self.dataset)
    data = data.data_generator()

    if preprocess:
        data.fillna(0, inplace=True)

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    if label_column is None:
        label = "label"
    else:
        label = label_column

    X, Y, target = get_target_values(data, instruction, label)
    Y = np.array(Y)
    classes = np.unique(Y)

    logger("->", "Target Column Found: {}".format(target))

    vocab = {}
    if preprocess:
        logger("Preprocessing data")
        X = lemmatize_text(text_clean_up(X.array))
        vocab = X
        X = encode_text(X, X)

    X = np.array(X)

    model = get_keras_text_class(max_features, len(classes), learning_rate)
    logger("Building Keras LSTM model dynamically")

    X_train, X_test, y_train, y_test = train_test_split(
        X, Y, test_size=test_size, random_state=random_state)

    X_train = sequence.pad_sequences(X_train, maxlen=max_text_length)
    X_test = sequence.pad_sequences(X_test, maxlen=max_text_length)

    y_vals = np.unique(np.append(y_train, y_test))
    label_mappings = {}
    for i in range(len(y_vals)):
        label_mappings[y_vals[i]] = i
    map_func = np.vectorize(lambda x: label_mappings[x])
    y_train = map_func(y_train)
    y_test = map_func(y_test)

    logger("Training initial model")

    # early stopping callback
    es = EarlyStopping(monitor=monitor, mode='auto', verbose=0, patience=5)

    history = model.fit(X_train,
                        y_train,
                        validation_data=(X_test, y_test),
                        batch_size=batch_size,
                        epochs=epochs,
                        callbacks=[es],
                        verbose=0)

    logger(
        "->", "Final training loss: {}".format(
            history.history["loss"][len(history.history["loss"]) - 1]))
    if testing:
        logger(
            "->", "Final validation loss: {}".format(
                history.history["val_loss"][len(history.history["val_loss"]) -
                                            1]))
        logger(
            "->", "Final validation accuracy: {}".format(
                history.history["val_accuracy"][
                    len(history.history["val_accuracy"]) - 1]))
        losses = {
            'training_loss': history.history['loss'],
            'val_loss': history.history['val_loss']
        }
        accuracy = {
            'training_accuracy': history.history['accuracy'],
            'validation_accuracy': history.history['val_accuracy']
        }
    else:
        logger("->",
               "Final validation loss: {}".format("0, No validation done"))
        losses = {'training_loss': history.history['loss']}
        accuracy = {'training_accuracy': history.history['accuracy']}

    plots = {}
    if generate_plots:
        # generates appropriate classification plots by feeding all
        # information
        logger("Generating plots")
        plots = generate_classification_plots(history, X, Y, model, X_test,
                                              y_test)

    if save_model:
        save(model, save_model, save_path=save_path)

    logger(
        "Storing information in client object under key 'text_classification'")
    # storing values the model dictionary

    self.models["text_classification"] = {
        "model": model,
        "classes": classes,
        "plots": plots,
        "target": Y,
        "vocabulary": vocab,
        "interpreter": label_mappings,
        "max_text_length": max_text_length,
        'test_data': {
            'X': X_test,
            'y': y_test
        },
        'losses': losses,
        'accuracy': accuracy
    }
    clearLog()
    return self.models["text_classification"]
예제 #30
0
def train_svm(instruction,
              dataset=None,
              test_size=0.2,
              kernel='linear',
              text=None,
              preprocess=True,
              ca_threshold=None,
              drop=None,
              cross_val_size=0.3):

    logger("Reading in dataset....")
    # reads dataset and fills n/a values with zeroes
    #data = pd.read_csv(self.dataset)

    dataReader = DataReader(dataset)
    data = dataReader.data_generator()

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    data, y, target, full_pipeline = initial_preprocesser(
        data, instruction, preprocess, ca_threshold, text)
    logger("->", "Target Column Found: {}".format(target))

    X_train = data['train']
    y_train = y['train']
    X_test = data['test']
    y_test = y['test']

    # classification_column = get_similar_column(getLabelwithInstruction(instruction), data)
    num_classes = len(np.unique(y))

    # Needed to make a custom label encoder due to train test split changes
    # Can still be inverse transformed, just a bit of extra work
    y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0))
    label_mappings = {}
    for i in range(len(y_vals)):
        label_mappings[y_vals[i]] = i

    y_train = y_train.apply(lambda x: label_mappings[x]).values
    y_test = y_test.apply(lambda x: label_mappings[x]).values

    # Fitting to SVM and storing in the model dictionary
    logger("Fitting Support Vector Machine...")
    clf = svm.SVC(kernel=kernel)
    clf.fit(X_train, y_train)

    score = accuracy_score(clf.predict(X_test), y_test)

    logger("->", "Accuracy found on testing set: {}".format(score))

    logger('->', "Stored model under 'svm' key")
    return {
        'id': generate_id(),
        "model": clf,
        "accuracy_score": accuracy_score(clf.predict(X_test), y_test),
        "target": target,
        "preprocesser": full_pipeline,
        "interpreter": label_mappings,
        "cross_val_score": cross_val_score(clf, X_train, y_train)
    }
    clearLog()