Пример #1
0
def get_target_values(data, instruction, yLabel):
    # Get target columns
    target = get_similar_column(get_value_instruction(instruction), data)
    X = data[target]
    del data[target]
    #labels
    Y = data[get_similar_column(get_value_instruction(yLabel), data)]
    return X, Y
Пример #2
0
def instruction_identifier(params):
    remove = get_similar_column(
        get_value_instruction(
            params['instruction']),
        params['data'])
    params['y'] = params['data'][remove]
    del params['data'][remove]
Пример #3
0
def get_ner(self, instruction):
    """
    function to identify name entities
    :param instruction: Used to get target column
    :return: dictionary object with detected name-entities
    """
    data = DataReader(self.dataset)
    data = data.data_generator()

    target = get_similar_column(get_value_instruction(instruction), data)
    logger("->", "Target Column Found: {}".format(target))

    # Remove stopwords if any from the detection column
    data['combined_text_for_ner'] = data[target].apply(
        lambda x: ' '.join([word for word in x.split() if word not in stopwords.words()]))

    logger("Detecting Name Entities from : {} data files".format(data.shape[0] + 1))

    # Named entity recognition pipeline, default model selection
    with NoStdStreams():
        hugging_face_ner_detector = pipeline('ner', grouped_entities=True, framework='tf')
        data['ner'] = data['combined_text_for_ner'].apply(lambda x: hugging_face_ner_detector(x))
    logger("NER detection status complete")
    logger("Storing information in client object under key 'named_entity_recognition'")

    self.models["named_entity_recognition"] = {
        "model": hugging_face_ner_detector.model,
        "tokenizer": hugging_face_ner_detector.tokenizer,
        'name_entities': data['ner'].to_dict()}

    logger("Output: ", data['ner'].to_dict())
    clearLog()
    return self.models["named_entity_recognition"]
Пример #4
0
def dimensionality_RF(instruction, dataset, target="", y="", n_features=10):

    global counter

    dataReader = DataReader(dataset)

    if target == "":
        data = dataReader.data_generator()
        data.fillna(0, inplace=True)
        remove = get_similar_column(get_value_instruction(instruction), data)
        data, y, target, full_pipeline = initial_preprocessor(
            data, instruction, True, 0.2, [], 0.2, random_state=49)

        le = preprocessing.LabelEncoder()
        X_train = data['train']
        y_train = y['train']
        X_test = data['test']
        y_test = y['test']

        y_train= le.fit_transform(y_train)
        y_test = le.fit_transform(y_test)

    first_classifier = tree.DecisionTreeClassifier()
    first_classifier.fit(X_train, y_train)

    first_classifier_acc = accuracy_score(
        first_classifier.predict(X_test), y_test)

    accuracy_scores = [first_classifier_acc]
    columns = []
    datas = []
    datas.append(dataset)
    columns.append([])

    for i, x in product(range(3, 10), range(4, len(X_train.columns))):
        feature_model = RandomForestRegressor(random_state=1, max_depth=x)
        feature_model.fit(X_train, y_train)

        importances = feature_model.feature_importances_
        indices = np.argsort(importances)[-x:]
        columns.append(X_train.columns[indices])

        X_temp_train = X_train[X_train.columns[indices]]
        X_temp_test = X_test[X_train.columns[indices]]

        val = pd.DataFrame(np.r_[X_temp_train, X_temp_test])
        val[target] = np.r_[y_train, y_test]
        datas.append(val)

        vr = tree.DecisionTreeClassifier()
        vr.fit(X_temp_train, y_train)

        accuracy_scores.append(accuracy_score(vr.predict(X_temp_test), y_test))

    the_index = accuracy_scores.index(max(accuracy_scores))
    print(accuracy_scores)
    return datas[the_index], accuracy_scores[0], max(
        accuracy_scores), list(columns[the_index])
def dimensionality_KPCA(instruction, dataset, target="", y=""):
    global currLog
    global counter

    dataReader = DataReader("./data/" + get_last_file()[0])

    if target == "":
        data = dataReader.data_generator()
        data.fillna(0, inplace=True)
        remove = get_similar_column(get_value_instruction(instruction), data)

        y = data[remove]
        del data[remove]
        le = preprocessing.LabelEncoder()
        y = le.fit_transform(y)

    kpca = KernelPCA(n_components=len(dataset.columns), kernel="rbf")
    data_modified = kpca.fit_transform(dataset)

    X_train, X_test, y_train, y_test = train_test_split(
        dataset, y, test_size=0.2, random_state=49)
    X_train_mod, X_test_mod, y_train_mod, y_test_mod = train_test_split(
        data_modified, y, test_size=0.2, random_state=49)

    clf = tree.DecisionTreeClassifier()
    clf.fit(X_train, y_train)

    clf_mod = tree.DecisionTreeClassifier()
    clf_mod.fit(X_train_mod, y_train_mod)
    acc = []
    acc.append(accuracy_score(
        clf_mod.predict(X_test_mod), y_test_mod))
    for i, j in product(range(3, 10), ["entropy", "gini"]):
        model = tree.DecisionTreeClassifier(criterion=j, max_depth=i)
        model = model.fit(X_train_mod, y_train_mod)
        acc.append(accuracy_score(model.predict(X_test_mod), y_test))
    del i, j
    data_modified = pd.DataFrame(data_modified)
    data_modified[target] = np.r_[y_train, y_test]
    # data_modified.to_csv("./data/housingPCA.csv")

    return data_modified, accuracy_score(
        clf.predict(X_test), y_test), max(acc), (len(
            dataset.columns) - len(data_modified.columns))

    def booster(dataset, obj):
        #obj=["reg:linear","multi:softmax "]

        X_train, X_test, y_train, y_test = train_test_split(
            dataset, y, test_size=0.2, random_state=49)
        clf = XGBClassifier(
            objective=obj,
            learning_rate=0.1,
            silent=1,
            alpha=10)
        clf.fit(X_train, y_train)
        return accuracy_score(clf.predict(X_test_mod), y_test_mod)
def dimensionality_RF(instruction, dataset, target="", y="", n_features=10):
    global currLog
    global counter

    dataReader = DataReader("./data/" + get_last_file()[0])

    if target == "":
        data = dataReader.data_generator()
        data.fillna(0, inplace=True)
        remove = get_similar_column(get_value_instruction(instruction), data)
        data = structured_preprocesser(data)

        y = data[remove]
        del data[remove]
        le = preprocessing.LabelEncoder()
        y = le.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(
        dataset, y, test_size=0.2, random_state=49)
    first_classifier = tree.DecisionTreeClassifier()
    first_classifier.fit(X_train, y_train)

    first_classifier_acc = accuracy_score(
        first_classifier.predict(X_test), y_test)

    accuracy_scores = [first_classifier_acc]
    columns = []
    datas = []
    datas.append(dataset)
    columns.append([])

    for i, x in product(range(3, 10), range(4, len(dataset.columns))):
        feature_model = RandomForestRegressor(random_state=1, max_depth=i)
        feature_model.fit(X_train, y_train)

        importances = feature_model.feature_importances_
        indices = np.argsort(importances)[-x:]
        columns.append(dataset.columns[indices])

        X_temp_train = X_train[dataset.columns[indices]]
        X_temp_test = X_test[dataset.columns[indices]]

        val = pd.DataFrame(np.r_[X_temp_train, X_temp_test])
        val[target] = np.r_[y_train, y_test]
        datas.append(val)

        vr = tree.DecisionTreeClassifier()
        vr.fit(X_temp_train, y_train)

        accuracy_scores.append(accuracy_score(vr.predict(X_temp_test), y_test))

    the_index = accuracy_scores.index(max(accuracy_scores))

    return datas[the_index], accuracy_scores[0], max(
        accuracy_scores), list(columns[the_index])
Пример #7
0
def initial_preprocesser(data,
                         instruction,
                         preprocess,
                         ca_threshold,
                         text,
                         test_size=0.2,
                         random_state=49):
    # Scans for object columns just in case we have a datetime column that
    # isn't detected

    if test_size < 0 or test_size > 1:
        raise Exception(
            'Test size cannot be {}, it should be a proportion between 0 and 1'
            .format(test_size))

    object_columns = [
        col for col, col_type in data.dtypes.iteritems()
        if col_type == 'object'
    ]

    # Handles dates without timestamps
    for col in object_columns:
        try:
            data[col] = pd.to_datetime(data[col], infer_datetime_format=True)
        except ValueError:
            pass

    # get target column
    target = get_similar_column(get_value_instruction(instruction), data)
    y = data[target]

    # remove rows where target is NaN
    data = data[y.notna()]
    y = y[y.notna()]

    del data[target]
    X_train, X_test, y_train, y_test = train_test_split(
        data, y, test_size=test_size, random_state=random_state)

    data = {
        'train': pd.concat([X_train], axis=1),
        'test': pd.concat([X_test], axis=1)
    }
    # preprocess the dataset
    full_pipeline = None
    if preprocess:
        data, full_pipeline = structured_preprocesser(data, ca_threshold, text)
    else:
        data.fillna(0, inplace=True)

    y = {'train': y_train, 'test': y_test}

    return data, y, target, full_pipeline
def dimensionality_KPCA(instruction, dataset, target="", y=""):
    '''
    function to reduce dimensionality in dataset via kernal principal component analysis
    :param instruction: command sent to client instance in written query.
    :param dataset: data instantiated in client instance passed to the algorithm
    :param target: column name of response variable/feature
    :param y: dictionary of train/test data values associated with response variable/feature
    '''
    
    global counter

    dataReader = DataReader("./data/" + get_last_file()[0])

    if target == "":
        data = dataReader.data_generator()
        data.fillna(0, inplace=True)
        remove = get_similar_column(get_value_instruction(instruction), data)

        y = data[remove]
        del data[remove]
        le = preprocessing.LabelEncoder()
        y = le.fit_transform(y)

    kpca = KernelPCA(n_components=len(dataset.columns), kernel="rbf")
    data_modified = kpca.fit_transform(dataset)

    X_train, X_test, y_train, y_test = train_test_split(
        dataset, y, test_size=0.2, random_state=49)
    X_train_mod, X_test_mod, y_train_mod, y_test_mod = train_test_split(
        data_modified, y, test_size=0.2, random_state=49)

    clf = tree.DecisionTreeClassifier()
    clf.fit(X_train, y_train)

    clf_mod = tree.DecisionTreeClassifier()
    clf_mod.fit(X_train_mod, y_train_mod)
    acc = []
    acc.append(accuracy_score(
        clf_mod.predict(X_test_mod), y_test_mod))
    for i, j in product(range(3, 10), ["entropy", "gini"]):
        model = tree.DecisionTreeClassifier(criterion=j, max_depth=i)
        model = model.fit(X_train_mod, y_train_mod)
        acc.append(accuracy_score(model.predict(X_test_mod), y_test))
    del i, j
    data_modified = pd.DataFrame(data_modified)
    data_modified[target] = np.r_[y_train, y_test]
    # data_modified.to_csv("./data/housingPCA.csv")

    return data_modified, accuracy_score(
        clf.predict(X_test), y_test), max(acc), (len(
            dataset.columns) - len(data_modified.columns))
Пример #9
0
    def neural_network_query(self,
                             instruction,
                             text=[],
                             ca_threshold=None,
                             drop=None,
                             preprocess=True,
                             test_size=0.2,
                             random_state=49,
                             epochs=50,
                             generate_plots=True,
                             callback_mode='min',
                             maximizer="val_loss",
                             save_model=False,
                             save_path=os.getcwd()):

        data = pd.read_csv(self.dataset)

        if preprocess:

            remove = get_similar_column(get_value_instruction(instruction),
                                        data)
            if data[remove].dtype.name == 'object':
                callback_mode = 'max'
                maximizer = "val_accuracy"
                self.classification_query_ann(instruction,
                                              text=text,
                                              ca_threshold=ca_threshold,
                                              preprocess=preprocess,
                                              test_size=test_size,
                                              random_state=random_state,
                                              epochs=epochs,
                                              generate_plots=generate_plots,
                                              callback_mode=callback_mode,
                                              maximizer=maximizer,
                                              save_model=save_model,
                                              save_path=save_path)
            else:
                self.regression_query_ann(instruction,
                                          text=text,
                                          ca_threshold=ca_threshold,
                                          preprocess=preprocess,
                                          test_size=test_size,
                                          random_state=random_state,
                                          epochs=epochs,
                                          generate_plots=generate_plots,
                                          callback_mode=callback_mode,
                                          maximizer=maximizer,
                                          drop=drop,
                                          save_model=save_model,
                                          save_path=save_path)
Пример #10
0
def csv_preprocessing(csv_file, data_path, instruction, image_column,
                      training_ratio, height, width):

    df = pd.read_csv(csv_file)
    if instruction is None:
        raise BaseException(
            "Instruction was not given for csv file to be processed.")

    label = get_similar_column(get_value_instruction(instruction), df)
    avoid_directories = ["proc_training_set", "proc_testing_set"]
    data_paths = [
        data_path + "/" + d for d in os.listdir(data_path)
        if os.path.isdir(data_path + "/" + d) and d not in avoid_directories
    ]

    file_extensions = ["jpg", "jpeg", "png", "gif"]
    need_file_extension = False
    path_included = False

    count = 0
    while image_column is None:
        if count > 20:
            raise BaseException(
                f"Could not locate column containing image information.")
        count += 1
        random_row = df.sample()
        for column, value in random_row.iloc[0].items():
            if isinstance(value, str):
                if os.path.exists(data_path + "/" +
                                  (value if value[0] != "/" else value[1:])):
                    path_included = True
                    image_column = column
                    break
                # add file extension if not included
                if value.split(".")[-1] in file_extensions:
                    file = [value]
                else:
                    file = []
                    for extension in file_extensions:
                        file.append(value + "." + extension)

                # look through all data_paths for file
                for path in data_paths:
                    for file_option in file:
                        if os.path.exists(path + "/" + file_option):
                            if file_option.split(".")[-1] in file_extensions:
                                need_file_extension = True
                            image_column = column
                            break
            if image_column is not None:
                break

    else:
        if os.path.exists(data_path + "/" + df.iloc[0][image_column]):
            path_included = True
        elif df.iloc[0][image_column].split(".")[-1]:
            need_file_extension = True

    df = df[[image_column, label]].dropna()

    heights = []
    widths = []
    classifications = df[label].value_counts()
    if len(classifications) < 2:
        raise BaseException(
            f"{csv_file} contains {len(classifications)} classes. Need at least two classification labels."
        )
    for key, value in classifications.items():
        if value < 2:
            raise BaseException(
                f"Class: {key} contans {value} images. Need at least two images in this class."
            )

    image_list = []

    # get the median heights and widths
    for index, row in df.iterrows():
        if path_included:
            p = data_path + "/" + \
                (row[image_column][1:] if row[image_column][0] == "/" else row[image_column])
            img = cv2.imread(p)
        else:
            for path in data_paths:
                if need_file_extension:
                    for extension in file_extensions:
                        p = path + "/" + row[image_column] + "." + extension
                        img = cv2.imread(p)
                        if img is not None:
                            break
                else:
                    p = path + "/" + row[image_column]
                    img = cv2.imread(p)
                if img is not None:
                    break
        if img is None:
            raise BaseException(
                f"{row[image_column]} could not be found in any directories.")
        image_list.append(img)
        heights.append(img.shape[0])
        widths.append(img.shape[1])

    height1, width1 = calculate_medians(heights, widths)
    if height is None:
        height = height1
    if width is None:
        width = width1

    # create training and testing folders
    create_folder(data_path, "proc_training_set")
    create_folder(data_path, "proc_testing_set")
    # create classification folders
    for classification in classifications.keys():
        create_folder(data_path + "/proc_training_set", classification)
        create_folder(data_path + "/proc_testing_set", classification)

    data_size = [0, 0]
    class_count = dict.fromkeys(classifications.keys(), 0)

    # save images into correct folder
    for index, row in df.iterrows():
        # resize images
        img = process_color_channel(image_list[index], height, width)
        p = "proc_" + (os.path.basename(row[image_column])
                       if path_included else row[image_column])
        if need_file_extension:
            p += ".jpg"
        if class_count[row[label]] / \
                classifications[row[label]] < training_ratio:
            data_size[0] += 1
            class_count[row[label]] += 1
            save_image(data_path + "/proc_training_set", img, p, row[label])
        else:
            data_size[1] += 1
            class_count[row[label]] += 1
            save_image(data_path + "/proc_testing_set", img, p, row[label])

    return {
        "num_categories": len(classifications),
        "height": height,
        "width": width,
        "train_size": data_size[0],
        "test_size": data_size[1]
    }
Пример #11
0
def image_caption_query(self,
                        instruction,
                        label_column=None,
                        drop=None,
                        epochs=10,
                        preprocess=True,
                        random_state=49,
                        test_size=0.2,
                        top_k=5000,
                        batch_size=32,
                        buffer_size=1000,
                        embedding_dim=256,
                        units=512,
                        gpu=False,
                        generate_plots=True,
                        save_model_decoder=False,
                        save_path_decoder=os.getcwd(),
                        save_model_encoder=False,
                        save_path_encoder=os.getcwd()):
    '''
    function to apply predictive algorithm for image_caption generation
    :param many params: used to hyperparametrize the function.
    :return a dictionary object with all of the information for the algorithm.
    '''

    if test_size < 0:
        raise Exception("Test size must be a float between 0 and 1")

    if test_size >= 1:
        raise Exception(
            "Test size must be a float between 0 and 1 (a test size greater than or equal to 1 results in no training "
            "data)")

    if top_k < 1:
        raise Exception("Top_k value must be equal to or greater than 1")

    if batch_size < 1:
        raise Exception("Batch size must be equal to or greater than 1")

    if buffer_size < 1:
        raise Exception("Buffer size must be equal to or greater than 1")

    if embedding_dim < 1:
        raise Exception(
            "Embedding dimension must be equal to or greater than 1")

    if units < 1:
        raise Exception("Units must be equal to or greater than 1")

    if epochs < 1:
        raise Exception(
            "Epoch number is less than 1 (model will not be trained)")

    if save_model_decoder:
        if not os.path.exists(save_path_decoder):
            raise Exception("Decoder save path does not exists")

    if save_model_encoder:
        if not os.path.exists(save_path_encoder):
            raise Exception("Encoder save path does not exists")

    if test_size == 0:
        testing = False
    else:
        testing = True

    if gpu:
        if tf.test.gpu_device_name():
            print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
        else:
            raise Exception("Please install GPU version of Tensorflow")

        device = '/device:GPU:0'
    else:
        device = '/device:CPU:0'

    np.random.seed(random_state)
    tf.random.set_seed(random_state)

    data = DataReader(self.dataset)
    df = data.data_generator()

    if preprocess:
        df.fillna(0, inplace=True)
    if drop is not None:
        df.drop(drop, axis=1, inplace=True)

    logger("Preprocessing data")

    train_captions = []
    img_name_vector = []

    if label_column is None:
        label = instruction
    else:
        label = label_column

    x = get_path_column(df)
    y = get_similar_column(get_value_instruction(label), df)
    logger("->", "Target Column Found: {}".format(y))

    for row in df.iterrows():
        if preprocess:
            caption = '<start> ' + row[1][y] + ' <end>'
        image_id = row[1][x]
        image_path = image_id

        img_name_vector.append(image_path)
        train_captions.append(caption)

    image_model = tf.keras.applications.InceptionV3(include_top=False,
                                                    weights='imagenet')
    new_input = image_model.input
    hidden_layer = image_model.layers[-1].output
    logger("Extracting features from model")
    image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

    image_dataset = tf.data.Dataset.from_tensor_slices(
        sorted(set(img_name_vector)))
    image_dataset = image_dataset.map(
        load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16)

    for img, path in image_dataset:
        batch_features = image_features_extract_model(img)
        batch_features = tf.reshape(
            batch_features,
            (batch_features.shape[0], -1, batch_features.shape[3]))

        for bf, p in zip(batch_features, path):
            path_of_feature = p.numpy().decode("utf-8")
            np.save(path_of_feature, bf.numpy())
    logger("->", "Tokenizing top {} words".format(top_k))
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=top_k,
        oov_token="<unk>",
        filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
    tokenizer.fit_on_texts(train_captions)
    tokenizer.word_index['<pad>'] = 0
    tokenizer.index_word[0] = '<pad>'
    train_seqs = tokenizer.texts_to_sequences(train_captions)
    cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs,
                                                               padding='post')

    vocab_size = top_k + 1
    # num_steps = len(img_name_vector) // batch_size

    if testing:
        img_name_train, img_name_val, cap_train, cap_val = train_test_split(
            img_name_vector, cap_vector, test_size=test_size, random_state=0)
    else:
        img_name_train = img_name_vector
        cap_train = cap_vector

    dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))

    dataset = dataset.map(lambda item1, item2: tf.numpy_function(
        map_func, [item1, item2], [tf.float32, tf.int32]),
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)

    # Shuffle and batch
    logger("Shuffling dataset")
    dataset = dataset.shuffle(buffer_size).batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    if testing:
        dataset_val = tf.data.Dataset.from_tensor_slices(
            (img_name_val, cap_val))

        dataset_val = dataset_val.map(
            lambda item1, item2: tf.numpy_function(map_func, [item1, item2],
                                                   [tf.float32, tf.int32]),
            num_parallel_calls=tf.data.experimental.AUTOTUNE)

        # Shuffle and batch
        dataset_val = dataset_val.shuffle(buffer_size).batch(batch_size)
        dataset_val = dataset_val.prefetch(
            buffer_size=tf.data.experimental.AUTOTUNE)

    logger("Establishing encoder decoder framework")
    encoder = CNN_Encoder(embedding_dim)
    decoder = RNN_Decoder(embedding_dim, units, vocab_size)

    optimizer = tf.keras.optimizers.Adam()
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')

    def loss_function(real, pred):
        mask = tf.math.logical_not(tf.math.equal(real, 0))
        loss_ = loss_object(real, pred)

        mask = tf.cast(mask, dtype=loss_.dtype)
        loss_ *= mask

        return tf.reduce_mean(loss_)

    @tf.function
    def train_step(img_tensor, target):
        with tf.device(device):
            loss = 0

            # initializing the hidden state for each batch
            # because the captions are not related from image to image
            hidden = decoder.reset_state(batch_size=target.shape[0])

            dec_input = tf.expand_dims([tokenizer.word_index['<start>']] *
                                       target.shape[0], 1)

            with tf.GradientTape() as tape:
                features = encoder(img_tensor)

                for i in range(1, target.shape[1]):
                    # passing the features through the decoder
                    predictions, hidden, _ = decoder(dec_input, features,
                                                     hidden)

                    loss += loss_function(target[:, i], predictions)

                    # using teacher forcing
                    dec_input = tf.expand_dims(target[:, i], 1)

            total_loss = (loss / int(target.shape[1]))

            trainable_variables = encoder.trainable_variables + decoder.trainable_variables

            gradients = tape.gradient(loss, trainable_variables)

            optimizer.apply_gradients(zip(gradients, trainable_variables))

            return loss, total_loss

    @tf.function
    def val_step(img_tensor, target):
        with tf.device(device):
            loss = 0

            # initializing the hidden state for each batch
            # because the captions are not related from image to image
            hidden = decoder.reset_state(batch_size=target.shape[0])

            dec_input = tf.expand_dims([tokenizer.word_index['<start>']] *
                                       target.shape[0], 1)

            with tf.GradientTape() as tape:
                features = encoder(img_tensor)

                for i in range(1, target.shape[1]):
                    # passing the features through the decoder
                    predictions, hidden, _ = decoder(dec_input, features,
                                                     hidden)

                    loss += loss_function(target[:, i], predictions)

                    # using teacher forcing
                    dec_input = tf.expand_dims(target[:, i], 1)

            total_loss = (loss / int(target.shape[1]))
            return total_loss

    logger("Training model...")
    with tf.device(device):
        loss_plot_train = []
        loss_plot_val = []
        for epoch in range(epochs):
            total_loss = 0
            total_loss_val = 0

            for (batch, (img_tensor, target)) in enumerate(dataset):
                batch_loss, t_loss = train_step(img_tensor, target)
                total_loss += t_loss

            loss_plot_train.append(total_loss.numpy())

            if testing:
                for (batch, (img_tensor, target)) in enumerate(dataset_val):
                    batch_loss, t_loss = train_step(img_tensor, target)
                    total_loss_val += t_loss

                loss_plot_val.append(total_loss_val.numpy())

    dir_name = os.path.dirname(img_name_vector[0])
    files = os.listdir(dir_name)

    for item in files:
        if item.endswith(".npy"):
            os.remove(os.path.join(dir_name, item))

    plots = {}
    if generate_plots:
        logger("Generating plots")
        plots.update({
            "loss":
            libra.plotting.nonkeras_generate_plots.plot_loss(
                loss_plot_train, loss_plot_val)
        })

    logger("->", "Final training loss: {}".format(str(total_loss.numpy())))
    total_loss = total_loss.numpy()
    if testing:
        total_loss_val = total_loss_val.numpy()
        total_loss_val_str = str(total_loss_val)
    else:
        total_loss_val = 0
        total_loss_val_str = str("0, No validation done")

    logger("->", "Final validation loss: {}".format(total_loss_val_str))

    if save_model_decoder:
        logger("Saving decoder checkpoint...")
        encoder.save_weights(save_path_decoder + "decoderImgCap.ckpt")

    if save_model_encoder:
        logger("Saving encoder checkpoint...")
        encoder.save_weights(save_path_encoder + "encoderImgCap.ckpt")

    logger("Storing information in client object under key 'image_caption'")

    self.models["image_caption"] = {
        "decoder": decoder,
        "encoder": encoder,
        "tokenizer": tokenizer,
        "feature_extraction": image_features_extract_model,
        "plots": plots,
        'losses': {
            'Training loss': total_loss,
            'Validation loss': total_loss_val
        }
    }
    clearLog()
    return self.models["image_caption"]
Пример #12
0
def dimensionality_ICA(instruction, dataset, target="", y=""):

    global counter

    dataReader = DataReader(dataset)

    if target == "":
        data = dataReader.data_generator()
        data.fillna(0, inplace=True)
        remove = get_similar_column(get_value_instruction(instruction), data)

        data, y, target, full_pipeline = initial_preprocessor(
            data, instruction, True, 0.2, [], 0.2, random_state=49)

        X_train = data['train']
        X_test = data['test']

        y_train = y['train']
        y_test = y['test']


    pca = FastICA(n_components=len(X_train.columns))
    X_train_mod = pca.fit_transform(X_train)
    X_test_mod = pca.fit_transform(X_test)


    clf = tree.DecisionTreeClassifier()
    clf.fit(X_train, y_train)

    clf_mod = tree.DecisionTreeClassifier()
    clf_mod.fit(X_train_mod, y_train)
    acc = []
    sets = []
    acc.append(accuracy_score(
        clf_mod.predict(X_test_mod), y_test))

    frame = pd.DataFrame(pd.DataFrame(X_train_mod).append(pd.DataFrame(X_test_mod)))
    frame[target] = np.r_[y_train, y_test]
    sets.append(frame)

    for i in range(2, len(X_train.columns)):
        pca = FastICA(n_components=i)
        X_train_mod = pca.fit_transform(X_train)
        X_test_mod = pca.fit_transform(X_test)

        frame = pd.DataFrame(pd.DataFrame(X_train_mod).append(pd.DataFrame(X_test_mod)))
        frame[target] = np.r_[y_train, y_test]
        sets.append(frame)

        clf_mod = tree.DecisionTreeClassifier()
        clf_mod.fit(X_train_mod, y_train)

        acc.append(accuracy_score(
            clf_mod.predict(X_test_mod), y_test))

    del i

    data_modified = sets[acc.index(max(acc))]
    score = max(acc)


    return data_modified, score, ((len(
            X_train.columns) + 1) - len(data_modified.columns))
Пример #13
0
    def neural_network_query(self,
                             instruction,
                             callback=False,
                             text=[],
                             ca_threshold=None,
                             drop=None,
                             preprocess=True,
                             test_size=0.2,
                             random_state=49,
                             epochs=50,
                             generate_plots=True,
                             callback_mode='min',
                             maximizer="val_loss",
                             save_model=False,
                             save_path=os.getcwd(),
                             add_layer={}):
        '''
        Detects to see if it's a regression/classification problem and then calls the correct query.
        :param hyperparameters: all of these are hyperparameters that're passed to the algorithm
        :return: a model, plots, accuracy information all stored in the self.models dictionary
        '''

        data = pd.read_csv(self.dataset)

        if preprocess:

            remove = get_similar_column(get_value_instruction(instruction),
                                        data)

            if len(data) < 50:
                raise Exception(
                    "Only datasets larger then 50 rows are supported for neural networks"
                )
            if len(data[remove].value_counts()) <= 50:
                callback_mode = 'max'
                maximizer = "val_accuracy"
                self.classification_query_ann(instruction,
                                              text=text,
                                              callback=callback,
                                              ca_threshold=ca_threshold,
                                              preprocess=preprocess,
                                              test_size=test_size,
                                              random_state=random_state,
                                              epochs=epochs,
                                              generate_plots=generate_plots,
                                              callback_mode=callback_mode,
                                              maximizer=maximizer,
                                              save_model=save_model,
                                              save_path=save_path,
                                              add_layer=add_layer)
            else:
                self.regression_query_ann(instruction,
                                          callback=callback,
                                          text=text,
                                          ca_threshold=ca_threshold,
                                          preprocess=preprocess,
                                          test_size=test_size,
                                          random_state=random_state,
                                          epochs=epochs,
                                          generate_plots=generate_plots,
                                          callback_mode=callback_mode,
                                          maximizer=maximizer,
                                          drop=drop,
                                          save_model=save_model,
                                          save_path=save_path,
                                          add_layer=add_layer)
        clearLog()
Пример #14
0
def dimensionality_reduc(
        instruction,
        dataset,
        arr=[
            "RF",
            "PCA",
            "KPCA",
            "ICA"],
        inplace=False):
    '''
    function to perform dimensionality reduction on the dataset (retrieve only 
    features with most relevance from multidimensional space of the dataset)
    :param instruction: command sent to client instance in written query
    :param dataset: data instantiated in client instance passed to the algorithm
    :param arr: list of options of algorithm/dimension reducing techniques 
    options to choose from
    :param inplace: option to keep features that were deemed as not important
    intact in the dataset
    '''
    
    global counter
    
    dataReader = DataReader(dataset)

    logger("loading dataset...")
    data = dataReader.data_generator()
    data.fillna(0, inplace=True)

    logger("getting most similar column from instruction...")
    target = get_similar_column(get_value_instruction(instruction), data)

    y = data[target]
    del data[target]
    le = preprocessing.LabelEncoder()
    y = le.fit_transform(y)

    data = structured_preprocesser(data)

    perms = []
    overall_storage = []
    finals = []

    logger("generating dimensionality permutations...")
    for i in range(1, len(arr) + 1):
        for elem in list(permutations(arr, i)):
            perms.append(elem)

    logger("running each possible permutation...")
    logger("realigning tensors...")
    for path in perms:
        currSet = data
        for element in path:
            if element == "RF":
                data_mod, beg_acc, final_acc, col_removed = dimensionality_RF(
                    instruction, currSet, target, y)
            elif element == "PCA":
                data_mod, beg_acc, final_acc, col_removed = dimensionality_PCA(
                    instruction, currSet, target, y)
            elif element == "KPCA":
                data_mod, beg_acc, final_acc, col_removed = dimensionality_KPCA(
                    instruction, currSet, target, y)
            elif element == "ICA":
                data_mod, beg_acc, final_acc, col_removed = dimensionality_ICA(
                    instruction, currSet, target, y)
            overall_storage.append(
                list([data_mod, beg_acc, final_acc, col_removed]))
            currSet = data_mod
        finals.append(overall_storage[len(overall_storage) - 1])

    logger("Fetching Best Accuracies...")
    accs = []
    logger("->", "Baseline Accuracy: " + str(finals[0][1]))
    # print("----------------------------")
    col_name = [["Permutation ", "| Final Accuracy "]]
    printtable(col_name, max(len(word)
                             for row in col_name for word in row) + 5)
    for i, element in product(range(len(finals)), finals):
        values = []
        values.append(str(perms[i]))
        values.append("| " + str(element[2]))
        datax = []
        datax.append(values)
        printtable(datax, max(len(word)
                              for row in col_name for word in row) + 5)
        del values, datax
        if finals[0][1] < element[2]:
            accs.append(list([str(perms[i]),
                              "| " + str(element[2])]))
    print("")
    logger("->", " Best Accuracies")
    # print("----------------------------")
    col_name = [["Permutation ", "| Final Accuracy "]]
    printtable(col_name, max(len(word)
                             for row in col_name for word in row) + 5)
    printtable(accs, col_width)

    if inplace:
        data.to_csv(dataset)
Пример #15
0
def dimensionality_RF(instruction, dataset, target="", y="", n_features=10):
    '''
    function to reduce dimensionality in dataset via random forest method
    :param instruction: command sent to client instance in written query.
    :param dataset: data instantiated in client instance passed to the algorithm
    :param target: column name of response variable/feature
    :param y: dictionary of train/test data values associated with response variable/feature
    :param n_features: maximum number of features to choose to analyze/select
    '''
    
    global counter

    dataReader = DataReader("./data/" + get_last_file()[0])

    if target == "":
        data = dataReader.data_generator()
        data.fillna(0, inplace=True)
        remove = get_similar_column(get_value_instruction(instruction), data)
        data = structured_preprocesser(data)

        y = data[remove]
        del data[remove]
        le = preprocessing.LabelEncoder()
        y = le.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(
        dataset, y, test_size=0.2, random_state=49)
    first_classifier = tree.DecisionTreeClassifier()
    first_classifier.fit(X_train, y_train)

    first_classifier_acc = accuracy_score(
        first_classifier.predict(X_test), y_test)

    accuracy_scores = [first_classifier_acc]
    columns = []
    datas = []
    datas.append(dataset)
    columns.append([])

    for i, x in product(range(3, 10), range(4, len(dataset.columns))):
        feature_model = RandomForestRegressor(random_state=1, max_depth=i)
        feature_model.fit(X_train, y_train)

        importances = feature_model.feature_importances_
        indices = np.argsort(importances)[-x:]
        columns.append(dataset.columns[indices])

        X_temp_train = X_train[dataset.columns[indices]]
        X_temp_test = X_test[dataset.columns[indices]]

        val = pd.DataFrame(np.r_[X_temp_train, X_temp_test])
        val[target] = np.r_[y_train, y_test]
        datas.append(val)

        vr = tree.DecisionTreeClassifier()
        vr.fit(X_temp_train, y_train)

        accuracy_scores.append(accuracy_score(vr.predict(X_temp_test), y_test))

    the_index = accuracy_scores.index(max(accuracy_scores))

    return datas[the_index], accuracy_scores[0], max(
        accuracy_scores), list(columns[the_index])
def dimensionality_reduc(
        instruction,
        dataset,
        arr=[
            "RF",
            "PCA",
            "KPCA",
            "ICA"],
        inplace=False):
    global currLog
    global counter

    dataReader = DataReader(dataset)

    logger("loading dataset...")
    data = dataReader.data_generator()
    data.fillna(0, inplace=True)

    logger("getting most similar column from instruction...")
    target = get_similar_column(get_value_instruction(instruction), data)

    y = data[target]
    del data[target]
    le = preprocessing.LabelEncoder()
    y = le.fit_transform(y)

    data = structured_preprocesser(data)

    perms = []
    overall_storage = []
    finals = []

    logger("generating dimensionality permutations...")
    for i in range(1, len(arr) + 1):
        for elem in list(permutations(arr, i)):
            perms.append(elem)

    logger("running each possible permutation...")
    logger("realigning tensors...")
    for path in perms:
        currSet = data
        for element in path:
            if element == "RF":
                data_mod, beg_acc, final_acc, col_removed = dimensionality_RF(
                    instruction, currSet, target, y)
            elif element == "PCA":
                data_mod, beg_acc, final_acc, col_removed = dimensionality_PCA(
                    instruction, currSet, target, y)
            elif element == "KPCA":
                data_mod, beg_acc, final_acc, col_removed = dimensionality_KPCA(
                    instruction, currSet, target, y)
            elif element == "ICA":
                data_mod, beg_acc, final_acc, col_removed = dimensionality_ICA(
                    instruction, currSet, target, y)
            overall_storage.append(
                list([data_mod, beg_acc, final_acc, col_removed]))
            currSet = data_mod
        finals.append(overall_storage[len(overall_storage) - 1])

    logger("Fetching Best Accuracies...")
    accs = []
    print("")
    print("Baseline Accuracy: " + str(finals[0][1]))
    print("----------------------------")
    for i, element in product(range(len(finals)), finals):
        print("Permutation --> " +
              str(perms[i]) +
              " | Final Accuracy --> " +
              str(element[2]))
        if finals[0][1] < element[2]:
            accs.append(list(["Permutation --> " +
                              str(perms[i]) +
                              " | Final Accuracy --> " +
                              str(element[2])]))
    print("")
    print("Best Accuracies")
    print("----------------------------")
    for element in accs:
        print(element)

    if inplace:
        data.to_csv(dataset)