Exemplo n.º 1
0
def fit_dnn_holdout(inputs: list, y_labels,
                    final_model: Model, outfolder: str,
                    task: str, model_descriptor: str, split_row=None):
    encoder = LabelBinarizer()
    y_label_int = encoder.fit_transform(y_labels)
    y_label_lookup = dict()
    for index, l in zip(y_label_int.argmax(1), y_labels):
        y_label_lookup[index] = l

    X_merge = numpy.concatenate(inputs,
                                axis=1)  # merge so as to create correct splits across all different feature inputs

    X_train= X_merge[0:split_row]
    X_test= X_merge[split_row:]
    y_train= y_label_int[0:split_row]
    y_test=y_label_int[split_row:]

    model_file = os.path.join(outfolder, "ann-%s.m" % task)

    #nfold_predictions = dict()

    print("\ttraining...")
    final_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    final_model.fit(X_train,
                    y_train, epochs=dmc.DNN_EPOCHES, batch_size=dmc.DNN_BATCH_SIZE)
    print("\ttesting...")
    prediction_prob = final_model.predict(X_test)

    # evaluate the model
    #
    predictions = prediction_prob.argmax(axis=-1)
    util.save_scores(predictions, y_test.argmax(1), "dnn", task, model_descriptor, 3,
                     outfolder)
Exemplo n.º 2
0
    def eval_holdout(self, model, model_name, test_X, test_y):
        print("start holdout evaluation stage:", len(test_X))
        predictions = model.predict(test_X)
        classifier_util.save_scores(predictions, test_y, model_name,
                                    self.task_name, self.identifier, 3,
                                    self.outfolder)

        print("complete!")
Exemplo n.º 3
0
def learn_discriminative(cpus,
                         task,
                         model_name,
                         X_train,
                         y_train,
                         identifier,
                         outfolder,
                         nfold=None,
                         feature_reduction=None):
    classifier = None
    model_file = None

    if (model_name == "knn"):
        print("== KNN ...")
        cls = KNeighborsClassifier(n_neighbors=1)
        # rfc_tuning_params = {"max_depth": [3, 5, None],
        #                      "max_features": [1, 3, 5, 7, 10],
        #                      "min_samples_split": [2, 5, 10],
        #                      "min_samples_leaf": [1, 3, 10],
        #                      "bootstrap": [True, False],
        #                      "criterion": ["gini", "entropy"]}
        if feature_reduction is not None:
            fr = create_feature_reduction_alg(feature_reduction,
                                              len(X_train[0]))
            print("\t using " + str(fr[1]))
            pipe = Pipeline([(fr[0], fr[1]), ('knn', cls)])
            classifier = pipe
        else:
            classifier = cls
        model_file = os.path.join(outfolder, "knn_classifier-%s.m" % task)
    if (model_name == "rf"):
        print("== Random Forest ...")
        cls = RandomForestClassifier(n_estimators=20, n_jobs=cpus)
        # rfc_tuning_params = {"max_depth": [3, 5, None],
        #                      "max_features": [1, 3, 5, 7, 10],
        #                      "min_samples_split": [2, 5, 10],
        #                      "min_samples_leaf": [1, 3, 10],
        #                      "bootstrap": [True, False],
        #                      "criterion": ["gini", "entropy"]}
        if feature_reduction is not None:
            fr = create_feature_reduction_alg(feature_reduction,
                                              len(X_train[0]))
            print("\t using " + str(fr[1]))
            pipe = Pipeline([(fr[0], fr[1]), ('rf', cls)])
            classifier = pipe
        else:
            classifier = cls
        model_file = os.path.join(outfolder,
                                  "random-forest_classifier-%s.m" % task)
    if (model_name == "svm_l"):
        print("== SVM, kernel=linear ...")
        cls = svm.LinearSVC(class_weight='balanced',
                            C=0.01,
                            penalty='l2',
                            loss='squared_hinge',
                            multi_class='ovr')
        if feature_reduction is not None:
            fr = create_feature_reduction_alg(feature_reduction,
                                              len(X_train[0]))
            print("\t using " + str(fr[1]))
            pipe = Pipeline([(fr[0], fr[1]), ('svm_l', cls)])
            classifier = pipe
        else:
            classifier = cls
        model_file = os.path.join(outfolder,
                                  "liblinear-svm-linear-%s.m" % task)

    if (model_name == "svm-rbf"):
        # tuned_parameters = [{'gamma': np.logspace(-9, 3, 3), 'probability': [True], 'C': np.logspace(-2, 10, 3)},
        #                     {'C': [1e-1, 1e-3, 1e-5, 0.2, 0.5, 1, 1.2, 1.3, 1.5, 1.6, 1.7, 1.8, 2]}]
        print("== SVM, kernel=rbf ...")
        cls = svm.SVC()
        if feature_reduction is not None:
            fr = create_feature_reduction_alg(feature_reduction,
                                              len(X_train[0]))
            pipe = Pipeline([(fr[0], fr[1]), ('svm-rbf', cls)])
            print("\t using " + str(fr[1]))
            classifier = pipe
        else:
            classifier = cls
        #model_file = os.path.join(outfolder, "liblinear-svm-rbf-%s.m" % task)

    if nfold is not None:
        nfold_predictions = cross_val_predict(classifier,
                                              X_train,
                                              y_train,
                                              cv=nfold)
        #util.save_classifier_model(classifier, model_file)
        util.save_scores(nfold_predictions, y_train, model_name, task,
                         identifier, 3, outfolder)
    else:
        classifier.fit(X_train, y_train)
        #util.save_classifier_model(classifier, model_file)
    return classifier
Exemplo n.º 4
0
def learn_dnn(nfold,
              task,
              embedding_model_file,
              text_data,
              X_train_metafeature,
              y_train,
              model_descriptor,
              outfolder,
              prediction_targets,
              text_data_extra_for_embedding_vocab=None,
              embedding_trainable=False,
              embedding_mask_zero=False):
    print("\t== Perform ANN ...")  # create model

    #process text data, index vocabulary, pad each text sentence/paragraph to a fixed length
    M = dmc.extract_vocab_and_2D_input(
        text_data,
        1,
        sentence_length=dmc.DNN_MAX_SENTENCE_LENGTH,
        tweets_extra=text_data_extra_for_embedding_vocab)
    X_train_textfeature = M[0]

    #load pre-trained word embedding model
    gensimFormat = ".gensim" in embedding_model_file
    if gensimFormat:
        pretrained_embedding_models = gensim.models.KeyedVectors.load(
            embedding_model_file, mmap='r')
    else:
        pretrained_embedding_models = gensim.models.KeyedVectors. \
            load_word2vec_format(embedding_model_file, binary=True)

    #create the embedding layer by mapping each input sentence sequence to embedding representations by
    #looking up its containing words in the pre-trained embedding model
    pretrained_word_matrix = dmc.build_pretrained_embedding_matrix(
        M[1], pretrained_embedding_models, dmc.DNN_EMBEDDING_DIM, 2)

    encoder = LabelBinarizer()
    y_train_int = encoder.fit_transform(y_train)
    #y_train_int = to_categorical(numpy.asarray(y_train))

    #now let's assemble the model based ont the descriptor
    model_first_input = Input(
        shape=(dmc.DNN_MAX_SENTENCE_LENGTH, ))  #model input

    embedding_input = dmc.create_embedding_input(
        # this parses 'model_descriptor' and takes the text-based features as input to the model
        sentence_inputs_2D=model_first_input,
        # it is useful to see the details of this method and try a few different options to see difference
        max_sentence_length=dmc.DNN_MAX_SENTENCE_LENGTH,
        word_vocab_size=len(M[1]),
        word_embedding_dim=dmc.DNN_EMBEDDING_DIM,
        word_embedding_weights=pretrained_word_matrix,
        word_embedding_trainable=embedding_trainable,
        word_embedding_mask_zero=embedding_mask_zero)

    model_text = dmc.create_submodel_text(
        # this parses 'model_descriptor' and takes the text-based features as input to the model
        input_layer=embedding_input,
        model_descriptor=model_descriptor)

    # else:
    #     model_sent_input_2D = Input(shape=(dmc.DNN_MAX_SENTENCE_LENGTH,))  # model input
    #     model_first_input = Input(shape=(dmc.DNN_MAX_DOC_LENGTH, dmc.DNN_MAX_SENTENCE_LENGTH), dtype='int32')
    #
    #     model_text = dmc.create_submodel_textfeature(
    #         # this parses 'model_descriptor' and takes the text-based features as input to the model
    #         sentence_inputs_2D=model_sent_input_2D,
    #         # it is useful to see the details of this method and try a few different options to see difference
    #         max_sentence_length=dmc.DNN_MAX_SENTENCE_LENGTH,
    #         word_vocab_size=len(M[1]),
    #         word_embedding_dim=dmc.DNN_EMBEDDING_DIM,
    #         word_embedding_weights=pretrained_word_matrix,
    #         model_option=model_descriptor,
    #         doc_inputs_3D=model_first_input,
    #         word_embedding_trainable=embedding_trainable,
    #         word_embedding_mask_zero=embedding_mask_zero
    #     )

    if X_train_metafeature is not None:  #if we also want to use other features together with text-based, concatenate them as-is
        model_metafeature_inputs = Input(shape=(len(X_train_metafeature[0]), ))
        model_metafeature = \
            dmc.create_submodel_metafeature(model_metafeature_inputs, 20)
        merge = concatenate([model_text, model_metafeature])
        final = Dense(prediction_targets, activation="softmax")(merge)
        model = Model(inputs=[model_first_input, model_metafeature_inputs],
                      outputs=final)
        X_merge = numpy.concatenate([X_train_textfeature, X_train_metafeature],
                                    axis=1)
    else:
        print("--- using text features ---")
        final = Dense(prediction_targets, activation="softmax")(model_text)
        model = Model(inputs=model_first_input, outputs=final)
        X_merge = X_train_textfeature

    #this prints the model architecture diagram to a file, so you can check that it looks right
    #plot_model(model, to_file="model.png")
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    #model.compile(loss='categorical_crossentropy',
    #              optimizer='rmsprop',
    #              metrics=['acc'])

    model_file = os.path.join(outfolder, "ann-%s.m" % task)

    #perform n-fold validation (we cant use scikit-learn's wrapper as we used Keras functional api above
    if nfold is not None:
        kfold = StratifiedKFold(n_splits=nfold,
                                shuffle=True,
                                random_state=RANDOM_STATE)
        splits = list(enumerate(kfold.split(X_merge, y_train_int.argmax(1))))

        nfold_predictions = dict()
        for k in range(0, len(splits)):
            # Fit the model
            X_train_index = splits[k][1][0]
            X_test_index = splits[k][1][1]

            X_train_merge_ = X_merge[X_train_index]
            X_test_merge_ = X_merge[X_test_index]
            y_train_ = y_train_int[X_train_index]

            X_train_text_feature = X_train_merge_[:,
                                                  0:len(X_train_textfeature[0]
                                                        )]
            X_train_meta_feature = X_train_merge_[:,
                                                  len(X_train_text_feature[0]
                                                      ):]

            # y_test = y_train[X_test_index]
            X_test_text_feature = X_test_merge_[:,
                                                0:len(X_train_textfeature[0])]
            X_test_meta_feature = X_test_merge_[:,
                                                len(X_train_textfeature[0]):]

            if X_train_metafeature is not None:
                model.fit([X_train_text_feature, X_train_meta_feature],
                          y_train_,
                          epochs=dmc.DNN_EPOCHES,
                          batch_size=dmc.DNN_BATCH_SIZE)
                prediction_prob = model.predict(
                    [X_test_text_feature, X_test_meta_feature])

            else:
                model.fit(X_train_text_feature,
                          y_train_,
                          epochs=dmc.DNN_EPOCHES,
                          batch_size=dmc.DNN_BATCH_SIZE)
                prediction_prob = model.predict(X_test_text_feature)
            # evaluate the model
            #
            predictions = prediction_prob.argmax(axis=-1)

            for i, l in zip(X_test_index, predictions):
                nfold_predictions[i] = l

            # self.save_classifier_model(best_estimator, ann_model_file)

        indexes = sorted(list(nfold_predictions.keys()))
        predicted_labels = []
        for i in indexes:
            predicted_labels.append(nfold_predictions[i])
        util.save_scores(predicted_labels, y_train_int.argmax(1), "dnn", task,
                         model_descriptor, 2, outfolder)
    else:
        if X_train_metafeature is not None:
            model.fit([X_train_textfeature, X_train_metafeature],
                      y_train_int,
                      epochs=dmc.DNN_EPOCHES,
                      batch_size=dmc.DNN_BATCH_SIZE,
                      verbose=2)
        else:
            model.fit(X_train_textfeature,
                      y_train_int,
                      epochs=dmc.DNN_EPOCHES,
                      batch_size=dmc.DNN_BATCH_SIZE,
                      verbose=2)

        # serialize model to YAML
        model_yaml = model.to_yaml()
        with open(model_file + ".yaml", "w") as yaml_file:
            yaml_file.write(model_yaml)
        # serialize weights to HDF5
        model.save_weights(model_file + ".h5")
Exemplo n.º 5
0
def learn_generative(cpus,
                     task,
                     model_name,
                     X_train,
                     y_train,
                     identifier,
                     outfolder,
                     nfold=None,
                     feature_reduction=None):
    classifier = None
    model_file = None
    if (model_name == "nb"):
        print("== Naive Bayes ...")
        cls = MultinomialNB()
        if feature_reduction is not None:
            fr = create_feature_reduction_alg(feature_reduction,
                                              len(X_train[0]))
            print("\t using " + str(fr[1]))
            pipe = Pipeline([(fr[0], fr[1]), ('nb', cls)])
            classifier = pipe
        else:
            classifier = cls
        model_file = os.path.join(outfolder, "nb-classifier-%s.m" % task)

    if (model_name == "sgd"):
        print("== SGD ...")
        # "loss": ["log", "modified_huber", "squared_hinge", 'squared_loss'],
        #               "penalty": ['l2', 'l1'],
        #               "alpha": [0.0001, 0.001, 0.01, 0.03, 0.05, 0.1],
        #               "n_iter": [1000],
        #               "learning_rate": ["optimal"]}
        cls = SGDClassifier(loss='log', penalty='l2', n_jobs=cpus)
        if feature_reduction is not None:
            fr = create_feature_reduction_alg(feature_reduction,
                                              len(X_train[0]))
            print("\t using " + str(fr[1]))
            pipe = Pipeline([(fr[0], fr[1]), ('sgd', cls)])
            classifier = pipe
        else:
            classifier = cls
        model_file = os.path.join(outfolder, "sgd-classifier-%s.m" % task)
    if (model_name == "lr"):
        print("== Stochastic Logistic Regression ...")

        cls = LogisticRegression(random_state=111)
        if feature_reduction is not None:
            fr = create_feature_reduction_alg(feature_reduction,
                                              len(X_train[0]))
            print("\t using " + str(fr[1]))
            pipe = Pipeline([(fr[0], fr[1]), ('lr', cls)])
            classifier = pipe
        else:
            classifier = cls
        model_file = os.path.join(outfolder, "stochasticLR-%s.m" % task)

    if nfold is not None:
        print(y_train.shape)
        nfold_predictions = cross_val_predict(classifier,
                                              X_train,
                                              y_train,
                                              cv=nfold)
        util.save_scores(nfold_predictions, y_train, model_name, task,
                         identifier, 2, outfolder)
    else:
        classifier.fit(X_train, y_train)
        #util.save_classifier_model(classifier, model_file)
    return classifier
Exemplo n.º 6
0
def fit_fasttext_holdout(df: DataFrame, split_at_row: int, class_col: int,
                         outfolder: str, task: str, text_norm_option: int,
                         text_input_info: dict, embedding_file: str):
    # X, y, embedding_file, nfold, outfolder: str, task: str):

    encoder = LabelBinarizer()
    y = df[:, class_col]
    print("\ttotal y rows=" + str(len(y)) + " with unique values=" +
          str(len(set(y))))
    print("\tencoding y labels..." + str(datetime.datetime.now()))

    if len(set(y)) > 2:
        y_int = encoder.fit_transform(y)
    else:
        y_int = np.array([[1, 0] if l.strip() == 'CG' else [0, 1] for l in y])

    y_label_lookup = dict()
    y_label_lookup_inverse = dict()
    for index, l in zip(y_int.argmax(1), y):
        y_label_lookup[index] = l
        y_label_lookup_inverse[l] = index
        # print(l+","+str(index))

    X = []
    text_length = 0
    index = 0
    for row in df:
        text = ""
        for b in range(len(text_input_info)):
            info = text_input_info[b]
            t = concate_text(row, info["text_col"])
            t = nlp.normalize(t)
            text_length += int(info["text_length"])
            text += t + " "
        words = nlp.tokenize(text, text_norm_option)
        text = " ".join(words).strip()
        X.append([text])
        index += 1
    X = numpy.asarray(X, dtype=str)

    # perform n-fold validation (we cant use scikit-learn's wrapper as we used Keras functional api above

    X_train_ = X[0:split_at_row]
    y_train_ = y[0:split_at_row]
    X_test_ = X[split_at_row:]
    y_test_ = y[split_at_row:]

    # prepare fasttext data
    fasttext_train = outfolder + "/fasttext_train.tsv"
    with open(fasttext_train, mode='w') as outfile:
        csvwriter = csv.writer(outfile,
                               delimiter='\t',
                               quotechar='"',
                               quoting=csv.QUOTE_MINIMAL)
        for i in range(len(X_train_)):
            label = y_train_[i]
            text = X_train_[i][0]
            csvwriter.writerow(["__label__" + label.replace(" ", "|"), text])

        # fasttext_test = outfolder + "/fasttext_test.tsv"
        # with open(fasttext_test, mode='w') as outfile:
        #     csvwriter = csv.writer(outfile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        #     for i in range(len(X_test_)):
        #         label = y_test_[i]
        #         text = X_test_[i][0]
        #         csvwriter.writerow(["__label__" + label, text])

        # -dim 300 -minn 4 -maxn 10 -wordNgrams 3 -neg 10 -loss ns -epoch 3000 -thread 30
    if embedding_file is not None and embedding_file.lower() != 'none':
        model = fasttext.train_supervised(input=fasttext_train,
                                          minn=4,
                                          maxn=10,
                                          wordNgrams=3,
                                          neg=10,
                                          loss='ns',
                                          epoch=3000,
                                          thread=30,
                                          dim=dmc.DNN_EMBEDDING_DIM,
                                          pretrainedVectors=embedding_file)
    else:
        model = fasttext.train_supervised(input=fasttext_train,
                                          minn=4,
                                          maxn=10,
                                          wordNgrams=3,
                                          neg=10,
                                          loss='ns',
                                          epoch=3000,
                                          thread=30,
                                          dim=dmc.DNN_EMBEDDING_DIM)
    # evaluate the model

    X_test_as_list = []
    for row in X_test_:
        X_test_as_list.append(row[0])
    predictions = model.predict(X_test_as_list)[0]

    predicted_labels = []
    for i in predictions:
        label = i[0]
        l = label[9:]
        l = l.replace("|", " ")
        predicted_labels.append(y_label_lookup_inverse[l])

    util.save_scores(predicted_labels, y_int[split_at_row:, :].argmax(1),
                     "dnn", task, "_fasttext_", 3, outfolder)
Exemplo n.º 7
0
def fit_fasttext(df: DataFrame, nfold: int, class_col: int, outfolder: str,
                 task: str, text_norm_option: int, text_input_info: dict,
                 embedding_file: str):
    # X, y, embedding_file, nfold, outfolder: str, task: str):
    print("\t running fasttext using embedding file=" + str(embedding_file))
    encoder = LabelBinarizer()
    y = df[:, class_col]

    y_int = encoder.fit_transform(y)
    y_label_lookup = dict()
    y_label_lookup_inverse = dict()
    for index, l in zip(y_int.argmax(1), y):
        y_label_lookup[index] = l
        y_label_lookup_inverse[l] = index
        # print(l+","+str(index))

    X = []
    text_length = 0
    index = 0
    for row in df:
        text = ""
        for b in range(len(text_input_info)):
            info = text_input_info[b]
            text += concate_text(row, info["text_col"]) + " "
            text_length += int(info["text_length"])
        text = nlp.normalize(text)
        words = nlp.tokenize(text, text_norm_option)
        text = " ".join(words).strip()
        X.append([text])
        index += 1
    X = numpy.asarray(X, dtype=str)

    # perform n-fold validation (we cant use scikit-learn's wrapper as we used Keras functional api above
    kfold = StratifiedKFold(n_splits=nfold,
                            shuffle=True,
                            random_state=cl.RANDOM_STATE)
    splits = list(enumerate(kfold.split(X, y_int.argmax(1))))

    nfold_predictions = dict()
    for k in range(0, len(splits)):
        print("\tnfold=" + str(k))

        # Fit the model
        X_train_index = splits[k][1][0]
        X_test_index = splits[k][1][1]

        X_train_ = X[X_train_index]
        y_train_ = y[X_train_index]
        X_test_ = X[X_test_index]
        y_test_ = y[X_test_index]

        # prepare fasttext data
        fasttext_train = outfolder + "/fasttext_train.tsv"
        with open(fasttext_train, mode='w') as outfile:
            csvwriter = csv.writer(outfile,
                                   delimiter='\t',
                                   quotechar='"',
                                   quoting=csv.QUOTE_MINIMAL)
            for i in range(len(X_train_)):
                label = y_train_[i]
                text = X_train_[i][0]
                csvwriter.writerow(
                    ["__label__" + label.replace(" ", "|"), text])

        # fasttext_test = outfolder + "/fasttext_test.tsv"
        # with open(fasttext_test, mode='w') as outfile:
        #     csvwriter = csv.writer(outfile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        #     for i in range(len(X_test_)):
        #         label = y_test_[i]
        #         text = X_test_[i][0]
        #         csvwriter.writerow(["__label__" + label, text])

        # -dim 300 -minn 4 -maxn 10 -wordNgrams 3 -neg 10 -loss ns -epoch 3000 -thread 30
        if embedding_file is not None:
            model = fasttext.train_supervised(input=fasttext_train,
                                              minn=4,
                                              maxn=10,
                                              wordNgrams=3,
                                              neg=10,
                                              loss='ns',
                                              epoch=3000,
                                              thread=30,
                                              dim=dmc.DNN_EMBEDDING_DIM,
                                              pretrainedVectors=embedding_file)
        else:
            model = fasttext.train_supervised(input=fasttext_train,
                                              minn=4,
                                              maxn=10,
                                              wordNgrams=3,
                                              neg=10,
                                              loss='ns',
                                              epoch=3000,
                                              thread=30,
                                              dim=dmc.DNN_EMBEDDING_DIM)

        # evaluate the model
        X_test_as_list = []
        for row in X_test_:
            X_test_as_list.append(row[0])
        predictions = model.predict(X_test_as_list)[0]

        for i in range(len(X_test_index)):
            index = X_test_index[i]
            label = predictions[i][0]
            l = label[9:]
            l = l.replace("|", " ")
            nfold_predictions[index] = y_label_lookup_inverse[l]

    indexes = sorted(list(nfold_predictions.keys()))
    predicted_labels = []
    for i in indexes:
        predicted_labels.append(nfold_predictions[i])

    util.save_scores(predicted_labels, y_int.argmax(1), "dnn", task,
                     "_fasttext_", 3, outfolder)
Exemplo n.º 8
0
def fit_dnn_holdout(df: DataFrame,
                    split_at_row: int,
                    class_col: int,
                    final_model: Model,
                    outfolder: str,
                    task: str,
                    model_descriptor: str,
                    text_norm_option: int,
                    text_input_info: dict,
                    embedding_model,
                    embedding_model_format,
                    word_weights: list = None):
    encoder = LabelBinarizer()
    y = df[:, class_col]
    print("\ttotal y rows=" + str(len(y)) + " with unique values=" +
          str(len(set(y))))
    print("\tencoding y labels..." + str(datetime.datetime.now()))

    if len(set(y)) > 2:
        y_int = encoder.fit_transform(y)
    else:
        y_int = np.array([[1, 0] if l.strip() == 'CG' else [0, 1] for l in y])

    print("\tcreating y labels dictionary..." + str(datetime.datetime.now()))
    y_label_lookup = dict()
    y_label_lookup_inverse = dict()
    for index, l in zip(y_int.argmax(1), y):
        y_label_lookup[index] = l
        y_label_lookup_inverse[l] = index

    model_file = os.path.join(outfolder, "ann-%s.m" % task)

    print("\tspliting to train/test..." + str(datetime.datetime.now()))
    df_train = df[0:split_at_row]
    df_test = df[split_at_row:]

    # nfold_predictions = dict()

    print("\ttraining..." + str(datetime.datetime.now()))
    final_model.compile(loss='categorical_crossentropy',
                        optimizer='adam',
                        metrics=['accuracy'])

    training_generator = data_generator(
        df=df_train,
        class_col=class_col,
        classes=y_label_lookup_inverse,
        batch_size=dmc.DNN_BATCH_SIZE,
        text_norm_option=text_norm_option,
        embedding_model=embedding_model,
        text_input_info=text_input_info,
        embedding_format=embedding_model_format,
        word_weights=word_weights)

    training_steps_per_epoch = round(len(df_train) / dmc.DNN_BATCH_SIZE)

    final_model.fit_generator(training_generator,
                              steps_per_epoch=training_steps_per_epoch,
                              epochs=dmc.DNN_EPOCHES)

    print("\ttesting...")
    test_generator = data_generator(df=df_test,
                                    class_col=class_col,
                                    classes=y_label_lookup_inverse,
                                    batch_size=len(df_test),
                                    text_norm_option=text_norm_option,
                                    embedding_model=embedding_model,
                                    text_input_info=text_input_info,
                                    embedding_format=embedding_model_format,
                                    shuffle=False,
                                    word_weights=word_weights)
    prediction_prob = final_model.predict_generator(test_generator, steps=1)

    # evaluate the model
    #
    predictions = prediction_prob.argmax(axis=-1)

    # evaluate the model
    util.save_scores(predictions, y_int[split_at_row:, :].argmax(1), "dnn",
                     task, model_descriptor, 3, outfolder)
Exemplo n.º 9
0
def fit_dnn(df: DataFrame,
            nfold: int,
            class_col: int,
            final_model: Model,
            outfolder: str,
            task: str,
            model_descriptor: str,
            text_norm_option: int,
            text_input_info: dict,
            embedding_model,
            embedding_model_format,
            word_weights: list = None):
    encoder = LabelBinarizer()
    y = df[:, class_col]

    y_int = encoder.fit_transform(y)
    y_label_lookup = dict()
    y_label_lookup_inverse = dict()
    for index, l in zip(y_int.argmax(1), y):
        y_label_lookup[index] = l
        y_label_lookup_inverse[l] = index

    model_file = os.path.join(outfolder, "ann-%s.m" % task)
    model_copies = []
    for i in range(nfold):
        model_copy = clone_model(final_model)
        model_copy.set_weights(final_model.get_weights())
        model_copies.append(model_copy)

    kfold = StratifiedKFold(n_splits=nfold,
                            shuffle=True,
                            random_state=cl.RANDOM_STATE)
    splits = list(enumerate(kfold.split(df, y_int.argmax(1))))

    nfold_predictions = dict()
    for k in range(0, len(splits)):
        print("\tnfold=" + str(k))
        nfold_model = model_copies[k]
        nfold_model.compile(loss='categorical_crossentropy',
                            optimizer='adam',
                            metrics=['accuracy'])

        # Fit the model
        X_train_index = splits[k][1][0]
        X_test_index = splits[k][1][1]

        X_train_merge_ = df[X_train_index]
        X_test_merge_ = df[X_test_index]
        y_train_ = y_int[X_train_index]
        y_test_ = y_int[X_test_index]

        # df, batch_size, text_norm_option, classes, ft_model, text_col_info:list
        training_generator = data_generator(
            df=X_train_merge_,
            class_col=class_col,
            classes=y_label_lookup_inverse,
            batch_size=dmc.DNN_BATCH_SIZE,
            text_norm_option=text_norm_option,
            embedding_model=embedding_model,
            text_input_info=text_input_info,
            embedding_format=embedding_model_format,
            word_weights=word_weights)

        training_steps_per_epoch = round(
            len(X_train_merge_) / dmc.DNN_BATCH_SIZE)

        nfold_model.fit_generator(training_generator,
                                  steps_per_epoch=training_steps_per_epoch,
                                  epochs=dmc.DNN_EPOCHES)

        test_generator = data_generator(
            df=X_test_merge_,
            class_col=class_col,
            classes=y_label_lookup_inverse,
            batch_size=len(X_test_merge_),
            text_norm_option=text_norm_option,
            embedding_model=embedding_model,
            text_input_info=text_input_info,
            embedding_format=embedding_model_format,
            shuffle=False,
            word_weights=word_weights)
        prediction_prob = nfold_model.predict_generator(test_generator,
                                                        steps=1)

        # evaluate the model
        #
        predictions = prediction_prob.argmax(axis=-1)

        for i, l in zip(X_test_index, predictions):
            nfold_predictions[i] = l

        del nfold_model

    indexes = sorted(list(nfold_predictions.keys()))
    predicted_labels = []
    for i in indexes:
        predicted_labels.append(nfold_predictions[i])
    util.save_scores(predicted_labels, y_int.argmax(1), "dnn", task,
                     model_descriptor, 3, outfolder)
Exemplo n.º 10
0
def fit_dnn(inputs: list, nfold: int, y_train,
            final_model: Model, outfolder: str,
            task: str, model_descriptor: str):
    encoder = LabelBinarizer()
    y_train_int = encoder.fit_transform(y_train)
    y_train_label_lookup = dict()
    for index, l in zip(y_train_int.argmax(1), y_train):
        y_train_label_lookup[index] = l

    X_merge = numpy.concatenate(inputs,
                                axis=1)  # merge so as to create correct splits across all different feature inputs

    model_file = os.path.join(outfolder, "ann-%s.m" % task)

    model_copies = []
    for i in range(nfold):
        model_copy = clone_model(final_model)
        model_copy.set_weights(final_model.get_weights())
        model_copies.append(model_copy)

    # perform n-fold validation (we cant use scikit-learn's wrapper as we used Keras functional api above
    if nfold is not None:
        kfold = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=cl.RANDOM_STATE)
        splits = list(enumerate(kfold.split(X_merge, y_train_int.argmax(1))))

        nfold_predictions = dict()
        for k in range(0, len(splits)):
            print("\tnfold=" + str(k))
            nfold_model = model_copies[k]
            nfold_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

            # Fit the model
            X_train_index = splits[k][1][0]
            X_test_index = splits[k][1][1]

            X_train_merge_ = X_merge[X_train_index]
            X_test_merge_ = X_merge[X_test_index]
            y_train_ = y_train_int[X_train_index]
            y_test_ = y_train_int[X_test_index]

            separate_training_feature_inputs = []  # to contain features for training set coming from different input branches
            separate_testing_feature_inputs = []
            index_start = 0
            for feature_input in inputs:
                length = len(feature_input[0])
                index_end = index_start + length
                slice_train = X_train_merge_[:, index_start:index_end]
                slice_test = X_test_merge_[:, index_start:index_end]
                separate_training_feature_inputs.append(slice_train)
                separate_testing_feature_inputs.append(slice_test)
                index_start = index_end

            nfold_model.fit(separate_training_feature_inputs,
                            y_train_, epochs=dmc.DNN_EPOCHES, batch_size=dmc.DNN_BATCH_SIZE)
            prediction_prob = nfold_model.predict(separate_testing_feature_inputs)

            # evaluate the model
            #
            predictions = prediction_prob.argmax(axis=-1)

            for i, l in zip(X_test_index, predictions):
                nfold_predictions[i] = l

            del nfold_model

        indexes = sorted(list(nfold_predictions.keys()))
        predicted_labels = []
        for i in indexes:
            predicted_labels.append(nfold_predictions[i])
        util.save_scores(predicted_labels, y_train_int.argmax(1), "dnn", task, model_descriptor, 3,
                         outfolder)
    else:
        final_model.fit(inputs,
                        y_train_int, epochs=dmc.DNN_EPOCHES, batch_size=dmc.DNN_BATCH_SIZE, verbose=2)

        # serialize model to YAML
        model_yaml = final_model.to_yaml()
        with open(model_file + ".yaml", "w") as yaml_file:
            yaml_file.write(model_yaml)
            # serialize weights to HDF5
            final_model.save_weights(model_file + ".h5")