Exemplo n.º 1
0
 def get_sets(self):
     """Returns as many balanced lists of documents as many folds."""
     documents_by_class = Dataset.get_documents_by_class(
         self.dataset.documents)
     sets = []
     for i in range(0, self.folds):
         set = []
         for c in documents_by_class:
             n = floor(len(documents_by_class[c]) / self.folds)
             for j in range(0, n):
                 set.append(documents_by_class[c][i * n + j])
         sets.append(set)
     return sets
Exemplo n.º 2
0
 def test_get_documents_by_class(self):
     documents = [
         ClassDocument("A positive document", "+"),
         ClassDocument("Another positive document", "+"),
         ClassDocument("Yet one more positive document", "+"),
         ClassDocument("This time a negative one", "-"),
         ClassDocument("Still negative", "-")
     ]
     documents_by_class = Dataset.get_documents_by_class(documents)
     expected_documents_by_class = {
         "+": [documents[0], documents[1], documents[2]],
         "-": [documents[3], documents[4]]
     }
     self.assertEqual(documents_by_class, expected_documents_by_class)
Exemplo n.º 3
0
 def test_get_split_documents(self):
     documents = [
         ClassDocument("pos1", "+"),
         ClassDocument("pos2", "+"),
         ClassDocument("pos3", "+"),
         ClassDocument("pos4", "+"),
         ClassDocument("pos5", "+"),
         ClassDocument("neg1", "-"),
         ClassDocument("neg2", "-"),
         ClassDocument("neg3", "-"),
         ClassDocument("neg4", "-"),
         ClassDocument("neg5", "-"),
     ]
     split_documents = Dataset.get_split_documents(documents, 0.8)
     expected_split_documents = [
         documents[0], documents[1], documents[2], documents[3],
         documents[5], documents[6], documents[7], documents[8]
     ], [documents[4], documents[9]]
     self.assertEqual(split_documents, expected_split_documents)
def train_nb(filename,
             label_col='label',
             text_col='text',
             validation_split=0.3):
    dataset = Dataset(filename, label_col=label_col, text_col=text_col)
    dataset.load()
    dataset.preprocess_texts(stemming=True)

    data = dataset.cleaned_data.copy()
    train = pd.DataFrame(columns=['label', 'text'])
    validation = pd.DataFrame(columns=['label', 'text'])
    for label in data.label.unique():
        label_data = data[data.label == label]
        train_data, validation_data = train_test_split(
            label_data, test_size=validation_split)
        train = pd.concat([train, train_data])
        validation = pd.concat([validation, validation_data])

    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])
    tuned_parameters = {
        'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
        'tfidf__use_idf': (True, False),
        'tfidf__norm': ('l1', 'l2'),
        'clf__alpha': [1, 1e-1, 1e-2]
    }

    x_train = train.text
    y_train = train.label
    x_validation = validation.text
    y_validation = validation.label

    print('Running Multinomial Naive Bayes...')
    start = time()
    model = GridSearchCV(text_clf, tuned_parameters, n_jobs=4, cv=10)
    model.fit(x_train, y_train)
    print('Finished in: {} mins'.format(round((time() - start) / 60, 2)))

    print('Testing Model...')
    results = model.predict(x_validation)
    print(classification_report(y_validation, results, digits=4))

    filepath = Path('models/emotion_recognition/nb_model.pickle').resolve()

    with filepath.open('wb') as file:
        pickle.dump(model, file)
Exemplo n.º 5
0
def train(model_type,
          dataset_path,
          tokenizer_path,
          save_dir,
          glove_embeddings,
          word2vec_embeddings,
          token_index_path,
          ngram_range,
          max_length,
          label_col='label',
          text_col='text',
          validation_split=0.3,
          embedding_dim=100,
          input_length=100,
          learning_rate=1e-3,
          epochs=10,
          batch_size=32):
    dataset = Dataset(dataset_path, label_col=label_col, text_col=text_col)
    dataset.load()
    dataset.preprocess_texts(stemming=True)

    tokenizer_file = Path(tokenizer_path).resolve()
    with tokenizer_file.open('rb') as file:
        tokenizer = pickle.load(file)

    data = dataset.cleaned_data.copy()
    train = pd.DataFrame(columns=['label', 'text'])
    validation = pd.DataFrame(columns=['label', 'text'])
    for label in data.label.unique():
        label_data = data[data.label == label]
        train_data, validation_data = train_test_split(
            label_data, test_size=validation_split)
        train = pd.concat([train, train_data])
        validation = pd.concat([validation, validation_data])

    embedding_layer = None
    if (glove_embeddings):
        embeddings_index = {}
        f = Path(os.getenv("GLOVE_EMBEDDINGS")).open()
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()

        embedding_dim = int(os.getenv("EMBEDDING_DIM"))
        embedding_matrix = np.zeros(
            (len(tokenizer.word_index) + 1, embedding_dim))
        for word, i in tokenizer.word_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector

        embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                                    embedding_dim,
                                    weights=[embedding_matrix],
                                    input_length=input_length,
                                    trainable=True)

    if (word2vec_embeddings):
        embedding = KeyedVectors.load_word2vec_format(
            os.getenv("WORD2VEC_EMBEDDINGS"))
        embedding_matrix = np.zeros(
            (len(tokenizer.word_index) + 1, embedding.vector_size))
        for word, i in tokenizer.word_index.items():
            try:
                embedding_vector = embedding.get_vector(word)
                embedding_matrix[i] = embedding_vector
            except:
                pass

        embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1,
                                    output_dim=embedding.vector_size,
                                    weights=[embedding_matrix],
                                    input_length=input_length,
                                    trainable=False,
                                    input_shape=(input_length, ))

    input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)

    train_sequences = [text.split() for text in train.text]
    validation_sequences = [text.split() for text in validation.text]
    list_tokenized_train = tokenizer.texts_to_sequences(train_sequences)
    list_tokenized_validation = tokenizer.texts_to_sequences(
        validation_sequences)

    if (model_type == 'fasttext'):
        with Path(token_index_path).resolve().open('rb') as file:
            token_index = pickle.load(file)
        list_tokenized_train = add_ngram(train_sequences, token_index,
                                         ngram_range)
        list_tokenized_validation = add_ngram(validation_sequences,
                                              token_index, ngram_range)
        input_dim = max_length

    model = NLP_MODEL[model_type](input_length,
                                  input_dim,
                                  embedding_layer,
                                  embedding_dim=embedding_dim)
    optimizer = Adam(learning_rate)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    print(model.summary())

    x_train = pad_sequences(list_tokenized_train, maxlen=input_length)
    x_validation = pad_sequences(list_tokenized_validation,
                                 maxlen=input_length)

    y_train = train.label.replace(4, 1)
    y_validation = validation.label.replace(4, 1)

    model_name = model_type + '_' + str(embedding_dim) + '_' + str(
        input_length)
    checkpoint_path = os.path.join(
        save_dir, 'checkpoints', model_name + '_{epoch:02d}-{val_acc:.4f}.h5')
    log_dir = os.path.join(save_dir, 'logs', model_name)

    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    model.fit(x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=(x_validation, y_validation),
              callbacks=[
                  checkpoints(checkpoint_path),
                  tensorboard(log_dir, batch_size),
                  early_stopping(10),
                  reduce_lr(5)
              ])

    model_file = Path(save_dir, model_name + '.h5').resolve()
    model.save_weights(model_file.as_posix())
Exemplo n.º 6
0
# cross_validator.py <folds> <dataset> <classifier> <options>

if __name__ == '__main__':
    print(" ")
    format = "cross_validator.py <folds> <dataset> <classifier> [<options>]\n"
    arguments = sys.argv
    if len(arguments) >= 4:
        folds = 0
        try:
            folds = int(arguments[1])
        except:
            print("Invalid format for number of folds (should be integer).")
            print("\n  >  " + format)
            exit()
        dataset = Dataset.get_dataset(arguments[2])
        if dataset is not None:
            cv = None
            if arguments[3] == "naive_bayes":
                cv = NaiveBayesCrossValidator(dataset, folds)
            elif arguments[3] == "knn":
                if len(arguments) == 5:
                    k = 0
                    try:
                        k = int(arguments[4])
                    except:
                        print("Invalid number format for number of neighbours")
                        print(
                            "\n  >  cross_validator.py <folds> <dataset> <classifier> <neighbours>\n"
                        )
                        exit()