Пример #1
0
def main():

    # Data loading and preprocessing
    nltk.download('stopwords')
    driver = NILCDriver(emb_type='glove', subtype='cbow', dimension=300)
    driver.download_extract()
    g1 = NewsG1Driver(file_name='g1_final.txt')
    g1.preprocess()
    g1.save_sentences()

    # MCI data
    mci = MCIDriver(file_name='data_set_cohmetrix_cn_trh_1.0.json')
    mci_data, mci_target = mci.preprocess()

    with open('data/processed_g1_final.txt') as g1_file:
        data = [line for line in g1_file]

    # Model training
    rnn_ae = Autoencoder(encoding_dim=500, embedding_dim=300, embedding_file=driver.file_name + '.txt')
    x_train, y_train, word_index, output_shape = rnn_ae.data_formatting(data)

    x_test, _, _, _ = rnn_ae.data_formatting(mci_data)

    rnn_ae.build(word_index, output_shape)
    rnn_ae.fit(x_train, y_train, 10, 128)
    rnn_ae.save_model_single_file()

    x_train_clean = rnn_ae.encoder.predict(x_train)
    x_train_noise = rnn_ae.noise_encoder.predict(x_train)
    x_train_encoded = np.concatenate([x_train_clean, x_train_noise])
    y_train_encoded = [0] * len(x_train_clean) + [1] * len(x_train_noise)

    x_test_encoded = rnn_ae.encoder.predict(x_test)
    y_test_encoded = mci_target

    clf = LogisticRegression()
    clf.fit(x_train_encoded, y_train_encoded)

    print("Accuracy: %0.2f" % accuracy_score(y_test_encoded, clf.predict(x_test_encoded)))