예제 #1
0
def main():
    batch_size = 128
    epochs = 100
    maxlen = 300
    model_path = "cnn_model.h5"
    num_words = 40000
    num_label = 2

    x, y = load_dataset("data/amazon_reviews_multilingual_JP_v1_00.tsv")

    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    vocab = build_vocabulary(x_train, num_words)
    x_train = vocab.texts_to_sequences(x_train)
    x_test = vocab.texts_to_sequences(x_test)
    x_train = pad_sequences(x_train, maxlen=maxlen, truncating="post")
    x_test = pad_sequences(x_test, maxlen=maxlen, truncating="post")

    wv = load_fasttext("data/cc.ja.300.vec.gz")
    wv = filter_embeddings(wv, vocab.word_index, num_words)

    model = CNNModel(num_words, num_label, embeddings=wv).build()
    model.compile(optimizer="adam",
                  loss="sparse_categorical_crossentropy",
                  metrics=["acc"])

    callbakcs = [
        EarlyStopping(patience=3),
        ModelCheckpoint(model_path, save_best_only=True)
    ]

    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.2,
              callbacks=callbakcs,
              shuffle=True)

    model = load_model(model_path)
    api = InferenceAPI(model, vocab, preprocess_dataset)
    y_pred = api.predict_from_sequence(x_test)
    print("precision: {:.4f}".format(
        precision_score(y_test, y_pred, average="binary")))
    print("recall: {:.4f}".format(
        recall_score(y_test, y_pred, average="binary")))
    print("f1: {:.4f}".format(f1_score(y_test, y_pred, average="binary")))
예제 #2
0
def train():
    df_tweets = pd.read_csv("data/df_tweets", index_col=0)
    df_tweets["text"] = preprocess_dataset(df_tweets["text"])
    df_tweets = df_tweets.dropna(how='any')
    df_tweets = df_tweets.drop(df_tweets.index[df_tweets["Irrelevant"] == 1])

    x = df_tweets["text"]
    # y = df_tweets[["posi_and_nega", "posi", "nega", "neutral", "Irrelevant"]]
    y = df_tweets[["posi_and_nega", "posi", "nega", "neutral"]]
    y = np.asarray(y)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    vocab = build_vocabulary(x_train, num_words)
    with open('model/tokenizer.pickle', 'wb') as handle:
        pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
    x_train = vocab.texts_to_sequences(x_train)
    x_test = vocab.texts_to_sequences(x_test)
    x_train = pad_sequences(x_train, maxlen=maxlen, truncating="post")
    x_test = pad_sequences(x_test, maxlen=maxlen, truncating="post")

    wv = KeyedVectors.load("model/word2vec.model", mmap='r')
    wv = filter_embeddings(wv, vocab.word_index, num_words)

    model = CNNModel(num_words, num_label, embeddings=wv).build()
    model.compile(optimizer="adam",
                  loss="binary_crossentropy",
                  metrics=["acc"])

    callbakcs = [
        EarlyStopping(patience=3),
        ModelCheckpoint(model_path, save_best_only=True)
    ]

    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.5,
              callbacks=callbakcs,
              shuffle=True)