def main(): args = sys.argv batch_size = 128 epochs = 100 maxlen = 300 model_path = 'models/cnn_model.h5' num_words = 40000 num_label = 2 x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv') x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) vocab = build_vocabulary(x_train, num_words) x_train = vocab.texts_to_sequences(x_train) x_test = vocab.texts_to_sequences(x_test) x_train = pad_sequences(x_train, maxlen=maxlen, truncating='post') x_test = pad_sequences(x_test, maxlen=maxlen, truncating='post') emb_flg = args[0] if emb_flg == 't': wv = load_fasttext('../chap08/models/cc.ja.300.vec.gz') wv = filter_embeddings(wv, vocab.word_index, num_words) else: wv = None model = CNNModel(num_words, num_label, embeddings=wv).build() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc']) callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True) ] model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=callbacks, shuffle=True) model = load_model(model_path) api = InferenceAPI(model, vocab, preprocess_dataset) y_pred = api.predict_from_sequences(x_test) print('precision: {:.4f}'.format( precision_score(y_test, y_pred, average='binary'))) print('recall : {:.4f}'.format( recall_score(y_test, y_pred, average='binary'))) print('f1 : {:.4f}'.format(f1_score(y_test, y_pred, average='binary')))
def main(): batch_size = 128 epochs = 100 maxlen = 300 model_path = "cnn_model.h5" num_words = 40000 num_label = 2 x, y = load_dataset("data/amazon_reviews_multilingual_JP_v1_00.tsv") x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) vocab = build_vocabulary(x_train, num_words) x_train = vocab.texts_to_sequences(x_train) x_test = vocab.texts_to_sequences(x_test) x_train = pad_sequences(x_train, maxlen=maxlen, truncating="post") x_test = pad_sequences(x_test, maxlen=maxlen, truncating="post") wv = load_fasttext("data/cc.ja.300.vec.gz") wv = filter_embeddings(wv, vocab.word_index, num_words) model = CNNModel(num_words, num_label, embeddings=wv).build() model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["acc"]) callbakcs = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True) ] model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=callbakcs, shuffle=True) model = load_model(model_path) api = InferenceAPI(model, vocab, preprocess_dataset) y_pred = api.predict_from_sequence(x_test) print("precision: {:.4f}".format( precision_score(y_test, y_pred, average="binary"))) print("recall: {:.4f}".format( recall_score(y_test, y_pred, average="binary"))) print("f1: {:.4f}".format(f1_score(y_test, y_pred, average="binary")))
def main(): # Set hyper-parameters. batch_size = 32 epochs = 100 model_path = 'models/model_{}.h5' num_words = 15000 # Data loading. x, y = load_dataset('./data/ja.wikipedia.conll') # Pre-processing. x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) source_vocab = Vocab(num_words=num_words, oov_token='<UNK>').fit(x_train) target_vocab = Vocab(lower=False).fit(y_train) x_train = create_dataset(x_train, source_vocab) y_train = create_dataset(y_train, target_vocab) # Build models. models = [ UnidirectionalModel(num_words, target_vocab.size).build(), BidirectionalModel(num_words, target_vocab.size).build(), ] for i, model in enumerate(models): model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') # Preparing callbacks. callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path.format(i), save_best_only=True) ] # Train the model. model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks, shuffle=True) # Inference. model = load_model(model_path.format(i)) api = InferenceAPI(model, source_vocab, target_vocab) y_pred = api.predict_from_sequences(x_test) print(classification_report(y_test, y_pred, digits=4))
def main(): # ハイパーパラメータの設定 batch_size = 32 epochs = 100 # model_path = 'models/unidirectional_model.h5' model_path = 'models/bidirectional_model.h5' num_words = 15000 # データ・セットの読み込み x, y = load_dataset('./data/ja.wikipedia.conll') # データ・セットの前処理 x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) source_vocab = Vocab(num_words=num_words, oov_token='<UNK>').fit(x_train) target_vocab = Vocab(lower=False).fit(y_train) x_train = create_dataset(x_train, source_vocab) y_train = create_dataset(y_train, target_vocab) # モデルの構築 # model = UnidirectionalModel(num_words, target_vocab.size).build() model = BidirectionalModel(num_words, target_vocab.size).build() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') # コールバックの用意 callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True) ] # モデルの学習 model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks, shuffle=True) # 予測と評価 model = load_model(model_path) api = InferenceAPI(model, source_vocab, target_vocab) y_pred = api.predict_from_sequences(x_test) print(classification_report(y_test, y_pred, digits=4))
def main(): # hyper parameter setting emb_dim = 50 epochs = 2 model_path = 'model.h5' negative_samples = 1 num_words = 10000 window_size = 1 # corpus text = load_data(filepath = '../chap04/data/ja.text8') # vocablary vocab = build_vocablary(text, num_words) # create dataset x, y = create_dataset(text, vocab, num_words, window_size, negative_samples) # construction of model model = EmbeddingModel(num_words, emb_dim) model = model.build() model.compile(optimizer = 'adam', loss = 'binary_crossentropy') # callback callbacks = [ EarlyStopping(patience=1), ModelCheckpoint(model_path, save_best_only=True) ] # model model.fit(x=x,y=y, batch_size=128, epochs=epochs, validation_split=0.2, callbacks=callbacks) # prediction model = load_model(model_path) api = InferenceAPI(model, vocab) pprint(api.most_similar(word='日本'))
def main(): # Set hyper-parameters. batch_size = 32 epochs = 100 model_path = 'atmodel.h5' enc_arch = 'encoder.json' dec_arch = 'decoder.json' data_path = '../data/w16to19hukusimaconv.txt' num_words = 7000 num_data = 4367 # Data loading. en_texts, ja_texts = load_dataset(data_path) en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data] # Preprocessings. #ja_texts = preprocess_ja(ja_texts) ja_texts = preprocess_dataset(ja_texts) en_texts = preprocess_dataset(en_texts) x_train, x_test, y_train, y_test = train_test_split(en_texts, ja_texts, test_size=0.2, random_state=42) en_vocab = build_vocabulary(x_train, num_words) ja_vocab = build_vocabulary(y_train, num_words) print(x_train[:3]) print(y_train[:3]) x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab) print(en_vocab.word_index) print(ja_vocab.word_index) # Build a simple model. encoder = Encoder(num_words) decoder = Decoder(num_words) # Build an attention model. #encoder = Encoder(num_words, return_sequences=True) #decoder = AttentionDecoder(num_words) seq2seq = Seq2seq(encoder, decoder) model = seq2seq.build() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') # Train the model. callbacks = [ EarlyStopping(patience=10), ModelCheckpoint(model_path, save_best_only=True, save_weights_only=True) ] """ model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, callbacks=callbacks, validation_split=0.1)""" encoder.save_as_json(enc_arch) decoder.save_as_json(dec_arch) # Inference. encoder = Encoder.load(enc_arch, model_path) decoder = Decoder.load(dec_arch, model_path) api = InferenceAPI(encoder, decoder, en_vocab, ja_vocab) #api = InferenceAPIforAttention(encoder, decoder, en_vocab, ja_vocab) texts = sorted(set(en_texts[:50]), key=len) texts = ["お聞きしたいと思います", "さっき の 答弁 全く 納得 できません", "全く 納得 い き ません", "ありがとうございました", "おはようございます",\ "よろしいでしょうか", "是非 よろしくお願いいたします", "もう少し 具体的に 教えて いただける と 助 か る んですけれども", "ちょっと 待 って", "質問 主 意 書 では 当然 混 同 は しておりません",\ "正 式 な 要求 でいい んですか", "時間ですので まとめて ください", "ちょっと 静粛に お願いします", "よろしいですか", "静粛に お願いします",\ "答弁 を まとめて ください", "時間 ですから", "驚 き の答弁 ですね", "それは いつ ごろ でしょうか", "そのとおり です" ] for text in texts: decoded = api.predict(text=text) print('入力: {}'.format(text)) print('応答: {}'.format(decoded)) y_test = [y.split(' ')[1:-1] for y in y_test] bleu_score = evaluate_bleu(x_test, y_test, api) print('BLEU: {}'.format(bleu_score))
def main(): # ハイパーパラメータの背一定 batch_size = 128 epochs = 100 maxlen = 300 # model_path = 'models/rnn_model.h5' # model_path = 'models/lstm_model.h5' # model_path = 'models/CNN_model.h5' model_path = 'models/latm_iniemb_model.h5' num_words = 4000 num_label = 2 # データ・セットの読み込み x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv') # データセットの前処理 x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) vocab = build_vocabulary(x_train, num_words) x_train = vocab.texts_to_sequences(x_train) x_test = vocab.texts_to_sequences(x_test) x_train = pad_sequences(x_train, maxlen=maxlen, truncating='post') x_test = pad_sequences(x_test, maxlen=maxlen, truncating='post') # 単語分散表現 wv = load_fasttext('data/cc.ja.300.vec') wv = filter_embeddings(wv, vocab.word_index, num_words) # モデルの構築 # model = RNNModel(num_words, num_label, embeddings=None).build() model = LSTMModel(num_words, num_label, embeddings=wv).build() # model = CNNModel(num_words, num_label, embeddings=None).build() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc']) # コールバックの用意 callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True) ] # モデルの学習 model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=callbacks, shuffle=True) # 予測 model = load_model(model_path) api = InferenceAPI(model, vocab, preprocess_dataset) y_pred = api.predict_from_sequences(x_test) print('precision: {:.4f}'.format( precision_score(y_test, y_pred, average='binary'))) print('recall: {:.4f}'.format( recall_score(y_test, y_pred, average='binary'))) print('f1: {:.4f}'.format(f1_score(y_test, y_pred, average='binary')))
def main(): startime = time.time() os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' # Set hyper-parameters. batch_size = 32 epochs = 100 model_path = 'models/model.h5' enc_arch = 'models/encoder.json' dec_arch = 'models/decoder.json' data_path = 'data/jpn.txt' num_words = 10000 num_data = 20000 # Data loading. print(return_time(startime), "1. Loading data ...") en_texts, ja_texts = load_dataset(data_path) en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data] # Preprocessings. print(return_time(startime), "2. Preprocessing dataset ...") ja_texts = preprocess_ja(ja_texts) ja_texts = preprocess_dataset(ja_texts) en_texts = preprocess_dataset(en_texts) x_train, x_test, y_train, y_test = train_test_split(en_texts, ja_texts, test_size=0.2, random_state=42) en_vocab = build_vocabulary(x_train, num_words) ja_vocab = build_vocabulary(y_train, num_words) x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab) # Build a simple model. print(return_time(startime), "3. Build model ...") encoder = Encoder(num_words) decoder = Decoder(num_words) # Build an attention model. # encoder = Encoder(num_words, return_sequences=True) # decoder = AttentionDecoder(num_words) seq2seq = Seq2seq(encoder, decoder) model = seq2seq.build() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') # Train the model. print(return_time(startime), "4. Start training ...") callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True, save_weights_only=True) ] model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, callbacks=callbacks, validation_split=0.1) encoder.save_as_json(enc_arch) decoder.save_as_json(dec_arch) # Inference. print(return_time(startime), "5. Evaluation") print("***********************************") encoder = Encoder.load(enc_arch, model_path) decoder = Decoder.load(dec_arch, model_path) api = InferenceAPI(encoder, decoder, en_vocab, ja_vocab) # api = InferenceAPIforAttention(encoder, decoder, en_vocab, ja_vocab) texts = sorted(set(en_texts[:50]), key=len) for text in texts: decoded = api.predict(text=text) print('English : {}'.format(text)) print('Japanese: {}'.format(decoded)) print() print(return_time(startime), "6. Calculating BLEU score ...") y_test = [y.split(' ')[1:-1] for y in y_test] bleu_score = evaluate_bleu(x_test, y_test, api) print('BLEU: {}'.format(bleu_score)) print(return_time(startime), "7. Finished!")
def main(): # ハイパーパラメータの設定 batch_size = 32 epochs = 100 model_path = 'models/simple_model.h5' enc_arch = 'models/encoder.json' dec_arch = 'models/decoder.json' data_path = 'data/jpn.txt' num_words = 10000 num_data = 20000 # データ・セット読み込み en_texts, ja_texts = load_dataset(data_path) en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data] # データ・セットの前処理 ja_texts = preprocess_ja(ja_texts) ja_texts = preprocess_dataset(ja_texts) en_texts = preprocess_dataset(en_texts) x_train, x_test, y_train, y_test = train_test_split(en_texts, ja_texts, test_size=0.2, random_state=42) en_vocab = build_vocabulary(x_train, num_words) ja_vocab = build_vocabulary(y_train, num_words) x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab) # モデルの構築 encoder = Encoder(num_words) decoder = Decoder(num_words) seq2seq = Seq2seq(encoder, decoder) model = seq2seq.build() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') # コールバックの用意 callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True, save_weights_only=True) ] # モデルの学習 model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, callbacks=callbacks, validation_split=0.1) encoder.save_as_json(enc_arch) decoder.save_as_json(dec_arch) # 予測 encoder = Encoder.load(enc_arch, model_path) decoder = Decoder.load(dec_arch, model_path) api = InferenceAPI(encoder, decoder, en_vocab, ja_vocab) texts = sorted(set(en_texts[:50]), key=len) for text in texts: decoded = api.predict(text=text) print('English : {}'.format(text)) print('Japanese: {}'.format(decoded)) # 性能評価 y_test = [y.split(' ')[1:-1] for y in y_test] bleu_score = evaluate_bleu(x_test, y_test, api) print('BLEU: {}'.format(bleu_score))
text = load_data(filepath='data/ja.text8') # ボキャブラリの構築 vocab = build_vocabulary(text, num_words) # データ・セットの作成 x, y = create_dataset(text, vocab, num_words, window_size, negative_samples) # モデルの構築 model = EmbeddingModel(num_words, emb_dim) model = model.build() model.compile(optimizer='adam', loss='binary_crossentropy') # コールバックの用意 callbacks = [ EarlyStopping(patience=1), ModelCheckpoint(model_path, save_best_only=True) ] # モデルの学習 model.fit(x=x, y=y, batch_size=128, epochs=epochs, validation_split=0.2, callbacks=callbacks) # 予測 model = load_model(model_path) api = InferenceAPI(model, vocab) pprint(api.most_similar(word='日本'))
def main(args): print(args) startime = time.time() os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' # Set hyper-parameters. batch_size = 128 epochs = 100 maxlen = 300 model_path = 'models/model_{}.h5' num_words = 40000 num_label = 2 # Data loading. print(return_time(startime), "1. Loading data ...") x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv') # pre-processing. print(return_time(startime), "2. Preprocessing dataset ...") x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) vocab = build_vocabulary(x_train, num_words) x_train = vocab.texts_to_sequences(x_train) x_test = vocab.texts_to_sequences(x_test) x_train = pad_sequences(x_train, maxlen=maxlen, truncating='post') x_test = pad_sequences(x_test, maxlen=maxlen, truncating='post') # Preparing word embedding. if args.loadwv: print(return_time(startime), "3. Loading word embedding ...") wv_path = 'data/wv_{0}_{1}.npy'.format(maxlen, num_words) if os.path.exists(wv_path): wv = np.load(wv_path) print(return_time(startime), "Loaded word embedding successfully!") else: print(return_time(startime), "Word embedding file doesn't exist") exit() else: print(return_time(startime), "3. Preparing word embedding ...") wv = load_fasttext('data/cc.ja.300.vec.gz') wv = filter_embeddings(wv, vocab.word_index, num_words) # Saving word embedding. if args.savewv: wv_path = 'data/wv_{0}_{1}.npy'.format(maxlen, num_words) np.save(wv_path, wv) print(return_time(startime), "Saved word embedding successfully!", wv_path) # Build models. models = [ RNNModel(num_words, num_label, embeddings=None).build(), LSTMModel(num_words, num_label, embeddings=None).build(), CNNModel(num_words, num_label, embeddings=None).build(), RNNModel(num_words, num_label, embeddings=wv).build(), LSTMModel(num_words, num_label, embeddings=wv).build(), CNNModel(num_words, num_label, embeddings=wv).build(), CNNModel(num_words, num_label, embeddings=wv, trainable=False).build() ] model_names = [ "RNN-None", "LSTM-None", "CNN-None", "RNN-wv", "LSTM-wv", "CNN-wv", "CNN-wv-notrain" ] print(return_time(startime), "4. Start training ...") for i, model in enumerate(models): print("***********************************") print(return_time(startime), "Model:", model_names[i]) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc']) # Preparing callbacks. callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path.format(model_names[i]), save_best_only=True) ] # Train the model. model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=callbacks, shuffle=True) # Inference. model = load_model(model_path.format(model_names[i])) api = InferenceAPI(model, vocab, preprocess_dataset) y_pred = api.predict_from_sequences(x_test) print('precision: {:.4f}'.format(precision_score(y_test, y_pred, average='binary'))) print('recall : {:.4f}'.format(recall_score(y_test, y_pred, average='binary'))) print('f1 : {:.4f}'.format(f1_score(y_test, y_pred, average='binary')))