def main(): args = sys.argv batch_size = 128 epochs = 100 maxlen = 300 model_path = 'models/cnn_model.h5' num_words = 40000 num_label = 2 x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv') x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) vocab = build_vocabulary(x_train, num_words) x_train = vocab.texts_to_sequences(x_train) x_test = vocab.texts_to_sequences(x_test) x_train = pad_sequences(x_train, maxlen=maxlen, truncating='post') x_test = pad_sequences(x_test, maxlen=maxlen, truncating='post') emb_flg = args[0] if emb_flg == 't': wv = load_fasttext('../chap08/models/cc.ja.300.vec.gz') wv = filter_embeddings(wv, vocab.word_index, num_words) else: wv = None model = CNNModel(num_words, num_label, embeddings=wv).build() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc']) callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True) ] model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=callbacks, shuffle=True) model = load_model(model_path) api = InferenceAPI(model, vocab, preprocess_dataset) y_pred = api.predict_from_sequences(x_test) print('precision: {:.4f}'.format( precision_score(y_test, y_pred, average='binary'))) print('recall : {:.4f}'.format( recall_score(y_test, y_pred, average='binary'))) print('f1 : {:.4f}'.format(f1_score(y_test, y_pred, average='binary')))
def main(): batch_size = 128 epochs = 100 maxlen = 300 model_path = "cnn_model.h5" num_words = 40000 num_label = 2 x, y = load_dataset("data/amazon_reviews_multilingual_JP_v1_00.tsv") x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) vocab = build_vocabulary(x_train, num_words) x_train = vocab.texts_to_sequences(x_train) x_test = vocab.texts_to_sequences(x_test) x_train = pad_sequences(x_train, maxlen=maxlen, truncating="post") x_test = pad_sequences(x_test, maxlen=maxlen, truncating="post") wv = load_fasttext("data/cc.ja.300.vec.gz") wv = filter_embeddings(wv, vocab.word_index, num_words) model = CNNModel(num_words, num_label, embeddings=wv).build() model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["acc"]) callbakcs = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True) ] model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=callbakcs, shuffle=True) model = load_model(model_path) api = InferenceAPI(model, vocab, preprocess_dataset) y_pred = api.predict_from_sequence(x_test) print("precision: {:.4f}".format( precision_score(y_test, y_pred, average="binary"))) print("recall: {:.4f}".format( recall_score(y_test, y_pred, average="binary"))) print("f1: {:.4f}".format(f1_score(y_test, y_pred, average="binary")))
def test_load_fasttext(): preprocessor = SpacyPreprocessor([PAD, UNK, "my", "name", "is", "peter"]) emb = load_fasttext("test-data/test-fasttext.vec", preprocessor) word_ids, _ = preprocessor.preprocess_tokenized([ ["hi", "there", "what's", "your", "name"], ["my", "name", "is", "peter"], ]) embedded = emb(word_ids) # OOVs assert embedded[0, :4].equal( emb(torch.tensor(preprocessor.unk_id)).unsqueeze(0).expand(4, -1)) # name assert embedded[0, 4].equal(emb(torch.tensor(3))) # my name is peter assert embedded[1, :4].equal(emb(torch.tensor([2, 3, 4, 5]))) # pad, should be zero assert embedded[1, 4].equal(torch.zeros(300))
def main(): # ハイパーパラメータの背一定 batch_size = 128 epochs = 100 maxlen = 300 # model_path = 'models/rnn_model.h5' # model_path = 'models/lstm_model.h5' # model_path = 'models/CNN_model.h5' model_path = 'models/latm_iniemb_model.h5' num_words = 4000 num_label = 2 # データ・セットの読み込み x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv') # データセットの前処理 x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) vocab = build_vocabulary(x_train, num_words) x_train = vocab.texts_to_sequences(x_train) x_test = vocab.texts_to_sequences(x_test) x_train = pad_sequences(x_train, maxlen=maxlen, truncating='post') x_test = pad_sequences(x_test, maxlen=maxlen, truncating='post') # 単語分散表現 wv = load_fasttext('data/cc.ja.300.vec') wv = filter_embeddings(wv, vocab.word_index, num_words) # モデルの構築 # model = RNNModel(num_words, num_label, embeddings=None).build() model = LSTMModel(num_words, num_label, embeddings=wv).build() # model = CNNModel(num_words, num_label, embeddings=None).build() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc']) # コールバックの用意 callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True) ] # モデルの学習 model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=callbacks, shuffle=True) # 予測 model = load_model(model_path) api = InferenceAPI(model, vocab, preprocess_dataset) y_pred = api.predict_from_sequences(x_test) print('precision: {:.4f}'.format( precision_score(y_test, y_pred, average='binary'))) print('recall: {:.4f}'.format( recall_score(y_test, y_pred, average='binary'))) print('f1: {:.4f}'.format(f1_score(y_test, y_pred, average='binary')))
def main(): # 4.1 mkdirs if not os.path.exists(config.submit): os.makedirs(config.submit) if not os.path.exists(config.weights + config.model_name + os.sep + 'fold_'+str(config.fold)): os.makedirs(config.weights + config.model_name + os.sep + 'fold_'+ str(config.fold)) if not os.path.exists(config.best_models): os.mkdir(config.best_models) if not os.path.exists(config.logs): os.mkdir(config.logs) if not os.path.exists(config.best_models + config.model_name ): os.mkdir(config.best_models + config.model_name) if not os.path.exists(config.best_models + config.model_name + os.sep + 'fold_'+str(config.fold)): os.mkdir(config.best_models + config.model_name + os.sep + 'fold_'+str(config.fold)) tqdm.pandas() start_time = time.time() train_X, test_X, train_y, word_index = utils.load_and_prec(config) print("Start embedding matrix............") embedding_matrix_1 = utils.load_glove(word_index, config.embedding_dir, config.max_features) embedding_matrix_2 = utils.load_para(word_index, config.embedding_dir, config.max_features) embedding_matrix_3 = utils.load_fasttext(word_index, config.embedding_dir, config.max_features) total_time = (time.time() - start_time) / 60 print("Took {:.2f} minutes".format(total_time)) if config.embed_method == "mean": embedding_matrix = np.mean([embedding_matrix_1, embedding_matrix_2, embedding_matrix_3], axis=0) elif config.embed_method =="concat": embedding_matrix = np.concatenate((embedding_matrix_1, embedding_matrix_2, embedding_matrix_3), axis=1) print(np.shape(embedding_matrix)) # # del embedding_matrix_1, embedding_matrix_2 # del embedding_matrix_1 # ------------------------------------------------------- # training # ------------------------------------------------------- train_preds = np.zeros((len(train_X))) test_preds = np.zeros((len(test_X))) x_test_cuda = torch.tensor(test_X, dtype=torch.long).cuda() test_dataset = torch.utils.data.TensorDataset(x_test_cuda) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False) splits = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED).split(train_X, train_y)) sigmoid = nn.Sigmoid() loss_fn = torch.nn.BCEWithLogitsLoss(reduction="mean") # k-fold for fold, (train_idx, valid_idx) in enumerate(splits): print(f'Fold {fold + 1}') # tflogger tflogger = utils.TFLogger(os.path.join('../results', 'TFlogs', config.model_name + "_fold{0}_{1}".format(config.fold, fold))) # initialize the early_stopping object early_stopping = utils.EarlyStopping(patience=7, verbose=True) x_train_fold = torch.tensor(train_X[train_idx], dtype=torch.long).cuda() y_train_fold = torch.tensor(train_y[train_idx, np.newaxis], dtype=torch.float32).cuda() x_val_fold = torch.tensor(train_X[valid_idx], dtype=torch.long).cuda() y_val_fold = torch.tensor(train_y[valid_idx, np.newaxis], dtype=torch.float32).cuda() if config.model == "baseline_bidir_LSTM_GRU": model = baseline_bidir_LSTM_GRU.NeuralNet(config, embedding_matrix) elif config.model == "baseline_pytorch": model = baseline_pytorch.NeuralNet(config, embedding_matrix) elif config.model == "baseline_lstm_gru_attention": model = baseline_lstm_gru_attention.NeuralNet(config, embedding_matrix) elif config.model == "baseline_lstm_lstm_attention": model = baseline_lstm_lstm_attention.NeuralNet(config, embedding_matrix) model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=config.lr) # scheduler scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1) train_dataset = torch.utils.data.TensorDataset(x_train_fold, y_train_fold) valid_dataset = torch.utils.data.TensorDataset(x_val_fold, y_val_fold) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=config.batch_size, shuffle=False) valid_loss = np.inf # initialize best loss best_loss = np.inf start_time = timer() for epoch in range(config.epochs): scheduler.step(epoch) # train lr = utils.get_learning_rate(optimizer) train_loss = train(train_loader=train_loader,model=model,loss_fn=loss_fn, optimizer=optimizer, epoch=epoch,valid_loss=valid_loss,start=start_time) # validate valid_loss, valid_output = evaluate(val_loader=valid_loader, model=model, loss_fn=loss_fn, epoch=epoch, train_loss=train_loss, start_time=start_time) test_preds_fold = np.zeros(len(test_X)) # check results is_best_loss = valid_loss < best_loss if is_best_loss: best_epoch = epoch best_train_loss = train_loss # update best loss best_loss = min(valid_loss, best_loss) # save NeuralNet utils.save_checkpoint({ "epoch": epoch, "model_name": config.model_name, "state_dict": model.state_dict(), "optimizer": optimizer.state_dict(), "fold": config.fold, "kfold": config.fold, },is_best_loss, config.fold, fold, config) # print logs print('\r', end='', flush=True) message = '%s %5.1f %6.1f %.2E | %0.3f | %0.3f | %s' % ( \ "best", best_epoch, best_epoch, Decimal(lr), best_train_loss, best_loss, utils.time_to_str((timer() - start_time), 'min')) log.write(message) log.write("\n") time.sleep(0.01) # ================================================================== # # Tensorboard Logging # # ================================================================== # # 1. Log scalar values (scalar summary) info = {'Train_loss': train_loss, 'Valid_loss': valid_loss, 'Learnging_rate': lr} for tag, value in info.items(): tflogger.scalar_summary(tag, value, epoch) # 2. Log values and gradients of the parameters (histogram summary) for tag, value in model.named_parameters(): tag = tag.replace('.', '/') tflogger.histo_summary(tag, value.data.cpu().numpy(), epoch) if not value.grad is None: tflogger.histo_summary(tag + '/grad', value.grad.data.cpu().numpy(), epoch) # ------------------------------------- # end tflogger # ================================================================== # # Early stopping # # ================================================================== # # early_stopping needs the validation loss to check if it has decresed, # and if it has, it will make a checkpoint of the current NeuralNet early_stopping(valid_loss, model) if early_stopping.early_stop: print("Early stopping") break # end looping all epochs train_preds[valid_idx] = sigmoid(valid_output).cpu().data.numpy()[:, 0] # test checkpoint_path = os.path.join("{0}{1}/fold_{2}/fold_{3}_model_best_loss.pth.tar". format(config.best_models, config.model_name, str(config.fold), fold)) best_model = torch.load(checkpoint_path) print("Test on epoch:", best_model['epoch']) model.load_state_dict(best_model["state_dict"]) test_preds_fold = test(test_loader=test_loader, model=model) test_preds += test_preds_fold / len(splits) # end k-fold search_result = threshold_search(train_y, train_preds) print(search_result) log.write("Threshold:{0}, f1:{1}".format(search_result['threshold'], search_result['f1'])) sub = pd.read_csv('../input/sample_submission.csv') sub.prediction = test_preds > search_result['threshold'] sub.to_csv("submission_{0}.csv".format(config.model_name), index=False) print('Test successful!')
def main(args): print "loadding data and labels from dataset" train = pd.read_csv(args.train_dir) ch_train = pd.read_csv(args.chtrain_dir) x_train = train["comment_text"] x_chtrain = ch_train["comment_text"] target_cols = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] x = [] x_ch = [] for line in x_train: if len(line) > 0: x.append(utils.review_to_wordlist(line.strip())) print "loaded %d comments from dataset" % len(x) for line in x_chtrain: if len(line) > 0: x_ch.append(utils.review_to_wordlist_char(line.strip())) print "loaded %d comments from dataset" % len(x) y = train[target_cols].values index2word, word2index = utils.load_vocab(args.vocab_dir) index2char, char2index = utils.load_char(args.char_dir) x_vector = utils.vectorize(x, word2index, verbose=False) x_vector = np.array(x_vector) char_vector = utils.vectorize_char(x_ch, char2index, verbose=False) char_vector = np.array(char_vector) print char_vector[0] save_dir = os.path.join(args.save_dir, args.model_type) if not os.path.exists(save_dir): os.makedirs(save_dir) if args.model_type in ["cnn", "cnnfe", "chcnn", "chcnn2"]: max_step = args.max_step_cnn max_size = args.max_size_cnn nb_epochs = args.nb_epochs_cnn elif args.model_type in [ "rnn", "rnnfe", "rnnfe2", "chrnn", "chrnnfe", "rcnn" ]: max_step = args.max_step_rnn max_size = args.max_size_rnn nb_epochs = args.nb_epochs_rnn ex_features = add_features("../data/train.csv") nfolds = args.nfolds skf = KFold(n_splits=nfolds, shuffle=True, random_state=2018) test_prob = [] stack_logits = np.zeros((len(x_vector), len(target_cols))) for (f, (train_index, test_index)) in enumerate(skf.split(x_vector)): x_train, x_eval = x_vector[train_index], x_vector[test_index] char_train, char_eval = char_vector[train_index], char_vector[ test_index] y_train, y_eval = y[train_index], y[test_index] with tf.Graph().as_default(): config_proto = utils.get_config_proto() sess = tf.Session(config=config_proto) if args.model_type == "cnn": model = TextCNN(args, "TextCNN") elif args.model_type == "cnnfe": model = TextCNNFE(args, "TextCNNFE") elif args.model_type == "rnn": model = TextRNN(args, "TextRNN") elif args.model_type == "rnnfe": model = TextRNNFE(args, "TextRNNFE") elif args.model_type == "rcnn": model = TextRCNN(args, "TextRCNN") elif args.model_type == "attention": model = RNNWithAttention(args, "Attention") elif args.model_type == "chrnn": model = TextRNNChar(args, "TextRNNChar") elif args.model_type == "chcnn": model = TextCNNChar(args, "TextCNNChar") elif args.model_type == "chcnn2": model = TextCNNChar(args, "TextCNNChar2") elif args.model_type == "rnnfe2": model = TextRNNFE2(args, "TextCNNCharFE2") elif args.model_type == "chrnnfe": model = TextRNNCharFE(args, "TextCNNCharFE") else: raise ValueError("Unknown model_type %s" % args.model_type) sess.run(tf.global_variables_initializer()) if args.use_ft: pretrain_dir = args.ft_dir print "use FastText word vector" embedding = utils.load_fasttext(pretrain_dir, index2word) if not args.use_ft: pretrain_dir = args.glove_dir print "use Glove word vector" embedding = utils.load_glove(pretrain_dir, index2word) sess.run(model.embedding_init, {model.embedding_placeholder: embedding}) for line in model.tvars: print line print "training %s model for toxic comments classification" % ( args.model_type) print "%d fold start training" % f for epoch in range(1, nb_epochs + 1): print "epoch %d start with lr %f" % ( epoch, model.learning_rate.eval(session=sess)), "\n", "- " * 50 loss, total_comments = 0.0, 0 if args.model_type in ["cnn", "rnn", "rcnn"]: train_batch = utils.get_batches(x_train, y_train, args.batch_size, args.max_len) valid_batch = utils.get_batches(x_eval, y_eval, max_size, args.max_len, False) elif args.model_type in ["chrnn", "chcnn", "chcnn2"]: train_batch = utils.get_batches_with_char( x_train, char_train, y_train, args.batch_size, args.max_len) valid_batch = utils.get_batches_with_char( x_eval, char_eval, y_eval, max_size, args.max_len, False) elif args.model_type in ["rnnfe", "cnnfe", "rnnfe2"]: train_batch = utils.get_batches_with_fe( x_train, y_train, ex_features, args.batch_size, args.max_len) valid_batch = utils.get_batches_with_fe( x_eval, y_eval, ex_features, max_size, args.max_len, False) elif args.model_type in ["chrnnfe"]: train_batch = utils.get_batches_with_charfe( x_train, char_train, y_train, ex_features, args.batch_size, args.max_len) valid_batch = utils.get_batches_with_charfe( x_eval, char_eval, y_eval, ex_features, max_size, args.max_len, False) epoch_start_time = time.time() step_start_time = epoch_start_time for idx, batch in enumerate(train_batch): if args.model_type in ["cnn", "rnn", "rcnn"]: comments, comments_length, labels = batch _, loss_t, global_step, batch_size = model.train( sess, comments, comments_length, labels) elif args.model_type in ["chrnn", "chcnn", "chcnn2"]: comments, comments_length, chs, labels = batch _, loss_t, global_step, batch_size = model.train( sess, comments, comments_length, chs, labels) elif args.model_type in ["rnnfe", "cnnfe", "rnnfe2"]: comments, comments_length, exs, labels = batch _, loss_t, global_step, batch_size = model.train( sess, comments, comments_length, labels, exs) elif args.model_type in ["chrnnfe"]: comments, comments_length, chs, exs, labels = batch _, loss_t, global_step, batch_size = model.train( sess, comments, comments_length, chs, labels, exs) loss += loss_t * batch_size total_comments += batch_size if global_step % 200 == 0: print "epoch %d step %d loss %f time %.2fs" % ( epoch, global_step, loss_t, time.time() - step_start_time) if global_step % 200 == 0: _ = run_valid(valid_batch, model, sess, args.model_type) # model.saver.save(sess, os.path.join(save_dir, "model.ckpt"), global_step=global_step) step_start_time = time.time() epoch_time = time.time() - epoch_start_time sess.run(model.learning_rate_decay_op) print "%.2f seconds in this epoch with train loss %f" % ( epoch_time, loss / total_comments) test_prob.append(run_test(args, model, sess)) stack_logits[test_index] = run_valid(valid_batch, model, sess, args.model_type) preds = np.zeros((test_prob[0].shape[0], len(target_cols))) for prob in test_prob: preds += prob print prob[0] preds /= len(test_prob) print len(test_prob) write_predict(stack_logits, args.model_type) write_results(preds, args.model_type)
def __init__( self, hidden_dim: int = 8, word_emb_dim: int = 300, node_emb_dim: int = 12, relation_emb_dim: int = 10, text_encoder_num_blocks: int = 1, text_encoder_num_conv_layers: int = 3, text_encoder_kernel_size: int = 5, text_encoder_num_heads: int = 1, graph_encoder_num_cov_layers: int = 4, graph_encoder_num_bases: int = 3, text_decoder_num_blocks: int = 1, text_decoder_num_heads: int = 1, learning_rate: float = 5e-4, sample_k_gen_obs: int = 5, max_decode_len: int = 200, steps_for_lr_warmup: int = 10000, pretrained_word_embedding_path: Optional[str] = None, word_vocab_path: Optional[str] = None, node_vocab_path: Optional[str] = None, relation_vocab_path: Optional[str] = None, **kwargs, ) -> None: super().__init__() self.save_hyperparameters( "hidden_dim", "word_emb_dim", "node_emb_dim", "relation_emb_dim", "text_encoder_num_blocks", "text_encoder_num_conv_layers", "text_encoder_kernel_size", "text_encoder_num_heads", "graph_encoder_num_cov_layers", "graph_encoder_num_bases", "text_decoder_num_blocks", "text_decoder_num_heads", "learning_rate", "sample_k_gen_obs", "max_decode_len", "steps_for_lr_warmup", ) # initialize word (preprocessor), node and relation stuff ( node_name_word_ids, node_name_mask, rel_name_word_ids, rel_name_mask, ) = self.init_word_node_rel( word_vocab_path=to_absolute_path(word_vocab_path) if word_vocab_path is not None else None, node_vocab_path=to_absolute_path(node_vocab_path) if node_vocab_path is not None else None, relation_vocab_path=to_absolute_path(relation_vocab_path) if relation_vocab_path is not None else None, ) # load pretrained word embedding and freeze it if pretrained_word_embedding_path is not None: pretrained_word_embedding = load_fasttext( to_absolute_path(pretrained_word_embedding_path), self.preprocessor) else: pretrained_word_embedding = nn.Embedding(self.num_words, word_emb_dim) pretrained_word_embedding.weight.requires_grad = False # graph updater self.graph_updater = GraphUpdater( self.hparams.hidden_dim, # type: ignore self.hparams.word_emb_dim, # type: ignore len(self.node_vocab), self.hparams.node_emb_dim, # type: ignore len(self.relation_vocab), self.hparams.relation_emb_dim, # type: ignore self.hparams.text_encoder_num_blocks, # type: ignore self.hparams.text_encoder_num_conv_layers, # type: ignore self.hparams.text_encoder_kernel_size, # type: ignore self.hparams.text_encoder_num_heads, # type: ignore self.hparams.graph_encoder_num_cov_layers, # type: ignore self.hparams.graph_encoder_num_bases, # type: ignore pretrained_word_embedding, node_name_word_ids, node_name_mask, rel_name_word_ids, rel_name_mask, ) self.graph_updater.pretraining = True # text decoder self.text_decoder = TextDecoder(text_decoder_num_blocks, hidden_dim, text_decoder_num_heads) self.target_word_prj = nn.Linear(hidden_dim, self.num_words, bias=False) self.ce_loss = nn.CrossEntropyLoss( ignore_index=self.preprocessor.pad_id, reduction="none")