def make_fold(train_df, test_df, save_model_class, save_model_deep): text_util = TextUtils() # preprocessing and tokenize train_spa_sent_1_df = train_df['spa_sent_1'].tolist() train_spa_sent_2_df = train_df['spa_sent_2'].tolist() test_spa_sent_1_df = test_df['spa_sent_1'].tolist() test_spa_sent_2_df = test_df['spa_sent_2'].tolist() train_spa_tokens_1 = text_util.tokenize(sentences=train_spa_sent_1_df, language=text_util.spanish) train_spa_tokens_2 = text_util.tokenize(sentences=train_spa_sent_2_df, language=text_util.spanish) test_spa_tokens_1 = text_util.tokenize(sentences=test_spa_sent_1_df, language=text_util.spanish) test_spa_tokens_2 = text_util.tokenize(sentences=test_spa_sent_2_df, language=text_util.spanish) # building vocabulary (#using only training dataset) train_spa_tokens = train_spa_tokens_1 + train_spa_tokens_2 train_label_df = train_df['label'].tolist() (spa_id2word, spa_word2id), spa_E_by_id = text_util.create_word_vocab( lst_tokens=train_spa_tokens, word_dim=300, fasttext_path='./data/new/pretrained/mine.wiki.es.vec') (id2label, label2id) = text_util.create_label_vocab(labels=train_label_df) # builing dataset (mean convert token, label to its corressponding id) train_dataset = text_util.create_dataset(lst_tokens_1=train_spa_tokens_1, lst_tokens_2=train_spa_tokens_2, labels=train_label_df, label2id=label2id, word2id_1=spa_word2id, word2id_2=spa_word2id) test_dataset = text_util.create_dataset(lst_tokens_1=test_spa_tokens_1, lst_tokens_2=test_spa_tokens_2, labels=test_df['label'].tolist(), label2id=label2id, word2id_1=spa_word2id, word2id_2=spa_word2id) # create batch train_batches = text_util.create_batch(dataset=train_dataset, batch_size=batch_size) test_batches = text_util.create_batch(dataset=test_dataset, batch_size=batch_size) # training train_score = train(train_batchs=train_batches, test_batchs=test_batches, n_epoch=n_epoch, init_lr=init_lr, init_keep_prob=init_keep_prob, init_word_emb=spa_E_by_id, text_util=text_util, save_model_class=save_model_class, save_model_deep=save_model_deep, word2id=spa_word2id, label2id=label2id) return train_score
""" data_df = pd.read_csv(data_file_path, sep='\t', header=None, names=data_file_headers) """ Processing (tokenize) """ eval_spa_tokens_1 = text_util.tokenize( sentences=data_df['spa_sent_1'].tolist(), language=text_util.spanish) eval_spa_tokens_2 = text_util.tokenize( sentences=data_df['spa_sent_2'].tolist(), language=text_util.spanish) dataset = text_util.create_dataset(lst_tokens_1=eval_spa_tokens_1, lst_tokens_2=eval_spa_tokens_2, labels=[0] * data_df.shape[0], word2id_1=model.word2id, word2id_2=model.word2id, label2id=model.label2id) """ Create batches """ batch_size = 32 eval_batch_ids = [(s, min(s + batch_size, len(dataset))) for s in range(0, len(dataset), batch_size)] """ Get scores """ results = [] for batch in eval_batch_ids: #batch_run(self, batch_input, text_util, mode, init_lr = None, init_keep_prob=None, metric=f1_score): scores = model.batch_run(batch_input=dataset[batch[0]:batch[1]],