Пример #1
0
def make_fold(train_df, test_df, save_model_class, save_model_deep):
    text_util = TextUtils()

    # preprocessing and tokenize
    train_spa_sent_1_df = train_df['spa_sent_1'].tolist()
    train_spa_sent_2_df = train_df['spa_sent_2'].tolist()

    test_spa_sent_1_df = test_df['spa_sent_1'].tolist()
    test_spa_sent_2_df = test_df['spa_sent_2'].tolist()

    train_spa_tokens_1 = text_util.tokenize(sentences=train_spa_sent_1_df,
                                            language=text_util.spanish)
    train_spa_tokens_2 = text_util.tokenize(sentences=train_spa_sent_2_df,
                                            language=text_util.spanish)
    test_spa_tokens_1 = text_util.tokenize(sentences=test_spa_sent_1_df,
                                           language=text_util.spanish)
    test_spa_tokens_2 = text_util.tokenize(sentences=test_spa_sent_2_df,
                                           language=text_util.spanish)

    # building vocabulary (#using only training dataset)
    train_spa_tokens = train_spa_tokens_1 + train_spa_tokens_2
    train_label_df = train_df['label'].tolist()

    (spa_id2word, spa_word2id), spa_E_by_id = text_util.create_word_vocab(
        lst_tokens=train_spa_tokens,
        word_dim=300,
        fasttext_path='./data/new/pretrained/mine.wiki.es.vec')
    (id2label, label2id) = text_util.create_label_vocab(labels=train_label_df)

    # builing dataset (mean convert token, label to its corressponding id)
    train_dataset = text_util.create_dataset(lst_tokens_1=train_spa_tokens_1,
                                             lst_tokens_2=train_spa_tokens_2,
                                             labels=train_label_df,
                                             label2id=label2id,
                                             word2id_1=spa_word2id,
                                             word2id_2=spa_word2id)

    test_dataset = text_util.create_dataset(lst_tokens_1=test_spa_tokens_1,
                                            lst_tokens_2=test_spa_tokens_2,
                                            labels=test_df['label'].tolist(),
                                            label2id=label2id,
                                            word2id_1=spa_word2id,
                                            word2id_2=spa_word2id)

    # create batch
    train_batches = text_util.create_batch(dataset=train_dataset,
                                           batch_size=batch_size)
    test_batches = text_util.create_batch(dataset=test_dataset,
                                          batch_size=batch_size)

    # training
    train_score = train(train_batchs=train_batches,
                        test_batchs=test_batches,
                        n_epoch=n_epoch,
                        init_lr=init_lr,
                        init_keep_prob=init_keep_prob,
                        init_word_emb=spa_E_by_id,
                        text_util=text_util,
                        save_model_class=save_model_class,
                        save_model_deep=save_model_deep,
                        word2id=spa_word2id,
                        label2id=label2id)

    return train_score
Пример #2
0
    """
    data_df = pd.read_csv(data_file_path,
                          sep='\t',
                          header=None,
                          names=data_file_headers)
    """
    Processing (tokenize)
    """
    eval_spa_tokens_1 = text_util.tokenize(
        sentences=data_df['spa_sent_1'].tolist(), language=text_util.spanish)
    eval_spa_tokens_2 = text_util.tokenize(
        sentences=data_df['spa_sent_2'].tolist(), language=text_util.spanish)

    dataset = text_util.create_dataset(lst_tokens_1=eval_spa_tokens_1,
                                       lst_tokens_2=eval_spa_tokens_2,
                                       labels=[0] * data_df.shape[0],
                                       word2id_1=model.word2id,
                                       word2id_2=model.word2id,
                                       label2id=model.label2id)
    """
    Create batches
    """
    batch_size = 32
    eval_batch_ids = [(s, min(s + batch_size, len(dataset)))
                      for s in range(0, len(dataset), batch_size)]
    """
    Get scores
    """
    results = []
    for batch in eval_batch_ids:
        #batch_run(self, batch_input, text_util, mode, init_lr = None, init_keep_prob=None, metric=f1_score):
        scores = model.batch_run(batch_input=dataset[batch[0]:batch[1]],