Exemplo n.º 1
0
 def __init__(self, samples, parser: ReversibleField = None, parse=True):
     super().__init__('self-bleu')
     if parse:
         samples = parser.reverse(samples)
     ref_tokens = [parser.tokenize(r) for r in samples]
     from fast_bleu import SelfBLEU as FSBLEU
     w = {i: np.ones(i) / i for i in range(2, 6)}
     self.bleu = FSBLEU(ref_tokens, w)
     print('self-bleu instance created!')
Exemplo n.º 2
0
 def __init__(self, references, parser: ReversibleField = None, parse=True):
     super().__init__('bleu')
     if parse:
         references = parser.reverse(references)
     ref_tokens = [parser.tokenize(r) for r in references]
     self.parser = parser
     from fast_bleu import BLEU as FBLEU
     w = {i: np.ones(i) / i for i in range(2, 6)}
     self.bleu = FBLEU(ref_tokens, w)
     print('bleu instance created!')
Exemplo n.º 3
0
def preprocess_real_dataset(dataset_name):
    train_filename, valid_filename, test_filename =\
        "{}_train.txt".format(dataset_name),\
        "{}_valid.txt".format(dataset_name),\
        "{}_test.txt".format(dataset_name)

    import random
    random.seed(42)
    print(train_filename, valid_filename, test_filename)

    TEXT = ReversibleField(
        tokenize="revtok",
        tokenizer_language="en",
        init_token='<sos>',
        eos_token='<eos>',
        pad_token='<pad>',
        use_vocab=True,
        lower=True,
        batch_first=True,
        # fix_length=MAX_LENGTH
    )

    trn = LanguageModelingDataset(path=DATASET_PATH + train_filename,
                                  newline_eos=False,
                                  text_field=TEXT)

    vld = LanguageModelingDataset(path=DATASET_PATH + valid_filename,
                                  newline_eos=False,
                                  text_field=TEXT)

    tst = LanguageModelingDataset(path=DATASET_PATH + test_filename,
                                  newline_eos=False,
                                  text_field=TEXT)

    TEXT.build_vocab(trn, vld, tst)

    TEXT.max_length = max(max([len(t.text)
                               for t in trn]), max([len(t.text) for t in vld]),
                          max([len(t.text) for t in tst])) + 1

    dump(obj=TEXT,
         file_name=dataset_name + "_vocab.pkl",
         parent_path=DATASET_PATH)

    print(
        'vocab size: {}\ntrain size: {}\n valid size: {}\n test size: {}\n max length: {}'
        .format(len(TEXT.vocab), len(trn), len(vld), len(tst),
                TEXT.max_length))
    return trn, vld, tst, TEXT
Exemplo n.º 4
0
 def __init__(self, samples, parser: ReversibleField = None, parse=True):
     super().__init__('NgramProp')
     if parse:
         samples = parser.reverse(samples)
     ref_tokens = [parser.tokenize(r) for r in samples]
     scores = []
     lens = []
     for sample in ref_tokens:
         if len(sample) == 0:
             continue
         smaple_score = float(len(Counter(sample))) / float(len(sample))
         scores.append(smaple_score)
         lens.append(len(sample))
     self.score = float(np.mean(scores))
     self.len = float(np.mean(lens))
Exemplo n.º 5
0
    def __init__(self,
                 samples,
                 min_n=2,
                 max_n=5,
                 parser: ReversibleField = None,
                 parse=True):
        super().__init__()
        from fast_bleu import SelfBLEU as FSBLEU

        assert max_n >= min_n
        assert min_n >= 1

        if parse:
            samples = parser.reverse(samples)
        ref_tokens = [parser.tokenize(r) for r in samples]
        w = {i: np.ones(i) / i for i in range(min_n, max_n + 1)}
        self.selfbleu = FSBLEU(ref_tokens, w, verbose=True)
        print('LOG: SelfBLEU init done!')
Exemplo n.º 6
0
 def __init__(self,
              references,
              min_n=2,
              max_n=5,
              parser: ReversibleField = None,
              parse=True):
     super().__init__('jaccard')
     print('multiset distances init upto {}!'.format(max_n))
     if parse:
         references = parser.reverse(references)
     references = [parser.tokenize(r) for r in references]
     self.references = references
     self.max_n = max_n
     self.min_n = min_n
     self.parser = parser
     assert self.max_n >= self.min_n
     assert self.min_n >= 1
     self.ref_ngrams = self._get_ngrams(references)
     print('jaccard instance created!')
Exemplo n.º 7
0
    def __init__(self,
                 ref_samples,
                 hyp_samples,
                 min_n=2,
                 max_n=5,
                 parser: ReversibleField = None,
                 parse=True):
        super().__init__()
        from fast_bleu import BLEU as FBLEU

        assert max_n >= min_n
        assert min_n >= 1

        if parse:
            ref_samples = parser.reverse(ref_samples)
            hyp_samples = parser.reverse(hyp_samples)
        self.ref_tokens = [parser.tokenize(r) for r in ref_samples]
        self.hyp_tokens = [parser.tokenize(r) for r in hyp_samples]
        self.parser = parser
        w = {i: np.ones(i) / i for i in range(min_n, max_n + 1)}
        self.bleu = FBLEU(self.hyp_tokens, w, verbose=True)
        print('LOG: ReverseBLEU init done!')
Exemplo n.º 8
0
def construct_field(
    field_type,
    batch_first=True,
    input_lower=True,
    lemmatized=False,
    input_include_lengths=True,
    input_fix_length=None,
):
    """ Construct TorchText field.

        Note: the `input_<x>` fields are specifically parameters for
              the `input_text` field type.
    """
    if field_type == 'input_text':
        if lemmatized:
            tokenizer = tokenize_fct_lemmatize
        else:
            tokenizer = "spacy"
        return ReversibleField(sequential=True,
                               use_vocab=True,
                               init_token=Constants.START_TOKEN,
                               eos_token=Constants.END_TOKEN,
                               lower=input_lower,
                               tokenize=tokenizer,
                               batch_first=batch_first,
                               pad_token=Constants.PAD_TOKEN,
                               unk_token=Constants.UNK_TOKEN,
                               include_lengths=input_include_lengths,
                               fix_length=input_fix_length,
                               preprocessing=gen_text_preprocessor())
    elif field_type == 'numeric_label':
        return LabelField(
            use_vocab=False,
            batch_first=batch_first,
        )
    elif field_type == 'bool_label':
        return LabelField(use_vocab=False,
                          batch_first=batch_first,
                          preprocessing=lambda x: (x == 'True'))
    else:
        raise Exception('Invalid Field Type')
Exemplo n.º 9
0
def preprocess_oracle_dataset():
    from metrics.oracle.oracle_lstm import Oracle_LSTM

    dataset_name = 'oracle'
    train_filename, valid_filename, test_filename = "{}_train".format(dataset_name),\
        "{}_valid".format(dataset_name),\
        "{}_test".format(dataset_name)

    oracle = Oracle_LSTM(num_emb=5000,
                         batch_size=128,
                         emb_dim=3200,
                         hidden_dim=32,
                         sequence_length=20)

    N = 60 * 10**3
    N1 = int(N * 2 / 3)
    N2 = int(N * 1 / 6)

    samples = oracle.generate(N)
    samples = map(lambda xx: list(map(str, xx)), samples)
    samples = list(map(lambda x: " ".join(x), samples))

    train_samples = samples[:N1]
    valid_samples = samples[N1:N1 + N2]
    test_samples = samples[-N2:]

    write_text(train_samples, train_filename)
    write_text(valid_samples, valid_filename)
    write_text(test_samples, test_filename)

    import random
    random.seed(42)
    print(train_filename, valid_filename, test_filename)

    TEXT = ReversibleField(
        init_token='<sos>',
        use_vocab=True,
        lower=False,
        batch_first=True,
    )

    trn = LanguageModelingDataset(path=DATASET_PATH + train_filename + '.txt',
                                  newline_eos=False,
                                  text_field=TEXT)

    vld = LanguageModelingDataset(path=DATASET_PATH + valid_filename + '.txt',
                                  newline_eos=False,
                                  text_field=TEXT)

    tst = LanguageModelingDataset(path=DATASET_PATH + test_filename + '.txt',
                                  newline_eos=False,
                                  text_field=TEXT)

    TEXT.build_vocab(trn, vld, tst)

    TEXT.max_length = max(max([len(t.text) for t in trn]),
                          max([len(t.text) for t in vld]),
                          max([len(t.text) for t in tst]))

    dump(obj=TEXT,
         file_name=dataset_name + "_vocab.pkl",
         parent_path=DATASET_PATH)

    print(
        'vocab size: {}\ntrain size: {}\n valid size: {}\n test size: {}\n max length: {}'
        .format(len(TEXT.vocab), len(trn), len(vld), len(tst),
                TEXT.max_length))
    return trn, vld, tst, TEXT
Exemplo n.º 10
0
seed = 2019
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)


def light_tokenize(sequence: str):
    return sequence.split()


def action_tokenize(sequence: str):
    return [sequence]


TEXT = Field(sequential=True, tokenize=light_tokenize, eos_token=END_OF_INPUT_TOK, pad_token=None)
ACTION = ReversibleField(
    sequential=True, tokenize=action_tokenize, is_target=True, unk_token=None, pad_token=None)

Fields = [('text', TEXT), ('action', ACTION)]


class TDPTool(Tool):
    def get_dataset(self, path: str, fields=Fields, separator=' ||| '):
        logger.info('loading dataset from {}'.format(path))
        tdp_dataset = TransitionDataset(path, fields=fields, separator=separator)
        logger.info('successed loading dataset')
        return tdp_dataset

    def get_vocab(self, *dataset):
        logger.info('building word vocab...')
        TEXT.build_vocab(*dataset, specials=[ROOT_TOK, NULL_STACK_TOK])
        logger.info('successed building word vocab')
Exemplo n.º 11
0
from torchtext.data import Dataset, Field, BucketIterator, ReversibleField
from torchtext.vocab import Vectors
from torchtext.datasets import SequenceTaggingDataset
from utils.log import logger
from config import DEVICE, DEFAULT_CONFIG


def light_tokenize(sequence: str):
    return [sequence]


# 普通字段
TEXT = Field(sequential=True, tokenize=light_tokenize, include_lengths=True)
# 可逆字段:可以从 wordid 映射回原来的 word
TAG = ReversibleField(sequential=True,
                      tokenize=light_tokenize,
                      is_target=True,
                      unk_token=None)
Fields = [('text', TEXT), ('tag', TAG)]


class TOOL(object):

    # @staticmethod
    def get_dataset(self, path: str, fields=Fields, separator='\t'):
        logger.info('loading dataset from {}'.format(path))
        st_dataset = SequenceTaggingDataset(path,
                                            fields=fields,
                                            separator=separator)
        logger.info('successed loading dataset')
        return st_dataset
Exemplo n.º 12
0
                                 repeat=False,
                                 sort=True,
                                 sort_key=lambda x: len(x.data))

    dataset_iter_val = data.Iterator(validation_set,
                                     batch_size=1,
                                     device=device,
                                     train=True,
                                     shuffle=True,
                                     repeat=False,
                                     sort=False)
else:

    FIELD = ReversibleField(batch_first=False,
                            init_token='<init>',
                            eos_token='<eos>',
                            lower=True,
                            include_lengths=True)

    split_cnn = CNN.splits(fields=FIELD)
    split_dm = DailyMail.splits(fields=FIELD)

    for scnn, sdm in zip(split_cnn, split_dm):
        scnn.examples.extend(sdm)
    split = split_cnn

    FIELD.build_vocab(split[0].src, vectors="glove.6B." + str(glove_dim) + "d")
    vocab = copy.deepcopy(FIELD.vocab)

    dataset_iter, dataset_iter_val, dataset_iter_test = BucketIterator.splits(
        split,
Exemplo n.º 13
0
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

from ...base.tool import Tool
from ...utils.log import logger
from .config import DEVICE, DEFAULT_CONFIG

seed = 2019
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)


def light_tokenize(sequence: str):
    return [x for x in sequence.strip()]


TEXT = ReversibleField(sequential=True, tokenize=light_tokenize)


class LMTool(Tool):
    def tokenize(self, sequence: str):
        return [x for x in sequence.strip()]

    def get_dataset(self, path: str, field=TEXT, newline_eos=False):
        logger.info('loading dataset from {}'.format(path))
        lm_dataset = LanguageModelingDataset(path, text_field=field, newline_eos=newline_eos)
        logger.info('successed loading dataset')
        return lm_dataset

    def get_vocab(self, *dataset):
        logger.info('building word vocab...')
        TEXT.build_vocab(*dataset)