def __init__(self, samples, parser: ReversibleField = None, parse=True): super().__init__('self-bleu') if parse: samples = parser.reverse(samples) ref_tokens = [parser.tokenize(r) for r in samples] from fast_bleu import SelfBLEU as FSBLEU w = {i: np.ones(i) / i for i in range(2, 6)} self.bleu = FSBLEU(ref_tokens, w) print('self-bleu instance created!')
def __init__(self, references, parser: ReversibleField = None, parse=True): super().__init__('bleu') if parse: references = parser.reverse(references) ref_tokens = [parser.tokenize(r) for r in references] self.parser = parser from fast_bleu import BLEU as FBLEU w = {i: np.ones(i) / i for i in range(2, 6)} self.bleu = FBLEU(ref_tokens, w) print('bleu instance created!')
def preprocess_real_dataset(dataset_name): train_filename, valid_filename, test_filename =\ "{}_train.txt".format(dataset_name),\ "{}_valid.txt".format(dataset_name),\ "{}_test.txt".format(dataset_name) import random random.seed(42) print(train_filename, valid_filename, test_filename) TEXT = ReversibleField( tokenize="revtok", tokenizer_language="en", init_token='<sos>', eos_token='<eos>', pad_token='<pad>', use_vocab=True, lower=True, batch_first=True, # fix_length=MAX_LENGTH ) trn = LanguageModelingDataset(path=DATASET_PATH + train_filename, newline_eos=False, text_field=TEXT) vld = LanguageModelingDataset(path=DATASET_PATH + valid_filename, newline_eos=False, text_field=TEXT) tst = LanguageModelingDataset(path=DATASET_PATH + test_filename, newline_eos=False, text_field=TEXT) TEXT.build_vocab(trn, vld, tst) TEXT.max_length = max(max([len(t.text) for t in trn]), max([len(t.text) for t in vld]), max([len(t.text) for t in tst])) + 1 dump(obj=TEXT, file_name=dataset_name + "_vocab.pkl", parent_path=DATASET_PATH) print( 'vocab size: {}\ntrain size: {}\n valid size: {}\n test size: {}\n max length: {}' .format(len(TEXT.vocab), len(trn), len(vld), len(tst), TEXT.max_length)) return trn, vld, tst, TEXT
def __init__(self, samples, parser: ReversibleField = None, parse=True): super().__init__('NgramProp') if parse: samples = parser.reverse(samples) ref_tokens = [parser.tokenize(r) for r in samples] scores = [] lens = [] for sample in ref_tokens: if len(sample) == 0: continue smaple_score = float(len(Counter(sample))) / float(len(sample)) scores.append(smaple_score) lens.append(len(sample)) self.score = float(np.mean(scores)) self.len = float(np.mean(lens))
def __init__(self, samples, min_n=2, max_n=5, parser: ReversibleField = None, parse=True): super().__init__() from fast_bleu import SelfBLEU as FSBLEU assert max_n >= min_n assert min_n >= 1 if parse: samples = parser.reverse(samples) ref_tokens = [parser.tokenize(r) for r in samples] w = {i: np.ones(i) / i for i in range(min_n, max_n + 1)} self.selfbleu = FSBLEU(ref_tokens, w, verbose=True) print('LOG: SelfBLEU init done!')
def __init__(self, references, min_n=2, max_n=5, parser: ReversibleField = None, parse=True): super().__init__('jaccard') print('multiset distances init upto {}!'.format(max_n)) if parse: references = parser.reverse(references) references = [parser.tokenize(r) for r in references] self.references = references self.max_n = max_n self.min_n = min_n self.parser = parser assert self.max_n >= self.min_n assert self.min_n >= 1 self.ref_ngrams = self._get_ngrams(references) print('jaccard instance created!')
def __init__(self, ref_samples, hyp_samples, min_n=2, max_n=5, parser: ReversibleField = None, parse=True): super().__init__() from fast_bleu import BLEU as FBLEU assert max_n >= min_n assert min_n >= 1 if parse: ref_samples = parser.reverse(ref_samples) hyp_samples = parser.reverse(hyp_samples) self.ref_tokens = [parser.tokenize(r) for r in ref_samples] self.hyp_tokens = [parser.tokenize(r) for r in hyp_samples] self.parser = parser w = {i: np.ones(i) / i for i in range(min_n, max_n + 1)} self.bleu = FBLEU(self.hyp_tokens, w, verbose=True) print('LOG: ReverseBLEU init done!')
def construct_field( field_type, batch_first=True, input_lower=True, lemmatized=False, input_include_lengths=True, input_fix_length=None, ): """ Construct TorchText field. Note: the `input_<x>` fields are specifically parameters for the `input_text` field type. """ if field_type == 'input_text': if lemmatized: tokenizer = tokenize_fct_lemmatize else: tokenizer = "spacy" return ReversibleField(sequential=True, use_vocab=True, init_token=Constants.START_TOKEN, eos_token=Constants.END_TOKEN, lower=input_lower, tokenize=tokenizer, batch_first=batch_first, pad_token=Constants.PAD_TOKEN, unk_token=Constants.UNK_TOKEN, include_lengths=input_include_lengths, fix_length=input_fix_length, preprocessing=gen_text_preprocessor()) elif field_type == 'numeric_label': return LabelField( use_vocab=False, batch_first=batch_first, ) elif field_type == 'bool_label': return LabelField(use_vocab=False, batch_first=batch_first, preprocessing=lambda x: (x == 'True')) else: raise Exception('Invalid Field Type')
def preprocess_oracle_dataset(): from metrics.oracle.oracle_lstm import Oracle_LSTM dataset_name = 'oracle' train_filename, valid_filename, test_filename = "{}_train".format(dataset_name),\ "{}_valid".format(dataset_name),\ "{}_test".format(dataset_name) oracle = Oracle_LSTM(num_emb=5000, batch_size=128, emb_dim=3200, hidden_dim=32, sequence_length=20) N = 60 * 10**3 N1 = int(N * 2 / 3) N2 = int(N * 1 / 6) samples = oracle.generate(N) samples = map(lambda xx: list(map(str, xx)), samples) samples = list(map(lambda x: " ".join(x), samples)) train_samples = samples[:N1] valid_samples = samples[N1:N1 + N2] test_samples = samples[-N2:] write_text(train_samples, train_filename) write_text(valid_samples, valid_filename) write_text(test_samples, test_filename) import random random.seed(42) print(train_filename, valid_filename, test_filename) TEXT = ReversibleField( init_token='<sos>', use_vocab=True, lower=False, batch_first=True, ) trn = LanguageModelingDataset(path=DATASET_PATH + train_filename + '.txt', newline_eos=False, text_field=TEXT) vld = LanguageModelingDataset(path=DATASET_PATH + valid_filename + '.txt', newline_eos=False, text_field=TEXT) tst = LanguageModelingDataset(path=DATASET_PATH + test_filename + '.txt', newline_eos=False, text_field=TEXT) TEXT.build_vocab(trn, vld, tst) TEXT.max_length = max(max([len(t.text) for t in trn]), max([len(t.text) for t in vld]), max([len(t.text) for t in tst])) dump(obj=TEXT, file_name=dataset_name + "_vocab.pkl", parent_path=DATASET_PATH) print( 'vocab size: {}\ntrain size: {}\n valid size: {}\n test size: {}\n max length: {}' .format(len(TEXT.vocab), len(trn), len(vld), len(tst), TEXT.max_length)) return trn, vld, tst, TEXT
seed = 2019 torch.manual_seed(seed) torch.cuda.manual_seed(seed) def light_tokenize(sequence: str): return sequence.split() def action_tokenize(sequence: str): return [sequence] TEXT = Field(sequential=True, tokenize=light_tokenize, eos_token=END_OF_INPUT_TOK, pad_token=None) ACTION = ReversibleField( sequential=True, tokenize=action_tokenize, is_target=True, unk_token=None, pad_token=None) Fields = [('text', TEXT), ('action', ACTION)] class TDPTool(Tool): def get_dataset(self, path: str, fields=Fields, separator=' ||| '): logger.info('loading dataset from {}'.format(path)) tdp_dataset = TransitionDataset(path, fields=fields, separator=separator) logger.info('successed loading dataset') return tdp_dataset def get_vocab(self, *dataset): logger.info('building word vocab...') TEXT.build_vocab(*dataset, specials=[ROOT_TOK, NULL_STACK_TOK]) logger.info('successed building word vocab')
from torchtext.data import Dataset, Field, BucketIterator, ReversibleField from torchtext.vocab import Vectors from torchtext.datasets import SequenceTaggingDataset from utils.log import logger from config import DEVICE, DEFAULT_CONFIG def light_tokenize(sequence: str): return [sequence] # 普通字段 TEXT = Field(sequential=True, tokenize=light_tokenize, include_lengths=True) # 可逆字段:可以从 wordid 映射回原来的 word TAG = ReversibleField(sequential=True, tokenize=light_tokenize, is_target=True, unk_token=None) Fields = [('text', TEXT), ('tag', TAG)] class TOOL(object): # @staticmethod def get_dataset(self, path: str, fields=Fields, separator='\t'): logger.info('loading dataset from {}'.format(path)) st_dataset = SequenceTaggingDataset(path, fields=fields, separator=separator) logger.info('successed loading dataset') return st_dataset
repeat=False, sort=True, sort_key=lambda x: len(x.data)) dataset_iter_val = data.Iterator(validation_set, batch_size=1, device=device, train=True, shuffle=True, repeat=False, sort=False) else: FIELD = ReversibleField(batch_first=False, init_token='<init>', eos_token='<eos>', lower=True, include_lengths=True) split_cnn = CNN.splits(fields=FIELD) split_dm = DailyMail.splits(fields=FIELD) for scnn, sdm in zip(split_cnn, split_dm): scnn.examples.extend(sdm) split = split_cnn FIELD.build_vocab(split[0].src, vectors="glove.6B." + str(glove_dim) + "d") vocab = copy.deepcopy(FIELD.vocab) dataset_iter, dataset_iter_val, dataset_iter_test = BucketIterator.splits( split,
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score from ...base.tool import Tool from ...utils.log import logger from .config import DEVICE, DEFAULT_CONFIG seed = 2019 torch.manual_seed(seed) torch.cuda.manual_seed(seed) def light_tokenize(sequence: str): return [x for x in sequence.strip()] TEXT = ReversibleField(sequential=True, tokenize=light_tokenize) class LMTool(Tool): def tokenize(self, sequence: str): return [x for x in sequence.strip()] def get_dataset(self, path: str, field=TEXT, newline_eos=False): logger.info('loading dataset from {}'.format(path)) lm_dataset = LanguageModelingDataset(path, text_field=field, newline_eos=newline_eos) logger.info('successed loading dataset') return lm_dataset def get_vocab(self, *dataset): logger.info('building word vocab...') TEXT.build_vocab(*dataset)