def test_identity_encoder_unknown(): sample = [ 'people/deceased_person/place_of_death', 'symbols/name_source/namesakes' ] encoder = IdentityEncoder(sample) input_ = 'symbols/namesake/named_after' output = encoder.encode(input_) assert len(output) == 1 assert encoder.decode(output) == UNKNOWN_TOKEN
def test_identity_encoder_known(): input_ = 'symbols/namesake/named_after' sample = [ 'people/deceased_person/place_of_death', 'symbols/name_source/namesakes' ] sample.append(input_) encoder = IdentityEncoder(sample) output = encoder.encode(input_) assert len(output) == 1 assert encoder.decode(output) == input_
def test_identity_encoder_sequence(): input_ = [ 'symbols/namesake/named_after', 'people/deceased_person/place_of_death' ] sample = [ 'people/deceased_person/place_of_death', 'symbols/name_source/namesakes' ] encoder = IdentityEncoder(sample) output = encoder.encode(input_) assert len(output) == 2 assert encoder.decode(output) == [ UNKNOWN_TOKEN, 'people/deceased_person/place_of_death' ]
def __init__(self, data_path=None): super().__init__() save_path = path.join(kPrepDataDir, 'labels.pt') if path.exists(save_path): with open(save_path, 'rb') as f: labels = pickle.load(f) self.encoder = IdentityEncoder(labels) return labels = [] with open(data_path, 'r') as f: for line in f: label = line.split('\t')[-1][:-1] labels.append(label) self.encoder = IdentityEncoder(labels) with open(save_path, 'wb') as f: pickle.dump(self.vocab(), f)
def __init__(self, is_char=False): self.mapped_data = dict() if not is_char: self.train, self.valid, self.test = ptb(train=True, dev=True, test=True) else: self.train, self.valid, self.test = ptb( train=True, dev=True, test=True, train_filename="ptb.char.train.txt", dev_filename="ptb.char.valid.txt", test_filename="ptb.char.test.txt") self._map_data(self.train + self.valid + self.test) #encodeer data encoder = IdentityEncoder(self.train + self.valid + self.test) self.train = torch.LongTensor(encoder.encode(self.train)) self.valid = torch.LongTensor(encoder.encode(self.valid)) self.test = torch.LongTensor(encoder.encode(self.test)) self.ntoken = encoder.vocab_size print('hello')
def model_load(fn): global model, criterion, optimizer with open(fn, 'rb') as f: model, criterion, optimizer = torch.load(f) from torchnlp import datasets from torchnlp.text_encoders import IdentityEncoder from torchnlp.samplers import BPTTBatchSampler print('Producing dataset...') train, val, test = getattr(datasets, args.data)(train=True, dev=True, test=True) encoder = IdentityEncoder(train + val + test) train_data = encoder.encode(train) val_data = encoder.encode(val) test_data = encoder.encode(test) eval_batch_size = 10 test_batch_size = 1 train_source_sampler, val_source_sampler, test_source_sampler = tuple([ BPTTBatchSampler(d, args.bptt, args.batch_size, True, 'source') for d in (train, val, test) ]) train_target_sampler, val_target_sampler, test_target_sampler = tuple([ BPTTBatchSampler(d, args.bptt, args.batch_size, True, 'target')
# load dataset train, dev, test = snli_dataset(train=True, dev=True, test=True) # Preprocess for row in datasets_iterator(train, dev, test): row['premise'] = row['premise'].lower() row['hypothesis'] = row['hypothesis'].lower() # Make Encoders sentence_corpus = [row['premise'] for row in datasets_iterator(train, dev, test)] sentence_corpus += [row['hypothesis'] for row in datasets_iterator(train, dev, test)] sentence_encoder = WhitespaceEncoder(sentence_corpus) label_corpus = [row['label'] for row in datasets_iterator(train, dev, test)] label_encoder = IdentityEncoder(label_corpus) # Encode for row in datasets_iterator(train, dev, test): row['premise'] = sentence_encoder.encode(row['premise']) row['hypothesis'] = sentence_encoder.encode(row['hypothesis']) row['label'] = label_encoder.encode(row['label']) config = args config.n_embed = sentence_encoder.vocab_size config.d_out = label_encoder.vocab_size config.n_cells = config.n_layers # double the number of cells for bidirectional networks if config.birnn: config.n_cells *= 2
def load_data(data_type, preprocessing=False, fine_grained=False, verbose=False, text_length=5000, encode=True, load_SLE=False): if data_type == 'imdb': train_data, test_data = imdb_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'newsgroups': train_data, test_data = newsgroups_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'reuters': train_data, test_data = reuters_dataset(preprocessing=preprocessing, fine_grained=fine_grained, verbose=verbose, text_length=text_length) elif data_type == 'webkb': train_data, test_data = webkb_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'cade': train_data, test_data = cade_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'dbpedia': train_data, test_data = dbpedia_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'agnews': train_data, test_data = agnews_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'yahoo': train_data, test_data = yahoo_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'sogou': train_data, test_data = sogou_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'yelp': train_data, test_data = yelp_dataset(preprocessing=preprocessing, fine_grained=fine_grained, verbose=verbose, text_length=text_length) elif data_type == 'amazon': train_data, test_data = amazon_dataset(preprocessing=preprocessing, fine_grained=fine_grained, verbose=verbose, text_length=text_length) elif data_type == 'custom': test_data = custom_dataset(preprocessing=preprocessing, fine_grained=fine_grained, verbose=verbose, text_length=text_length) sentence_encoder = pickle.load(open('epochs/sentence_encoder', 'rb')) label_encoder = pickle.load(open('epochs/label_encoder', 'rb')) for row in datasets_iterator(test_data): row['text'] = sentence_encoder.encode(' '.join(row['text'])) row['label'] = label_encoder.encode(row['label']) return sentence_encoder.vocab_size, label_encoder.vocab_size, test_data else: raise ValueError('{} data type not supported.'.format(data_type)) if encode: if load_SLE: sentence_encoder = pickle.load( open('epochs/sentence_encoder', 'rb')) label_encoder = pickle.load(open('epochs/label_encoder', 'rb')) else: sentence_corpus = [ row['text'] for row in datasets_iterator(train_data, ) ] label_corpus = [ row['label'] for row in datasets_iterator(train_data, ) ] sentence_encoder = WhitespaceEncoder( sentence_corpus, reserved_tokens=[PADDING_TOKEN, UNKNOWN_TOKEN]) label_encoder = IdentityEncoder(label_corpus, reserved_tokens=[]) with open('epochs/sentence_encoder', 'wb') as f: pickle.dump(sentence_encoder, f) with open('epochs/label_encoder', 'wb') as f: pickle.dump(label_encoder, f) # Encode for row in datasets_iterator(train_data, test_data): row['text'] = sentence_encoder.encode(row['text']) row['label'] = label_encoder.encode(row['label']) return sentence_encoder.vocab_size, label_encoder.vocab_size, train_data, test_data else: return train_data, test_data
def encoder(): sample = [ 'people/deceased_person/place_of_death', 'symbols/name_source/namesakes' ] return IdentityEncoder(sample)