def test_identity_encoder_unknown(): sample = [ 'people/deceased_person/place_of_death', 'symbols/name_source/namesakes' ] encoder = IdentityEncoder(sample) input_ = 'symbols/namesake/named_after' output = encoder.encode(input_) assert len(output) == 1 assert encoder.decode(output) == UNKNOWN_TOKEN
def test_identity_encoder_known(): input_ = 'symbols/namesake/named_after' sample = [ 'people/deceased_person/place_of_death', 'symbols/name_source/namesakes' ] sample.append(input_) encoder = IdentityEncoder(sample) output = encoder.encode(input_) assert len(output) == 1 assert encoder.decode(output) == input_
def __init__(self, is_char=False): self.mapped_data = dict() if not is_char: self.train, self.valid, self.test = ptb(train=True, dev=True, test=True) else: self.train, self.valid, self.test = ptb( train=True, dev=True, test=True, train_filename="ptb.char.train.txt", dev_filename="ptb.char.valid.txt", test_filename="ptb.char.test.txt") self._map_data(self.train + self.valid + self.test) #encodeer data encoder = IdentityEncoder(self.train + self.valid + self.test) self.train = torch.LongTensor(encoder.encode(self.train)) self.valid = torch.LongTensor(encoder.encode(self.valid)) self.test = torch.LongTensor(encoder.encode(self.test)) self.ntoken = encoder.vocab_size print('hello')
def test_identity_encoder_sequence(): input_ = [ 'symbols/namesake/named_after', 'people/deceased_person/place_of_death' ] sample = [ 'people/deceased_person/place_of_death', 'symbols/name_source/namesakes' ] encoder = IdentityEncoder(sample) output = encoder.encode(input_) assert len(output) == 2 assert encoder.decode(output) == [ UNKNOWN_TOKEN, 'people/deceased_person/place_of_death' ]
with open(fn, 'rb') as f: model, criterion, optimizer = torch.load(f) from torchnlp import datasets from torchnlp.text_encoders import IdentityEncoder from torchnlp.samplers import BPTTBatchSampler print('Producing dataset...') train, val, test = getattr(datasets, args.data)(train=True, dev=True, test=True) encoder = IdentityEncoder(train + val + test) train_data = encoder.encode(train) val_data = encoder.encode(val) test_data = encoder.encode(test) eval_batch_size = 10 test_batch_size = 1 train_source_sampler, val_source_sampler, test_source_sampler = tuple([ BPTTBatchSampler(d, args.bptt, args.batch_size, True, 'source') for d in (train, val, test) ]) train_target_sampler, val_target_sampler, test_target_sampler = tuple([ BPTTBatchSampler(d, args.bptt, args.batch_size, True, 'target') for d in (train, val, test) ])
row['premise'] = row['premise'].lower() row['hypothesis'] = row['hypothesis'].lower() # Make Encoders sentence_corpus = [row['premise'] for row in datasets_iterator(train, dev, test)] sentence_corpus += [row['hypothesis'] for row in datasets_iterator(train, dev, test)] sentence_encoder = WhitespaceEncoder(sentence_corpus) label_corpus = [row['label'] for row in datasets_iterator(train, dev, test)] label_encoder = IdentityEncoder(label_corpus) # Encode for row in datasets_iterator(train, dev, test): row['premise'] = sentence_encoder.encode(row['premise']) row['hypothesis'] = sentence_encoder.encode(row['hypothesis']) row['label'] = label_encoder.encode(row['label']) config = args config.n_embed = sentence_encoder.vocab_size config.d_out = label_encoder.vocab_size config.n_cells = config.n_layers # double the number of cells for bidirectional networks if config.birnn: config.n_cells *= 2 if args.resume_snapshot: model = torch.load( args.resume_snapshot, map_location=lambda storage, location: storage.cuda(args.gpu)) else: model = SNLIClassifier(config)
def load_data(data_type, preprocessing=False, fine_grained=False, verbose=False, text_length=5000, encode=True, load_SLE=False): if data_type == 'imdb': train_data, test_data = imdb_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'newsgroups': train_data, test_data = newsgroups_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'reuters': train_data, test_data = reuters_dataset(preprocessing=preprocessing, fine_grained=fine_grained, verbose=verbose, text_length=text_length) elif data_type == 'webkb': train_data, test_data = webkb_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'cade': train_data, test_data = cade_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'dbpedia': train_data, test_data = dbpedia_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'agnews': train_data, test_data = agnews_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'yahoo': train_data, test_data = yahoo_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'sogou': train_data, test_data = sogou_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'yelp': train_data, test_data = yelp_dataset(preprocessing=preprocessing, fine_grained=fine_grained, verbose=verbose, text_length=text_length) elif data_type == 'amazon': train_data, test_data = amazon_dataset(preprocessing=preprocessing, fine_grained=fine_grained, verbose=verbose, text_length=text_length) elif data_type == 'custom': test_data = custom_dataset(preprocessing=preprocessing, fine_grained=fine_grained, verbose=verbose, text_length=text_length) sentence_encoder = pickle.load(open('epochs/sentence_encoder', 'rb')) label_encoder = pickle.load(open('epochs/label_encoder', 'rb')) for row in datasets_iterator(test_data): row['text'] = sentence_encoder.encode(' '.join(row['text'])) row['label'] = label_encoder.encode(row['label']) return sentence_encoder.vocab_size, label_encoder.vocab_size, test_data else: raise ValueError('{} data type not supported.'.format(data_type)) if encode: if load_SLE: sentence_encoder = pickle.load( open('epochs/sentence_encoder', 'rb')) label_encoder = pickle.load(open('epochs/label_encoder', 'rb')) else: sentence_corpus = [ row['text'] for row in datasets_iterator(train_data, ) ] label_corpus = [ row['label'] for row in datasets_iterator(train_data, ) ] sentence_encoder = WhitespaceEncoder( sentence_corpus, reserved_tokens=[PADDING_TOKEN, UNKNOWN_TOKEN]) label_encoder = IdentityEncoder(label_corpus, reserved_tokens=[]) with open('epochs/sentence_encoder', 'wb') as f: pickle.dump(sentence_encoder, f) with open('epochs/label_encoder', 'wb') as f: pickle.dump(label_encoder, f) # Encode for row in datasets_iterator(train_data, test_data): row['text'] = sentence_encoder.encode(row['text']) row['label'] = label_encoder.encode(row['label']) return sentence_encoder.vocab_size, label_encoder.vocab_size, train_data, test_data else: return train_data, test_data