def test_vocabulary_getitem(): counter = nlp.data.utils.Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) vocab = nlp.Vocab(counter, max_size=None, min_freq=1, unknown_token='<unk>', bos_token=None, eos_token=None, reserved_tokens=None) i1 = vocab['c'] assert i1 == 2 assert vocab.to_indices('c') == 2 i2 = vocab[['c']] assert i2 == [2] assert vocab.to_indices(['c']) == [2] i3 = vocab[['<unk>', 'non-exist']] assert i3 == [0, 0] assert vocab.to_indices(['<unk>', 'non-exist']) == [0, 0] i4 = vocab[['a', 'non-exist', 'a', 'b']] assert i4 == [4, 0, 4, 3] assert vocab.to_indices(['a', 'non-exist', 'a', 'b']) == [4, 0, 4, 3] no_unk_vocab = nlp.Vocab(counter, max_size=None, min_freq=1, unknown_token=None, bos_token=None, eos_token=None, reserved_tokens=None) assert no_unk_vocab['c'] == 1 assert no_unk_vocab.to_indices('c') == 1 assert no_unk_vocab[['c']] == [1] assert no_unk_vocab.to_indices(['c']) == [1] for words in [['<unk>', 'non-exist'], ['a', 'non-exist', 'a', 'b']]: with pytest.raises(KeyError): no_unk_vocab.to_indices(words)
def test_vocab_serialization(): # Preserving unknown_token behaviour vocab = nlp.Vocab(unknown_token=None) with pytest.raises(KeyError): vocab['hello'] loaded_vocab = nlp.Vocab.from_json(vocab.to_json()) with pytest.raises(KeyError): loaded_vocab['hello'] vocab = nlp.Vocab(unknown_token='abc') vocab['hello'] loaded_vocab = nlp.Vocab.from_json(vocab.to_json()) loaded_vocab['hello']
def _build_vocab(data_name, train_dataset, test_dataset, dev_dataset, model_name): all_token = [] max_len = 0 for dataset in (train_dataset, dev_dataset, test_dataset): for line in dataset: line = _clean_str(line[0], data_name).split() max_len = max_len if max_len > len(line) else len(line) all_token.extend(line) vocab = nlp.Vocab(nlp.data.count_tokens(all_token)) if (model_name == 'rand'): emb = nlp.embedding.TokenEmbedding() emb[emb.unknown_token] = nd.zeros(300) vocab.set_embedding(emb) else: vocab.set_embedding( nlp.embedding.create('Word2Vec', source='GoogleNews-vectors-negative300')) for word in vocab.embedding._idx_to_token: if (vocab.embedding[word] == nd.zeros(300)).sum() == 300: vocab.embedding[word] = nd.random.uniform(0, 0.05, 300) vocab.embedding['<unk>'] = nd.random.uniform(0, 0.05, 300) vocab.embedding['<pad>'] = nd.zeros(300) vocab.embedding['<bos>'] = nd.zeros(300) vocab.embedding['<eos>'] = nd.zeros(300) print('maximum length (in tokens): ', max_len) return vocab, max_len
def test_bptt_batchify_padding_token(): vocab = nlp.Vocab(nlp.data.utils.Counter(['a', 'b', 'c']), padding_token=None) seq_len = 35 batch_size = 80 # Padding token must always be specified for StreamBPTTBatchify with pytest.raises(ValueError): nlp.data.batchify.StreamBPTTBatchify(vocab, seq_len, batch_size, last_batch='discard') with pytest.raises(ValueError): nlp.data.batchify.StreamBPTTBatchify(vocab, seq_len, batch_size, last_batch='keep') # Padding token must be specified for last_batch='keep' for CorpusBPTTBatchify with pytest.raises(ValueError): nlp.data.batchify.CorpusBPTTBatchify(vocab, seq_len, batch_size, last_batch='keep') nlp.data.batchify.CorpusBPTTBatchify(vocab, seq_len, batch_size, last_batch='discard')
def build_vocabulary(embeddings, tr_df, val_df=None, tst_df=None): """ Inputs: arrays representing the training, and optionally validation and test data (transductive case) Outputs: vocabulary (Tokenized text as in-place modification of input arrays or returned as new arrays) """ all_tokens = [] # appends the other datasets if they are not null datasets = [tr_df] if val_df is not None: datasets.append(val_df) if tst_df is not None: datasets.append(tst_df) # For each dataset, get each twit, tokenize it, and add each token to the list of tokens for dataset in datasets: for text_instance in dataset['text'].values: tokens = word_tokenize(text_instance) all_tokens.extend(tokens) # Count the tokens and create a vocab object counter = nlp.data.count_tokens(all_tokens) vocab = nlp.Vocab(counter) # Attach selected embeddings to the vocabulary vocab.set_embedding(nlp.embedding.create('glove', source=embeddings)) return vocab
def get_train_data(args): """Helper function to get training data.""" with print_time('load training dataset'): dataset = nlp.data.Text8(segment='train') with print_time('count tokens'): counter = nlp.data.count_tokens(itertools.chain.from_iterable(dataset)) vocab = nlp.Vocab(counter, unknown_token=None, padding_token=None, bos_token=None, eos_token=None, min_freq=5) idx_to_counts = np.array([counter[w] for w in vocab.idx_to_token]) negatives_weights = idx_to_counts**0.75 negatives_sampler = nlp.data.UnigramCandidateSampler( weights=mx.nd.array(negatives_weights)) # Skip "unknown" tokens with print_time('code dataset'): coded_dataset = [[ vocab[token] for token in sentence if token in vocab ] for sentence in dataset] coded_dataset = [ sentence for sentence in coded_dataset if len(sentence) ] with print_time('prune frequent words from sentences'): f = idx_to_counts / np.sum(idx_to_counts) idx_to_pdiscard = 1 - np.sqrt(args.frequent_token_subsampling / f) prune_sentences_ = functools.partial(prune_sentences, idx_to_pdiscard=idx_to_pdiscard) coded_dataset = list(map(prune_sentences_, coded_dataset)) if args.ngram_buckets: # Fasttext model with print_time('prepare subwords'): subword_function = nlp.vocab.create_subword_function( 'NGramHashes', ngrams=args.ngrams, num_subwords=args.ngram_buckets) # Store subword indices for all words in vocabulary idx_to_subwordidxs = list(subword_function(vocab.idx_to_token)) get_subwords_masks = get_subwords_masks_factory(idx_to_subwordidxs) max_subwordidxs_len = max(len(s) for s in idx_to_subwordidxs) if max_subwordidxs_len > 500: warnings.warn( 'The word with largest number of subwords ' 'has {} subwords, suggesting there are ' 'some noisy words in your vocabulary. ' 'You should filter out very long words ' 'to avoid memory issues.'.format(max_subwordidxs_len)) return (coded_dataset, negatives_sampler, vocab, subword_function, get_subwords_masks) else: return coded_dataset, negatives_sampler, vocab
def build_vocab(hparams, types="fasttext", source="wiki.simple", min_freq=10): lyrics_train = load_lyrics('train') lyrics_valid = load_lyrics('valid') lyrics_test = load_lyrics('test') # Extract token in sentence total_vocab = lyrics_train + lyrics_valid + lyrics_test list_of_tokens = [] for i in total_vocab: list_of_tokens.append(preprocessing(i)) token_counter = Counter(itertools.chain.from_iterable(list_of_tokens)) tmp_vocab = nlp.Vocab(counter=token_counter, min_freq=10, bos_token=None, eos_token=None) # connecting SISG embedding with vocab ptr_embedding = nlp.embedding.create(types, source=source) tmp_vocab.set_embedding(ptr_embedding) array = tmp_vocab.embedding.idx_to_vec.asnumpy() vocab = Vocab( tmp_vocab.idx_to_token, padding_token="<pad>", unknown_token="<unk>", bos_token=None, eos_token=None, ) vocab.embedding = array # saving vocab with open(hparams.dataset_path + "/vocab.pkl", mode="wb") as io: pickle.dump(vocab, io)
def corpus_process(): # TODO: try to repalce it with torchtext vocab? print(gluonnlp.embedding.list_sources('glove')) glove = gluonnlp.embedding.create('glove', source='glove.6B.50d') vocab = gluonnlp.Vocab(gluonnlp.data.Counter(glove.idx_to_token)) vocab.set_embedding(glove) # print(vocab['<pad>','<unk>']) # print(vocab.idx_to_token[3]) embeddings = vocab.embedding.idx_to_vec # We use imdb5k first data_train = pd.read_csv(os.path.join(args.data_path, 'imdb5k_train.csv')) data_test = pd.read_csv(os.path.join(args.data_path, 'imdb5k_test.csv')) data_train.replace(to_replace='neg', value=0, inplace=True) data_train.replace(to_replace='pos', value=1, inplace=True) data_test.replace(to_replace='neg', value=0, inplace=True) data_test.replace(to_replace='pos', value=1, inplace=True) X_train, y_train = get_token_id(data_train['text'], vocab), np.asarray(data_train['label']) X_test, y_test = get_token_id(data_test['text'], vocab), np.asarray(data_test['label']) # print(len(y_train)) X_train_new, y_train_new = X_train[:4000], y_train[:4000] X_valid, y_valid = X_train[4000:], y_train[4000:] # print(X_train_new,len(y_train_new),len(y_valid)) train = (X_train, y_train) train_new = (X_train_new, y_train_new) valid = (X_valid, y_valid) test = (X_test, y_test) pickle.dump(train, open(args.data_path + '/train.pkl', 'wb')) pickle.dump(train_new, open(args.data_path + '/train_new.pkl', 'wb')) pickle.dump(valid, open(args.data_path + '/valid.pkl', 'wb')) pickle.dump(test, open(args.data_path + '/test.pkl', 'wb')) pickle.dump(embeddings, open(args.data_path + '/embedding_matrix', 'wb'))
def test_corpus_bptt_batchify(batch_size, seq_len, wikitext2_test_and_counter): data, counter = wikitext2_test_and_counter vocab = nlp.Vocab(counter) # unsupported last_batch with pytest.raises(ValueError): bptt_keep = nlp.data.batchify.CorpusBPTTBatchify( vocab, seq_len, batch_size, last_batch='unsupported') # last_batch='keep' bptt_keep = nlp.data.batchify.CorpusBPTTBatchify( vocab, seq_len, batch_size, last_batch='keep') X, Y = zip(*(bptt_keep(data))) X, Y = mx.nd.concat(*X, dim=0), mx.nd.concat(*Y, dim=0) coded = mx.nd.concat( X, Y[-1].expand_dims(0), dim=0).T.reshape(-1).asnumpy().tolist() assert vocab[list(data)] == coded[:len(data)] assert all(pad == vocab[vocab.padding_token] for pad in coded[len(data):]) # last_batch='discard' bptt_discard = nlp.data.batchify.CorpusBPTTBatchify( vocab, seq_len, batch_size, last_batch='discard') X, Y = zip(*(bptt_discard(data))) X, Y = mx.nd.concat(*X, dim=0), mx.nd.concat(*Y, dim=0) coded = mx.nd.concat( X, Y[-1].expand_dims(0), dim=0).T.reshape(-1).asnumpy().tolist() assert len(data) - len(coded) < batch_size * seq_len
def build_vocab(self): """ """ embedding_list = [] for source_name in ['wiki.ko', 'cc.ko.300']: tmp_vocab = nlp.Vocab(counter=Counter(self.sp.tokens), unknown_token='<unk>', padding_token='<pad>', min_freq=1, bos_token=None, eos_token=None, token_to_idx={'<unk>': 1}) embedding = nlp.embedding.create('fasttext', source=source_name) tmp_vocab.set_embedding(embedding) array = tmp_vocab.embedding.idx_to_vec.asnumpy() array[1] = array.mean(axis=0) embedding_list.append(array) OOV = int(((array == 0.).sum(axis=1) == array.shape[1]).sum()) print(f"The number of OOV is {OOV} by {array.shape[0]}") self.index.update({"OOV": OOV}) self.vocab.embedding = embedding_list self.index.update({'token2idx': self.vocab.token_to_idx}) self.index.update( {'idx2token': {v: k for k, v in self.vocab.token_to_idx.items()}})
def get_vocabulary_embeddings(examples): glove_6b50d = nlp.embedding.create('glove', source='glove.6B.100d') vocab = nlp.Vocab(nlp.data.Counter(glove_6b50d.idx_to_token)) vocab.set_embedding(glove_6b50d) prompt_text = ' '.join([ ' '.join( [' '.join(preprocess_glove(turn[0])) for turn in conversation]) for conversation in examples ]) respon_text = ' '.join([ ' '.join( [' '.join(preprocess_glove(turn[1])) for turn in conversation]) for conversation in examples ]) tokens = (prompt_text + respon_text).split(' ') vocabulary = sorted(list(set(tokens))) print('Total vocabulary {}'.format(len(vocabulary))) for token in vocabulary: if token not in vocab: print(token) vocabulary = [token for token in vocabulary if token in vocab] print('Embeddable vocabulary {}'.format(len(vocabulary)))
def get_train_data(args): """Helper function to get training data.""" with print_time('load training dataset'): dataset = nlp.data.Text8(segment='train') with print_time('count tokens'): counter = nlp.data.count_tokens(itertools.chain.from_iterable(dataset)) vocab = nlp.Vocab(counter, unknown_token=None, padding_token=None, bos_token=None, eos_token=None, min_freq=5) idx_to_counts = mx.nd.array([counter[w] for w in vocab.idx_to_token]) negatives_weights = idx_to_counts**0.75 negatives_sampler = nlp.data.UnigramCandidateSampler( weights=negatives_weights) # Skip "unknown" tokens with print_time('code dataset'): coded_dataset = [[ vocab[token] for token in sentence if token in vocab ] for sentence in dataset] with print_time('prune frequent words from sentences'): frequent_tokens_subsampling_constant = 1e-3 f = idx_to_counts / mx.nd.sum(idx_to_counts) idx_to_pdiscard = ( mx.nd.sqrt(frequent_tokens_subsampling_constant / f) + frequent_tokens_subsampling_constant / f).asnumpy() prune_sentences_ = functools.partial(prune_sentences, idx_to_pdiscard=idx_to_pdiscard) coded_dataset = list(map(prune_sentences_, coded_dataset)) with print_time('prepare subwords'): subword_function = nlp.vocab.create_subword_function( 'NGramHashes', ngrams=args.ngrams, num_subwords=args.ngram_buckets) # Precompute a idx to subwordidxs mapping to support fast lookup idx_to_subwordidxs = list(subword_function(vocab.idx_to_token)) max_subwordidxs_len = max(len(s) for s in idx_to_subwordidxs) # Padded max_subwordidxs_len + 1 so each row contains at least one -1 # element which can be found by np.argmax below. idx_to_subwordidxs = np.stack( np.pad(b.asnumpy(), (0, max_subwordidxs_len - len(b) + 1), \ constant_values=-1, mode='constant') for b in idx_to_subwordidxs).astype(np.float32) idx_to_subwordidxs = mx.nd.array(idx_to_subwordidxs) logging.info( 'Using %s to obtain subwords. ' 'The word with largest number of subwords ' 'has %s subwords.', subword_function, max_subwordidxs_len) return (coded_dataset, negatives_sampler, vocab, subword_function, idx_to_subwordidxs)
def __init__(self, text_vocab, scentences, seq_len, is_cased, tag_list, tag_vocab=None): self.text_vocab = text_vocab self.seq_len = seq_len self.tag_list = tag_list self.ernie_tokenizer = nlp.data.BERTTokenizer(self.text_vocab, lower=not is_cased) predected_sentence = [] if scentences is None else load_segment( scentences, self.ernie_tokenizer) if tag_vocab is None: logging.info('Indexing tags...') tag_counter = nlp.data.count_tokens([tag for tag in self.tag_list]) self.tag_vocab = nlp.Vocab(tag_counter, padding_token=NULL_TAG, bos_token=None, eos_token=None, unknown_token=None) else: self.tag_vocab = tag_vocab self.null_tag_index = self.tag_vocab[NULL_TAG] self.predect_inputs = [ self._encode_as_input(sentence) for sentence in predected_sentence ] logging.info('tag_vocab: %s', self.tag_vocab)
def preprocess_dataset_stream(stream, logging, min_freq=5, max_vocab_size=None): counter = None i = 0 for data in iter(stream): i += 1 counter = nlp.data.count_tokens(itertools.chain.from_iterable(data), counter=counter) if i % 100 == 0: logging.info("{} Files pre-processed".format(i)) counter = trim_counter_large_tokens(counter, 20) vocab = nlp.Vocab(counter, unknown_token=None, padding_token=None, bos_token=None, eos_token=None, min_freq=min_freq, max_size=max_vocab_size) idx_to_counts = [counter[w] for w in vocab.idx_to_token] def code(sentence): return [vocab[token] for token in sentence if token in vocab] def code_corpus(corpus): return corpus.transform(code) stream = stream.transform(code_corpus) return stream, vocab, idx_to_counts
def preprocess_dataset_stream(stream, logging, min_freq=5, max_vocab_size=None, pre_embedding=None): if pre_embedding: counter = nlp.data.Counter(pre_embedding.idx_to_token) ## increase counts so these terms aren't filtered out of the vocabulary for i in range(int(math.log2(min_freq * 2))): counter = counter + counter else: counter = None i = 0 for data in iter(stream): i += 1 counter = nlp.data.count_tokens(itertools.chain.from_iterable(data), counter=counter) if i % 100 == 0: logging.info("{} Files pre-processed".format(i)) counter = trim_counter_large_tokens(counter, 30) vocab = nlp.Vocab(counter, unknown_token=None, padding_token=None, bos_token=None, eos_token=None, min_freq=min_freq, max_size=max_vocab_size) idx_to_counts = [counter[w] for w in vocab.idx_to_token] def code(sentence): return [vocab[token] for token in sentence if token in vocab] def code_corpus(corpus): return corpus.transform(code) stream = stream.transform(code_corpus) return stream, vocab, idx_to_counts
def make_vocab(self): # train path train_path = self.data_path + '/snli_1.0_train.txt' # train data를 tab으로 구별 document, label 컬럼으로 불러옴 tr = pd.read_csv(train_path, sep='\t').loc[:, ['sentence1', 'sentence2']] # Mecab 정의 # tokenizer = MeCab() # document 열의 데이터를 Mecab의 형태소로 나눈 것들을 list로 변환 tokenized = tr['sentence1'].apply( lambda elm: str(elm).split()).tolist() tokenized += tr['sentence2'].apply( lambda elm: str(elm).split()).tolist() # tokenized 에서 각 단어의 count 저장 counter = nlp.data.count_tokens( itertools.chain.from_iterable(tokenized)) # counter에서 최소 10번 이상 나온것들을 vocab에 저장 vocab = nlp.Vocab(counter=counter, min_freq=10, bos_token=None, eos_token=None) nlp.embedding.list_sources() # wiki.ko 데이터를 fasttext로 벡터화 한 임베딩 가져오기 embedding = nlp.embedding.create('Glove', source='glove.6B.300d') # 만든 vocab에 벡터 적용 vocab.set_embedding(embedding) # vocab.pkl 저장 with open(self.data_path + '/vocab.pkl', mode='wb') as io: pickle.dump(vocab, io)
def get_vocab(self): if self.vocab is not None: return self.vocab else: tok_to_idx = self.vectorizer.vocabulary_ cv_vocab = {v: 1 for v in tok_to_idx} cur_idx = len(tok_to_idx) if self.additional_feature_keys: if isinstance(self.additional_feature_keys, list): for f in self.additional_feature_keys: cv_vocab[f] = 1 tok_to_idx[f] = cur_idx cur_idx += 1 else: ## assume it's a dictionary for k in self.additional_feature_keys: for v in self.additional_feature_keys[k]: cv_vocab[k + ':' + v] = 1 tok_to_idx[k + ':' + v] = cur_idx cur_idx += 1 vocab = nlp.Vocab(cv_vocab, token_to_idx=tok_to_idx, unknown_token=None, eos_token=None, bos_token=None, padding_token=None) self.vocab = vocab return vocab
def test_text_models(): val = nlp.data.WikiText2(segment='val', root='tests/data/wikitext-2') val_freq = get_frequencies(val) vocab = nlp.Vocab(val_freq) text_models = [ 'standard_lstm_lm_200', 'standard_lstm_lm_650', 'standard_lstm_lm_1500', 'awd_lstm_lm_1150', 'awd_lstm_lm_600' ] pretrained_to_test = { 'standard_lstm_lm_1500': 'wikitext-2', 'standard_lstm_lm_650': 'wikitext-2', 'standard_lstm_lm_200': 'wikitext-2', 'awd_lstm_lm_1150': 'wikitext-2', 'awd_lstm_lm_600': 'wikitext-2' } for model_name in text_models: eprint('testing forward for %s' % model_name) pretrained_dataset = pretrained_to_test.get(model_name) model, _ = get_text_model(model_name, vocab=vocab, dataset_name=pretrained_dataset, pretrained=pretrained_dataset is not None, root='tests/data/model/') print(model) if not pretrained_dataset: model.collect_params().initialize() output, state = model(mx.nd.arange(330).reshape(33, 10)) output.wait_to_read()
def test_corpus_batchify(batch_size): data = nlp.data.WikiText2(segment='test', root=os.path.join('tests', 'data', 'wikitext-2')) vocab = nlp.Vocab(nlp.data.utils.Counter(data)) batchify = nlp.data.batchify.CorpusBatchify(vocab, batch_size) batches = batchify(data) assert batches[:].shape == (len(data) // batch_size, batch_size)
def make_vocab(self): tr = pd.read_csv(self._train_path, sep='\t').loc[:, ['sentence1', 'sentence2']] tokenized = tr['sentence1'].apply( lambda elm: str(elm).split()).tolist() tokenized += tr['sentence2'].apply( lambda elm: str(elm).split()).tolist() # tokenized 에서 각 단어의 count 저장 counter = nlp.data.count_tokens( itertools.chain.from_iterable(tokenized)) # counter에서 최소 10번 이상 나온것들을 vocab에 저장 vocab = nlp.Vocab(counter=counter, min_freq=10, bos_token=None, eos_token=None) nlp.embedding.list_sources() # wiki.ko 데이터를 fasttext로 벡터화 한 임베딩 가져오기 embedding = nlp.embedding.create( 'word2vec', source='GoogleNews-vectors-negative300') # 만든 vocab에 벡터 적용 vocab.set_embedding(embedding) # vocab.pkl 저장 with open(Path.cwd() / 'data_in' / 'vocab.pkl', mode='wb') as io: pickle.dump(vocab, io)
def build_vocabulary(train_array, test_array): """ Inputs: arrays representing the training, validation and test data Outputs: vocabulary (Tokenized text as in-place modification of input arrays or returned as new arrays) """ # List of all tokens in the dataset. all_tokens = [] # Keep track of all types of labels. all_labels = set() for array in (train_array, test_array): for i, instance in enumerate(array): sent, label_string = instance tokens = [START_TOKEN, *sent.lower().split(' '), STOP_TOKEN] labels = label_string.split(',') # In-place modification of array. array[i] = (tokens, labels) # Update running count of all tokens and all labels types. all_tokens.extend(tokens) all_labels.update(labels) counter = nlp.data.count_tokens(all_tokens) vocab = nlp.Vocab(counter) return vocab, all_labels
def test_vocabulary_to_tokens(): counter = nlp.data.utils.Counter( ['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) vocab = nlp.Vocab(counter, max_size=None, min_freq=1, unknown_token='<unknown>', bos_token=None, eos_token=None, reserved_tokens=None) i1 = vocab.to_tokens(2) assert i1 == 'c' i2 = vocab.to_tokens([2]) assert i2 == ['c'] i3 = vocab.to_tokens([0, 0]) assert i3 == ['<unknown>', '<unknown>'] i4 = vocab.to_tokens([4, 0, 4, 3]) assert i4 == ['a', '<unknown>', 'a', 'b'] for indices in [6, [6, 7]]: with pytest.raises(ValueError): vocab.to_tokens(indices)
def _build_vocab(data_name, train_dataset, test_dataset): all_token = [] max_len = 0 for i, line in enumerate(train_dataset): train_dataset[i][0] = _clean_str(line[0], data_name) line = train_dataset[i][0].split() max_len = max_len if max_len > len(line) else len(line) all_token.extend(line) for i, line in enumerate(test_dataset): test_dataset[i][0] = _clean_str(line[0], data_name) line = test_dataset[i][0].split() max_len = max_len if max_len > len(line) else len(line) all_token.extend(line) vocab = nlp.Vocab(nlp.data.count_tokens(all_token)) vocab.set_embedding( nlp.embedding.create('Word2Vec', source='GoogleNews-vectors-negative300')) for word in vocab.embedding._idx_to_token: if (vocab.embedding[word] == nd.zeros(300)).sum() == 300: vocab.embedding[word] = nd.random.normal(-1.0, 1.0, 300) vocab.embedding['<unk>'] = nd.zeros(300) vocab.embedding['<pad>'] = nd.zeros(300) vocab.embedding['<bos>'] = nd.zeros(300) vocab.embedding['<eos>'] = nd.zeros(300) print('maximum length (in tokens): ', max_len) return vocab, max_len
def gluonnlp_main(): """ 추후 적용 예정 :return: """ import gluonnlp as nlp cwd = Path.cwd() full_path = cwd / 'data_in/Chatbot_data-master/ChatbotData.csv' tr_input, val_input, tr_label, val_label = load_data(data_path=full_path) total_input = tr_input + val_input mecab_tokenizer = Mecab() # extracting morph in sentences _list_of_tokens = [ mecab_tokenizer.morphs(input_item) for input_item in total_input ] list_of_tokens = [] for _ in _list_of_tokens: list_of_tokens += _ # making the vocab counter = nlp.data.count_tokens( itertools.chain.from_iterable(list_of_tokens)) vocab = nlp.Vocab(counter=counter, min_freq=5, bos_token=None, eos_token=None)
def __init__(self, dataset_token, embedding): self.dataset_token = dataset_token self.seqs = [sample[0] + sample[1] for sample in dataset_token] self.counter = nlp.data.count_tokens( list(itertools.chain.from_iterable(self.seqs))) self.vocab = nlp.Vocab(self.counter, max_size=40000) self.vocab.set_embedding(nlp.embedding.GloVe(source=embedding))
def test_wikitext2(): batch_size = 80 seq_len = 35 train = nlp.data.WikiText2( segment='train', root=os.path.join('tests', 'data', 'wikitext-2')) val = nlp.data.WikiText2( segment='val', root=os.path.join('tests', 'data', 'wikitext-2')) test = nlp.data.WikiText2( segment='test', root=os.path.join('tests', 'data', 'wikitext-2')) train_freq, val_freq, test_freq = [nlp.data.utils.Counter(x) for x in [train[0], val[0], test[0]]] assert len(train[0]) == 2075677, len(train[0]) assert len(train_freq) == 33278, len(train_freq) assert len(val[0]) == 216347, len(val[0]) assert len(val_freq) == 13777, len(val_freq) assert len(test[0]) == 244102, len(test[0]) assert len(test_freq) == 14143, len(test_freq) assert test_freq['English'] == 32, test_freq['English'] vocab = nlp.Vocab(train_freq) serialized_vocab = vocab.to_json() assert len(serialized_vocab) == 962190, len(serialized_vocab) assert json.loads(serialized_vocab)['idx_to_token'] == vocab._idx_to_token train_data = train.bptt_batchify(vocab, seq_len, batch_size, last_batch='discard') assert len(train_data) == 741, len(train_data) for i, (data, target) in enumerate(train_data): mx.test_utils.assert_almost_equal(data[1:].asnumpy(), target[:-1].asnumpy()) assert data.shape == target.shape == (seq_len, batch_size) train_data = train.bptt_batchify(vocab, seq_len, batch_size, last_batch='keep') assert len(train_data) == 742, len(train_data) assert train_data[-1][0].shape[0] <= seq_len for i, (data, target) in enumerate(train_data): mx.test_utils.assert_almost_equal(data[1:].asnumpy(), target[:-1].asnumpy()) assert data.shape == target.shape train_freq, val_freq, test_freq = [nlp.data.utils.Counter(x) for x in [train[0], val[0], test[0]]] train = nlp.data.WikiText2( segment='train', skip_empty=False, root=os.path.join('tests', 'data', 'wikitext-2')) val = nlp.data.WikiText2( segment='val', skip_empty=False, root=os.path.join('tests', 'data', 'wikitext-2')) test = nlp.data.WikiText2( segment='test', skip_empty=False, root=os.path.join('tests', 'data', 'wikitext-2')) assert len(train[0]) == 2088628, len(train[0]) assert len(train_freq) == 33278, len(train_freq) assert len(val[0]) == 217646, len(val[0]) assert len(val_freq) == 13777, len(val_freq) assert len(test[0]) == 245569, len(test[0]) assert len(test_freq) == 14143, len(test_freq) assert test_freq['English'] == 32, test_freq['English'] batched_data = train.batchify(vocab, batch_size) assert batched_data.shape == (26107, batch_size)
def test_bptt_batchify(batch_size, seq_len): data = nlp.data.WikiText2(segment='test', root=os.path.join('tests', 'data', 'wikitext-2')) vocab = nlp.Vocab(nlp.data.utils.Counter(data[0])) # unsupported last_batch with pytest.raises(ValueError): data.bptt_batchify(vocab, seq_len, batch_size, last_batch='unsupported') # last_batch='keep' X, Y = zip( *(data.bptt_batchify(vocab, seq_len, batch_size, last_batch='keep'))) X, Y = mx.nd.concat(*X, dim=0), mx.nd.concat(*Y, dim=0) coded = mx.nd.concat(X, Y[-1].expand_dims(0), dim=0).T.reshape(-1).asnumpy().tolist() assert vocab[data[0]] == coded[:len(data[0])] assert all(pad == vocab[vocab.padding_token] for pad in coded[len(data[0]):]) # last_batch='discard' X, Y = zip(*( data.bptt_batchify(vocab, seq_len, batch_size, last_batch='discard'))) X, Y = mx.nd.concat(*X, dim=0), mx.nd.concat(*Y, dim=0) coded = mx.nd.concat(X, Y[-1].expand_dims(0), dim=0).T.reshape(-1).asnumpy().tolist() assert len(data[0]) - len(coded) < batch_size * seq_len
def get_train_data(args): """Helper function to get training data.""" counter = dict() with io.open(args.vocab, 'r', encoding='utf-8') as f: for line in f: token, count = line.split('\t') counter[token] = int(count) vocab = nlp.Vocab(counter, unknown_token=None, padding_token=None, bos_token=None, eos_token=None, min_freq=1) npz = np.load(args.cooccurrences) row, col, counts = npz['row'], npz['col'], npz['data'] rank_dtype = 'int32' if row.max() >= np.iinfo(np.int32).max: rank_dtype = 'int64' # MXNet has no support for uint32, so we must fall back to int64 logging.info('More words than could be counted using int32. ' 'Using int64 to represent word indices.') row = mx.nd.array(row, dtype=rank_dtype) col = mx.nd.array(col, dtype=rank_dtype) # row is always used as 'source' and col as 'context' word. Therefore # duplicate the entries. assert row.shape == col.shape row = mx.nd.concatenate([row, col]) col = mx.nd.concatenate([col, row[:len(row) // 2]]) counts = mx.nd.array(counts, dtype='float32') counts = mx.nd.concatenate([counts, counts]) return vocab, row, col, counts
def make_vocab(self): jamo_list = sorted( set(self.chosung_list + self.jungsung_list + self.jongsung_list)) counter = nlp.data.count_tokens(jamo_list) vocab = nlp.Vocab(counter=counter, bos_token=None, eos_token=None) with open(self.data_path + '/' + 'vocab_char.pkl', mode='wb') as io: pickle.dump(vocab, io)
def build_vocab(self, dataset, reserved_tokens=None): # get_input(ex): id_, ..., label sentences = itertools.chain.from_iterable([self.get_input(ex)[1:-1] for ex in dataset]) tokens = [self.tokenizer.tokenize(s) for s in sentences] counter = nlp.data.count_tokens(list(itertools.chain.from_iterable(tokens))) vocab = nlp.Vocab(counter, bos_token=None, eos_token=None, reserved_tokens=reserved_tokens) logger.info('built vocabulary of size {}'.format(len(vocab))) return vocab