def tokenize(self, texts): if self.verbose: print("Tokenizing {} documents...".format(len(texts))) tokens = get_texts(texts) texts_length = [len(t) for t in tokens] try: self.vocab if self.verbose: print("Using existing vocabulary") except AttributeError: if self.verbose: print("Building Vocab...") self.vocab = Vocab.create(tokens, max_vocab=self.max_vocab, min_freq=self.min_freq) texts_numz = [self.vocab.numericalize(t) for t in texts] sorted_texts_length = sorted(texts_length) self.maxlen = int(np.quantile(sorted_texts_length, q=self.q)) # self.maxlen = sorted_texts_length[int(self.q * len(sorted_texts_length))] if self.verbose: print("Padding documents...") padded_texts = [ pad_sequences(t, self.maxlen, pad_first=self.pad_first, pad_idx=self.pad_idx) for t in texts_numz ] return np.stack(padded_texts, axis=0)
def test_simple_vocab(): input_vocab = Vocab(['a', f'b{cpe}', 'c', 'd', f'e{cpe}']) actual_vocab, actual_first_non_term_index = _create_term_vocab(input_vocab) expected_itos = [f'b{cpe}', f'e{cpe}', 'a', 'c', 'd'] assert expected_itos == actual_vocab.itos assert 2 == actual_first_non_term_index
def test_only_terminals(): input_vocab = Vocab([f'b{cpe}', f'e{cpe}']) actual_vocab, actual_first_non_term_index = _create_term_vocab(input_vocab) expected_itos = [f'b{cpe}', f'e{cpe}'] assert expected_itos == actual_vocab.itos assert 2 == actual_first_non_term_index
def test_only_non_terminals(): input_vocab = Vocab(['a', 'c', 'd']) actual_vocab, actual_first_non_term_index = _create_term_vocab(input_vocab) expected_itos = ['a', 'c', 'd'] assert expected_itos == actual_vocab.itos assert 0 == actual_first_non_term_index
def test_empty_vocab(): input_vocab = Vocab([]) actual_vocab, actual_first_non_term_index = _create_term_vocab(input_vocab) expected_itos = [] assert expected_itos == actual_vocab.itos assert 0 == actual_first_non_term_index
def prepare_text(df: pd.DataFrame, text_col: str, max_vocab: int, min_freq: int, maxlen: int, word_vectors_path: str): texts = df[text_col].tolist() # texts = [t.lower() for t in texts] tokens = get_texts_gensim(texts) vocab = Vocab.create(tokens, max_vocab=max_vocab, min_freq=min_freq) sequences = [vocab.numericalize(t) for t in tokens] padded_seq = np.array([pad_sequences(s, maxlen=maxlen) for s in sequences]) print("Our vocabulary contains {} words".format(len(vocab.stoi))) embedding_matrix = build_embeddings_matrix(vocab, word_vectors_path) return padded_seq, vocab, embedding_matrix
def tokenize(self, texts): if self.verbose: print("Running sentence tokenizer for {} documents...".format( len(texts))) texts_sents = self._sentencizer(texts) # from nested to flat list. For speed purposes all_sents = [s for sents in texts_sents for s in sents] # saving the lengths of the documents: 1) for padding purposes and 2) to # compute consecutive ranges so we can "fold" the list again texts_length = [0] + [len(s) for s in texts_sents] range_idx = [ sum(texts_length[:i + 1]) for i in range(len(texts_length)) ] if self.verbose: print("Tokenizing {} sentences...".format(len(all_sents))) sents_tokens = get_texts(all_sents) # saving the lengths of sentences for padding purposes sents_length = [len(s) for s in sents_tokens] try: self.vocab if self.verbose: print("Using existing vocabulary") except AttributeError: if self.verbose: print("Building Vocab...") self.vocab = Vocab.create(sents_tokens, max_vocab=self.max_vocab, min_freq=self.min_freq) # 'numericalize' each sentence sents_numz = [self.vocab.numericalize(s) for s in sents_tokens] # group the sentences again into documents texts_numz = [ sents_numz[range_idx[i]:range_idx[i + 1]] for i in range(len(range_idx[:-1])) ] # compute max lengths for padding purposes self.maxlen_sent = int(np.quantile(sents_length, q=self.q)) self.maxlen_doc = int(np.quantile(texts_length[1:], q=self.q)) if self.verbose: print("Padding sentences and documents...") padded_texts = [ pad_nested_sequences( r, self.maxlen_sent, self.maxlen_doc, pad_sent_first=self.pad_sent_first, pad_doc_first=self.pad_doc_first, pad_idx=self.pad_idx, ) for r in texts_numz ] return np.stack(padded_texts, axis=0)
def check_data(data_bunch: DataBunch, vocab: Vocab, verbose: bool, allow_unks: bool) -> None: first_batch = data_bunch.one_batch()[0] if not allow_unks and not contains_no_value(first_batch, UNKNOWN_TOKEN_INDEX): raise ValueError( f"Unknown is found : {[vocab.textify(seq) for seq in first_batch]}" ) if verbose: logger.info(f'Displaying the first batch:\n{first_batch}') token_seqs = [vocab.textify(seq) for seq in first_batch] logger.info(pformat(token_seqs))
def evaluate_lm(data_path, model_dir, tokenizer_lang="xx", evaluate_custom_perplexity=False): """ Evaluate metrics of a trained language model using any dataset of texts from CSV file. Attributes: data_path (str): Path to CSV file with texts in the first column. model_dir (str): Directory with a trained language model. tokenizer_lang (str): Language setting for tokenizer. evaluate_custom_perplexity (bool): The perplexity estimated as e^(avg. loss), but the average loss changes slightly with batch size. To get perplexity computed in slower but controlled fashion, set `evaluate_custom_perplexity` to True. Discrepancy between perplexity and custom perplexity is empirically approximately 1%. """ model_dir = Path(model_dir) with open(model_dir / "lm_itos.pkl", "rb") as f: itos = pickle.load(f) data_df = pd.read_csv(data_path, header=None) data = TextLMDataBunch.from_df("", data_df, data_df, text_cols=0, tokenizer=Tokenizer(lang=tokenizer_lang), vocab=Vocab(itos)) with open(model_dir / "model_hparams.json", "r") as model_hparams_file: model_hparams = json.load(model_hparams_file) learner = lm_learner(data, AWD_LSTM, model_dir, pretrained=True, config=model_hparams) loss, acc = learner.validate() print("Loss: {}, Perplexity: {}, Accuracy: {}".format( loss, exp(loss), acc)) if evaluate_custom_perplexity: print( "Custom perplexity: {}, Fraction OOV: {}, OOV perplexity contribution: {}" .format(*evaluate_perplexity(learner, data.valid_ds.x)))
def __init__(self, path: str, after_epoch: Optional[int] = None, force_use_cpu: bool = False, load_only_description: bool = False, device: Optional[int] = None): if not os.path.exists(path): raise FileNotFoundError(f'Path does not exist: {path}') self._force_use_cpu = force_use_cpu self._id = os.path.basename(path) path_to_config_file = os.path.join(path, CONFIG_FILE_NAME) path_to_metrics_file = os.path.join(path, METRICS_FILE_NAME) path_to_tags_file = os.path.join(path, TAGS_FILE_NAME) self._metrics = None self._config = None self._tags = [] self._context: List[str] = [] try: self._config: LMTrainingConfig = load_config_or_metrics_from_file(path_to_config_file, LMTrainingConfig) except FileNotFoundError: logger.warning(f'Config file not found: {path_to_config_file}') try: self._metrics: LMTrainingMetrics = load_config_or_metrics_from_file(os.path.join(path, METRICS_FILE_NAME), LMTrainingMetrics) except FileNotFoundError: logger.warning(f'File with metrics not found: {path_to_metrics_file}') if os.path.exists(path_to_tags_file): value = read_value_from_file(path_to_tags_file, value_type=str) if value != '': self._tags = value.split(',') self.prep_function = self._config.prep_function self._load_only_description = load_only_description if not load_only_description: # we might want to load only description without loading actual weights when we want # to save time when loading multiple models to choose one of them to work with self._original_vocab = Vocab.load(os.path.join(path, VOCAB_FILE_NAME)) term_vocab, self._first_nonterm_token = _create_term_vocab(self._original_vocab) self._model, self._vocab = self._load_model(path, after_epoch, term_vocab, device=device) to_test_mode(self._model) self._initial_snapshot = take_hidden_state_snapshot(self._model) # last_predicted_token_tensor is a rank-2 tensor! self._last_predicted_token_tensor = torch.tensor([self._vocab.numericalize([self.STARTING_TOKEN])], device=get_device(self._force_use_cpu))
def test_simple(open_mock): # Given file_mock1 = file_mock_with_lines(['1', 'My Class']) file_mock2 = file_mock_with_lines(['1', 'hi']) open_mock.side_effect = [file_mock1, file_mock2] numericalizer = Numericalizer(Vocab( ['`unk', '`pad', '1', 'My', 'Class', 'hi']), n_cpus=1) text_list = TextList( [Path('/path/to/some/file1'), Path('/path/to/some/file2')]) # when numericalizer.process(text_list) # then expected = np.array([np.array([2, 3, 4]), np.array([2, 5])]) assert all_trues(np.equal(expected, text_list.items, dtype=np.object))
def build_vocab(docs: t.List[str], max_vocab: int = 10000, min_freq: int = 5) -> Vocab: return Vocab.create(docs, max_vocab=max_vocab, min_freq=min_freq)
def log_vocab(self, vocab: Vocab) -> None: logger.info(f"Vocab size: {len(vocab.itos)}") vocab.save(os.path.join(self.path_to_trained_model, VOCAB_FILE_NAME)) if self.comet_experiment: self.comet_experiment.log_parameter("vocabulary", len(vocab.itos))
import sys import dataprep.api.corpus as api from dataprep.api.corpus import PreprocessedCorpus from fastai.text import Vocab, TextList, NumericalizeProcessor, OpenFileProcessor if __name__ == '__main__': prep_corpus: PreprocessedCorpus = api.bpe('/home/hlib/dev/yahtzee', '10k', calc_vocab=True) vocab = Vocab(list(prep_corpus.load_vocab().keys())) text_list = TextList.from_folder(prep_corpus.path_to_prep_dataset, vocab=vocab, extensions=['.prep'], processor=[OpenFileProcessor(), NumericalizeProcessor(vocab=vocab, max_vocab=sys.maxsize, min_freq=0)])\ .split_by_rand_pct()\ .label_for_lm()\ .databunch() print(text_list)
def _create_term_vocab(vocab: Vocab) -> Tuple[Vocab, int]: terminal_token_indices = {i for i, k in enumerate(vocab.itos) if is_terminal_subtoken(k)} term_vocab_list = [vocab.itos[i] for i in terminal_token_indices] non_term_vocab_list = [vocab.itos[i] for i in range(len(vocab.itos)) if i not in terminal_token_indices] term_vocab = Vocab(term_vocab_list + non_term_vocab_list) return term_vocab, len(term_vocab_list)
def create_vocab_for_lm(prep_corpus: PreprocessedCorpus) -> Vocab: return Vocab( ['`unk', '`pad'] + list(map(lambda x: to_literal_str(x), prep_corpus.load_vocab().keys())))
def numericalize(doc: str, vocab: Vocab) -> t.List[int]: return vocab.numericalize(doc)