Пример #1
0
 def tokenize(self, texts):
     if self.verbose:
         print("Tokenizing {} documents...".format(len(texts)))
     tokens = get_texts(texts)
     texts_length = [len(t) for t in tokens]
     try:
         self.vocab
         if self.verbose:
             print("Using existing vocabulary")
     except AttributeError:
         if self.verbose:
             print("Building Vocab...")
         self.vocab = Vocab.create(tokens,
                                   max_vocab=self.max_vocab,
                                   min_freq=self.min_freq)
     texts_numz = [self.vocab.numericalize(t) for t in texts]
     sorted_texts_length = sorted(texts_length)
     self.maxlen = int(np.quantile(sorted_texts_length, q=self.q))
     # self.maxlen = sorted_texts_length[int(self.q * len(sorted_texts_length))]
     if self.verbose:
         print("Padding documents...")
     padded_texts = [
         pad_sequences(t,
                       self.maxlen,
                       pad_first=self.pad_first,
                       pad_idx=self.pad_idx) for t in texts_numz
     ]
     return np.stack(padded_texts, axis=0)
Пример #2
0
def test_simple_vocab():
    input_vocab = Vocab(['a', f'b{cpe}', 'c', 'd', f'e{cpe}'])
    actual_vocab, actual_first_non_term_index = _create_term_vocab(input_vocab)

    expected_itos = [f'b{cpe}', f'e{cpe}', 'a', 'c', 'd']

    assert expected_itos == actual_vocab.itos
    assert 2 == actual_first_non_term_index
Пример #3
0
def test_only_terminals():
    input_vocab = Vocab([f'b{cpe}', f'e{cpe}'])
    actual_vocab, actual_first_non_term_index = _create_term_vocab(input_vocab)

    expected_itos = [f'b{cpe}', f'e{cpe}']

    assert expected_itos == actual_vocab.itos
    assert 2 == actual_first_non_term_index
Пример #4
0
def test_only_non_terminals():
    input_vocab = Vocab(['a', 'c', 'd'])
    actual_vocab, actual_first_non_term_index = _create_term_vocab(input_vocab)

    expected_itos = ['a', 'c', 'd']

    assert expected_itos == actual_vocab.itos
    assert 0 == actual_first_non_term_index
Пример #5
0
def test_empty_vocab():
    input_vocab = Vocab([])
    actual_vocab, actual_first_non_term_index = _create_term_vocab(input_vocab)

    expected_itos = []

    assert expected_itos == actual_vocab.itos
    assert 0 == actual_first_non_term_index
def prepare_text(df: pd.DataFrame, text_col: str, max_vocab: int,
                 min_freq: int, maxlen: int, word_vectors_path: str):
    texts = df[text_col].tolist()
    # texts = [t.lower() for t in texts]
    tokens = get_texts_gensim(texts)
    vocab = Vocab.create(tokens, max_vocab=max_vocab, min_freq=min_freq)
    sequences = [vocab.numericalize(t) for t in tokens]
    padded_seq = np.array([pad_sequences(s, maxlen=maxlen) for s in sequences])
    print("Our vocabulary contains {} words".format(len(vocab.stoi)))
    embedding_matrix = build_embeddings_matrix(vocab, word_vectors_path)
    return padded_seq, vocab, embedding_matrix
Пример #7
0
    def tokenize(self, texts):
        if self.verbose:
            print("Running sentence tokenizer for {} documents...".format(
                len(texts)))
        texts_sents = self._sentencizer(texts)
        # from nested to flat list. For speed purposes
        all_sents = [s for sents in texts_sents for s in sents]
        #  saving the lengths of the documents: 1) for padding purposes and 2) to
        #  compute consecutive ranges so we can "fold" the list again
        texts_length = [0] + [len(s) for s in texts_sents]
        range_idx = [
            sum(texts_length[:i + 1]) for i in range(len(texts_length))
        ]
        if self.verbose:
            print("Tokenizing {} sentences...".format(len(all_sents)))
        sents_tokens = get_texts(all_sents)
        #  saving the lengths of sentences for padding purposes
        sents_length = [len(s) for s in sents_tokens]
        try:
            self.vocab
            if self.verbose:
                print("Using existing vocabulary")
        except AttributeError:
            if self.verbose:
                print("Building Vocab...")
            self.vocab = Vocab.create(sents_tokens,
                                      max_vocab=self.max_vocab,
                                      min_freq=self.min_freq)
        # 'numericalize' each sentence
        sents_numz = [self.vocab.numericalize(s) for s in sents_tokens]
        # group the sentences again into documents
        texts_numz = [
            sents_numz[range_idx[i]:range_idx[i + 1]]
            for i in range(len(range_idx[:-1]))
        ]
        # compute max lengths for padding purposes
        self.maxlen_sent = int(np.quantile(sents_length, q=self.q))
        self.maxlen_doc = int(np.quantile(texts_length[1:], q=self.q))

        if self.verbose:
            print("Padding sentences and documents...")
        padded_texts = [
            pad_nested_sequences(
                r,
                self.maxlen_sent,
                self.maxlen_doc,
                pad_sent_first=self.pad_sent_first,
                pad_doc_first=self.pad_doc_first,
                pad_idx=self.pad_idx,
            ) for r in texts_numz
        ]
        return np.stack(padded_texts, axis=0)
Пример #8
0
def check_data(data_bunch: DataBunch, vocab: Vocab, verbose: bool,
               allow_unks: bool) -> None:
    first_batch = data_bunch.one_batch()[0]

    if not allow_unks and not contains_no_value(first_batch,
                                                UNKNOWN_TOKEN_INDEX):
        raise ValueError(
            f"Unknown is found : {[vocab.textify(seq) for seq in first_batch]}"
        )
    if verbose:
        logger.info(f'Displaying the first batch:\n{first_batch}')
        token_seqs = [vocab.textify(seq) for seq in first_batch]
        logger.info(pformat(token_seqs))
Пример #9
0
def evaluate_lm(data_path,
                model_dir,
                tokenizer_lang="xx",
                evaluate_custom_perplexity=False):
    """
    Evaluate metrics of a trained language model using any dataset of texts from CSV file.

    Attributes:
        data_path (str): Path to CSV file with texts in the first column.
        model_dir (str): Directory with a trained language model.
        tokenizer_lang (str): Language setting for tokenizer.
        evaluate_custom_perplexity (bool): The perplexity estimated as e^(avg. loss),
            but the average loss changes slightly with batch size. To get perplexity computed in
            slower but controlled fashion, set `evaluate_custom_perplexity` to True. Discrepancy
            between perplexity and custom perplexity is empirically approximately 1%.
    """
    model_dir = Path(model_dir)
    with open(model_dir / "lm_itos.pkl", "rb") as f:
        itos = pickle.load(f)

    data_df = pd.read_csv(data_path, header=None)
    data = TextLMDataBunch.from_df("",
                                   data_df,
                                   data_df,
                                   text_cols=0,
                                   tokenizer=Tokenizer(lang=tokenizer_lang),
                                   vocab=Vocab(itos))

    with open(model_dir / "model_hparams.json", "r") as model_hparams_file:
        model_hparams = json.load(model_hparams_file)
    learner = lm_learner(data,
                         AWD_LSTM,
                         model_dir,
                         pretrained=True,
                         config=model_hparams)

    loss, acc = learner.validate()
    print("Loss: {}, Perplexity: {}, Accuracy: {}".format(
        loss, exp(loss), acc))
    if evaluate_custom_perplexity:
        print(
            "Custom perplexity: {}, Fraction OOV: {}, OOV perplexity contribution: {}"
            .format(*evaluate_perplexity(learner, data.valid_ds.x)))
Пример #10
0
    def __init__(self, path: str, after_epoch: Optional[int] = None,
                 force_use_cpu: bool = False, load_only_description: bool = False, device: Optional[int] = None):
        if not os.path.exists(path):
            raise FileNotFoundError(f'Path does not exist: {path}')
        self._force_use_cpu = force_use_cpu
        self._id = os.path.basename(path)
        path_to_config_file = os.path.join(path, CONFIG_FILE_NAME)
        path_to_metrics_file = os.path.join(path, METRICS_FILE_NAME)
        path_to_tags_file = os.path.join(path, TAGS_FILE_NAME)
        self._metrics = None
        self._config = None
        self._tags = []
        self._context: List[str] = []
        try:
            self._config: LMTrainingConfig = load_config_or_metrics_from_file(path_to_config_file, LMTrainingConfig)
        except FileNotFoundError:
            logger.warning(f'Config file not found: {path_to_config_file}')
        try:
            self._metrics: LMTrainingMetrics = load_config_or_metrics_from_file(os.path.join(path, METRICS_FILE_NAME), LMTrainingMetrics)
        except FileNotFoundError:
            logger.warning(f'File with metrics not found: {path_to_metrics_file}')
        if os.path.exists(path_to_tags_file):
            value = read_value_from_file(path_to_tags_file, value_type=str)
            if value != '':
                self._tags = value.split(',')
        self.prep_function = self._config.prep_function

        self._load_only_description = load_only_description
        if not load_only_description:
            # we might want to load only description without loading actual weights when we want
            # to save time when loading multiple models to choose one of them to work with

            self._original_vocab = Vocab.load(os.path.join(path, VOCAB_FILE_NAME))
            term_vocab, self._first_nonterm_token = _create_term_vocab(self._original_vocab)
            self._model, self._vocab = self._load_model(path, after_epoch, term_vocab, device=device)
            to_test_mode(self._model)
            self._initial_snapshot = take_hidden_state_snapshot(self._model)

            # last_predicted_token_tensor is a rank-2 tensor!
            self._last_predicted_token_tensor = torch.tensor([self._vocab.numericalize([self.STARTING_TOKEN])],
                                                             device=get_device(self._force_use_cpu))
Пример #11
0
def test_simple(open_mock):
    # Given
    file_mock1 = file_mock_with_lines(['1', 'My Class'])
    file_mock2 = file_mock_with_lines(['1', 'hi'])

    open_mock.side_effect = [file_mock1, file_mock2]

    numericalizer = Numericalizer(Vocab(
        ['`unk', '`pad', '1', 'My', 'Class', 'hi']),
                                  n_cpus=1)

    text_list = TextList(
        [Path('/path/to/some/file1'),
         Path('/path/to/some/file2')])

    # when
    numericalizer.process(text_list)

    # then
    expected = np.array([np.array([2, 3, 4]), np.array([2, 5])])
    assert all_trues(np.equal(expected, text_list.items, dtype=np.object))
Пример #12
0
def build_vocab(docs: t.List[str], max_vocab: int = 10000, min_freq: int = 5) -> Vocab:
    return Vocab.create(docs, max_vocab=max_vocab, min_freq=min_freq)
Пример #13
0
 def log_vocab(self, vocab: Vocab) -> None:
     logger.info(f"Vocab size: {len(vocab.itos)}")
     vocab.save(os.path.join(self.path_to_trained_model, VOCAB_FILE_NAME))
     if self.comet_experiment:
         self.comet_experiment.log_parameter("vocabulary", len(vocab.itos))
Пример #14
0
import sys

import dataprep.api.corpus as api
from dataprep.api.corpus import PreprocessedCorpus
from fastai.text import Vocab, TextList, NumericalizeProcessor, OpenFileProcessor

if __name__ == '__main__':
    prep_corpus: PreprocessedCorpus = api.bpe('/home/hlib/dev/yahtzee',
                                              '10k',
                                              calc_vocab=True)
    vocab = Vocab(list(prep_corpus.load_vocab().keys()))
    text_list = TextList.from_folder(prep_corpus.path_to_prep_dataset, vocab=vocab, extensions=['.prep'],
                                  processor=[OpenFileProcessor(),
                                             NumericalizeProcessor(vocab=vocab, max_vocab=sys.maxsize, min_freq=0)])\
        .split_by_rand_pct()\
        .label_for_lm()\
        .databunch()
    print(text_list)
Пример #15
0
def _create_term_vocab(vocab: Vocab) -> Tuple[Vocab, int]:
    terminal_token_indices = {i for i, k in enumerate(vocab.itos) if is_terminal_subtoken(k)}
    term_vocab_list = [vocab.itos[i] for i in terminal_token_indices]
    non_term_vocab_list = [vocab.itos[i] for i in range(len(vocab.itos)) if i not in terminal_token_indices]
    term_vocab = Vocab(term_vocab_list + non_term_vocab_list)
    return term_vocab, len(term_vocab_list)
Пример #16
0
def create_vocab_for_lm(prep_corpus: PreprocessedCorpus) -> Vocab:
    return Vocab(
        ['`unk', '`pad'] +
        list(map(lambda x: to_literal_str(x),
                 prep_corpus.load_vocab().keys())))
Пример #17
0
def numericalize(doc: str, vocab: Vocab) -> t.List[int]:
    return vocab.numericalize(doc)