Пример #1
0
 def from_df(cls, path: PathOrStr, train_df: DataFrame, valid_df: DataFrame, test_df: Optional[DataFrame] = None,
             tokenizer: Tokenizer = None, vocab: Vocab = None, classes: Collection[str] = None,
             text_cols: IntsOrStrs = 1,
             label_cols: IntsOrStrs = 0, label_delim: str = None, **kwargs) -> DataBunch:
     "Create a `TextDataBunch` from DataFrames."
     p_kwargs, kwargs = split_kwargs_by_func(kwargs, get_bert_processor)
     # use our custom processors while taking tokenizer and vocab as kwargs
     processor = get_bert_processor(tokenizer=tokenizer, vocab=vocab, **p_kwargs)
     if classes is None and is_listy(label_cols) and len(label_cols) > 1: classes = label_cols
     src = ItemLists(path, TextList.from_df(train_df, path, cols=text_cols, processor=processor),
                     TextList.from_df(valid_df, path, cols=text_cols, processor=processor))
     src = src.label_for_lm() if cls == TextLMDataBunch else src.label_from_df(cols=label_cols, classes=classes)
     if test_df is not None: src.add_test(TextList.from_df(test_df, path, cols=text_cols))
     return src.databunch(**kwargs)
Пример #2
0
def main(models_path: Path, test_data_json: Path, debug: bool):
    """Evaluates a language model against a test data set."""

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=UserWarning)

        print(f"Loading test data from {test_data_json}...")
        rows = []
        with jsonlines.open(test_data_json) as reader:
            for obj in reader.iter(type=dict, skip_invalid=True):
                rows.append(obj)
        df = pd.DataFrame(rows)
        test_databunch = (TextList.from_df(
            df, path=models_path,
            cols=["title",
                  "content"]).split_none().label_for_lm().databunch(bs=4))

        learner = measure(
            "model loading",
            lambda: from_model(models_path, model_name="model_large_finetuned"
                               ),
            debug,
        )

        print(learner.validate(dl=test_databunch.train_dl))
Пример #3
0
def train(bs):
    path = Path("./")
    data_lm = load_data(path, 'data_lm.pkl', bs=bs)
    print("data_lm loaded")

    data_clas = (TextList.from_csv(path, 'classifier.csv', cols=["summary", "description"], vocab=data_lm.vocab)
                 .split_from_df(col=3)
                 .label_from_df(cols=0)
                 .databunch(bs=bs))

    print("data_clas loaded")
    data_clas.show_batch()  # not sure how it will work

    data_clas.save('data_clas.pkl')

    learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.6)
    learn.load_encoder('fine_tuned_enc')

    lr_estimate = 1.0e-2

    learn.fit_one_cycle(1, lr_estimate, moms=(0.8,0.7))

    learn.save('first')
    losses_fig = learn.recorder.plot_losses(return_fig=True)
    losses_fig.savefig("losses_001.jpg", dpi=600)
Пример #4
0
def create_databunch(path_to_prep_dataset: str,
                     file_paths: Sequence[Path],
                     vocab: Vocab,
                     bs: int,
                     bptt: int,
                     device: str,
                     only_validation_files: bool = False,
                     allow_unks: bool = False,
                     verbose: bool = True) -> DataBunch:
    if verbose:
        logger.info(f'Getting preprocessed corpus from {path_to_prep_dataset}')
    numericalizer = Numericalizer(vocab,
                                  allow_unks=allow_unks,
                                  large_databunch=verbose)
    text_list = TextList(file_paths,
                         path=path_to_prep_dataset,
                         processor=numericalizer)

    if verbose:
        logger.info("Splitting into training/validation sets")
    if only_validation_files:
        split_list = text_list.split_by_valid_func(lambda f: True)
    else:
        split_list = text_list.split_by_folder()

    if verbose:
        logger.info("Labeling for langmodeling")
    labelled_list = split_list.label_for_lm()

    if verbose:
        cpu_memory_used_mb = get_cpu_memory_used_mb()
        logger.debug(f"Cpu memory used: {cpu_memory_used_mb} MB")

    if verbose:
        logger.info("Creating data bunches")
    data_bunched = labelled_list.databunch(bs=bs, bptt=bptt, device=device)
    return data_bunched
Пример #5
0
    def __init__(self, path):
        texts = pd.read_csv(path + '/jokes_extended_vk_anekdot_preproc.csv',
                            index_col=0)
        texts.dropna(inplace=True)
        data = TextList.from_df(texts,
                        processor=[TokenizeProcessor(tokenizer=Tokenizer(lang="xx")),
                                     NumericalizeProcessor(min_freq=2, max_vocab=60000)])\
                                    .split_by_rand_pct(.1)\
                                    .label_for_lm()\
                                    .databunch(bs=64)

        self.learn = language_model_learner(data=data,
                                            arch=AWD_LSTM,
                                            pretrained=None)
        self.learn.load_pretrained(path + '/ulmfit/bestmodel_tune.pth',
                                   path + '/ulmfit/bestmodel_tune_itos.pkl')
Пример #6
0
def test_simple(open_mock):
    # Given
    file_mock1 = file_mock_with_lines(['1', 'My Class'])
    file_mock2 = file_mock_with_lines(['1', 'hi'])

    open_mock.side_effect = [file_mock1, file_mock2]

    numericalizer = Numericalizer(Vocab(
        ['`unk', '`pad', '1', 'My', 'Class', 'hi']),
                                  n_cpus=1)

    text_list = TextList(
        [Path('/path/to/some/file1'),
         Path('/path/to/some/file2')])

    # when
    numericalizer.process(text_list)

    # then
    expected = np.array([np.array([2, 3, 4]), np.array([2, 5])])
    assert all_trues(np.equal(expected, text_list.items, dtype=np.object))
Пример #7
0
import sys

import dataprep.api.corpus as api
from dataprep.api.corpus import PreprocessedCorpus
from fastai.text import Vocab, TextList, NumericalizeProcessor, OpenFileProcessor

if __name__ == '__main__':
    prep_corpus: PreprocessedCorpus = api.bpe('/home/hlib/dev/yahtzee',
                                              '10k',
                                              calc_vocab=True)
    vocab = Vocab(list(prep_corpus.load_vocab().keys()))
    text_list = TextList.from_folder(prep_corpus.path_to_prep_dataset, vocab=vocab, extensions=['.prep'],
                                  processor=[OpenFileProcessor(),
                                             NumericalizeProcessor(vocab=vocab, max_vocab=sys.maxsize, min_freq=0)])\
        .split_by_rand_pct()\
        .label_for_lm()\
        .databunch()
    print(text_list)