def from_df(cls, path: PathOrStr, train_df: DataFrame, valid_df: DataFrame, test_df: Optional[DataFrame] = None, tokenizer: Tokenizer = None, vocab: Vocab = None, classes: Collection[str] = None, text_cols: IntsOrStrs = 1, label_cols: IntsOrStrs = 0, label_delim: str = None, **kwargs) -> DataBunch: "Create a `TextDataBunch` from DataFrames." p_kwargs, kwargs = split_kwargs_by_func(kwargs, get_bert_processor) # use our custom processors while taking tokenizer and vocab as kwargs processor = get_bert_processor(tokenizer=tokenizer, vocab=vocab, **p_kwargs) if classes is None and is_listy(label_cols) and len(label_cols) > 1: classes = label_cols src = ItemLists(path, TextList.from_df(train_df, path, cols=text_cols, processor=processor), TextList.from_df(valid_df, path, cols=text_cols, processor=processor)) src = src.label_for_lm() if cls == TextLMDataBunch else src.label_from_df(cols=label_cols, classes=classes) if test_df is not None: src.add_test(TextList.from_df(test_df, path, cols=text_cols)) return src.databunch(**kwargs)
def main(models_path: Path, test_data_json: Path, debug: bool): """Evaluates a language model against a test data set.""" with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning) print(f"Loading test data from {test_data_json}...") rows = [] with jsonlines.open(test_data_json) as reader: for obj in reader.iter(type=dict, skip_invalid=True): rows.append(obj) df = pd.DataFrame(rows) test_databunch = (TextList.from_df( df, path=models_path, cols=["title", "content"]).split_none().label_for_lm().databunch(bs=4)) learner = measure( "model loading", lambda: from_model(models_path, model_name="model_large_finetuned" ), debug, ) print(learner.validate(dl=test_databunch.train_dl))
def train(bs): path = Path("./") data_lm = load_data(path, 'data_lm.pkl', bs=bs) print("data_lm loaded") data_clas = (TextList.from_csv(path, 'classifier.csv', cols=["summary", "description"], vocab=data_lm.vocab) .split_from_df(col=3) .label_from_df(cols=0) .databunch(bs=bs)) print("data_clas loaded") data_clas.show_batch() # not sure how it will work data_clas.save('data_clas.pkl') learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.6) learn.load_encoder('fine_tuned_enc') lr_estimate = 1.0e-2 learn.fit_one_cycle(1, lr_estimate, moms=(0.8,0.7)) learn.save('first') losses_fig = learn.recorder.plot_losses(return_fig=True) losses_fig.savefig("losses_001.jpg", dpi=600)
def create_databunch(path_to_prep_dataset: str, file_paths: Sequence[Path], vocab: Vocab, bs: int, bptt: int, device: str, only_validation_files: bool = False, allow_unks: bool = False, verbose: bool = True) -> DataBunch: if verbose: logger.info(f'Getting preprocessed corpus from {path_to_prep_dataset}') numericalizer = Numericalizer(vocab, allow_unks=allow_unks, large_databunch=verbose) text_list = TextList(file_paths, path=path_to_prep_dataset, processor=numericalizer) if verbose: logger.info("Splitting into training/validation sets") if only_validation_files: split_list = text_list.split_by_valid_func(lambda f: True) else: split_list = text_list.split_by_folder() if verbose: logger.info("Labeling for langmodeling") labelled_list = split_list.label_for_lm() if verbose: cpu_memory_used_mb = get_cpu_memory_used_mb() logger.debug(f"Cpu memory used: {cpu_memory_used_mb} MB") if verbose: logger.info("Creating data bunches") data_bunched = labelled_list.databunch(bs=bs, bptt=bptt, device=device) return data_bunched
def __init__(self, path): texts = pd.read_csv(path + '/jokes_extended_vk_anekdot_preproc.csv', index_col=0) texts.dropna(inplace=True) data = TextList.from_df(texts, processor=[TokenizeProcessor(tokenizer=Tokenizer(lang="xx")), NumericalizeProcessor(min_freq=2, max_vocab=60000)])\ .split_by_rand_pct(.1)\ .label_for_lm()\ .databunch(bs=64) self.learn = language_model_learner(data=data, arch=AWD_LSTM, pretrained=None) self.learn.load_pretrained(path + '/ulmfit/bestmodel_tune.pth', path + '/ulmfit/bestmodel_tune_itos.pkl')
def test_simple(open_mock): # Given file_mock1 = file_mock_with_lines(['1', 'My Class']) file_mock2 = file_mock_with_lines(['1', 'hi']) open_mock.side_effect = [file_mock1, file_mock2] numericalizer = Numericalizer(Vocab( ['`unk', '`pad', '1', 'My', 'Class', 'hi']), n_cpus=1) text_list = TextList( [Path('/path/to/some/file1'), Path('/path/to/some/file2')]) # when numericalizer.process(text_list) # then expected = np.array([np.array([2, 3, 4]), np.array([2, 5])]) assert all_trues(np.equal(expected, text_list.items, dtype=np.object))
import sys import dataprep.api.corpus as api from dataprep.api.corpus import PreprocessedCorpus from fastai.text import Vocab, TextList, NumericalizeProcessor, OpenFileProcessor if __name__ == '__main__': prep_corpus: PreprocessedCorpus = api.bpe('/home/hlib/dev/yahtzee', '10k', calc_vocab=True) vocab = Vocab(list(prep_corpus.load_vocab().keys())) text_list = TextList.from_folder(prep_corpus.path_to_prep_dataset, vocab=vocab, extensions=['.prep'], processor=[OpenFileProcessor(), NumericalizeProcessor(vocab=vocab, max_vocab=sys.maxsize, min_freq=0)])\ .split_by_rand_pct()\ .label_for_lm()\ .databunch() print(text_list)