def test_readers(): config_string = """ [training] [corpora] @readers = "myreader.v1" [nlp] lang = "en" pipeline = ["tok2vec", "textcat"] [components] [components.tok2vec] factory = "tok2vec" [components.textcat] factory = "textcat" """ @registry.readers.register("myreader.v1") def myreader() -> Dict[str, Callable[[Language, str], Iterable[Example]]]: annots = {"cats": {"POS": 1.0, "NEG": 0.0}} def reader(nlp: Language): doc = nlp.make_doc(f"This is an example") return [Example.from_dict(doc, annots)] return { "train": reader, "dev": reader, "extra": reader, "something": reader } config = Config().from_str(config_string) nlp = load_model_from_config(config, auto_fill=True) T = registry.resolve(nlp.config.interpolate()["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) assert isinstance(train_corpus, Callable) optimizer = T["optimizer"] # simulate a training loop nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) for example in train_corpus(nlp): nlp.update([example], sgd=optimizer) scores = nlp.evaluate(list(dev_corpus(nlp))) assert scores["cats_macro_auc"] == 0.0 # ensure the pipeline runs doc = nlp("Quick test") assert doc.cats corpora = {"corpora": nlp.config.interpolate()["corpora"]} extra_corpus = registry.resolve(corpora)["corpora"]["extra"] assert isinstance(extra_corpus, Callable)
def __init__( self, config: Config, *, rank: int = 0, num_workers: int = 1, use_gpu: int = 0, ray=None, ): if ray is None: # Avoid importing ray in the module. This allows a test-ray to # be passed in, and speeds up the CLI. import ray # type: ignore self.ray = ray self.rank = rank self.num_workers = num_workers self.gpu_id = self._resolve_gpu(use_gpu) self.nlp = init_nlp(Config(config), use_gpu=self.gpu_id) config = self.nlp.config.interpolate() self.T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [self.T["train_corpus"], self.T["dev_corpus"]] self.train_corpus, self.dev_corpus = resolve_dot_names(config, dot_names) self.before_to_disk = create_before_to_disk_callback(self.T["before_to_disk"]) allocator = self.T["gpu_allocator"] if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) self._evaluation_callback = lambda: {} self._results = [] self._has_evaluation_callback = False self.thread = None self.proxy = None self.n_grads_used = 0 self.n_grads_discarded = 0
def test_cat_readers(reader, additional_config): nlp_config_string = """ [training] seed = 0 [training.score_weights] cats_macro_auc = 1.0 [corpora] @readers = "PLACEHOLDER" [nlp] lang = "en" pipeline = ["tok2vec", "textcat_multilabel"] [components] [components.tok2vec] factory = "tok2vec" [components.textcat_multilabel] factory = "textcat_multilabel" """ config = Config().from_str(nlp_config_string) fix_random_seed(config["training"]["seed"]) config["corpora"]["@readers"] = reader config["corpora"].update(additional_config) nlp = load_model_from_config(config, auto_fill=True) T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) optimizer = T["optimizer"] # simulate a training loop nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) for example in train_corpus(nlp): assert example.y.cats # this shouldn't fail if each training example has at least one positive label assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0] nlp.update([example], sgd=optimizer) # simulate performance benchmark on dev corpus dev_examples = list(dev_corpus(nlp)) for example in dev_examples: # this shouldn't fail if each dev example has at least one positive label assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0] scores = nlp.evaluate(dev_examples) assert scores["cats_score"] # ensure the pipeline runs doc = nlp("Quick test") assert doc.cats
def test_create_nlp_from_pretraining_config(): """Test that the default pretraining config validates properly""" config = Config().from_str(pretrain_config_string) pretrain_config = load_config(DEFAULT_CONFIG_PRETRAIN_PATH) filled = config.merge(pretrain_config) registry.resolve(filled["pretraining"], schema=ConfigSchemaPretrain)