def test_readers():
    config_string = """
    [training]

    [corpora]
    @readers = "myreader.v1"

    [nlp]
    lang = "en"
    pipeline = ["tok2vec", "textcat"]

    [components]

    [components.tok2vec]
    factory = "tok2vec"

    [components.textcat]
    factory = "textcat"
    """

    @registry.readers.register("myreader.v1")
    def myreader() -> Dict[str, Callable[[Language, str], Iterable[Example]]]:
        annots = {"cats": {"POS": 1.0, "NEG": 0.0}}

        def reader(nlp: Language):
            doc = nlp.make_doc(f"This is an example")
            return [Example.from_dict(doc, annots)]

        return {
            "train": reader,
            "dev": reader,
            "extra": reader,
            "something": reader
        }

    config = Config().from_str(config_string)
    nlp = load_model_from_config(config, auto_fill=True)
    T = registry.resolve(nlp.config.interpolate()["training"],
                         schema=ConfigSchemaTraining)
    dot_names = [T["train_corpus"], T["dev_corpus"]]
    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
    assert isinstance(train_corpus, Callable)
    optimizer = T["optimizer"]
    # simulate a training loop
    nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
    for example in train_corpus(nlp):
        nlp.update([example], sgd=optimizer)
    scores = nlp.evaluate(list(dev_corpus(nlp)))
    assert scores["cats_macro_auc"] == 0.0
    # ensure the pipeline runs
    doc = nlp("Quick test")
    assert doc.cats
    corpora = {"corpora": nlp.config.interpolate()["corpora"]}
    extra_corpus = registry.resolve(corpora)["corpora"]["extra"]
    assert isinstance(extra_corpus, Callable)
示例#2
0
    def __init__(
        self,
        config: Config,
        *,
        rank: int = 0,
        num_workers: int = 1,
        use_gpu: int = 0,
        ray=None,
    ):
        if ray is None:
            # Avoid importing ray in the module. This allows a test-ray to
            # be passed in, and speeds up the CLI.
            import ray  # type: ignore

            self.ray = ray
        self.rank = rank
        self.num_workers = num_workers
        self.gpu_id = self._resolve_gpu(use_gpu)
        self.nlp = init_nlp(Config(config), use_gpu=self.gpu_id)
        config = self.nlp.config.interpolate()
        self.T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
        dot_names = [self.T["train_corpus"], self.T["dev_corpus"]]
        self.train_corpus, self.dev_corpus = resolve_dot_names(config, dot_names)
        self.before_to_disk = create_before_to_disk_callback(self.T["before_to_disk"])
        allocator = self.T["gpu_allocator"]
        if use_gpu >= 0 and allocator:
            set_gpu_allocator(allocator)
        self._evaluation_callback = lambda: {}
        self._results = []
        self._has_evaluation_callback = False
        self.thread = None
        self.proxy = None
        self.n_grads_used = 0
        self.n_grads_discarded = 0
示例#3
0
def test_cat_readers(reader, additional_config):
    nlp_config_string = """
    [training]
    seed = 0

    [training.score_weights]
    cats_macro_auc = 1.0

    [corpora]
    @readers = "PLACEHOLDER"

    [nlp]
    lang = "en"
    pipeline = ["tok2vec", "textcat_multilabel"]

    [components]

    [components.tok2vec]
    factory = "tok2vec"

    [components.textcat_multilabel]
    factory = "textcat_multilabel"
    """
    config = Config().from_str(nlp_config_string)
    fix_random_seed(config["training"]["seed"])
    config["corpora"]["@readers"] = reader
    config["corpora"].update(additional_config)
    nlp = load_model_from_config(config, auto_fill=True)
    T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
    dot_names = [T["train_corpus"], T["dev_corpus"]]
    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
    optimizer = T["optimizer"]
    # simulate a training loop
    nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
    for example in train_corpus(nlp):
        assert example.y.cats
        # this shouldn't fail if each training example has at least one positive label
        assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
        nlp.update([example], sgd=optimizer)
    # simulate performance benchmark on dev corpus
    dev_examples = list(dev_corpus(nlp))
    for example in dev_examples:
        # this shouldn't fail if each dev example has at least one positive label
        assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
    scores = nlp.evaluate(dev_examples)
    assert scores["cats_score"]
    # ensure the pipeline runs
    doc = nlp("Quick test")
    assert doc.cats
示例#4
0
def test_create_nlp_from_pretraining_config():
    """Test that the default pretraining config validates properly"""
    config = Config().from_str(pretrain_config_string)
    pretrain_config = load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
    filled = config.merge(pretrain_config)
    registry.resolve(filled["pretraining"], schema=ConfigSchemaPretrain)