예제 #1
0
def init_tok2vec(nlp: "Language", pretrain_config: Dict[str, Any],
                 init_config: Dict[str, Any]) -> bool:
    # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
    P = pretrain_config
    I = init_config
    weights_data = None
    init_tok2vec = ensure_path(I["init_tok2vec"])
    if init_tok2vec is not None:
        if P["objective"].get("type") == "vectors" and not I["vectors"]:
            err = 'need initialize.vectors if pretraining.objective.type is "vectors"'
            errors = [{"loc": ["initialize"], "msg": err}]
            raise ConfigValidationError(config=nlp.config, errors=errors)
        if not init_tok2vec.exists():
            err = f"can't find pretrained tok2vec: {init_tok2vec}"
            errors = [{"loc": ["initialize", "init_tok2vec"], "msg": err}]
            raise ConfigValidationError(config=nlp.config, errors=errors)
        with init_tok2vec.open("rb") as file_:
            weights_data = file_.read()
    if weights_data is not None:
        tok2vec_component = P["component"]
        if tok2vec_component is None:
            desc = (
                f"To use pretrained tok2vec weights, [pretraining.component] "
                f"needs to specify the component that should load them.")
            err = "component can't be null"
            errors = [{"loc": ["pretraining", "component"], "msg": err}]
            raise ConfigValidationError(config=nlp.config["pretraining"],
                                        errors=errors,
                                        desc=desc)
        layer = nlp.get_pipe(tok2vec_component).model
        if P["layer"]:
            layer = layer.get_ref(P["layer"])
        layer.from_bytes(weights_data)
        return True
    return False
예제 #2
0
def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
    raw_config = config
    config = raw_config.interpolate()
    if "seed" not in config["training"]:
        raise ValueError(Errors.E1015.format(value="[training] seed"))
    if "gpu_allocator" not in config["training"]:
        raise ValueError(Errors.E1015.format(value="[training] gpu_allocator"))
    if config["training"]["seed"] is not None:
        fix_random_seed(config["training"]["seed"])
    allocator = config["training"]["gpu_allocator"]
    if use_gpu >= 0 and allocator:
        set_gpu_allocator(allocator)
    # Use original config here before it's resolved to functions
    sourced = get_sourced_components(config)
    nlp = load_model_from_config(raw_config, auto_fill=True)
    logger.info("Set up nlp object from config")
    config = nlp.config.interpolate()
    # Resolve all training-relevant sections using the filled nlp config
    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
    dot_names = [T["train_corpus"], T["dev_corpus"]]
    if not isinstance(T["train_corpus"], str):
        raise ConfigValidationError(
            desc=Errors.E897.format(
                field="training.train_corpus", type=type(T["train_corpus"])
            )
        )
    if not isinstance(T["dev_corpus"], str):
        raise ConfigValidationError(
            desc=Errors.E897.format(
                field="training.dev_corpus", type=type(T["dev_corpus"])
            )
        )
    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
    optimizer = T["optimizer"]
    # Components that shouldn't be updated during training
    frozen_components = T["frozen_components"]
    # Sourced components that require resume_training
    resume_components = [p for p in sourced if p not in frozen_components]
    logger.info(f"Pipeline: {nlp.pipe_names}")
    if resume_components:
        with nlp.select_pipes(enable=resume_components):
            logger.info(f"Resuming training for: {resume_components}")
            nlp.resume_training(sgd=optimizer)
    # Make sure that listeners are defined before initializing further
    nlp._link_components()
    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
        nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
        logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
    # Detect components with listeners that are not frozen consistently
    for name, proc in nlp.pipeline:
        if getattr(proc, "listening_components", None):  # e.g. tok2vec/transformer
            for listener in proc.listening_components:
                if listener in frozen_components and name not in frozen_components:
                    logger.warning(Warnings.W087.format(name=name, listener=listener))
                # We always check this regardless, in case user freezes tok2vec
                if listener not in frozen_components and name in frozen_components:
                    logger.warning(Warnings.W086.format(name=name, listener=listener))
    return nlp
예제 #3
0
def validate_init_settings(
        func: Callable,
        settings: Dict[str, Any],
        *,
        section: Optional[str] = None,
        name: str = "",
        exclude: Iterable[str] = ("get_examples", "nlp"),
) -> Dict[str, Any]:
    """Validate initialization settings against the expected arguments in
    the method signature. Will parse values if possible (e.g. int to string)
    and return the updated settings dict. Will raise a ConfigValidationError
    if types don't match or required values are missing.

    func (Callable): The initialize method of a given component etc.
    settings (Dict[str, Any]): The settings from the respective [initialize] block.
    section (str): Initialize section, for error message.
    name (str): Name of the block in the section.
    exclude (Iterable[str]): Parameter names to exclude from schema.
    RETURNS (Dict[str, Any]): The validated settings.
    """
    schema = get_arg_model(func, exclude=exclude, name="InitArgModel")
    try:
        return schema(**settings).dict()
    except ValidationError as e:
        block = "initialize" if not section else f"initialize.{section}"
        title = f"Error validating initialization settings in [{block}]"
        raise ConfigValidationError(title=title,
                                    errors=e.errors(),
                                    config=settings,
                                    parent=name) from None
예제 #4
0
def load_vectors_into_model(nlp: "Language",
                            name: Union[str, Path],
                            *,
                            add_strings: bool = True) -> None:
    """Load word vectors from an installed model or path into a model instance."""
    try:
        vectors_nlp = load_model(name)
    except ConfigValidationError as e:
        title = f"Config validation error for vectors {name}"
        desc = (
            "This typically means that there's a problem in the config.cfg included "
            "with the packaged vectors. Make sure that the vectors package you're "
            "loading is compatible with the current version of spaCy.")
        err = ConfigValidationError.from_error(e, title=title, desc=desc)
        raise err from None

    if len(vectors_nlp.vocab.vectors.keys()) == 0:
        logger.warning(Warnings.W112.format(name=name))

    nlp.vocab.vectors = vectors_nlp.vocab.vectors
    for lex in nlp.vocab:
        lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK)
    if add_strings:
        # I guess we should add the strings from the vectors_nlp model?
        # E.g. if someone does a similarity query, they might expect the strings.
        for key in nlp.vocab.vectors.key2row:
            if key in vectors_nlp.vocab.strings:
                nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
예제 #5
0
def load_vectors_into_model(nlp: "Language",
                            name: Union[str, Path],
                            *,
                            add_strings: bool = True) -> None:
    """Load word vectors from an installed model or path into a model instance."""
    try:
        # Load with the same vocab, which automatically adds the vectors to
        # the current nlp object. Exclude lookups so they are not modified.
        exclude = ["lookups"]
        if not add_strings:
            exclude.append("strings")
        vectors_nlp = load_model(name, vocab=nlp.vocab, exclude=exclude)
    except ConfigValidationError as e:
        title = f"Config validation error for vectors {name}"
        desc = (
            "This typically means that there's a problem in the config.cfg included "
            "with the packaged vectors. Make sure that the vectors package you're "
            "loading is compatible with the current version of spaCy.")
        err = ConfigValidationError.from_error(e, title=title, desc=desc)
        raise err from None

    if (len(vectors_nlp.vocab.vectors.keys()) == 0
            and vectors_nlp.vocab.vectors.mode != VectorsMode.floret) or (
                vectors_nlp.vocab.vectors.shape[0] == 0
                and vectors_nlp.vocab.vectors.mode == VectorsMode.floret):
        logger.warning(Warnings.W112.format(name=name))

    for lex in nlp.vocab:
        lex.rank = nlp.vocab.vectors.key2row.get(
            lex.orth, OOV_RANK)  # type: ignore[attr-defined]
예제 #6
0
def init_tok2vec(nlp: "Language", pretrain_config: Dict[str, Any],
                 init_config: Dict[str, Any]) -> bool:
    # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
    P = pretrain_config
    I = init_config
    weights_data = None
    init_tok2vec = ensure_path(I["init_tok2vec"])
    if init_tok2vec is not None:
        if not init_tok2vec.exists():
            err = f"can't find pretrained tok2vec: {init_tok2vec}"
            errors = [{"loc": ["initialize", "init_tok2vec"], "msg": err}]
            raise ConfigValidationError(config=nlp.config, errors=errors)
        with init_tok2vec.open("rb") as file_:
            weights_data = file_.read()
    if weights_data is not None:
        layer = get_tok2vec_ref(nlp, P)
        layer.from_bytes(weights_data)
        logger.info(f"Loaded pretrained weights from {init_tok2vec}")
        return True
    return False