Пример #1
0
def test_dense_features_with_spacy_sm(en_core_web_sm):
    message = {"text": "hello world"}
    tokenizer = SpacyTokenizer(en_core_web_sm)
    tokenizer.tokenize(message)

    featurizer = Featurizer(use_dense_features=True)
    dense_features = featurizer.get_dense_features(message)
    assert dense_features is None
Пример #2
0
def test_vectors_with_spacy_sm(en_core_web_sm):
    message = {"text": "hello world"}
    tokenizer = SpacyTokenizer(en_core_web_sm)
    tokenizer.tokenize(message)

    assert message["tokens"][0].get("vector") is None
    assert message["tokens"][1].get("vector") is None
    assert message["tokens"][2].get("vector") is None
Пример #3
0
def test_dense_features_with_spacy_md_and_flag_disabled(en_core_web_md):
    message = {"text": "hello world"}
    tokenizer = SpacyTokenizer(en_core_web_md)
    tokenizer.tokenize(message)

    featurizer = Featurizer(use_dense_features=False)
    dense_features = featurizer.get_dense_features(message)
    assert dense_features is None
Пример #4
0
def test_cls(en_core_web_sm):
    message = {"text": "hello world"}
    tokenizer = SpacyTokenizer(en_core_web_sm)
    tokenizer.tokenize(message)

    assert len(message["tokens"]) == 3
    assert message["tokens"][0].text == "hello"
    assert message["tokens"][1].text == "world"
    assert message["tokens"][2].text == "__CLS__"
Пример #5
0
def test_vectors_with_spacy_md(en_core_web_md):
    message = {"text": "hello world"}
    tokenizer = SpacyTokenizer(en_core_web_md)
    tokenizer.tokenize(message)

    assert message["tokens"][0].get("vector").shape == (300, )
    assert message["tokens"][1].get("vector").shape == (300, )
    assert message["tokens"][2].get(
        "vector") is None  # CLS vector will be computed later
Пример #6
0
def test_dense_features_with_spacy_md(en_core_web_md):
    message = {"text": "hello world"}
    tokenizer = SpacyTokenizer(en_core_web_md)
    tokenizer.tokenize(message)

    featurizer = Featurizer(use_dense_features=True)
    dense_features = featurizer.get_dense_features(message)

    assert len(dense_features) == 3
    assert len(dense_features[0]["text_dense_features"]) == 300
Пример #7
0
def gold_example_to_crf_tokens(
    example: Dict,
    tokenizer: Optional[Tokenizer] = None,
    use_dense_features: bool = False,
    bilou: bool = True,
) -> List[CRFToken]:
    """Translate training example to CRF feature space.

    Args:
        example (dict): example dict. must have either "doc", "tokens" or "text" field.
        tokenizer (Tokenizer): tokenizer.
        use_dense_features (bool): use dense features.
        bilou (bool): apply BILOU tags to example.

    Returns:
        List[CRFToken], CRF example.
    """
    if not example:
        return []

    tokenizer = tokenizer or SpacyTokenizer()
    featurizer = Featurizer(use_dense_features=use_dense_features)

    if "tokens" in example:
        # tokenized by 3rd party, nothing to do .. except for dense feature addition (when needed)
        if use_dense_features and isinstance(tokenizer, SpacyTokenizer):
            for token in example["tokens"]:
                vector = tokenizer.get_vector(token)
                if vector is not None:
                    token.set("vector", vector)

    elif "text" in example:
        # Call a tokenizer to tokenize the message. Default is SpacyTokenizer.
        tokenizer.tokenize(example, attribute="text")
    else:
        raise ValueError(f"Bad example: {example}. "
                         f"Attribute ``text`` or ``tokens`` is missing.")
    # By default, JSON examples don't have a tagging schema like "BILOU".
    # If they do, like in CoNLL datasets, we strip them after alignment.
    entities = featurizer.apply_bilou_schema(example)
    if not bilou:
        remove_bilou_prefixes(entities)
    return featurizer(example, entities)
Пример #8
0
def main(in_file, model_file=None, config_file=None, spacy_model=None):
    """Train CRF entity tagger."""
    if config_file:
        msg.info(f"Loading config: {config_file}")
        component_config = srsly.read_json(config_file)
    else:
        component_config = None

    model_file = model_file or "model.pkl"
    msg.info("Loading model from file", model_file)
    crf_extractor = CRFExtractor(
        component_config=component_config).from_disk(model_file)
    msg.good("Successfully loaded CRF tagger", crf_extractor)

    msg.info("Loading dev dataset from file", in_file)
    dev_examples = read_file(in_file)
    msg.good(f"Successfully loaded {len(dev_examples)} dev examples.")

    if spacy_model is not None:
        nlp = spacy.load(spacy_model)
        msg.info(f"Using spaCy model: {spacy_model}")
    else:
        nlp = spacy.blank("en")
        msg.info(f"Using spaCy blank: 'en'")

    tokenizer = SpacyTokenizer(nlp=nlp)
    use_dense_features = crf_extractor.use_dense_features()
    dev_crf_examples = [
        gold_example_to_crf_tokens(ex,
                                   tokenizer=tokenizer,
                                   use_dense_features=use_dense_features)
        for ex in dev_examples
    ]

    f1_score, classification_report = crf_extractor.eval(dev_crf_examples)
    msg.warn(f"f1 score: {f1_score}")
    print(classification_report)
Пример #9
0
 def __init__(self,
              nlp: Language,
              crf_extractor: Optional[CRFExtractor] = None):
     self.nlp = nlp
     self.crf_extractor = crf_extractor
     self.spacy_tokenizer = SpacyTokenizer(nlp)
Пример #10
0
class CRFEntityExtractor(object):
    """spaCy v2.0 pipeline component that sets entity annotations
    based on CRF (Conditional Random Field) estimator.


    See ```CRFExtractor``` for CRF implementation details.
    """

    name = "crf_ner"

    def __init__(self,
                 nlp: Language,
                 crf_extractor: Optional[CRFExtractor] = None):
        self.nlp = nlp
        self.crf_extractor = crf_extractor
        self.spacy_tokenizer = SpacyTokenizer(nlp)

    def __call__(self, doc: Doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.

        References:
            - ``https://spacy.io/usage/processing-pipelines#component-example2``.

        Args:
            doc (Doc): spaCy document.

        Returns:
            doc
        """
        if not self.crf_extractor:
            raise RuntimeError("`CRFEntityExtractor` was not initialized. "
                               "Did you call `.from_disk()` method ?")

        example = {"doc": doc, "text": doc.text}
        self.spacy_tokenizer.tokenize(example, attribute="doc")

        spans = [
            doc.char_span(entity_dict["start"],
                          entity_dict["end"],
                          label=entity_dict["entity"])
            for entity_dict in self.crf_extractor.process(example)
        ]

        doc.ents = list(doc.ents) + spans
        for span in spans:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched
            # indices!
            span.merge()

        return doc

    def from_disk(self, path: Union[Path, str]) -> "CRFEntityExtractor":
        """Load crf extractor from disk.

        Args:
            path: path to directory.

        Returns:
            Component
        """
        if not isinstance(path, Path):
            path = Path(path)

        self.crf_extractor = CRFExtractor().from_disk(path)
        return self
Пример #11
0
def main(
    in_file,
    out_dir=None,
    model_file=None,
    config_file=None,
    spacy_model=None,
    fine_tune=False,
):
    """Train CRF entity tagger."""
    if config_file:
        msg.info("Loading config from disk")
        component_config = srsly.read_json(config_file)
        msg.good("Successfully loaded config from file.", config_file)
    else:
        component_config = None

    crf_extractor = CRFExtractor(component_config=component_config)

    if model_file is not None:
        msg.info(f"Loading model from disk.")
        crf_extractor = crf_extractor.from_disk(model_file)
        msg.good("Successfully loaded model from file.", model_file)

    msg.info("Loading training examples.")
    train_examples = read_file(in_file)
    msg.good(
        f"Successfully loaded {len(train_examples)} training examples from file.",
        in_file)

    if spacy_model is not None:
        nlp = spacy.load(spacy_model)
        msg.info(f"Using spaCy model: {spacy_model}")
    else:
        nlp = spacy.blank("en")
        msg.info(f"Using spaCy blank: 'en'")

    tokenizer = SpacyTokenizer(nlp=nlp)
    use_dense_features = crf_extractor.use_dense_features()
    train_crf = [
        gold_example_to_crf_tokens(ex,
                                   tokenizer=tokenizer,
                                   use_dense_features=use_dense_features)
        for ex in train_examples
    ]

    if fine_tune:
        msg.info("Fine-tuning hyper params.")
        rs = crf_extractor.fine_tune(train_crf,
                                     cv=5,
                                     n_iter=30,
                                     random_state=42)
        msg.good("Setting fine-tuned hyper params:", rs.best_params_)
        crf_extractor.component_config.update(rs.best_params_)

    msg.info("Training entity tagger with CRF.")
    crf_extractor.train(train_crf)

    model_path = pathlib.Path(out_dir or ".").resolve() / "model.pkl"
    msg.info("Saving model to disk")
    model_path.parent.mkdir(exist_ok=True)
    crf_extractor.to_disk(model_path)
    msg.good("Successfully saved model to file.",
             str(model_path.relative_to(os.getcwd())))