def test_dense_features_with_spacy_sm(en_core_web_sm): message = {"text": "hello world"} tokenizer = SpacyTokenizer(en_core_web_sm) tokenizer.tokenize(message) featurizer = Featurizer(use_dense_features=True) dense_features = featurizer.get_dense_features(message) assert dense_features is None
def test_vectors_with_spacy_sm(en_core_web_sm): message = {"text": "hello world"} tokenizer = SpacyTokenizer(en_core_web_sm) tokenizer.tokenize(message) assert message["tokens"][0].get("vector") is None assert message["tokens"][1].get("vector") is None assert message["tokens"][2].get("vector") is None
def test_dense_features_with_spacy_md_and_flag_disabled(en_core_web_md): message = {"text": "hello world"} tokenizer = SpacyTokenizer(en_core_web_md) tokenizer.tokenize(message) featurizer = Featurizer(use_dense_features=False) dense_features = featurizer.get_dense_features(message) assert dense_features is None
def test_cls(en_core_web_sm): message = {"text": "hello world"} tokenizer = SpacyTokenizer(en_core_web_sm) tokenizer.tokenize(message) assert len(message["tokens"]) == 3 assert message["tokens"][0].text == "hello" assert message["tokens"][1].text == "world" assert message["tokens"][2].text == "__CLS__"
def test_vectors_with_spacy_md(en_core_web_md): message = {"text": "hello world"} tokenizer = SpacyTokenizer(en_core_web_md) tokenizer.tokenize(message) assert message["tokens"][0].get("vector").shape == (300, ) assert message["tokens"][1].get("vector").shape == (300, ) assert message["tokens"][2].get( "vector") is None # CLS vector will be computed later
def test_dense_features_with_spacy_md(en_core_web_md): message = {"text": "hello world"} tokenizer = SpacyTokenizer(en_core_web_md) tokenizer.tokenize(message) featurizer = Featurizer(use_dense_features=True) dense_features = featurizer.get_dense_features(message) assert len(dense_features) == 3 assert len(dense_features[0]["text_dense_features"]) == 300
def gold_example_to_crf_tokens( example: Dict, tokenizer: Optional[Tokenizer] = None, use_dense_features: bool = False, bilou: bool = True, ) -> List[CRFToken]: """Translate training example to CRF feature space. Args: example (dict): example dict. must have either "doc", "tokens" or "text" field. tokenizer (Tokenizer): tokenizer. use_dense_features (bool): use dense features. bilou (bool): apply BILOU tags to example. Returns: List[CRFToken], CRF example. """ if not example: return [] tokenizer = tokenizer or SpacyTokenizer() featurizer = Featurizer(use_dense_features=use_dense_features) if "tokens" in example: # tokenized by 3rd party, nothing to do .. except for dense feature addition (when needed) if use_dense_features and isinstance(tokenizer, SpacyTokenizer): for token in example["tokens"]: vector = tokenizer.get_vector(token) if vector is not None: token.set("vector", vector) elif "text" in example: # Call a tokenizer to tokenize the message. Default is SpacyTokenizer. tokenizer.tokenize(example, attribute="text") else: raise ValueError(f"Bad example: {example}. " f"Attribute ``text`` or ``tokens`` is missing.") # By default, JSON examples don't have a tagging schema like "BILOU". # If they do, like in CoNLL datasets, we strip them after alignment. entities = featurizer.apply_bilou_schema(example) if not bilou: remove_bilou_prefixes(entities) return featurizer(example, entities)
def main(in_file, model_file=None, config_file=None, spacy_model=None): """Train CRF entity tagger.""" if config_file: msg.info(f"Loading config: {config_file}") component_config = srsly.read_json(config_file) else: component_config = None model_file = model_file or "model.pkl" msg.info("Loading model from file", model_file) crf_extractor = CRFExtractor( component_config=component_config).from_disk(model_file) msg.good("Successfully loaded CRF tagger", crf_extractor) msg.info("Loading dev dataset from file", in_file) dev_examples = read_file(in_file) msg.good(f"Successfully loaded {len(dev_examples)} dev examples.") if spacy_model is not None: nlp = spacy.load(spacy_model) msg.info(f"Using spaCy model: {spacy_model}") else: nlp = spacy.blank("en") msg.info(f"Using spaCy blank: 'en'") tokenizer = SpacyTokenizer(nlp=nlp) use_dense_features = crf_extractor.use_dense_features() dev_crf_examples = [ gold_example_to_crf_tokens(ex, tokenizer=tokenizer, use_dense_features=use_dense_features) for ex in dev_examples ] f1_score, classification_report = crf_extractor.eval(dev_crf_examples) msg.warn(f"f1 score: {f1_score}") print(classification_report)
def __init__(self, nlp: Language, crf_extractor: Optional[CRFExtractor] = None): self.nlp = nlp self.crf_extractor = crf_extractor self.spacy_tokenizer = SpacyTokenizer(nlp)
class CRFEntityExtractor(object): """spaCy v2.0 pipeline component that sets entity annotations based on CRF (Conditional Random Field) estimator. See ```CRFExtractor``` for CRF implementation details. """ name = "crf_ner" def __init__(self, nlp: Language, crf_extractor: Optional[CRFExtractor] = None): self.nlp = nlp self.crf_extractor = crf_extractor self.spacy_tokenizer = SpacyTokenizer(nlp) def __call__(self, doc: Doc): """Apply the pipeline component on a Doc object and modify it if matches are found. Return the Doc, so it can be processed by the next component in the pipeline, if available. References: - ``https://spacy.io/usage/processing-pipelines#component-example2``. Args: doc (Doc): spaCy document. Returns: doc """ if not self.crf_extractor: raise RuntimeError("`CRFEntityExtractor` was not initialized. " "Did you call `.from_disk()` method ?") example = {"doc": doc, "text": doc.text} self.spacy_tokenizer.tokenize(example, attribute="doc") spans = [ doc.char_span(entity_dict["start"], entity_dict["end"], label=entity_dict["entity"]) for entity_dict in self.crf_extractor.process(example) ] doc.ents = list(doc.ents) + spans for span in spans: # Iterate over all spans and merge them into one token. This is done # after setting the entities – otherwise, it would cause mismatched # indices! span.merge() return doc def from_disk(self, path: Union[Path, str]) -> "CRFEntityExtractor": """Load crf extractor from disk. Args: path: path to directory. Returns: Component """ if not isinstance(path, Path): path = Path(path) self.crf_extractor = CRFExtractor().from_disk(path) return self
def main( in_file, out_dir=None, model_file=None, config_file=None, spacy_model=None, fine_tune=False, ): """Train CRF entity tagger.""" if config_file: msg.info("Loading config from disk") component_config = srsly.read_json(config_file) msg.good("Successfully loaded config from file.", config_file) else: component_config = None crf_extractor = CRFExtractor(component_config=component_config) if model_file is not None: msg.info(f"Loading model from disk.") crf_extractor = crf_extractor.from_disk(model_file) msg.good("Successfully loaded model from file.", model_file) msg.info("Loading training examples.") train_examples = read_file(in_file) msg.good( f"Successfully loaded {len(train_examples)} training examples from file.", in_file) if spacy_model is not None: nlp = spacy.load(spacy_model) msg.info(f"Using spaCy model: {spacy_model}") else: nlp = spacy.blank("en") msg.info(f"Using spaCy blank: 'en'") tokenizer = SpacyTokenizer(nlp=nlp) use_dense_features = crf_extractor.use_dense_features() train_crf = [ gold_example_to_crf_tokens(ex, tokenizer=tokenizer, use_dense_features=use_dense_features) for ex in train_examples ] if fine_tune: msg.info("Fine-tuning hyper params.") rs = crf_extractor.fine_tune(train_crf, cv=5, n_iter=30, random_state=42) msg.good("Setting fine-tuned hyper params:", rs.best_params_) crf_extractor.component_config.update(rs.best_params_) msg.info("Training entity tagger with CRF.") crf_extractor.train(train_crf) model_path = pathlib.Path(out_dir or ".").resolve() / "model.pkl" msg.info("Saving model to disk") model_path.parent.mkdir(exist_ok=True) crf_extractor.to_disk(model_path) msg.good("Successfully saved model to file.", str(model_path.relative_to(os.getcwd())))