示例#1
0
 def regex(self, value):
     if value is not None:
         if type(value) == list or type(value) == dict:
             self._ruler = EntityRuler(self.model)
             self._ruler.add_patterns(value)
         else:
             self._ruler = EntityRuler(self.model).from_disk(value)
         self.model.add_pipe(self._ruler, before='ner')
示例#2
0
def init_entity_ruler(nlp, patterns, label):
    string_store = nlp.vocab.strings
    if label not in string_store:
        string_store.add(label)

    ruler = EntityRuler(nlp)
    ruler.add_patterns(prepare_patterns(patterns, label))
    return ruler
示例#3
0
def main():
    # load_data
    test1_filepath = 'data/test1.csv'
    test2_filepath = 'data/test2.csv'
    test3_filepath = 'data/test3.csv'

    test1 = pd.read_csv(test1_filepath, index_col=0)
    test2 = pd.read_csv(test2_filepath, index_col=0)
    test3 = pd.read_csv(test3_filepath, index_col=0)
    print('loaded data')

    # remove unneccesary characters
    clean_texts_1 = [replace_newline(t.lower()) for t in test1['text']]
    clean_texts_2 = [replace_newline(t.lower()) for t in test2['text']]
    clean_texts_3 = [replace_newline(t.lower()) for t in test3['text']]

    # apply tokenizer, tagger, parser and ner to text
    nlp = spacy.load('en_core_web_lg')
    ruler = EntityRuler(nlp)
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    print('entity ruler added')

    test1_docs = [doc for doc in nlp.pipe(clean_texts_1, batch_size=10)]
    test2_docs = [doc for doc in nlp.pipe(clean_texts_2, batch_size=10)]
    test3_docs = [doc for doc in nlp.pipe(clean_texts_3, batch_size=10)]

    test1 = multilabel_binarizer(
        df2multilabels(df_predictions(test1_docs, clean_texts_1)))
    # test1 = np.insert(test1, 6, 0, axis=1)
    test1 = pd.DataFrame(test1,
                         columns=[
                             'Australia', 'Brazil', 'China', 'France', 'India',
                             'Japan', 'Korea', 'Spain', 'UK', 'US'
                         ])
    test1.to_csv('data/test1_country_predictions.csv')
    print('test1 predictions saved')

    test2 = multilabel_binarizer(
        df2multilabels(df_predictions(test2_docs, clean_texts_2)))
    test2 = pd.DataFrame(test2,
                         columns=[
                             'Australia', 'Brazil', 'China', 'France', 'India',
                             'Japan', 'Korea', 'Spain', 'UK', 'US'
                         ])
    test2.to_csv('data/test2_country_predictions.csv')
    print('test2 predictions saved')

    test3 = multilabel_binarizer(
        df2multilabels(df_predictions(test3_docs, clean_texts_3)))
    test3 = np.insert(test3, 5, 0, axis=1)
    test3 = pd.DataFrame(test3,
                         columns=[
                             'Australia', 'Brazil', 'China', 'France', 'India',
                             'Japan', 'Korea', 'Spain', 'UK', 'US'
                         ])
    test3.to_csv('data/test3_country_predictions.csv')
    print('test3 predictions saved')
示例#4
0
文件: nlp.py 项目: 18F/FAC-Distiller
def setup():
    nlp = spacy.load("en_core_web_sm")  # or 'en'
    ruler = EntityRuler(nlp, overwrite_ents=True)
    sentencizer = nlp.create_pipe("sentencizer")
    ruler.add_patterns(patterns)
    nlp.add_pipe(sentencizer, first=True)
    nlp.add_pipe(expand_audit_numbers, first=True)
    nlp.add_pipe(ruler)
    return nlp
示例#5
0
def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab):
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
    bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
    new_ruler = EntityRuler(nlp)
    new_ruler = new_ruler.from_bytes(bytes_old_style)
    assert len(new_ruler) == len(ruler)
    assert new_ruler.patterns == ruler.patterns
    assert new_ruler.overwrite is not ruler.overwrite
示例#6
0
 def add_entity_ruler(self, patterns):
     """
     To add entity ruler in nlp pipeline
     official doc: https://spacy.io/api/entityruler
     :param patterns: list or list of lists of token/phrase based patterns
     """
     ruler = EntityRuler(self._nlp)
     ruler.add_patterns(patterns)
     self._nlp.add_pipe(ruler)
示例#7
0
 def add_ruler(entity_name, entity_arr):
     ruler = EntityRuler(nlp, overwrite_ents=True)
     for d in entity_arr:
         ruler.add_patterns([{
             "label": str(entity_name),
             "pattern": str(d)
         }])
     ruler.name = str(entity_name)
     print('adding ent ', str(entity_name))
     return ruler
    def __init__(self, nlp, name: str, patterns: Sequence[dict]):
        """
        Initialise the PatternMatcher.

        Args:
            nlp : Spacy model
            patterns (Sequence[dict]): for the EntityRuler. See https://spacy.io/usage/rule-based-matching#entityruler
        """
        self.ruler = EntityRuler(nlp)
        self.ruler.add_patterns(patterns)
class SpatialRecognizer(object):

    name = "spatial_recognizer"

    def __init__(self, nlp, label="SPATIAL_RE", **kwargs):
        self.label = nlp.vocab.strings[label]
        self.nlp = nlp
        self.ruler = EntityRuler(nlp, overwrite_ents=False, validate=True)
        self.is_debug = kwargs.get("is_debug")
        try:
            self.ruler = self.ruler.from_disk("../_data/spatial-re-patterns.jsonl")
            self._add_ruler_to_pipeline(nlp, self.ruler)
        except ValueError as ve:
            if self.is_debug:
                logger.error(f"{ve}: Ensure patterns file is added.")
            self._add_patterns()
        if self.is_debug:
            logger.debug(f"Pipeline -> {nlp.pipe_names}")

    def __call__(self, doc):
        return doc

    def construct_patters(self):
        """Load patterns adhoc if loading .jsonl file fails"""
        obj_patterns = [
            {"label": "SPATIAL_RE",
             "pattern": [{"LOWER": "that"}, {"TEXT": {"IN": ["is", "are"]}}, {"LOWER": "behind"}, {"LOWER": "the"}]},
            {"label": "SPATIAL_RE", "pattern": [{"LOWER": "behind"}, {"LOWER": "the"}]},
            {"label": "SPATIAL_RE",
             "pattern": [{"LOWER": "that"}, {"TEXT": {"IN": ["is", "are"]}}, {"LOWER": "in"}, {"LOWER": "front"}]},
            {"label": "SPATIAL_RE",
             "pattern": [{"LOWER": "that"}, {"TEXT": {"IN": ["is", "are"]}}, {"TEXT": {"IN": ["in", "on"]}},
                         {"LOWER": "the"}, {"TEXT": {"IN": ["left", "right"]}}]},
            {"label": "SPATIAL_RE", "pattern": [{"LOWER": "to"}, {"LOWER": "the"}, {"TEXT": {"IN": ["right", "left"]}},
                                                {"TEXT": {"IN": ["", "of"]}}]},
            {"label": "SPATIAL_RE", "pattern": [{"TEXT": {"IN": ["above", "below"]}}, {"LOWER": "the"}]},
            {"label": "SPATIAL_RE", "pattern": [{"TEXT": {"IN": ["left", "right", "front", "behind", "above", "below"] }}, ]}
        ]

        return obj_patterns

    def _add_patterns(self):
        patterns = self.construct_patters()
        #other_pipes = [p for p in self.nlp.pipe_names if p != "tagger"] # excluse tagger
        other_pipes = self.nlp.pipe_names
        with self.nlp.disable_pipes(*other_pipes):
            self.ruler.add_patterns(patterns)
        #self.ruler.add_patterns(patterns)
        self._add_ruler_to_pipeline(self.nlp, self.ruler)

    def _add_ruler_to_pipeline(self, nlp, ruler, name="spatial_entity_ruler"):
        if nlp.has_pipe(name):
            nlp.replace_pipe(name, self.ruler)
        else:
            nlp.add_pipe(self.ruler, name=name, last=True)
示例#10
0
def extract_entities(text, output={}):
    nlp = English()
    ruler = EntityRuler(nlp)
    ruler.add_patterns(patterns.patterns + patterns.section_patterns)
    nlp.add_pipe(ruler)
    doc = nlp(text)

    logger.debug('TOKENS: {}'.format('\n'.join(['{} {}'.format(i, t.text) for i, t in enumerate(doc)])))
    logger.debug('ENTITIES: {}'.format('\n'.join(['{}:{}'.format(ent.label_, ent.text) for ent in doc.ents])))

    return update_output(doc, output)
示例#11
0
def test_entity_ruler_serialize_bytes(nlp, patterns):
    ruler = EntityRuler(nlp, patterns=patterns)
    assert len(ruler) == len(patterns)
    assert len(ruler.labels) == 3
    ruler_bytes = ruler.to_bytes()
    new_ruler = EntityRuler(nlp)
    assert len(new_ruler) == 0
    assert len(new_ruler.labels) == 0
    new_ruler = new_ruler.from_bytes(ruler_bytes)
    assert len(ruler) == len(patterns)
    assert len(ruler.labels) == 3
示例#12
0
def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab):
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
    with make_tempdir() as tmpdir:
        out_file = tmpdir / "entity_ruler"
        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
        new_ruler = EntityRuler(nlp).from_disk(out_file)
        for pattern in ruler.patterns:
            assert pattern in new_ruler.patterns
        assert len(new_ruler) == len(ruler)
        assert new_ruler.overwrite is not ruler.overwrite
示例#13
0
    def __init__(self, nlp: Language, data_path: Path = Path("data")):
        self.nlp = nlp
        self.data_path = data_path
        self.skills = self._get_skills()

        patterns = self._build_patterns(self.skills)
        extra_patterns = self._get_extra_skill_patterns()
        ruler = EntityRuler(nlp, overwrite_ents=True)
        ruler.add_patterns(itertools.chain(patterns, extra_patterns))
        if not self.nlp.has_pipe("skills_ruler"):
            self.nlp.add_pipe(ruler, name="skills_ruler")
示例#14
0
    def add_ruler(self, patterns: Union[List[RulePattern], RulePattern], before: str = "ner") -> 'ConvenientSpacy':
        if isinstance(patterns, Pattern):
            patterns = [patterns]

        patterns = (List[Pattern])(patterns)

        ruler = EntityRuler(self.nlp)
        [ruler.add_patterns(pattern.asdict) for pattern in patterns]
        self.nlp.add_pipe(ruler, before=before)

        return self
示例#15
0
def test_entity_ruler_init(nlp, patterns):
    ruler = EntityRuler(nlp, patterns=patterns)
    assert len(ruler) == len(patterns)
    assert len(ruler.labels) == 4
    assert "HELLO" in ruler
    assert "BYE" in ruler
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    doc = nlp("hello world bye bye")
    assert len(doc.ents) == 2
    assert doc.ents[0].label_ == "HELLO"
    assert doc.ents[1].label_ == "BYE"
示例#16
0
def alsoknownas(sentence):

    from spacy.pipeline import EntityRuler
    nlp = spacy.load('en_core_web_sm', disable=['ner'])
    rulerAlKnAs = EntityRuler(nlp, overwrite_ents=True)

    answer = sentence
    answer = answer.translate(str.maketrans('', '', string.punctuation))

    aka_patterns = [
        text for text in ('known as', 'nicknamed', 'known mononymously as',
                          'known professionally as')
    ]

    str1 = ""
    str2 = ""
    str3 = ""
    str4 = ""
    label = ""
    for aka in aka_patterns:
        if aka in answer:
            a = answer.split(aka, 1)[-1]
            name = a.split()[0]
            surname1 = a.split()[1]
            surname2 = a.split()[2]
            surname3 = a.split()[3]
            str1 = name
            str2 = name + " " + surname1
            str3 = name + " " + surname1 + " " + surname2
            str4 = name + " " + surname1 + " " + surname2 + " " + surname3

    tokens = nltk.word_tokenize(str4)
    pos = nltk.pos_tag(tokens)

    if (pos[0][1] in {"NNP", "NN"} and pos[1][1] not in {"NNP", "NN"}):
        label = str1
    if (pos[0][1] in {"NNP", "NN"} and pos[1][1] in {"NNP", "NN"}
            and pos[2][1] not in {"NNP", "NN"}):
        label = str2
    if (pos[0][1] in {"NNP", "NN"} and pos[1][1] in {"NNP", "NN"}
            and pos[2][1] in {"NNP", "NN"} and pos[3][1] not in {"NNP", "NN"}):
        label = str3
    if (pos[0][1] in {"NNP", "NN"} and pos[1][1] in {"NNP", "NN"}
            and pos[2][1] in {"NNP", "NN"} and pos[3][1] in {"NNP", "NN"}):
        label = str4

    for aka in aka_patterns:
        rulerAlKnAs.add_patterns([{"label": label, "pattern": aka}])
    rulerAlKnAs.name = 'rulerAlKnAs'
    nlp.add_pipe(rulerAlKnAs)
    doc = nlp(answer)
    for ent in doc.ents:
        return (ent.label_)
示例#17
0
def test_entity_ruler_multiprocessing(nlp, n_process):
    ruler = EntityRuler(nlp)
    texts = ["I enjoy eating Pizza Hut pizza."]

    patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut", "id": "1234"}]

    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)

    for doc in nlp.pipe(texts, n_process=2):
        for ent in doc.ents:
            assert ent.ent_id_ == "1234"
def add_hard_coded_entities(nlp, filename):
    with open(filename) as neaF:
        named_entity_additions = json.load(neaF)
    ner_additions = EntityRuler(nlp, overwrite_ents=True, phrase_matcher_attr="LOWER")
    for key, value in named_entity_additions.items():
        patterns = [{"label": key, "pattern": token} for token in value]  # create the pattern dictionary
        ner_additions.add_patterns(patterns)
    base = ['tagger', "parser", "ner"]
    for pipe in list(set(nlp.pipe_names) - set(base)):
        nlp.remove_pipe(pipe)
    nlp.add_pipe(ner_additions, after="ner")
    return nlp
示例#19
0
 def add_name_rules_to_nlp(self, parsed_appointments):
     ruler = EntityRuler(self.instance)
     patterns = []
     for appointment in parsed_appointments:
         pattern = {}
         visitor = appointment.visitor
         pattern['label'] = "PER"
         pattern['pattern'] = []
         for part in visitor:
             pattern['pattern'].append({"lower": part})
         patterns.append(pattern)
     ruler.add_patterns(patterns)
     self.instance.add_pipe(ruler)
示例#20
0
    def register_recognizer(self, recognizer_cls: Type[EntityRulerRecognizer]):
        recognizer = recognizer_cls()

        recognizer_name = recognizer_cls.__name__
        ruler = EntityRuler(self.nlp)
        self.nlp.add_pipe(ruler, recognizer_name)
        rules = [{"label": recognizer.TAG, "pattern": pattern} for pattern in recognizer.patterns]
        ruler.add_patterns(rules)
        self.nlp.add_pipe(
            set_spacy_entity_extension_attributes(recognizer.SCORE, recognizer_name),
            name="label_" + recognizer_name,
            after=recognizer_name,
        )
示例#21
0
def extract_sections(text, output={}):
    nlp = English()
    ruler = EntityRuler(nlp)
    ruler.add_patterns(patterns.section_patterns)
    nlp.add_pipe(ruler)
    nlp.add_pipe(expand_sections)
    doc = nlp(text)

    logger.debug('SESSION TOKENS: {}'.format(
        '\n'.join(['{} {}'.format(i, t.text) for i, t in enumerate(doc)])))
    logger.debug('SESSION ENTITIES: {}'.format(
        '\n'.join(['{}:{}:{}:{}'.format(ent.label_, ent.text.strip(), ent.start, ent.end) for ent in doc.ents])))

    return update_output(doc, output)
    def train_entity_ruler(self) -> None:
        """Initializes Spacy's EntityRuler module, and sets self.entity_model

        In addition, updates Spacy's EntityRuler with the Train Data available
        """

        nlp = spacy.load("en_core_web_sm")

        ruler = EntityRuler(nlp)
        patterns = self.generate_entity_train_data()
        ruler.add_patterns(patterns)
        nlp.add_pipe(ruler)

        self.entity_model = nlp
 def __init__(self, nlp, label="SPATIAL_RE", **kwargs):
     self.label = nlp.vocab.strings[label]
     self.nlp = nlp
     self.ruler = EntityRuler(nlp, overwrite_ents=False, validate=True)
     self.is_debug = kwargs.get("is_debug")
     try:
         self.ruler = self.ruler.from_disk("../_data/spatial-re-patterns.jsonl")
         self._add_ruler_to_pipeline(nlp, self.ruler)
     except ValueError as ve:
         if self.is_debug:
             logger.error(f"{ve}: Ensure patterns file is added.")
         self._add_patterns()
     if self.is_debug:
         logger.debug(f"Pipeline -> {nlp.pipe_names}")
示例#24
0
def my_nlp(model):
    nlp = spacy.load(model)
    list_infixes_defaults = list(nlp.Defaults.infixes)
    if reg in list_infixes_defaults:
        list_infixes_defaults.remove(reg)
    # modify tokenizer infix patterns(dd-dd-dd)
    infixes = (list_infixes_defaults + [r"(?<=[0-9])[\+\*^](?=[0-9-])"])
    infix_re = compile_infix_regex(infixes)

    nlp.tokenizer.infix_finditer = infix_re.finditer
    ruler = EntityRuler(nlp)
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler, before='ner')
    return nlp
示例#25
0
    def fijar_pipes(self):
        """Fija componentes adicionales del Pipeline de procesamiento."""
        if not self.lang.has_pipe("cumplimiento"):
            self.lang.add_pipe(self.cumplimiento, name="cumplimiento", last=True)

        if self.grupos:
            if not self.lang.has_pipe("presencia"):
                self.lang.add_pipe(self.presencia, name="presencia", last=True)

        if self.entes:
            if not self.lang.has_pipe("entes"):
                ruler = EntityRuler(self.lang, phrase_matcher_attr="LOWER")
                ruler.add_patterns(self.crear_patrones())

                self.lang.add_pipe(ruler, name="entes", before="ner")
def pattern_matcher(nlp, name: str, patterns: Sequence[dict]):
    """
    Create an EntityRuler object loaded with a list of patterns.

    Args:
        nlp : Spacy model
        patterns (Sequence[dict]): for the EntityRuler. See https://spacy.io/usage/rule-based-matching#entityruler

    Returns:
        Spacy EntityRuler component
    """
    ruler = EntityRuler(nlp)
    ruler.add_patterns(patterns)

    return ruler
示例#27
0
def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab):
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, overwrite_ents=True)

    ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
    nlp.add_pipe(ruler)
    with make_tempdir() as tmpdir:
        nlp.to_disk(tmpdir)
        ruler = nlp.get_pipe("entity_ruler")
        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
        assert ruler.overwrite is True
        nlp2 = load(tmpdir)
        new_ruler = nlp2.get_pipe("entity_ruler")
        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
        assert new_ruler.overwrite is True
示例#28
0
def start():
    print("Adding entity rules please wait...")
    file_terms = files.get_file_terms()
    dir_terms = files.get_dir_terms()
    ruler = EntityRuler(nlp, overwrite_ents=True)
    # list of words reserved for commands
    reserved_words = ["find", "copy", "open", "move"]
    _patterns = get_file_patterns(file_terms, reserved_words)
    [
        _patterns.append(pattern)
        for pattern in get_dir_patterns(dir_terms, reserved_words)
    ]
    ruler.add_patterns(_patterns)
    nlp.add_pipe(ruler)
    main()
示例#29
0
def test_entity_ruler_overlapping_spans(nlp):
    ruler = EntityRuler(nlp)
    patterns = [
        {
            "label": "FOOBAR",
            "pattern": "foo bar"
        },
        {
            "label": "BARBAZ",
            "pattern": "bar baz"
        },
    ]
    ruler.add_patterns(patterns)
    doc = ruler(nlp.make_doc("foo bar baz"))
    assert len(doc.ents) == 1
    assert doc.ents[0].label_ == "FOOBAR"
示例#30
0
def run_cd2h(input_dir, output_dir, single_file_name):
    """
        Process a set of input files per CD2H and the 2014 I2b2 challange and output results that can be evaluated

        :param input_dir:
        :param output_dir:
        :return:
        """
    # Set up NLP
    nlp = spacy.load("en_core_web_md")

    # This breaks up words on dash slash and periods ( from end of sentences ) for better parsing
    infix_re = re.compile(r'''[-/,.]''')
    nlp.tokenizer = custom_tokenizer(nlp, infix_re)

    ruler = EntityRuler(nlp, overwrite_ents=True,
                        validate=True).from_disk("./spacy_patterns.jsonl")
    nlp.add_pipe(ruler)
    # Dir of XML to process
    if single_file_name:
        entry = Path(f"{output_dir}/{ single_file_name}")
        token_list = process_xml_file(entry, output_dir, nlp)
        for token in token_list:
            print("TOKEN " + token)
        create_output_cd2h(output_dir, entry, token_list)
    else:
        entries = os.scandir(input_dir)
        for entry in entries:
            print(f"Processing input file : {entry.path} {entry.name}")
            token_list = process_xml_file(entry, output_dir, nlp)
            for token in token_list:
                print("TOKEN " + token)
            create_output_cd2h(output_dir, entry, token_list)

    sys.exit(0)
示例#31
0
def test_issue7():
    nlp = spacy.load("en_core_web_sm")
    negex = Negex(nlp)
    nlp.add_pipe(negex, last=True)
    ruler = EntityRuler(nlp)
    patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
    doc = nlp("fgfgdghgdh")