class TechCompanyRecognizer(object):
    """Example of a spaCy v2.0 pipeline component that sets entity annotations
    based on list of single or multiple-word company names. Companies are
    labelled as ORG and their spans are merged into one token. Additionally,
    ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
    respectively."""
    name = 'tech_companies'  # component name, will show up in the pipeline

    def __init__(self, nlp, companies=tuple(), label='ORG'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
        # so even if the list of companies is long, it's very efficient
        patterns = [nlp(org) for org in companies]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('TECH_ORGS', None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        Token.set_extension('is_tech_org', default=False)

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_tech_org == True.
        Doc.set_extension('has_tech_org', getter=self.has_tech_org)
        Span.set_extension('has_tech_org', getter=self.has_tech_org)

    def __call__(self, doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.
        """
        matches = self.matcher(doc)
        spans = []  # keep the spans for later so we can merge them afterwards
        for _, start, end in matches:
            # Generate Span representing the entity & set label
            entity = Span(doc, start, end, label=self.label)
            spans.append(entity)
            # Set custom attribute on each token of the entity
            for token in entity:
                token._.set('is_tech_org', True)
            # Overwrite doc.ents and add entity – be careful not to replace!
            doc.ents = list(doc.ents) + [entity]
        for span in spans:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched
            # indices!
            span.merge()
        return doc  # don't forget to return the Doc!

    def has_tech_org(self, tokens):
        """Getter for Doc and Span attributes. Returns True if one of the tokens
        is a tech org. Since the getter is only called when we access the
        attribute, we can refer to the Token's 'is_tech_org' attribute here,
        which is already set in the processing step."""
        return any([t._.get('is_tech_org') for t in tokens])
예제 #2
0
def test_issue3248_1():
    """Test that the PhraseMatcher correctly reports its number of rules, not
    total number of patterns."""
    nlp = English()
    matcher = PhraseMatcher(nlp.vocab)
    matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c"))
    matcher.add("TEST2", None, nlp("d"))
    assert len(matcher) == 2
예제 #3
0
def test_issue3248_2():
    """Test that the PhraseMatcher can be pickled correctly."""
    nlp = English()
    matcher = PhraseMatcher(nlp.vocab)
    matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c"))
    matcher.add("TEST2", None, nlp("d"))
    data = pickle.dumps(matcher)
    new_matcher = pickle.loads(data)
    assert len(new_matcher) == len(matcher)
예제 #4
0
def get_matches(tokenizer, phrases, texts, max_length=6):
    matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length)
    matcher.add('Phrase', None, *phrases)
    for text in texts:
        doc = tokenizer(text)
        for w in doc:
            _ = doc.vocab[w.text]
        matches = matcher(doc)
        for ent_id, start, end in matches:
            yield (ent_id, doc[start:end].text)
예제 #5
0
def test_issue3331(en_vocab):
    """Test that duplicate patterns for different rules result in multiple
    matches, one per rule.
    """
    matcher = PhraseMatcher(en_vocab)
    matcher.add("A", None, Doc(en_vocab, words=["Barack", "Obama"]))
    matcher.add("B", None, Doc(en_vocab, words=["Barack", "Obama"]))
    doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
    matches = matcher(doc)
    assert len(matches) == 2
    match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]]
    assert sorted(match_ids) == ["A", "B"]
class AdversativeConnectivesTagger:
    '''
    This tagger has the task to find all adversative connectives in a document. It needs to go after the 'Tagger' pipeline component.
    '''
    name = 'adversative connective tagger'

    def __init__(self, nlp, language: str='es') -> None:
        '''
        This constructor will initialize the object that tags adversative connectives.

        Parameters:
        nlp: The Spacy model to use this tagger with.
        language: The language that this pipeline will be used in.

        Returns:
        None.
        '''
        if not language in ACCEPTED_LANGUAGES:
            raise ValueError(f'Language {language} is not supported yet')

        self._language = language
        self._matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
        self._connectives = []
        if language == 'es': # Adversative connectives for spanish
            self._connectives = ['pero', 'sino', 'no obstante', 'sino que', 'sin embargo', 'pero sí', 'aunque', 'menos', 'solo', 'excepto', 'salvo', 'más que', 'en cambio', 'ahora bien', 'más bien']
        else: # Support for future languages
            pass

        for con in self._connectives:
            self._matcher.add(con, None, nlp(con))
        

    def __call__(self, doc: Doc) -> Doc:
        '''
        This method will find all adversative connectives and store them in an iterable.

        Parameters:
        doc(Doc): A Spacy document.
        '''
        matches = self._matcher(doc)
        adversative_connectives_spans = [doc[start:end] for _, start, end in matches]

        doc._.adversative_connectives_span_indices = [{'start': span.start,
                                                       'end': span.end,
                                                       'label': span.label}
                                                      for span in filter_spans(adversative_connectives_spans)] # Save the causal connectives found
        
        return doc
예제 #7
0
 def _match_no_category(
     self,
     list_of_tags: List[AnyStr],
     list_of_keywords: List[AnyStr],
 ) -> None:
     """Tokenize keywords for every language. Instanciate PhraseMatcher with associated tags"""
     for language in self.tokenizer.spacy_nlp_dict:
         patterns = self._tokenize_keywords(language, list_of_tags,
                                            list_of_keywords)
         self.tokenizer.spacy_nlp_dict[language].remove_pipe("sentencizer")
         matcher = PhraseMatcher(
             self.tokenizer.spacy_nlp_dict[language].vocab,
             attr=get_phrase_matcher_attr(self.lemmatization),
         )
         matcher.add("PatternList", patterns)
         self._matcher_dict[language] = matcher
예제 #8
0
def matcherR():
    nlp = spacy.load('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe('ner')
    label = 'GEOLOC'
    matcher = PhraseMatcher(nlp.vocab)
    text = open('onlylocations.csv','r')
    for i in text:
        matcher.add(label, None,nlp(i))
    one = nlp('Chennai has been affected severly beacuse of the tsunami')
    matches = matcher(one)
    [match for match in matches]
    print(matches)
예제 #9
0
def string_to_nlp(s):
    matcher = PhraseMatcher(nlp.vocab)
    states = [
        'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar',
        'Chhattisgarh', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh',
        'Jharkhand', 'Karnataka', 'Kerala', 'Madhya Pradesh', 'Maharashtra',
        'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Punjab',
        'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura',
        'Uttar Pradesh', 'Uttarakhand', 'West Bengal', 'Andaman and Nicobar',
        'Chandigarh', 'Dadra Nagar ', 'Haveli ', 'Daman ', 'Diu', 'Delhi',
        'Jammu and Kashmir', 'Ladakh', 'Lakshadweep', 'Puducherry'
    ]
    # Only run nlp.make_doc to speed things up
    patterns = [nlp.make_doc(text) for text in states]
    matcher.add("Indian State", None, *patterns)
    return nlp(s)
예제 #10
0
def lexical_dangerous(nlp_doc):
    strArr = []
    terms = [
        "all", "each", "every", "any", "few", "little", "many", "much",
        "several", "some", "a lot"
    ]
    lexical_dangerous_plural = [nlp(text) for text in terms]
    doc = nlp(str(nlp_doc))
    phrase_matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
    phrase_matcher.add("lexicalplural", None, *lexical_dangerous_plural)
    matches = phrase_matcher(doc)

    for match_id, start, end in matches:
        span = doc[start:end]
        strArr.append(span.text)
    return strArr
예제 #11
0
파일: text.py 프로젝트: tanghyd/capstone
def create_phrase_matcher(phrases, nlp=None, label="TRIGGER"):
    if nlp == None:
        nlp = load_spacy_model(output_type='doc',
                               tokenizer_only=True,
                               verbose=False)

    # Create phrase matcher
    from spacy.matcher import PhraseMatcher

    # creates a phrase matcher using nlp model's vocabulary, matching on the LOWER attribute
    matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
    patterns = list(nlp.tokenizer.pipe(phrases))
    matcher.add(label, None,
                *patterns)  # If pattern matches, return match label as label

    return matcher
예제 #12
0
def is_continuance_word(nlp_doc):
    strArr = []
    terms = [
        "below:", "as follows:", "following:", "listed:", "in particular:",
        "support:", " and ", ":"
    ]
    continuance = [nlp(text) for text in terms]
    doc = nlp(str(nlp_doc))
    phrase_matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
    phrase_matcher.add("continuance", None, *continuance)
    matches = phrase_matcher(doc)

    for match_id, start, end in matches:
        span = doc[start:end]
        strArr.append(span.text)
    return strArr
    def setUp(self):

        # Keywords
        nlp = spacy.load("en_core_web_sm")
        keywords = ['hello world', 'Sparty', 'A. thaliana', 'protein 5']
        patterns = [nlp.make_doc(keyword) for keyword in keywords]

        # Text
        txt = ('Hello world, my name is Sparty. My research is about '
               'A. thaliana protein 5. Hello.')
        self.doc = nlp(txt)

        # Matcher
        matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
        matcher.add("Keywords", patterns)
        self.matches = matcher(self.doc)
예제 #14
0
    def return_people(self):

        nlp = spacy.load("en_core_web_sm")

        doc = nlp(self.text)

        people = ["Doctor Kimble", "Postman", "Deli Man", "Plumber"]
        people_patterns = list(nlp.pipe(people))
        peopleMatcher = PhraseMatcher(nlp.vocab)
        peopleMatcher.add("PEOPLE", [*people_patterns])

        for match_id, start, end in peopleMatcher(doc):

            peopleSpan = Span(doc, start, end, label="PEOPLE")

            return peopleSpan
예제 #15
0
class EntityMatcher(object):
    name = 'entity_matcher'

    def __init__(self, nlp, terms, label):
        patterns = [nlp(term) for term in terms]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        spans = []
        for label, start, end in matches:
            span = Span(doc, start, end, label=label)
            spans.append(span)
        doc.ents = spans
        return doc
예제 #16
0
def find_associated(word, docs):
    """Returns a list of words that are associated with a word in sentences belonging to texts of a corpus"""
    associated = []
    matcher = PhraseMatcher(nlp.vocab)
    # terminology_list = []
    patterns = nlp(word)
    matcher.add('TerminologyList',*patterns)

    for iD in docs:
        texts = nlp(docs[iD])
        for sent in texts.sents:
            sent = sent.as_doc()
            matches = matcher(sent)
            if len(matches) > 0 and sent[matches[0][1]+1].head.text not in associated:
                associated.append(sent[matches[0][1]+1].head.text)
    return associated
예제 #17
0
def sentence_Incompletes(nlp_doc):
    strArr = []
    terms = [
        "TBD", "TBS", "TBE", "TBC", "TBR", "not defined", "not determined",
        "but not limited to", "as a minimum"
    ]
    incompletes = [nlp(text) for text in terms]
    doc = nlp(str(nlp_doc))
    phrase_matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
    phrase_matcher.add("Incompletes", None, *incompletes)
    matches = phrase_matcher(doc)

    for match_id, start, end in matches:
        span = doc[start:end]
        strArr.append(span.text)
    return strArr
예제 #18
0
    def return_rooms(self):

        nlp = spacy.load("en_core_web_sm")

        doc = nlp(self.text)

        rooms = ["kitchen", "bedroom", "bathroom", "hallway", "living room"]
        room_patterns = list(nlp.pipe(rooms))
        roomMatcher = PhraseMatcher(nlp.vocab)
        roomMatcher.add("ROOM", [*room_patterns])

        for match_id, start, end in roomMatcher(doc):
            # Create a Span with the label for "GPE"
            roomSpan = Span(doc, start, end, label="ROOM")

            return roomSpan
예제 #19
0
def main():
    nlp = spacy.load('en_core_web_sm')
    matcher = PhraseMatcher(nlp.vocab)

    pattern1 = nlp('Golden Retriever')
    pattern2 = nlp('Chicken')
    matcher.add('DOG', None, pattern1)
    matcher.add('FOWL', None, pattern2)
    doc = nlp(
        "I have a Golden Chicken and a Golden Retrievable Fish and a Golden Retriever"
    )

    # Iterate over the matches
    for match_id, start, end in matcher(doc):
        # Get the matched span
        span = doc[start:end]
        print('Matched span:', span.text)
예제 #20
0
def return_people():

    people = ["Doctor Kimble", "Postman", "Deli Man", "Plumber"]
    people_patterns = list(nlp.pipe(people))
    peopleMatcher = PhraseMatcher(nlp.vocab)
    peopleMatcher.add("PEOPLE", None, *people_patterns)

    for match_id, start, end in peopleMatcher(doc):
        # Create a Span with the label for "GPE"
        peopleSpan = Span(doc, start, end, label="PEOPLE")
        #peopleSpan = Span(doc, start, end, label="PEOPLE")
        #objectSpan = Span(doc, start, end, label="OBJECTS")
        # Overwrite the doc.ents and add the span
        # doc.ents = list(doc.ents) + [roomSpan] + [peopleSpan] + [objectSpan]

        # Print the text of the span root's head token and the span text
        print(peopleSpan.text)
예제 #21
0
def test_span_v_doc_in_phrasematcher(en_vocab):
    """Ensure that PhraseMatcher only returns matches in input Span and not in entire Doc"""
    doc = Doc(en_vocab,
              words=[
                  "I", "like", "Spans", "and", "Docs", "in", "my", "input",
                  ",", "Spans", "and", "Docs", "in", "my", "matchers", ","
                  "and", "Spans", "and", "Docs", "everywhere"
                  "."
              ])
    span = doc[9:15]  # second clause
    pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
    matcher = PhraseMatcher(en_vocab)
    matcher.add("SPACY", [pattern])
    matches_doc = matcher(doc)
    matches_span = matcher(span)
    assert len(matches_doc) == 3
    assert len(matches_span) == 1
예제 #22
0
def return_objects():

    objects = ["can of coke", "coke can", "wallet", "purse", "bottle of water"]
    object_patterns = list(nlp.pipe(objects))
    objectMatcher = PhraseMatcher(nlp.vocab)
    objectMatcher.add("OBJECTS", None, *object_patterns)

    for match_id, start, end in objectMatcher(doc):
        # Create a Span with the label for "GPE"
        objectSpan = Span(doc, start, end, label="OBJECTS")
        #peopleSpan = Span(doc, start, end, label="PEOPLE")
        #objectSpan = Span(doc, start, end, label="OBJECTS")
        # Overwrite the doc.ents and add the span
        # doc.ents = list(doc.ents) + [roomSpan] + [peopleSpan] + [objectSpan]

        # Print the text of the span root's head token and the span text
        print(objectSpan.text)
예제 #23
0
def dfprep(json_in, save_df, inputfile):
    if inputfile == 1:
        with open("input.txt", "r") as f:
            para = ast.literal_eval(f.read())
        json_in = para['json_in']
        save_df = para['save_df']
    with mlflow.start_run() as mlrun:
        print(subprocess.getoutput("python -m spacy download en_core_web_sm"))
        artpd = pd.read_json(json_in,
                             orient='index',
                             convert_dates=False,
                             convert_axes=False)
        artpda = artpd[artpd.abstract.notnull()].copy()
        artpda = artpda[artpd.title.notnull()]
        #        artpda.index = pd.Series(artpda.index).apply(lambda x: x[0:8])
        artpdak = artpda[artpda.keywords.str.len() > 0].copy()
        dataf = pd.DataFrame(
            index=artpdak.index,
            columns=['SRC', 'TRG', 'keywords', 'Extracted', 'abskey'])
        dataf.loc[:, 'SRC'] = artpdak.title + ' ' + artpdak.abstract
        dataf.loc[:, 'keywords'] = artpdak.keywords
        svoc = spacy.load("en_core_web_sm")
        matcher = PhraseMatcher(svoc.vocab, attr="LOWER")
        for pmid in dataf.index:
            t0 = dataf.loc[pmid]
            patterns = [svoc.make_doc(str(name)) for name in t0.keywords]
            matcher.add("Names", None, *patterns)
            doc = svoc(t0.SRC)
            t1 = ['O'] * (len(doc))
            matched = []
            matn = 0
            for _, start, end in matcher(doc):
                t1[start] = 'B'
                t1[start + 1:end] = 'I' * (end - start - 1)
                if str(doc[start:end]).lower() not in matched:
                    matn = matn + 1
                    matched.append(str(doc[start:end]).lower())
            abskw = []
            for x in t0.keywords:
                if x.lower() not in matched:
                    abskw.append(x)
            dataf.loc[pmid, 'TRG'] = ' '.join([t for t in t1])
            dataf.loc[pmid, 'Extracted'] = matn
            dataf.loc[pmid, 'abskey'] = abskw
            matcher.remove("Names")
        dataf.to_pickle(save_df)
예제 #24
0
def search_for_keyword(keyword, doc_obj, nlp):
    phrase_matcher = PhraseMatcher(nlp.vocab)
    li = list(keyword.split(" "))
    patterns = [nlp.make_doc(text) for text in li]
    phrase_matcher.add("TerminologyList", patterns)

    matched_items = phrase_matcher(doc_obj)
    matched_text = []
    for match_id, start, end in matched_items:
        text = nlp.vocab.strings[match_id]
        span = doc_obj[start:end]
        matched_text.append(span.sent.text)
    print("Found ", len(matched_text), " matches")
    for txt in matched_text:
        print("Found Match ........")
        print(txt)
        print("\n")
예제 #25
0
class Singletons:
    __instance = None
    cached_signals = phrase_matcher = nlp = None

    @staticmethod
    def get_instance():
        """ Static access method. """
        if Singletons.__instance is None:
            logger.info("Calling Singletone private constructor")
            Singletons()
        return Singletons.__instance

    def __init__(self):
        if Singletons.__instance is not None:
            raise Exception("This class is a singleton!")
        else:
            logger.info("Creating empty cache for signals")
            self.cached_signals = {}
            logger.info("Making the question PhraseMatcher")
            self.nlp = spacy.load("en_core_web_sm")
            self.phrase_matcher = PhraseMatcher(self.nlp.vocab, attr='LOWER')
            question_terms = [
                "who", "whom", "whose", "what", "when", "where", "why",
                "which", "how"
            ]
            patterns = [self.nlp(text) for text in question_terms]
            self.phrase_matcher.add("question", None, *patterns)
            Singletons.__instance = self

    def get_cached_signals(self):
        """
        This method gets the cached signal dict
        @return: signal dict format {"prod_id":[list of signal object]}
        """
        return self.cached_signals

    def set_cached_signals(self, product_id, signals):
        if str(product_id) not in self.cached_signals.keys():
            self.cached_signals[str(product_id)] = signals
        return self.cached_signals

    def get_phrase_matcher(self, doc):
        return self.phrase_matcher(doc)

    def return_nlp(self, sentence):
        return self.nlp(sentence)
예제 #26
0
    def get_gender(self) -> str:
        """
        Based on sentences where the Respondent/Applicant keywords are found,
        count the instances of gendered pronouns. This approach assumes the
        sentence refers to subject in shorthand by using gendered pronouns
        as opposed to keywords multiple times in sentence. This function
        returns a final result to be packaged into json.
        Parameters:
        spacy.doc (obj):
        Returns:
        str: Gender
        """

        # Search terms formatted
        phrases = [
            "respondent", "respondents", "applicant", 'filed an application'
        ]
        patterns = [nlp(text) for text in phrases]

        # Gender constants
        male_prons = ['he', "he's", 'his', 'himself']
        female_prons = ['she', "she's", 'her', 'herself']

        # Variables for analysis storage
        male_found = []
        female_found = []

        # PhraseMatcher setup, add tag (RESP) and pass in patterns
        phrase_matcher = PhraseMatcher(nlp.vocab, attr='LEMMA')
        phrase_matcher.add("RESP", None, *patterns)

        # Sentences with both 'RESP' tag and gendered pronouns added to respective list
        for sent in self.doc.sents:
            for match_id, _, _ in phrase_matcher(nlp(sent.text)):
                if nlp.vocab.strings[match_id] in ['RESP', *male_prons]:
                    male_found.append(sent.text)
                elif nlp.vocab.strings[match_id] in ['RESP', *female_prons]:
                    female_found.append(sent.text)

        # Make `set()` of list to eliminate duplicates and compare lengths
        if len(set(female_found)) > len(set(male_found)):
            return "Female"
        elif len(set(male_found)) > len(set(female_found)):
            return "Male"
        else:
            return "Unknown"
예제 #27
0
    def return_objects(self):

        nlp = spacy.load("en_core_web_sm")

        doc = nlp(self.text)

        objectJson = open(self.objectFilePath, "r")
        objects = json.loads(objectJson.read())
        object_patterns = list(nlp.pipe(objects))
        objectMatcher = PhraseMatcher(nlp.vocab)
        objectMatcher.add("OBJECTS", [*object_patterns])

        for match_id, start, end in objectMatcher(doc):

            objectSpan = Span(doc, start, end, label="OBJECTS")

            return objectSpan
예제 #28
0
class EntityMatcher(object):
    name = 'entity_matcher'

    def __init__(self, nlp, terms_list, labels_list):
        idx = 0
        for terms in terms_list:
            patterns = [nlp(text) for text in terms]
            self.matcher = PhraseMatcher(nlp.vocab)
            self.matcher.add(labels_list[idx], None, *patterns)
            idx = idx + 1

    def __call__(self, doc):
        matches = self.matcher(doc)
        for match_id, start, end in matches:
            span = Span(doc, start, end, label=match_id)
            doc.ents = list(doc.ents) + [span]
        return doc
예제 #29
0
파일: extract.py 프로젝트: jjm0022/Blitz
 def patternGenerator(self, doc, pattern_path):
     """
     Returns the starting and ending character index for a phrase match along with the phrase
     """
     phrase = namedtuple('Phrase', ['text', 'start', 'end'])
     phrases = self.readPatterns(self._nlp.tokenizer, pattern_path)
     matcher = PhraseMatcher(self._nlp.tokenizer.vocab, max_length=6)
     matcher.add("Phrase", None, *phrases)
     for w in doc:
         _ = doc.vocab[w.text]
     matches = matcher(doc)
     for ent_id, start, end in matches:
         yield phrase(
             text=doc[start:end].text,
             start=doc[start:end].start_char,
             end=doc[start:end].end_char,
         )
예제 #30
0
def test_span_in_phrasematcher(en_vocab):
    """Ensure that PhraseMatcher accepts Span and Doc as input"""
    # fmt: off
    words = [
        "I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and",
        "nothing", "else", "."
    ]
    # fmt: on
    doc = Doc(en_vocab, words=words)
    span = doc[:8]
    pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
    matcher = PhraseMatcher(en_vocab)
    matcher.add("SPACY", [pattern])
    matches_doc = matcher(doc)
    matches_span = matcher(span)
    assert len(matches_doc) == 1
    assert len(matches_span) == 1
예제 #31
0
    def doccano_snippet_training_out(self, docs, terms, path):
        patterns = [self.nlp.make_doc(text) for text in terms]
        matcher = PhraseMatcher(self.nlp.vocab)
        matcher.add("TerminologyList", None, *patterns)

        snippets = []
        for doc in docs:
            match_start_spans = []
            matches = matcher(doc)
            for match_id, start, end in matches:
                span = doc[start:end]
                match_start_spans.append(start)
                snippets.append(json.dumps({"text": span.sent.text}))

        with open(path, 'w') as fh:
            for snippet in snippets:
                fh.write(snippet+"\n")
예제 #32
0
def test_phrase_matcher_bool_attrs(en_vocab):
    words1 = ["Hello", "world", "!"]
    words2 = ["No", "problem", ",", "he", "said", "."]
    pattern = Doc(en_vocab, words=words1)
    matcher = PhraseMatcher(en_vocab, attr="IS_PUNCT")
    matcher.add("TEST", [pattern])
    doc = Doc(en_vocab, words=words2)
    matches = matcher(doc)
    assert len(matches) == 2
    match_id1, start1, end1 = matches[0]
    match_id2, start2, end2 = matches[1]
    assert match_id1 == en_vocab.strings["TEST"]
    assert match_id2 == en_vocab.strings["TEST"]
    assert start1 == 0
    assert end1 == 3
    assert start2 == 3
    assert end2 == 6
예제 #33
0
class JeevesSkills:
    def __init__(self, nlp, name, label="SKILL"):
        r = requests.get("https://restcountries.eu/rest/v2/all")
        r.raise_for_status()  # make sure requests raises an error if it fails
        countries = r.json()
        # Convert API response to dict keyed by country name for easy lookup
        self.countries = {c["name"]: c for c in countries}
        self.label = label
        # Set up the PhraseMatcher with Doc patterns for each country name
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add("SKILLS",
                         [nlp.make_doc(c) for c in self.countries.keys()])
        # Register attributes on the Span. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        Span.set_extension("jenkins_skill", default=None, force=True)
        #Span.set_extension("country_capital", default=None)
        #Span.set_extension("country_latlng", default=None)
        #Span.set_extension("country_flag", default=None)
        # Register attribute on Doc via a getter that checks if the Doc
        # contains a country entity
        Doc.set_extension("jenkins_has_skill",
                          getter=self.jenkins_has_skill,
                          force=True)

    def __call__(self, doc):
        spans = []  # keep the spans for later so we can merge them afterwards
        for _, start, end in self.matcher(doc):
            # Generate Span representing the entity & set label
            entity = Span(doc, start, end, label=self.label)
            # Set custom attributes on entity. Can be extended with other data
            # returned by the API, like currencies, country code, calling code etc.
            entity._.set("jenkins_skill", True)
            #entity._.set("country_capital", self.countries[entity.text]["capital"])
            #entity._.set("country_latlng", self.countries[entity.text]["latlng"])
            #entity._.set("country_flag", self.countries[entity.text]["flag"])
            spans.append(entity)
        # Overwrite doc.ents and add entity – be careful not to replace!
        doc.ents = list(doc.ents) + spans
        return doc  # don't forget to return the Doc!

    def jenkins_has_skill(self, doc):
        """Getter for Doc attributes. Since the getter is only called
        when we access the attribute, we can refer to the Span's 'is_skill'
        attribute here, which is already set in the processing step."""
        return any([entity._.get("jenkins_skill") for entity in doc.ents])
예제 #34
0
class SpacyAffirmationDetector(DialogActDetector):
    def __init__(self):
        self._nlp = NLP.get('en')

        self._phrase_matcher_negative = PhraseMatcher(self._nlp.vocab,
                                                      attr="LOWER")
        negated_affirmation_phrase_patterns = [
            self._nlp.make_doc(text) for text in negated_affirmation_phrases
        ]
        self._phrase_matcher_negative.add("NegatedAffirmationPhrases", None,
                                          *negated_affirmation_phrase_patterns)

        self._phrase_matcher = PhraseMatcher(self._nlp.vocab, attr="LOWER")
        affirmation_phrase_patterns = [
            self._nlp.make_doc(text) for text in affirmation_phrases
        ]
        self._phrase_matcher.add("AffirmationPhrases", None,
                                 *affirmation_phrase_patterns)

        self._token_matcher = Matcher(self._nlp.vocab)
        self._token_matcher.add("Affirmation", None,
                                *affirmation_token_patterns)

    def detect(self, text, language='en'):
        doc = self._nlp(text)

        negated_phrase_matches = self._phrase_matcher_negative(doc)
        if negated_phrase_matches:
            (match_id, start, end) = negated_phrase_matches[0]
            span = doc[start:end]
            return False, span.text

        phrase_matches = self._phrase_matcher(doc)
        if phrase_matches:
            (match_id, start, end) = phrase_matches[0]
            span = doc[start:end]
            return True, span.text

        token_matches = self._token_matcher(doc)
        if token_matches:
            (match_id, start, end) = token_matches[0]
            span = doc[start:end]
            return True, span.text

        return False, None
예제 #35
0
def test_issue4002(en_vocab):
    """Test that the PhraseMatcher can match on overwritten NORM attributes."""
    matcher = PhraseMatcher(en_vocab, attr="NORM")
    pattern1 = Doc(en_vocab, words=["c", "d"])
    assert [t.norm_ for t in pattern1] == ["c", "d"]
    matcher.add("TEST", [pattern1])
    doc = Doc(en_vocab, words=["a", "b", "c", "d"])
    assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
    matches = matcher(doc)
    assert len(matches) == 1
    matcher = PhraseMatcher(en_vocab, attr="NORM")
    pattern2 = Doc(en_vocab, words=["1", "2"])
    pattern2[0].norm_ = "c"
    pattern2[1].norm_ = "d"
    assert [t.norm_ for t in pattern2] == ["c", "d"]
    matcher.add("TEST", [pattern2])
    matches = matcher(doc)
    assert len(matches) == 1
class RESTCountriesComponent(object):
    """spaCy v2.0 pipeline component that requests all countries via
    the REST Countries API, merges country names into one token, assigns entity
    labels and sets attributes on country tokens.
    """
    name = 'rest_countries' # component name, will show up in the pipeline

    def __init__(self, nlp, label='GPE'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        # Make request once on initialisation and store the data
        r = requests.get('https://restcountries.eu/rest/v2/all')
        r.raise_for_status()  # make sure requests raises an error if it fails
        countries = r.json()

        # Convert API response to dict keyed by country name for easy lookup
        # This could also be extended using the alternative and foreign language
        # names provided by the API
        self.countries = {c['name']: c for c in countries}
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher with Doc patterns for each country name
        patterns = [nlp(c) for c in self.countries.keys()]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('COUNTRIES', None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        # If no default value is set, it defaults to None.
        Token.set_extension('is_country', default=False)
        Token.set_extension('country_capital')
        Token.set_extension('country_latlng')
        Token.set_extension('country_flag')

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_country == True.
        Doc.set_extension('has_country', getter=self.has_country)
        Span.set_extension('has_country', getter=self.has_country)


    def __call__(self, doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.
        """
        matches = self.matcher(doc)
        spans = []  # keep the spans for later so we can merge them afterwards
        for _, start, end in matches:
            # Generate Span representing the entity & set label
            entity = Span(doc, start, end, label=self.label)
            spans.append(entity)
            # Set custom attribute on each token of the entity
            # Can be extended with other data returned by the API, like
            # currencies, country code, flag, calling code etc.
            for token in entity:
                token._.set('is_country', True)
                token._.set('country_capital', self.countries[entity.text]['capital'])
                token._.set('country_latlng', self.countries[entity.text]['latlng'])
                token._.set('country_flag', self.countries[entity.text]['flag'])
            # Overwrite doc.ents and add entity – be careful not to replace!
            doc.ents = list(doc.ents) + [entity]
        for span in spans:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched
            # indices!
            span.merge()
        return doc  # don't forget to return the Doc!

    def has_country(self, tokens):
        """Getter for Doc and Span attributes. Returns True if one of the tokens
        is a country. Since the getter is only called when we access the
        attribute, we can refer to the Token's 'is_country' attribute here,
        which is already set in the processing step."""
        return any([t._.get('is_country') for t in tokens])