예제 #1
0
def test_operator_combos(en_vocab):
    cases = [
        ("aaab", "a a a b", True),
        ("aaab", "a+ b", True),
        ("aaab", "a+ a+ b", True),
        ("aaab", "a+ a+ a b", True),
        ("aaab", "a+ a+ a+ b", True),
        ("aaab", "a+ a a b", True),
        ("aaab", "a+ a a", True),
        ("aaab", "a+", True),
        ("aaa", "a+ b", False),
        ("aaa", "a+ a+ b", False),
        ("aaa", "a+ a+ a+ b", False),
        ("aaa", "a+ a b", False),
        ("aaa", "a+ a a b", False),
        ("aaab", "a+ a a", True),
        ("aaab", "a+", True),
        ("aaab", "a+ a b", True),
    ]
    for string, pattern_str, result in cases:
        matcher = Matcher(en_vocab)
        doc = Doc(matcher.vocab, words=list(string))
        pattern = []
        for part in pattern_str.split():
            if part.endswith("+"):
                pattern.append({"ORTH": part[0], "OP": "+"})
            else:
                pattern.append({"ORTH": part})
        matcher.add("PATTERN", None, pattern)
        matches = matcher(doc)
        if result:
            assert matches, (string, pattern_str)
        else:
            assert not matches, (string, pattern_str)
예제 #2
0
파일: ud_train.py 프로젝트: spacy-io/spaCy
def write_conllu(docs, file_):
    merger = Matcher(docs[0].vocab)
    merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
    for i, doc in enumerate(docs):
        matches = merger(doc)
        spans = [doc[start : end + 1] for _, start, end in matches]
        with doc.retokenize() as retokenizer:
            for span in spans:
                retokenizer.merge(span)
        file_.write("# newdoc id = {i}\n".format(i=i))
        for j, sent in enumerate(doc.sents):
            file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
            file_.write("# text = {text}\n".format(text=sent.text))
            for k, token in enumerate(sent):
                if token.head.i > sent[-1].i or token.head.i < sent[0].i:
                    for word in doc[sent[0].i - 10 : sent[0].i]:
                        print(word.i, word.head.i, word.text, word.dep_)
                    for word in sent:
                        print(word.i, word.head.i, word.text, word.dep_)
                    for word in doc[sent[-1].i : sent[-1].i + 10]:
                        print(word.i, word.head.i, word.text, word.dep_)
                    raise ValueError(
                        "Invalid parse: head outside sentence (%s)" % token.text
                    )
                file_.write(token._.get_conllu_lines(k) + "\n")
            file_.write("\n")
예제 #3
0
def test_issue615(en_tokenizer):
    def merge_phrases(matcher, doc, i, matches):
        """Merge a phrase. We have to be careful here because we'll change the
        token indices. To avoid problems, merge all the phrases once we're called
        on the last match."""
        if i != len(matches) - 1:
            return None
        spans = [Span(doc, start, end, label=label) for label, start, end in matches]
        with doc.retokenize() as retokenizer:
            for span in spans:
                tag = "NNP" if span.label_ else span.root.tag_
                attrs = {"tag": tag, "lemma": span.text}
                retokenizer.merge(span, attrs=attrs)
                doc.ents = doc.ents + (span,)

    text = "The golf club is broken"
    pattern = [{"ORTH": "golf"}, {"ORTH": "club"}]
    label = "Sport_Equipment"
    doc = en_tokenizer(text)
    matcher = Matcher(doc.vocab)
    matcher.add(label, merge_phrases, pattern)
    matcher(doc)
    entities = list(doc.ents)
    assert entities != []
    assert entities[0].label != 0
예제 #4
0
def test_matcher_match_zero_plus(matcher):
    words = 'He said , " some words " ...'.split()
    pattern = [{"ORTH": '"'}, {"OP": "*", "IS_PUNCT": False}, {"ORTH": '"'}]
    matcher = Matcher(matcher.vocab)
    matcher.add("Quote", None, pattern)
    doc = Doc(matcher.vocab, words=words)
    assert len(matcher(doc)) == 1
예제 #5
0
def write_conllu(docs, file_):
    merger = Matcher(docs[0].vocab)
    merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
    for i, doc in enumerate(docs):
        matches = merger(doc)
        spans = [doc[start : end + 1] for _, start, end in matches]
        with doc.retokenize() as retokenizer:
            for span in spans:
                retokenizer.merge(span)
        # TODO: This shouldn't be necessary? Should be handled in merge
        for word in doc:
            if word.i == word.head.i:
                word.dep_ = "ROOT"
        file_.write("# newdoc id = {i}\n".format(i=i))
        for j, sent in enumerate(doc.sents):
            file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
            file_.write("# text = {text}\n".format(text=sent.text))
            for k, token in enumerate(sent):
                file_.write(_get_token_conllu(token, k, len(sent)) + "\n")
            file_.write("\n")
            for word in sent:
                if word.head.i == word.i and word.dep_ == "ROOT":
                    break
            else:
                print("Rootless sentence!")
                print(sent)
                print(i)
                for w in sent:
                    print(w.i, w.text, w.head.text, w.head.i, w.dep_)
                raise ValueError
class RussianTokenizer(object):
    name = 'russian_tokenizer'

    def __init__(self, nlp, merge_patterns=None, terminal_patterns=None):
        self.matcher = Matcher(nlp.vocab)
        self.token_merge = nlp.vocab.strings['pattern']
        self.sentence_terminal = nlp.vocab.strings['sentence_terminal']
        if merge_patterns:
            self.matcher.add(self.token_merge, None, *merge_patterns)
        if terminal_patterns:
            self.matcher.add(self.sentence_terminal, None, *terminal_patterns)

    def __call__(self, doc):
        spans = []
        for id, start, end in self.matcher(doc):
            if id == self.token_merge:
                spans.append(doc[start:end])
            elif id == self.sentence_terminal:
                # remove all sentence start marks from span that match pattern
                for token in doc[start:end]:
                    if token.sent_start:
                        token.sent_start = False
        if spans:
            for span in spans:
                span.merge()
        return doc
예제 #7
0
def test_issue3555(en_vocab):
    """Test that custom extensions with default None don't break matcher."""
    Token.set_extension("issue3555", default=None)
    matcher = Matcher(en_vocab)
    pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
    matcher.add("TEST", None, pattern)
    doc = Doc(en_vocab, words=["have", "apple"])
    matcher(doc)
예제 #8
0
def test_issue1883():
    matcher = Matcher(Vocab())
    matcher.add("pat1", None, [{"orth": "hello"}])
    doc = Doc(matcher.vocab, words=["hello"])
    assert len(matcher(doc)) == 1
    new_matcher = copy.deepcopy(matcher)
    new_doc = Doc(new_matcher.vocab, words=["hello"])
    assert len(new_matcher(new_doc)) == 1
예제 #9
0
def test_matcher_operator_shadow(en_vocab):
    matcher = Matcher(en_vocab)
    doc = Doc(matcher.vocab, words=["a", "b", "c"])
    pattern = [{"ORTH": "a"}, {"IS_ALPHA": True, "OP": "+"}, {"ORTH": "c"}]
    matcher.add("A.C", None, pattern)
    matches = matcher(doc)
    assert len(matches) == 1
    assert matches[0][1:] == (0, 3)
예제 #10
0
def test_issue_1971_2(en_vocab):
    matcher = Matcher(en_vocab)
    pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
    pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}]  # {"IN": ["EUR"]}}]
    doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
    matcher.add("TEST1", None, pattern1, pattern2)
    matches = matcher(doc)
    assert len(matches) == 2
예제 #11
0
def test_match_consuming(doc, text, pattern, re_pattern):
    """Test that matcher.__call__ consumes tokens on a match similar to
    re.findall."""
    matcher = Matcher(doc.vocab)
    matcher.add(re_pattern, None, pattern)
    matches = matcher(doc)
    re_matches = [m.span() for m in re.finditer(re_pattern, text)]
    assert len(matches) == len(re_matches)
예제 #12
0
def test_greedy_matching(doc, text, pattern, re_pattern):
    """Test that the greedy matching behavior of the * op is consistant with
    other re implementations."""
    matcher = Matcher(doc.vocab)
    matcher.add(re_pattern, None, pattern)
    matches = matcher(doc)
    re_matches = [m.span() for m in re.finditer(re_pattern, text)]
    for match, re_match in zip(matches, re_matches):
        assert match[1:] == re_match
예제 #13
0
def test_issue1945():
    """Test regression in Matcher introduced in v2.0.6."""
    matcher = Matcher(Vocab())
    matcher.add("MWE", None, [{"orth": "a"}, {"orth": "a"}])
    doc = Doc(matcher.vocab, words=["a", "a", "a"])
    matches = matcher(doc)  # we should see two overlapping matches here
    assert len(matches) == 2
    assert matches[0][1:] == (0, 2)
    assert matches[1][1:] == (1, 3)
예제 #14
0
def test_matcher_compare_length(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"LENGTH": {">=": 2}}]
    matcher.add("LENGTH_COMPARE", None, pattern)
    doc = Doc(en_vocab, words=["a", "aa", "aaa"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["a"])
    matches = matcher(doc)
    assert len(matches) == 0
예제 #15
0
def test_matcher_regex(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}]
    matcher.add("A_OR_AN", None, pattern)
    doc = Doc(en_vocab, words=["an", "a", "hi"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["bye"])
    matches = matcher(doc)
    assert len(matches) == 0
예제 #16
0
def test_matcher_set_value_operator(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": {"IN": ["a", "the"]}, "OP": "?"}, {"ORTH": "house"}]
    matcher.add("DET_HOUSE", None, pattern)
    doc = Doc(en_vocab, words=["In", "a", "house"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["my", "house"])
    matches = matcher(doc)
    assert len(matches) == 1
예제 #17
0
def test_matcher_set_value(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": {"IN": ["an", "a"]}}]
    matcher.add("A_OR_AN", None, pattern)
    doc = Doc(en_vocab, words=["an", "a", "apple"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["aardvark"])
    matches = matcher(doc)
    assert len(matches) == 0
예제 #18
0
def test_matcher_any_token_operator(en_vocab):
    """Test that patterns with "any token" {} work with operators."""
    matcher = Matcher(en_vocab)
    matcher.add("TEST", None, [{"ORTH": "test"}, {"OP": "*"}])
    doc = Doc(en_vocab, words=["test", "hello", "world"])
    matches = [doc[start:end].text for _, start, end in matcher(doc)]
    assert len(matches) == 3
    assert matches[0] == "test"
    assert matches[1] == "test hello"
    assert matches[2] == "test hello world"
예제 #19
0
def test_matcher_regex_shape(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}]
    matcher.add("NON_ALPHA", None, pattern)
    doc = Doc(en_vocab, words=["99", "problems", "!"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["bye"])
    matches = matcher(doc)
    assert len(matches) == 0
예제 #20
0
def matcher(en_vocab):
    rules = {
        "JS": [[{"ORTH": "JavaScript"}]],
        "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
        "Java": [[{"LOWER": "java"}]],
    }
    matcher = Matcher(en_vocab)
    for key, patterns in rules.items():
        matcher.add(key, None, *patterns)
    return matcher
예제 #21
0
def test_issue_1971_3(en_vocab):
    """Test that pattern matches correctly for multiple extension attributes."""
    Token.set_extension("a", default=1, force=True)
    Token.set_extension("b", default=2, force=True)
    doc = Doc(en_vocab, words=["hello", "world"])
    matcher = Matcher(en_vocab)
    matcher.add("A", None, [{"_": {"a": 1}}])
    matcher.add("B", None, [{"_": {"b": 2}}])
    matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
    assert len(matches) == 4
    assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
예제 #22
0
def test_issue1450(string, start, end):
    """Test matcher works when patterns end with * operator."""
    pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
    matcher = Matcher(Vocab())
    matcher.add("TSTEND", None, pattern)
    doc = Doc(Vocab(), words=string.split())
    matches = matcher(doc)
    if start is None or end is None:
        assert matches == []
    assert matches[-1][1] == start
    assert matches[-1][2] == end
예제 #23
0
def test_issue850_basic():
    """Test Matcher matches with '*' operator and Boolean flag"""
    vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
    matcher = Matcher(vocab)
    pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}]
    matcher.add("FarAway", None, pattern)
    doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
    match = matcher(doc)
    assert len(match) == 1
    ent_id, start, end = match[0]
    assert start == 0
    assert end == 4
예제 #24
0
def test_matcher_end_zero_plus(en_vocab):
    """Test matcher works when patterns end with * operator. (issue 1450)"""
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
    matcher.add("TSTEND", None, pattern)
    nlp = lambda string: Doc(matcher.vocab, words=string.split())
    assert len(matcher(nlp("a"))) == 1
    assert len(matcher(nlp("a b"))) == 2
    assert len(matcher(nlp("a c"))) == 1
    assert len(matcher(nlp("a b c"))) == 2
    assert len(matcher(nlp("a b b c"))) == 3
    assert len(matcher(nlp("a b b"))) == 3
예제 #25
0
def test_matcher_from_api_docs(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": "test"}]
    assert len(matcher) == 0
    matcher.add("Rule", None, pattern)
    assert len(matcher) == 1
    matcher.remove("Rule")
    assert "Rule" not in matcher
    matcher.add("Rule", None, pattern)
    assert "Rule" in matcher
    on_match, patterns = matcher.get("Rule")
    assert len(patterns[0])
예제 #26
0
def test_matcher_sets_return_correct_tokens(en_vocab):
    matcher = Matcher(en_vocab)
    patterns = [
        [{'LOWER': {'IN': ["zero"]}}],
        [{'LOWER': {'IN': ["one"]}}],
        [{'LOWER': {'IN': ["two"]}}],
    ]
    matcher.add('TEST', None, *patterns)
    doc = Doc(en_vocab, words="zero one two three".split())
    matches = matcher(doc)
    texts = [Span(doc, s, e, label=L).text for L, s, e in matches]
    assert texts == ['zero', 'one', 'two']
예제 #27
0
def test_issue590(en_vocab):
    """Test overlapping matches"""
    doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
    matcher = Matcher(en_vocab)
    matcher.add(
        "ab",
        None,
        [{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}],
    )
    matcher.add("ab", None, [{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}])
    matches = matcher(doc)
    assert len(matches) == 2
예제 #28
0
def test_matcher_empty_dict(en_vocab):
    """Test matcher allows empty token specs, meaning match on any token."""
    matcher = Matcher(en_vocab)
    doc = Doc(matcher.vocab, words=["a", "b", "c"])
    matcher.add("A.C", None, [{"ORTH": "a"}, {}, {"ORTH": "c"}])
    matches = matcher(doc)
    assert len(matches) == 1
    assert matches[0][1:] == (0, 3)
    matcher = Matcher(en_vocab)
    matcher.add("A.", None, [{"ORTH": "a"}, {}])
    matches = matcher(doc)
    assert matches[0][1:] == (0, 2)
예제 #29
0
def test_matcher_extension_attribute(en_vocab):
    matcher = Matcher(en_vocab)
    get_is_fruit = lambda token: token.text in ("apple", "banana")
    Token.set_extension("is_fruit", getter=get_is_fruit, force=True)
    pattern = [{"ORTH": "an"}, {"_": {"is_fruit": True}}]
    matcher.add("HAVING_FRUIT", None, pattern)
    doc = Doc(en_vocab, words=["an", "apple"])
    matches = matcher(doc)
    assert len(matches) == 1
    doc = Doc(en_vocab, words=["an", "aardvark"])
    matches = matcher(doc)
    assert len(matches) == 0
예제 #30
0
def test_matcher_extension_set_membership(en_vocab):
    matcher = Matcher(en_vocab)
    get_reversed = lambda token: "".join(reversed(token.text))
    Token.set_extension("reversed", getter=get_reversed, force=True)
    pattern = [{"_": {"reversed": {"IN": ["eyb", "ih"]}}}]
    matcher.add("REVERSED", None, pattern)
    doc = Doc(en_vocab, words=["hi", "bye", "hello"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["aardvark"])
    matches = matcher(doc)
    assert len(matches) == 0
예제 #31
0
def rules(obj, qus, question):
    matcher = Matcher(nlp.vocab)
    matcher.add("rule description", None, [{"Lemma": "about"}], [{"Lemma": "detail"}])
    matcher.add("rule student_course", None, [{"Lemma": "take"}])
    matcher.add("rule topic_course", None, [{"Lemma": "cover"}])
    matcher.add("rule topic_student", None, [{"Lemma": "familiar"}, {"Lemma": "with"}])
    matcher.add("rule student_topic", None, [{"Lemma": "know"}])
    doc = nlp(qus)

    # doc=nlp(question)
    matcher_ques = Matcher(nlp.vocab)
    matcher_ques.add("question rule1", None, [{"POS": "PROPN"}, {"POS": "NUM"}])
    matcher_ques.add("question rule2", None, [{"POS": "PROPN"}, {"POS": "PROPN"},{"POS": "PROPN"}],[{"POS": "PROPN"}, {"POS": "PROPN"}])
    matcher_ques.add("question rule3", None, [{"Lemma": "course"}, {"Lemma": "cover"}])

    doc2 = nlp(question)

    for match_id, start, end in matcher(doc):
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        if (string_id == "rule topic_student"):
            return ["q4", obj[0]]

        for match_id2, start2, end2 in matcher_ques(doc2):
            span2 = doc2[start2:end2]
            string_id2 = doc.vocab.strings[match_id2]
            if (string_id == "rule description" and string_id2 == "question rule1"):
                return ["q1", span2.text]
            elif (string_id == "rule student_course" and string_id2 == "question rule2"):
                return ["q2", span2.text]
            elif (string_id == "rule topic_course" and string_id2 == "question rule3"):
                return ["q3", question.split("cover",1)[1]]
            elif (string_id == "rule student_topic" and string_id2 == "question rule2"):
                return ["q5", span2.text]
예제 #32
0
    match_id, start, end = matches[i]  # indices of matched term
    span = doc[start:end]              # extract matched term

    print('span: {} | start_ind:{:5} | end_ind:{:5} | id:{}'.format(
        span, start, end, match_id))

# set a pattern of text to collect
# find all mentions of the word fees
pattern = [{'LOWER':'fees'}] # LOWER coverts words to lowercase before matching

# instantiate matcher
matcher = Matcher(nlp.vocab)

# add pattern to the matcher (one matcher can look for many unique patterns)
# provice a pattern name, function to apply to matches, pattern to identify
matcher.add('fee', collect_sents, pattern)

# pass the doc to the matcher to run the collect_sents function
matcher(doc)
# change the function to print the sentence of the matched term (span)

def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]
    print('SPAN: {}'.format(span))

    # span.sent provides the sentence that contains the span
    print('SENT: {}'.format(span.sent))
    print()

# update the pattern to look for any noun preceeding the term 'fees'
예제 #33
0
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en")
matcher = Matcher(nlp.vocab)
pattern = [{"DEP": "nsubj"}, {"DEP": "aux"}, {"DEP": "ROOT"}]
matcher.add("NsubjAuxRoot", None, pattern)
doc = nlp(u"We can overtake them.")
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print("Span: ", span.text)
    print("The positions in the doc are: ", start, "-", end)
예제 #34
0
class CrazyTokenizer(object):
    """
    Tokenizer with Reddit- and Twitter-specific options

    Parameters
    ----------
    lowercase : bool, optional
        If True, lowercase all tokens. Defaults to True.

    keepcaps: bool, optional
        If True, keep ALL CAPS WORDS uppercased. Defaults to False.

    normalize: int or bool, optional
        If not False, perform normalization of repeated charachers
        ("awesoooooome" -> "awesooome"). The value of parameter
        determines the number of occurences to keep. Defaults to 3.

    ignore_quotes: bool, optional
        If True, ignore tokens contained within double quotes.
        Defaults to False.

    ignore_reddit_quotes: bool, optional
        If True, remove quotes from the Reddit comments. Defaults to False.

    ignore_stopwords: str, list, or boolean, optional
        Whether to ignore stopwords

        - str: language to get a list of stopwords for from NLTK package
        - list: list of stopwords to remove
        - True: use built-in list of the english stop words
        - False: keep all tokens

        Defaults to False

    stem: {False, 'stem', 'lemm'}, optional
        Whether to perform word stemming

        - False: do not perform word stemming
        - 'stem': use PorterStemmer from NLTK package
        - 'lemm': use WordNetLemmatizer from NLTK package

    remove_punct: bool, optional
        If True, remove punctuation tokens. Defaults to True.

    remove_breaks: bool, optional
        If True, remove linebreak tokens. Defaults to True.

    decontract: bool, optional
        If True, attempt to expand certain contractions. Defaults to False.
        Example: "'ll" -> " will"

    numbers, subreddits, reddit_usernames, emails:
    False or str, optional
        Replacement of the different types of tokens

        - False: leaves these tokens intact
        - str: replacement token
        - '': removes all occurrences of these tokens

    twitter_handles: False, 'realname' or str, optional
        Processing of twitter handles

        - False: do nothing
        - str: replacement token
        - 'realname': replace with the real screen name of Twitter account
        - 'split': split handles using Viterbi algorithm

        Example: "#vladimirputinisthebest" -> "vladimir putin is the best"

    hashtags: False or str, optional
        Processing of hashtags

        - False: do nothing
        - str: replacement token
        - 'split': split hashtags according using Viterbi algorithm

    urls: False or str, optional
        Replacement of parsed URLs

        - False: leave URL intact
        - str: replacement token
        - dict: replace all URLs stored in keys with the corresponding values
        - '': removes all occurrences of these tokens
        - 'domain': extract domain ("http://cnn.com" -> "cnn")
        - 'domain_unwrap_fast': extract domain after unwraping links
        for a list of URL shorteners (goo.gl, t.co, bit.ly, tinyurl.com)
        - 'domain_unwrap': extract domain after unwraping all links
        - 'title': extract and tokenize title of each link after unwraping it

        Defaults to False.

    extra_patterns: None or list of tuples, optional
        Replacement of any user-supplied extra patterns.
        Tuples must have the following form: (name, re_pattern, replacement_token):

        - name (str): name of the pattern
        - re_pattern (_sre.SRE_Pattern): compiled re pattern
        - replacement_token (str): replacement token

        Defaults to None

    keep_untokenized: None or list, optional
        List of expressions to keep untokenized

        Example: ["New York", "Los Angeles", "San Francisco"]

    whitespaces_to_underscores: boolean, optional
        If True, replace all whitespace characters with
        underscores in the final tokens. Defaults to True.

    remove_nonunicode: boolean, optional
        If True, remove all non-unicode characters. Defaults to False.

    pos_emojis, neg_emojis, neutral_emojis: None, True, or list, optional
        Replace positive, negative, and neutral emojis with the special tokens

        - None: do not perform replacement
        - True: perform replacement of the default lists of emojis
        - list: list of emojis to replace

    print_url_warnings: bool, optional
        If True, print URL-related warnings. Defaults to False.

    latin_chars_fix: bool, optional
        Try applying this fix if you have a lot of \\xe2\\x80\\x99-like
        or U+1F601-like strings in your data. Defaults to False.

    ngrams: int, optional
        Add ngrams of tokens after tokenizing
    """
    def __init__(self,
                 lowercase=True,
                 keepcaps=False,
                 normalize=3,
                 ignore_quotes=False,
                 ignore_reddit_quotes=False,
                 ignore_stopwords=False,
                 stem=False,
                 remove_punct=True,
                 remove_breaks=True,
                 decontract=False,
                 twitter_handles=False,
                 urls=False,
                 hashtags=False,
                 numbers=False,
                 subreddits=False,
                 reddit_usernames=False,
                 emails=False,
                 extra_patterns=None,
                 keep_untokenized=None,
                 whitespaces_to_underscores=True,
                 remove_nonunicode=False,
                 pos_emojis=None,
                 neg_emojis=None,
                 neutral_emojis=None,
                 print_url_warnings=False,
                 latin_chars_fix=False,
                 ngrams=1):
        self.params = locals()

        self._nlp = English()
        self._merging_matcher = Matcher(self._nlp.vocab)
        self._matcher = Matcher(self._nlp.vocab)

        self._replacements = {}
        self._domains = {}
        self._realnames = {}
        self._stopwords = None

        alpha_digits_flag = self._nlp.vocab.add_flag(alpha_digits_check)
        hashtag_flag = self._nlp.vocab.add_flag(hashtag_check)
        twitter_handle_flag = self._nlp.vocab.add_flag(twitter_handle_check)

        self._merging_matcher.add('HASHTAG', None, [{
            'ORTH': '#'
        }, {
            'IS_ASCII': True
        }])
        self._merging_matcher.add('SUBREDDIT', None, [{
            'ORTH': '/r'
        }, {
            'ORTH': '/'
        }, {
            alpha_digits_flag: True
        }], [{
            'ORTH': 'r'
        }, {
            'ORTH': '/'
        }, {
            alpha_digits_flag: True
        }])
        self._merging_matcher.add('REDDIT_USERNAME', None,
                                  [{
                                      'ORTH': '/u'
                                  }, {
                                      'ORTH': '/'
                                  }, {
                                      alpha_digits_flag: True
                                  }], [{
                                      'ORTH': 'u'
                                  }, {
                                      'ORTH': '/'
                                  }, {
                                      alpha_digits_flag: True
                                  }])

        if isinstance(ignore_stopwords, str) and ('nltk' in sys.modules):
            try:
                self._stopwords = stopwords.words(ignore_stopwords)
            except OSError:
                raise ValueError('Language {} was not found by NLTK'.format(
                    ignore_stopwords))
        elif ignore_stopwords is True:
            self._matcher.add('STOPWORDS', self._remove_token, [{
                'IS_STOP': True
            }])
        elif isinstance(ignore_stopwords, list):
            self._stopwords = [word.lower() for word in ignore_stopwords]
        elif ignore_stopwords is not False:
            raise TypeError(
                'Type {} is not supported by ignore_stopwords parameter or NLTK is not installed'
                .format(type(ignore_stopwords)))

        if lowercase and (not keepcaps):
            self._matcher.add('LOWERCASE', self._lowercase, [{
                'IS_LOWER': False
            }])
        elif lowercase and keepcaps:
            self._matcher.add('LOWERCASE', self._lowercase, [{
                'IS_LOWER': False,
                'IS_UPPER': False
            }])

        if remove_punct:
            self._matcher.add('PUNCTUATION', self._remove_token,
                              [{
                                  'IS_PUNCT': True
                              }])

        if remove_breaks:

            def break_check(text):
                return bool(BREAKS_RE.fullmatch(text))

            break_flag = self._nlp.vocab.add_flag(break_check)
            self._matcher.add('BREAK', self._remove_token, [{
                break_flag: True
            }])

        if normalize:

            def normalize_check(text):
                return bool(NORMALIZE_RE.search(text))

            normalize_flag = self._nlp.vocab.add_flag(normalize_check)
            self._matcher.add('NORMALIZE', self._normalize,
                              [{
                                  normalize_flag: True
                              }])

        if numbers is not False:
            self._matcher.add('NUMBER', self._replace_token, [{
                'LIKE_NUM': True
            }])
            self._replacements['NUMBER'] = numbers

        if urls is not False:
            if urls in [
                    'domain', 'domain_unwrap_fast', 'domain_unwrap', 'title'
            ]:
                self._urls = urls
                self._matcher.add('URL', self._process_url, [{
                    'LIKE_URL': True
                }])
            elif isinstance(urls, dict):
                self._domains = urls
                self._urls = 'domain_unwrap_fast'
                self._matcher.add('URL', self._process_url, [{
                    'LIKE_URL': True
                }])
            else:
                self._matcher.add('URL', self._replace_token, [{
                    'LIKE_URL': True
                }])
                self._replacements['URL'] = urls

        if emails is not False:
            self._matcher.add('EMAIL', self._replace_token, [{
                'LIKE_EMAIL': True
            }])
            self._replacements['EMAIL'] = emails

        if reddit_usernames is not False:

            def reddit_username_check(text):
                return bool(REDDITORS_RE.fullmatch(text))

            reddit_username_flag = self._nlp.vocab.add_flag(
                reddit_username_check)
            self._matcher.add('REDDIT_USERNAME', self._replace_token,
                              [{
                                  reddit_username_flag: True
                              }])
            self._replacements['REDDIT_USERNAME'] = reddit_usernames

        if subreddits is not False:

            def subreddit_check(text):
                return bool(SUBREDDITS_RE.fullmatch(text))

            subreddit_flag = self._nlp.vocab.add_flag(subreddit_check)
            self._matcher.add('SUBREDDIT', self._replace_token,
                              [{
                                  subreddit_flag: True
                              }])
            self._replacements['SUBREDDIT'] = subreddits

        if twitter_handles is not False:
            self._matcher.add('TWITTER_HANDLE', self._handles_postprocess,
                              [{
                                  twitter_handle_flag: True
                              }])

        if hashtags is not False:
            self._matcher.add('HASHTAG', self._hashtag_postprocess,
                              [{
                                  hashtag_flag: True
                              }])

        if hashtags == 'split' or twitter_handles == 'split':
            file = os.path.join(DATA_PATH, 'wordsfreq_wiki2.txt')
            with open(file) as f:
                self._words = f.read().split()
            self._wordcost = dict((k, log((i + 1) * log(len(self._words))))
                                  for i, k in enumerate(self._words))
            self._maxword = max(len(x) for x in self._words)

        if twitter_handles == 'realname':
            with open(os.path.join(DATA_PATH, 'realnames.json')) as f:
                self._realnames = json.load(f)

        if ignore_quotes:
            self._merging_matcher.add('QUOTE', None, [{
                'ORTH': '"'
            }, {
                'OP': '*',
                'IS_ASCII': True
            }, {
                'ORTH': '"'
            }])

            def doublequote_check(text):
                return bool(QUOTES_RE.fullmatch(text))

            doublequote_flag = self._nlp.vocab.add_flag(doublequote_check)
            self._matcher.add('DOUBLE_QUOTES', self._remove_token,
                              [{
                                  doublequote_flag: True
                              }])

        if self._stopwords:

            def stopword_check(text):
                return bool(text.lower() in self._stopwords)

            stopword_flag = self._nlp.vocab.add_flag(stopword_check)
            self._matcher.add('STOPWORD', self._remove_token,
                              [{
                                  stopword_flag: True
                              }])

        if keep_untokenized is not None:
            if not isinstance(keep_untokenized, list):
                raise ValueError(
                    "keep_untokenized has to be either None or a list")
            for i, phrase in enumerate(keep_untokenized):
                phrase_tokens = phrase.split(' ')
                rule = []
                for token in phrase_tokens:
                    rule.append({'LOWER': token.lower()})
                self._merging_matcher.add('RULE_' + str(i), None, rule)

        if pos_emojis:
            if not isinstance(pos_emojis, list):
                pos_emojis = POS_EMOJIS
            pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emojis]
            self._matcher.add('HAPPY', self._replace_token, *pos_patterns)
            self._replacements['HAPPY'] = 'POS_EMOJI'

        if neg_emojis:
            if not isinstance(neg_emojis, list):
                neg_emojis = NEG_EMOJIS
            neg_patterns = [[{'ORTH': emoji}] for emoji in neg_emojis]
            self._matcher.add('SAD', self._replace_token, *neg_patterns)
            self._replacements['SAD'] = 'NEG_EMOJI'

        if neutral_emojis:
            if not isinstance(neutral_emojis, list):
                neutral_emojis = NEUTRAL_EMOJIS
            neutral_patterns = [[{'ORTH': emoji}] for emoji in neutral_emojis]
            self._matcher.add('NEUTRAL', self._replace_token,
                              *neutral_patterns)
            self._replacements['NEUTRAL'] = 'NEUTRAL_EMOJI'

        if isinstance(extra_patterns, list):
            self._flags = {}
            for name, re_pattern, replacement_token in extra_patterns:

                def flag(text):
                    return bool(re_pattern.match(text))

                self._flags[name] = self._nlp.vocab.add_flag(flag)
                self._matcher.add(name, self._replace_token,
                                  [{
                                      self._flags[name]: True
                                  }])
                self._replacements[name] = replacement_token

        if stem and ('nltk' in sys.modules):
            if stem == 'stem':
                self._stemmer = PorterStemmer()
            elif stem == 'lemm':
                self._stemmer = WordNetLemmatizer()
            else:
                raise ValueError(
                    'Stemming method {} is not supported'.format(stem))
            self._matcher.add('WORD_TO_STEM', self._stem_word,
                              [{
                                  'IS_ALPHA': True
                              }])

        retokenize_flag = self._nlp.vocab.add_flag(retokenize_check)
        self._matcher.add('RETOKENIZE', self._retokenize,
                          [{
                              retokenize_flag: True,
                              'IS_PUNCT': False,
                              'LIKE_URL': False,
                              'LIKE_EMAIL': False,
                              'LIKE_NUM': False,
                              hashtag_flag: False,
                              twitter_handle_flag: False
                          }])

        self._nlp.add_pipe(self._merge_doc, name='merge_doc', last=True)
        self._nlp.add_pipe(self._match_doc, name='match_doc', last=True)
        self._nlp.add_pipe(self._postproc_doc, name='postproc_doc', last=True)

    @staticmethod
    def _lowercase(__, doc, i, matches):
        # Lowercase tokens
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            tok._.transformed_text = tok._.transformed_text.lower()

    def _stem_word(self, __, doc, i, matches):
        # Stem tokens
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            if self.params['stem'] == 'stem':
                tok._.transformed_text = self._stemmer.stem(
                    tok._.transformed_text)
            elif self.params['stem'] == 'lemm':
                tok._.transformed_text = self._stemmer.lemmatize(
                    tok._.transformed_text)

    def _normalize(self, __, doc, i, matches):
        # Normalize repeating symbols
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            tok._.transformed_text = NORMALIZE_RE.sub(
                r"\1" * self.params['normalize'], tok._.transformed_text)

    def _process_url(self, __, doc, i, matches):
        # Process found URLs
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            found_urls = URLS_RE.findall(tok.text)
            if found_urls:
                if found_urls[0] in self._domains:
                    tok._.transformed_text = self._domains[found_urls[0]]
                elif self._urls == 'domain':
                    tok._.transformed_text = tldextract.extract(
                        found_urls[0]).domain
                elif self._urls != 'title':
                    if self._urls == 'domain_unwrap':
                        domain = unshorten_url(
                            found_urls[0], None,
                            self.params['print_url_warnings'])
                    else:
                        domain = unshorten_url(
                            found_urls[0], URL_SHORTENERS,
                            self.params['print_url_warnings'])
                    self._domains[found_urls[0]] = domain
                    tok._.transformed_text = domain
                elif self._urls == 'title':
                    domain = unshorten_url(found_urls[0], URL_SHORTENERS)
                    if domain != 'twitter':
                        title = get_url_title(
                            found_urls[0], self.params['print_url_warnings'])
                        title = self.tokenize(URLS_RE.sub('', title))
                    else:
                        title = ''
                    tok._.transformed_text = title
                    self._domains[found_urls[0]] = title

    def _replace_token(self, __, doc, i, matches):
        # Replace tokens with something else
        match_id, start, end = matches[i]
        span = doc[start:end]
        replacement_token = self._replacements[doc.vocab.strings[match_id]]
        for tok in span:
            tok._.transformed_text = replacement_token

    @staticmethod
    def _remove_token(__, doc, i, matches):
        # Remove tokens
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            tok._.transformed_text = ''

    def _retokenize(self, __, doc, i, matches):
        # Retokenize
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            text = tok.text
            text = re.sub(r'([#@])', r' \1', text)
            text = re.sub(r'\s{2,}', ' ', text).strip()
            tok._.transformed_text = self.tokenize(text)

    def _infer_spaces(self, text):
        # Infer location of spaces in hashtags
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)

        def best_match(i):
            # Find the best match for the first i characters
            # assuming costs has been built for the first (i-1) characters
            candidates = enumerate(reversed(cost[max(0, i - self._maxword):i]))
            return min(
                (c + self._wordcost.get(text[i - k - 1:i], 9e999), k + 1)
                for k, c in candidates)

        cost = [0]
        for i in range(1, len(text) + 1):
            cur_cost, k = best_match(i)
            cost.append(cur_cost)

        out = []
        i = len(text)
        while i > 0:
            cur_cost, k = best_match(i)
            assert cur_cost == cost[i]
            out.append(text[i - k:i])
            i -= k

        return list(reversed(out))

    def _handles_postprocess(self, __, doc, i, matches):
        # Process twitter handles
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            if self.params['twitter_handles'] == 'realname':
                if tok.text in self._realnames:
                    tok._.transformed_text = self._realnames[tok.text]
                else:
                    handle = get_twitter_realname(tok.text)
                    realname = self.tokenize(TWITTER_HANDLES_RE.sub(
                        '', handle))
                    tok._.transformed_text = realname
                    self._realnames[tok.text] = realname
            elif self.params['twitter_handles'] == 'split':
                poss = self._infer_spaces(tok._.transformed_text[1:])
                if poss:
                    tok._.transformed_text = poss
            else:
                tok._.transformed_text = self.params['twitter_handles']

    def _hashtag_postprocess(self, __, doc, i, matches):
        # Process hashtags
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            if self.params['hashtags'] == 'split':
                poss = self._infer_spaces(tok._.transformed_text[1:])
                if poss:
                    tok._.transformed_text = poss
            else:
                tok._.transformed_text = self.params['hashtags']

    @staticmethod
    def _decontract(text):
        # Expand contractions
        for contraction, decontraction in DECONTRACTIONS.items():
            text = re.sub(contraction, decontraction, text)
        return text

    def _preprocess_text(self, text):
        # Do some preprocessing
        text = re.sub("’", "'", text)
        if self.params['remove_nonunicode']:
            try:
                text = text.encode('utf-8').decode('unicode-escape')
                text = ''.join(filter(lambda x: x in string.printable,
                                      text)).strip()
            except UnicodeDecodeError:
                warnings.warn(
                    '(UnicodeDecodeError while trying to remove non-unicode characters'
                )
        if self.params['decontract']:
            text = self._decontract(text)
        text = html.unescape(text)

        if self.params['latin_chars_fix']:
            if EMOJIS_UTF_RE.findall(text):
                text = EMOJIS_UTF_NOSPACE_RE.sub(r' \1', text)
                for utf_code, emoji in EMOJIS_UTF.items():
                    text = EMOJIS_UTF_PATS[utf_code].sub(emoji, text)

            if EMOJIS_UNICODE_RE.findall(text):
                text = EMOJIS_UNICODE_NOSPACE_RE.sub(r'\1 \2', text)
                for utf_code, emoji in EMOJIS_UNICODE.items():
                    text = EMOJIS_UNICODE_PATS[utf_code].sub(emoji, text)

            if LATIN_CHARS_RE.findall(text):
                for _hex, _char in LATIN_CHARS.items():
                    text = LATIN_CHARS_PATS[_hex].sub(_char, text)

        if self.params['ignore_reddit_quotes']:
            text = REDDIT_QUOTES_RE.sub(text, ' ')

        text = text.replace('.@', '. @')
        text = re.sub(r'([*;,!?\(\)\[\]])', r' \1', text)
        text = re.sub(r'\s{2,}', ' ', text)

        return text.strip()

    def _merge_doc(self, doc):
        # Perform merging for certain types of tokens
        matches = self._merging_matcher(doc)
        spans = []
        for __, start, end in matches:
            spans.append(doc[start:end])
        for span in spans:
            span.merge()
        for tok in doc:
            tok._.transformed_text = tok.text

        return doc

    def _match_doc(self, doc):
        # Perform all additional processing
        self._matcher(doc)
        return doc

    def _postproc_doc(self, doc):
        # Perform postprocessing
        doc._.tokens = []
        for tok in doc:
            if isinstance(tok._.transformed_text, list):
                doc._.tokens.extend(tok._.transformed_text)
            elif tok._.transformed_text.strip() != '':
                if self.params['whitespaces_to_underscores']:
                    tok._.transformed_text = "_".join(
                        tok._.transformed_text.split())
                doc._.tokens.append(tok._.transformed_text.strip())
        return doc

    def tokenize(self, text):
        """
        Tokenize document

        Parameters
        ----------
        text : str
            Document to tokenize

        Returns
        -------
        list
            List of tokens

        Examples
        --------
        >>> from redditscore.tokenizer import CrazyTokenizer
        >>> tokenizer = CrazyTokenizer(splithashtags=True, hashtags=False)
        >>> tokenizer.tokenize("#makeamericagreatagain")
        ["make", "america", "great", "again"]
        """
        if not isinstance(text, str):
            warnings.warn('Document {} is not a string'.format(text))
            return []
        text = self._preprocess_text(text)
        doc = self._nlp(text)
        tokens = doc._.tokens
        if self.params['ngrams'] > 1:
            if self.params['whitespaces_to_underscores']:
                tokens = word_ngrams(tokens, (1, self.params['ngrams']),
                                     separator='_')
            else:
                tokens = word_ngrams(tokens, (1, self.params['ngrams']))
        return tokens
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

# Write a pattern that matches a form of "download" plus proper noun
pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("DOWNLOAD_THINGS_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)
class PatternMatcher:
    def __init__(self):
        self.count = {
            "0": 0,
            "1": 0,
            "2": 0,
            "3": 0,
            "4": 0,
            "5": 0,
            "6": 0,
            "7": 0,
            "8": 0,
            "9": 0,
            "10": 0
        }
        self.compa_sent_count = 0

        self.nlp = spacy.load('en')
        self.matcher = Matcher(self.nlp.vocab)
        # self.matcher.add(0,
        #             None,
        #             [{'ORTH': 'JJR'}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}],
        #             [{'ORTH': 'JJR'}, {}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}],
        #             [{'ORTH': 'JJR'}, {'ORTH': 'CIN'}, {}, {'ORTH': 'TECH'}],
        #             [{'ORTH': 'JJR'}, {}, {'ORTH': 'CIN'}, {}, {'ORTH': 'TECH'}])
        # self.matcher.add(1,
        #             None,
        #             [{'ORTH': 'RB'}, {'ORTH': 'JJ'}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}],
        #             [{'ORTH': 'RB'}, {'ORTH': 'JJ'}, {}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}])
        # self.matcher.add(8,
        #             None,
        #             [{'ORTH': 'RBR'}, {'ORTH': 'JJ'}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}],
        #             [{'ORTH': 'RBR'}, {'ORTH': 'JJ'}, {}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}])
        # self.matcher.add(2,
        #             None,
        #             [{'ORTH': 'CV'}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}],
        #             [{'ORTH': 'CV'}, {}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}])
        # self.matcher.add(3,
        #             None,
        #             [{'ORTH': 'CV'}, {'ORTH': 'VBG'}, {'ORTH': 'TECH'}])
        # self.matcher.add(4,
        #             None,
        #             [{'ORTH': 'CV'}, {'ORTH': 'TECH'}])
        self.matcher.add(2, None, [{
            'ORTH': 'VB'
        }, {
            'ORTH': 'VBN'
        }, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'VB'
        }, {
            'ORTH': 'VBN'
        }, {}, {
            'ORTH': 'TECH'
        }])
        # self.matcher.add(6,
        #             None,
        #             [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'JJS'}],
        #             [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {'ORTH': 'JJS'}],
        #             [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'JJS'}],
        #             [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'JJS'}])
        # self.matcher.add(10,
        #             None,
        #             [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'RBR'}],
        #             [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {'ORTH': 'RBR'}],
        #             [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RBR'}],
        #             [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RBR'}])
        self.matcher.add(0, None, [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VBZ'
        }, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VBZ'
        }, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VBZ'
        }, {}, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VBZ'
        }, {}, {
            'ORTH': 'JJR'
        }])
        self.matcher.add(1, None, [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VBZ'
        }, {
            'ORTH': 'JJ'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VBZ'
        }, {
            'ORTH': 'JJ'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VBZ'
        }, {}, {
            'ORTH': 'JJ'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VBZ'
        }, {}, {
            'ORTH': 'JJ'
        }])
        # self.matcher.add(9,
        #             None,
        #             [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'RBS'}],
        #             [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {'ORTH': 'RBS'}],
        #             [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RBS'}],
        #             [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RBS'}])

    def add_pos_tag(self, words, tech_pair):
        tagged_words = CoreNLPParser(url='http://localhost:9000',
                                     tagtype='pos').tag(words)
        # print tagged_words
        tag_list = []
        for (word, tag) in tagged_words:
            if word in tech_pair.split("\t"):
                tag_list.append("TECH")
            else:
                tag_list.append(tag)
        return tag_list

    def match_pattern(self, pre, words, post, current_id, tech_pair):
        tag_list = self.add_pos_tag(words, tech_pair)
        patterns = self.matcher(self.nlp(u'{}'.format(" ".join(tag_list))))
        if patterns != []:
            self.compa_sent_count += 1
            print("yes")
            out_file = open(
                os.path.join(os.pardir, "outnew", "pattern", "sentences.txt"),
                "a")
            out_file.write("{}\n".format(current_id))
            out_file.write("{}\n".format(current_id))
            out_file.write("{}\nPattern(s): ".format(tech_pair))
            out_file.write(" ".join(words))
            out_file.write("\n")
            out_file.close()
            data = open(
                os.path.join(os.pardir, "outnew", "pattern", "output.txt"),
                "a")
            data.write("{}\n".format(current_id))
            data.write("{}\nPattern(s): ".format(tech_pair))
            for pattern in patterns:
                self.count[str(pattern[0])] += 1
                data.write(str(pattern[0]) + "\t")
                # data_file = open(os.path.join(os.pardir, "out", "tech_v2", "{}.txt".format(pattern[0])), "a")
            data.write("\n")
            data.write(" ".join(words))
            data.write("\n\n\n")
            data.close()
예제 #37
0
matcher = Matcher(nlp.vocab)
text = Active
doc = nlp(text)
sents = list(doc.sents)
print("Number of Sentences = ", len(sents))
for sent in doc.sents:
    print(sent)
    for i in sent:
        # print(token.dep_,token.tag_, end = " ")

        print(
            f' {i.text:{10}} {i.pos_:{8}} {i.tag_:{6}} {i.dep_:{10}} {spacy.explain(i.tag_)}'
        )
    print(" ")
    print(
        "-----------------------------------------------------------------------------------------"
    )
    print(" ")
passive_rule = [{
    'DEP': 'nsubjpass'
}, {
    'DEP': 'aux',
    'OP': '*'
}, {
    'DEP': 'auxpass'
}, {
    'TAG': 'VBN'
}]
matcher.add('Passive', None, passive_rule)
matches = matcher(doc)
print(len(matches))
class hearst_patterns(object):
    
    """ Hearst Patterns is a class object used to detects hypernym relations to hyponyms in a text
    
    input: raw text
    returns: list of dict object with each entry all the hypernym-hyponym pairs of a text
    entry format: ["predicate" : [(hyponym, hypernym), (hyponym, hypernym), ..]]
    
    """
    
    import spacy
    
    def __init__(self, nlp, extended=False, predicatematch = "basic"):
        
       
#     Included in each entry is the original regex pattern now adapted as a spaCy matcher pattern.
#     Many of these patterns are in the same format, next iteration of code should include an
#     automatic pattern generator for patterns.
            
#     These patterns need checking and cleaning up for testing.
            
#     Format for the dict entry of each pattern
#     {
#      "label" : predicate, 
#      "pattern" : spaCy pattern, 
#      "posn" : first/last depending on whether the hypernym appears before its hyponym
#     }
      
        # make the patterns easier to read
        # as lexical understanding develops, consider adding attributes to dstinguish between hypernyms and hyponyms
        self.nlp = nlp
        
        options = ["bronze", "silver", "gold"]
        if predicatematch not in options:
            entry = ""
            while entry not in ["1", "2", "3"]: 
                entry = input(f"1. {options[0]}, 2. {options[1]}, 3. {options[2]}")
            self.predicatematch = options[int(entry) -1]
        else:
            self.predicatematch = predicatematch
        
        hypernym = {"POS" : {"IN": ["NOUN", "PROPN"]}} 
        hyponym = {"POS" : {"IN": ["NOUN", "PROPN"]}}
        punct = {"IS_PUNCT": True, "OP": "?"}

        self.patterns = [

        {"label" : "such_as", "pattern" : [
#                 '(NP_\\w+ (, )?such as (NP_\\w+ ?(, )?(and |or )?)+)',
#                 'first'
             hypernym, punct, {"LEMMA": "such"}, {"LEMMA": "as"}, hyponym
        ], "posn" : "first"},

        {"label" : "know_as", "pattern" : [
#                 '(NP_\\w+ (, )?know as (NP_\\w+ ?(, )?(and |or )?)+)', # added for this experiment
#                 'first'
             hypernym, punct, {"LEMMA": "know"}, {"LEMMA": "as"}, hyponym
        ], "posn" : "first"},

        {"label" : "such", "pattern" : [
#                 '(such NP_\\w+ (, )?as (NP_\\w+ ?(, )?(and |or )?)+)',
#                 'first'
             {"LEMMA": "such"}, hypernym, punct, {"LEMMA": "as"}, hyponym
        ], "posn" : "first"},

        {"label" : "include", "pattern" : [
#                 '(NP_\\w+ (, )?include (NP_\\w+ ?(, )?(and |or )?)+)',
#                 'first'
             hypernym, punct, {"LEMMA" : "include"}, hyponym
        ], "posn" : "first"},

        {"label" : "especially", "pattern" : [ ## problem - especially is merged as a modifier in to a noun phrase
#                 '(NP_\\w+ (, )?especially (NP_\\w+ ?(, )?(and |or )?)+)',
#                 'first'
             hypernym, punct, {"LEMMA" : "especially"}, hyponym
        ], "posn" : "first"},

        {"label" : "other", "pattern" : [
#             problem: the noun_chunk, 'others' clashes with this rule to create a zero length chunk when predicate removed
#                 '((NP_\\w+ ?(, )?)+(and |or )?other NP_\\w+)',
#                 'last'
             hyponym, punct, {"LEMMA" : {"IN" : ["and", "or"]}}, {"LEMMA" : "other"}, hypernym
#             There were bruises, lacerations, or other injuries were not prevalent."
        ], "posn" : "last"},

        ]

        if extended:
            self.patterns.extend([

            {"label" : "which_may_include", "pattern" : [
#                     '(NP_\\w+ (, )?which may include (NP_\\w+ '
#                     '?(, )?(and |or )?)+)',
#                     'first'
                hypernym, punct, {"LEMMA" : "which"}, {"LEMMA" : "may"}, {"LEMMA" : "include"}, hyponym
            ], "posn" : "first"},

            {"label" : "which_be_similar_to", "pattern" : [
#                     '(NP_\\w+ (, )?which be similar to (NP_\\w+ ? '
#                     '(, )?(and |or )?)+)',
#                     'first'
                hypernym, punct, {"LEMMA" : "which"}, {"LEMMA" : "be"}, {"LEMMA" : "similar"}, {"LEMMA" : "to"}, hyponym
            ], "posn" : "first"},

            {"label" : "example_of_this_be", "pattern" : [
#                     '(NP_\\w+ (, )?example of this be (NP_\\w+ ? '
#                     '(, )?(and |or )?)+)',
#                     'first'
                hypernym, punct, {"LEMMA" : "example"}, {"LEMMA" : "of"}, {"LEMMA" : "this"}, {"LEMMA" : "be"}, hyponym
            ], "posn" : "first"},

            {"label" : ",type", "pattern" : [
#                     '(NP_\\w+ (, )?type (NP_\\w+ ? (, )?(and |or )?)+)',
#                     'first'
                hypernym, punct, {"LEMMA" : "type"}, punct, hyponym
            ], "posn" : "first"},

            {"label" : "mainly", "pattern" : [
#                     '(NP_\\w+ (, )?mainly (NP_\\w+ ? (, )?(and |or )?)+)',
#                     'first'
                hypernym, punct, {"LEMMA" : "mainly"}, hyponym
            ], "posn" : "first"},

            {"label" : "mostly", "pattern" : [
#                     '(NP_\\w+ (, )?mostly (NP_\\w+ ? (, )?(and |or )?)+)',
#                     'first'
                hypernym, punct, {"LEMMA" : "mostly"}, hyponym
            ], "posn" : "first"},

            {"label" : "notably", "pattern" : [
#                     '(NP_\\w+ (, )?notably (NP_\\w+ ? (, )?(and |or )?)+)',
#                     'first'
                hypernym, punct, {"LEMMA" : "notably"}, hyponym
            ], "posn" : "first"},

            {"label" : "particularly", "pattern" : [
#                     '(NP_\\w+ (, )?particularly (NP_\\w+ ? '
#                     '(, )?(and |or )?)+)',
#                     'first'
                hypernym, punct, {"LEMMA" : "particularly"}, hyponym
            ], "posn" : "first"},

            {"label" : "principally", "pattern" : [
#                     '(NP_\\w+ (, )?principally (NP_\\w+ ? (, )?(and |or )?)+)', - fuses in a noun phrase
#                     'first'
                hypernym, punct, {"LEMMA" : "principally"}, hyponym
            ], "posn" : "first"},

            {"label" : "in_particular", "pattern" : [
#                     '(NP_\\w+ (, )?in particular (NP_\\w+ ? '
#                     '(, )?(and |or )?)+)',
#                     'first'
                hypernym, punct, {"LEMMA" : "in"}, {"LEMMA" : "particular"}, hyponym
            ], "posn" : "first"},

            {"label" : "except", "pattern" : [
#                     '(NP_\\w+ (, )?except (NP_\\w+ ? (, )?(and |or )?)+)',
#                     'first'
                hypernym, punct, {"LEMMA" : "except"}, hyponym
            ], "posn" : "first"},

            {"label" : "other_than", "pattern" : [
#                     '(NP_\\w+ (, )?other than (NP_\\w+ ? (, )?(and |or )?)+)',
#                     'first'
                hypernym, punct, {"LEMMA" : "other"}, {"LEMMA" : "than"}, hyponym
            ], "posn" : "first"},

            {"label" : "eg", "pattern" : [
#                     '(NP_\\w+ (, )?e.g. (, )?(NP_\\w+ ? (, )?(and |or )?)+)',
#                     'first'
                hypernym, punct, {"LEMMA" : {"IN" : ["e.g.", "eg"]}}, hyponym 
            ], "posn" : "first"},

#                 {"label" : "eg-ie", "pattern" : [ 
# #                     '(NP_\\w+ \\( (e.g.|i.e.) (, )?(NP_\\w+ ? (, )?(and |or )?)+' - need to understand this pattern better
# #                     '(\\. )?\\))',
# #                     'first'
#                     hypernym, punct, {"LEMMA" : {IN : ["e.g.", "i.e.", "eg", "ie"]}}, {"LEMMA" : "than"}, hyponym
#                 ]},

            {"label" : "ie", "pattern" : [
#                     '(NP_\\w+ (, )?i.e. (, )?(NP_\\w+ ? (, )?(and |or )?)+)',
#                     'first'
                hypernym, punct, {"LEMMA" : {"IN" : ["i.e.", "ie"]}}, hyponym 
            ], "posn" : "first"},

            {"label" : "for_example", "pattern" : [
#                     '(NP_\\w+ (, )?for example (, )?'
#                     '(NP_\\w+ ?(, )?(and |or )?)+)',
#                     'first'
                hypernym, punct, {"LEMMA" : "for"}, {"LEMMA" : "example"}, punct, hyponym
            ], "posn" : "first"},

            {"label" : "example_of_be", "pattern" : [
#                     'example of (NP_\\w+ (, )?be (NP_\\w+ ? '
#                     '(, )?(and |or )?)+)',
#                     'first'
                {"LEMMA" : "example"}, {"LEMMA" : "of"}, hypernym, punct, {"LEMMA" : "be"}, hyponym
            ], "posn" : "first"},

            {"label" : "like", "pattern" : [
#                     '(NP_\\w+ (, )?like (NP_\\w+ ? (, )?(and |or )?)+)',
#                     'first'
                hypernym, punct, {"LEMMA" : "like"}, hyponym,
            ], "posn" : "first"},

            # repeat of such_as pattern in primary patterns???
#                     'such (NP_\\w+ (, )?as (NP_\\w+ ? (, )?(and |or )?)+)',
#                     'first'

                {"label" : "whether", "pattern" : [
#                     '(NP_\\w+ (, )?whether (NP_\\w+ ? (, )?(and |or )?)+)',
#                     'first'
                hypernym, punct, {"LEMMA" : "whether"}, hyponym
            ], "posn" : "first"},

            {"label" : "compare_to", "pattern" : [
#                     '(NP_\\w+ (, )?compare to (NP_\\w+ ? (, )?(and |or )?)+)',
#                     'first'
                hypernym, punct, {"LEMMA" : "compare"}, {"LEMMA" : "to"}, hyponym 
            ], "posn" : "first"},

            {"label" : "among_-PRON-", "pattern" : [
#                     '(NP_\\w+ (, )?among -PRON- (NP_\\w+ ? '
#                     '(, )?(and |or )?)+)',
#                     'first'
                hypernym, punct, {"LEMMA" : "among"}, {"LEMMA" : "-PRON-"}, hyponym
            ], "posn" : "first"},

            {"label" : "for_instance", "pattern" : [
#                     '(NP_\\w+ (, )? (NP_\\w+ ? (, )?(and |or )?)+ '
#                     'for instance)',
#                     'first'
                hypernym, punct, hyponym, {"LEMMA" : "for"}, {"LEMMA" : "instance"}
            ], "posn" : "first"},

            {"label" : "and-or_any_other", "pattern" : [
#                     '((NP_\\w+ ?(, )?)+(and |or )?any other NP_\\w+)',
#                     'last'
                hyponym, punct, {"DEP": "cc"}, {"LEMMA" : "any"}, {"LEMMA" : "other"}, hypernym,
            ], "posn" : "last"},

            {"label" : "some_other", "pattern" : [
#                     '((NP_\\w+ ?(, )?)+(and |or )?some other NP_\\w+)',
#                     'last'
                hyponym, punct, {"DEP": "cc", "OP" : "?"}, {"LEMMA" : "some"}, {"LEMMA" : "other"}, hypernym,
            ], "posn" : "last"},

            {"label" : "be_a", "pattern" : [
#                     '((NP_\\w+ ?(, )?)+(and |or )?be a NP_\\w+)',
#                     'last'
                hyponym, punct, {"LEMMA" : "be"}, {"LEMMA" : "a"}, hypernym,
            ], "posn" : "last"},

            {"label" : "like_other", "pattern" : [
#                     '((NP_\\w+ ?(, )?)+(and |or )?like other NP_\\w+)',
#                     'last'
                hyponym, punct, {"LEMMA" : "like"}, {"LEMMA" : "other"}, hypernym,
            ], "posn" : "last"},

             {"label" : "one_of_the", "pattern" : [
#                     '((NP_\\w+ ?(, )?)+(and |or )?one of the NP_\\w+)',
#                     'last'
                hyponym, punct, {"LEMMA" : "one"}, {"LEMMA" : "of"}, {"LEMMA" : "the"}, hypernym,
            ], "posn" : "last"},

            {"label" : "one_of_these", "pattern" : [
#                     '((NP_\\w+ ?(, )?)+(and |or )?one of these NP_\\w+)',
#                     'last'
            hyponym, punct, {"LEMMA" : "one"}, {"LEMMA" : "of"}, {"LEMMA" : "these"}, hypernym,
            ], "posn" : "last"},

            {"label" : "one_of_those", "pattern" : [
#                     '((NP_\\w+ ?(, )?)+(and |or )?one of those NP_\\w+)',
#                     'last'
            hyponym, punct, {"DEP": "cc", "OP" : "?"}, {"LEMMA" : "one"}, {"LEMMA" : "of"}, {"LEMMA" : "those"}, hypernym,
            ], "posn" : "last"},

            {"label" : "be_example_of", "pattern" : [
#                     '((NP_\\w+ ?(, )?)+(and |or )?be example of NP_\\w+)', added optional "an" to spaCy pattern for singular vs. plural
#                     'last'
                hyponym, punct, {"LEMMA" : "be"}, {"LEMMA" : "an", "OP" : "?"}, {"LEMMA" : "example"}, {"LEMMA" : "of"}, hypernym
            ], "posn" : "last"},

            {"label" : "which_be_call", "pattern" : [
#                     '((NP_\\w+ ?(, )?)+(and |or )?which be call NP_\\w+)',
#                     'last'
                hyponym, punct, {"LEMMA" : "which"}, {"LEMMA" : "be"}, {"LEMMA" : "call"}, hypernym
            ], "posn" : "last"},
#               
            {"label" : "which_be_name", "pattern" : [
#                     '((NP_\\w+ ?(, )?)+(and |or )?which be name NP_\\w+)',
#                     'last'
                hyponym, punct, {"LEMMA" : "which"}, {"LEMMA" : "be"}, {"LEMMA" : "name"}, hypernym
            ], "posn" : "last"},

            {"label" : "a_kind_of", "pattern" : [
#                     '((NP_\\w+ ?(, )?)+(and|or)? a kind of NP_\\w+)',
#                     'last'
                hyponym, punct, {"LEMMA" : "a"}, {"LEMMA" : "kind"}, {"LEMMA" : "of"}, hypernym
            ], "posn" : "last"},

#                     '((NP_\\w+ ?(, )?)+(and|or)? kind of NP_\\w+)', - combined with above
#                     'last'

            {"label" : "form_of", "pattern" : [
#                     '((NP_\\w+ ?(, )?)+(and|or)? form of NP_\\w+)',
#                     'last'
                hyponym, punct, {"LEMMA" : "a", "OP" : "?"}, {"LEMMA" : "form"}, {"LEMMA" : "of"}, hypernym
            ], "posn" : "last"},

            {"label" : "which_look_like", "pattern" : [
#                     '((NP_\\w+ ?(, )?)+(and |or )?which look like NP_\\w+)',
#                     'last'
                hyponym, punct, {"LEMMA" : "which"}, {"LEMMA" : "look"}, {"LEMMA" : "like"}, hyponym
            ], "posn" : "last"},

            {"label" : "which_sound_like", "pattern" : [
#                     '((NP_\\w+ ?(, )?)+(and |or )?which sound like NP_\\w+)',
#                     'last'
                hyponym, punct, {"LEMMA" : "which"}, {"LEMMA" : "sound"}, {"LEMMA" : "like"}, hypernym
            ], "posn" : "last"},

            {"label" : "type", "pattern" : [
#                     '((NP_\\w+ ?(, )?)+(and |or )? NP_\\w+ type)',
#                     'last'
                hyponym, punct, {"LEMMA" : "type"}, hypernym
            ], "posn" : "last"},

            {"label" : "compare_with", "pattern" : [
#                     '(compare (NP_\\w+ ?(, )?)+(and |or )?with NP_\\w+)',
#                     'last'
                {"LEMMA" : "compare"}, hyponym, punct, {"LEMMA" : "with"}, hypernym
            ], "posn" : "last"},

#             {"label" : "as", "pattern" : [
# #                     '((NP_\\w+ ?(, )?)+(and |or )?as NP_\\w+)',
# #                     'last'
#                 hyponym, punct, {"LEMMA" : "as"}, hypernym
#             ], "posn" : "last"},

            {"label" : "sort_of", "pattern" : [
#                     '((NP_\\w+ ?(, )?)+(and|or)? sort of NP_\\w+)',
#                     'last'
                hyponym, punct, {"LEMMA" : "sort"}, {"LEMMA" : "of"}, hypernym
            ], "posn" : "last"},

        ]),        

        ## initiate matcher
        from spacy.matcher import Matcher
        self.matcher = Matcher(self.nlp.vocab, validate = True)
        
        # added "some" to original list
        self.predicate_list = [
            'able', 'available', 'brief', 'certain',
            'different', 'due', 'enough', 'especially', 'few', 'fifth',
            'former', 'his', 'howbeit', 'immediate', 'important', 'inc',
            'its', 'last', 'latter', 'least', 'less', 'likely', 'little',
            'many', 'ml', 'more', 'most', 'much', 'my', 'necessary',
            'new', 'next', 'non', 'old', 'other', 'our', 'ours', 'own',
            'particular', 'past', 'possible', 'present', 'proud', 'recent',
            'same', 'several', 'significant', 'similar', 'some', 'such', 'sup', 'sure'
        ]

        self.predicates = []
        self.first = []
        self.last = []

        # add patterns to matcher
        for pattern in self.patterns:
            self.matcher.add(pattern["label"], None, pattern["pattern"])

            # gather list of predicate terms for the noun_chunk deconfliction
            self.predicates.append(pattern["label"].split('_'))

            # gather list of predicates where the hypernym appears first
            if pattern["posn"] == "first":
                self.first.append(pattern["label"])

            # gather list of predicates where the hypernym appears last
            if pattern["posn"] == "last":
                self.last.append(pattern["label"])
                
    def isPredicateMatch_bronze(self, noun_chunknoun_chunk, predicates):
        
        """
        Bronze option to remove predicate phrases from noun_chunks using a predefined list of modifiers

        input: the chunk to be checked, list of predicate phrases
        returns: the chnunk with predicate phrases removed.

        """
        counter = 0
        while noun_chunknoun_chunk[counter].lemma_ in predicates:
                counter += 1
                
        #remove empty spans, eg the noun_chunk 'others' becomes a zero length span
        if len(noun_chunknoun_chunk[counter:]) == 0:
            counter = 0
                
        return noun_chunknoun_chunk[counter:]
    
    def isPredicateMatch_silver(self, noun_chunk):
        
        """
        Silver option to remove predicate phrases from noun_chunks using stop word list

        input: the chunk to be checked, list of predicate phrases
        returns: the chnunk with predicate phrases removed.

        """
        counter = 0
        
        while not noun_chunk[0].is_stop and noun_chunk[counter].is_stop:
            counter += 1
                
#         #remove empty spans, eg the noun_chunk 'others' becomes a zero length span
#         if len(chunk[counter:]) == 0:
#             counter = 0
        #print(noun_chunk, "becomes: ", noun_chunk[counter:])        
        return noun_chunk[counter:]

    def isPredicateMatch_gold(self, noun_chunk, predicates):
        
        """
        Gold option to remove predicate phrases from noun_chunks using pattern labels.

        input: the chunk to be checked, list of predicate phrases
        returns: the chnunk with predicate phrases removed.

        """

        def match(empty, count, noun_chunk, predicates):
            # empty: check whether predicates list is empty
            # count < len(predicates[0]): checks whether the count has reached the final token of the predicate
            # chunk[count].lemma_ == predicates[0][count]: check whether chunk token is equal to the predicate token

            
            while not empty and count < len(predicates[0]) and noun_chunk[count].lemma_ == predicates[0][count]:
                count += 1
                
            #remove empty spans, eg the noun_chunk 'others' becomes a zero length span
            if len(noun_chunk[count:]) == 0:
                count = 0

            return empty, count
    
        def isMatch(noun_chunk, predicates):

            empty, counter = match(predicates == [], 0, noun_chunk, predicates)
            if empty or counter == len(predicates[0]):
                #print(chunk, "becomes: ", chunk[counter:])
                return noun_chunk[counter:]
            else:
                return isMatch(noun_chunk, predicates[1:])

        return isMatch(noun_chunk, predicates)
    
    
    def find_hyponyms(self, doc):
        
        """
        this is the main function of the class object
        
        follows logic of:
        1. checks whether text has been parsed
        2. pre-processing for noun_chunks
        3. generate matches
        4. create list of dict object containing match results
        """
        
        # if isinstance(text, spacy.tokens.doc.Doc):
        #     doc = text
        # else:
        #     doc = self.nlp(text) # initiate doc 
            
        
        ## Pre-processing
        # there are some predicate terms, such as "particularly", "especially" and "some other" which are
        # merged with the noun phrase. Such terms are part of the pattern and become part of the
        # merged noun-chunk, consequently, they are not detected in by the matcher.
        # This pre-processing, therefore, walks through the noun_chunks of a doc object to remove those
        # predicate terms from each noun_chunk and merges the result.
        
        with doc.retokenize() as retokenizer:

            for chunk in doc.noun_chunks:

                attrs = {"tag": chunk.root.tag, "dep": chunk.root.dep}

                if self.predicatematch == "bronze":
                    retokenizer.merge(self.isPredicateMatch_bronze(chunk, self.predicate_list), attrs = attrs)
                elif self.predicatematch == "silver":
                    retokenizer.merge(self.isPredicateMatch_silver(chunk), attrs = attrs)
                elif self.predicatematch == "gold":
                    retokenizer.merge(self.isPredicateMatch_gold(chunk, self.predicates), attrs = attrs)
    
        ## Main Body
        #Find matches in doc
        matches = self.matcher(doc)
        
        pairs = [] # set up dictionary containing pairs
        
        # If none are found then return None
        if not matches:
            return pairs

        for match_id, start, end in matches:
            predicate = self.nlp.vocab.strings[match_id]
            
            # if the predicate is in the list where the hypernym is last, else hypernym is first
            if predicate in self.last: 
                hypernym = doc[end - 1]
                hyponym = doc[start]
            else:
                # an inelegent way to deal with the "such_NOUN_as pattern" since the first token is not the hypernym
                if doc[start].lemma_ == "such":
                    start += 1
                hypernym = doc[start]
                hyponym = doc[end - 1]

            # create a list of dictionary objects with the format:
            # {
            # "predicate" : " predicate term based from pattern name,
            # "pairs" : [(hypernym, hyponym)] + [hyponym conjuncts (tokens linked by and | or)]
            # "sent" : sentence in which the pairs originate
            # }
            
#             pairs.append(dict({"predicate" : predicate, 
#                                "pairs" : [(hypernym, hyponym)] + [(hypernym, token) for token in hyponym.conjuncts if token != hypernym],
#                                "sent" : (hyponym.sent.text).strip()}))

            pairs.append((hyponym.lemma_, hypernym.lemma_, predicate))  
            for token in hyponym.conjuncts:   
                if token != hypernym and token != None:
                    pairs.append((token.lemma_, hypernym.lemma_, predicate))

        return pairs
예제 #39
0
class AbbreviationDetector:
    """
    Detects abbreviations using the algorithm in "A simple algorithm for identifying
    abbreviation definitions in biomedical text.", (Schwartz & Hearst, 2003).

    This class sets the `._.abbreviations` attribute on spaCy Doc.

    The abbreviations attribute is a `List[Span]` where each Span has the `Span._.long_form`
    attribute set to the long form definition of the abbreviation.

    Note that this class does not replace the spans, or merge them.
    """

    def __init__(self, nlp) -> None:
        Doc.set_extension("abbreviations", default=[], force=True)
        Span.set_extension("long_form", default=None, force=True)

        self.matcher = Matcher(nlp.vocab)
        self.matcher.add(
            "parenthesis", None, [{"ORTH": "("}, {"OP": "+"}, {"ORTH": ")"}]
        )
        abbreviation_regex = r'(([A-Z0-9-]+){2,})|(([A-Z0-9-]\.){2,})'
        acronym_rule = [{'TAG': 'NNP', 'TEXT': {'REGEX': abbreviation_regex}}]
        self.matcher.add(
            "only_abbreviation", None, acronym_rule
        )
        self.global_matcher = Matcher(nlp.vocab)
        self.nlp = nlp

    def find(self, span: Span, doc: Doc) -> Tuple[Span, Set[Span]]:
        """
        Functional version of calling the matcher for a single span.
        This method is helpful if you already have an abbreviation which
        you want to find a definition for.
        """
        dummy_matches = [(-1, int(span.start), int(span.end))]
        filtered = filter_matches(dummy_matches, doc)
        abbreviations = self.find_matches_for(filtered, doc)

        if not abbreviations:
            return span, set()
        else:
            return abbreviations[0]

    def __call__(self, doc: Doc) -> Doc:
        matches = self.matcher(doc)
        matches_all_shorts = [x for x in matches if self.nlp.vocab.strings[x[0]] == 'only_abbreviation']
        matches_no_brackets = [(x[0], x[1] + 1, x[2] - 1) for x in matches if self.nlp.vocab.strings[x[0]] == 'parenthesis']
        all_shorts = []
        [all_shorts.append(doc[m[1]:m[2]]) for m in matches_all_shorts]
        filtered = filter_matches(matches_no_brackets, doc)
        occurences = self.find_matches_for(filtered, doc)

        # Modify here to detect all abbreviations w or w/o definition        
        for (long_form, short_forms) in occurences:
            for short in short_forms:
                short._.long_form = long_form
                doc._.abbreviations.append(short)
        
        for short in all_shorts:
            if short not in doc._.abbreviations:
                doc._.abbreviations.append(short)

        return doc

    def find_matches_for(
        self, filtered: List[Tuple[Span, Span]], doc: Doc
    ) -> List[Tuple[Span, Set[Span]]]:
        rules = {}
        all_occurences: Dict[Span, Set[Span]] = defaultdict(set)
        already_seen_long: Set[str] = set()
        already_seen_short: Set[str] = set()
        for (long_candidate, short_candidate) in filtered:
            short, long = find_abbreviation(long_candidate, short_candidate)
            # We need the long and short form definitions to be unique, because we need
            # to store them so we can look them up later. This is a bit of a
            # pathalogical case also, as it would mean an abbreviation had been
            # defined twice in a document. There's not much we can do about this,
            # but at least the case which is discarded will be picked up below by
            # the global matcher. So it's likely that things will work out ok most of the time.
            new_long = long.string not in already_seen_long if long else False
            new_short = short.string not in already_seen_short
            if long is not None and new_long and new_short:
                already_seen_long.add(long.string)
                already_seen_short.add(short.string)
                all_occurences[long].add(short)
                rules[long.string] = long
                # Add a rule to a matcher to find exactly this substring.
                self.global_matcher.add(
                    long.string, None, [{"ORTH": x.text} for x in short]
                )
        to_remove = set()
        global_matches = self.global_matcher(doc)
        for match, start, end in global_matches:
            string_key = self.global_matcher.vocab.strings[match]
            to_remove.add(string_key)
            all_occurences[rules[string_key]].add(doc[start:end])
        for key in to_remove:
            # Clean up the global matcher.
            self.global_matcher.remove(key)

        return list((k, v) for k, v in all_occurences.items())
예제 #40
0
def test_matcher_valid_callback(en_vocab):
    """Test that on_match can only be None or callable."""
    matcher = Matcher(en_vocab)
    with pytest.raises(ValueError):
        matcher.add("TEST", [[{"TEXT": "test"}]], on_match=[])
    matcher(Doc(en_vocab, words=["test"]))
예제 #41
0
def test_matcher_no_zero_length(en_vocab):
    doc = Doc(en_vocab, words=["a", "b"], tags=["A", "B"])
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
    assert len(matcher(doc)) == 0
예제 #42
0
def test_matcher_basic_check(en_vocab):
    matcher = Matcher(en_vocab)
    # Potential mistake: pass in pattern instead of list of patterns
    pattern = [{"TEXT": "hello"}, {"TEXT": "world"}]
    with pytest.raises(ValueError):
        matcher.add("TEST", pattern)
예제 #43
0
def test_matcher_intersect_value_operator(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"MORPH": {"INTERSECTS": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    assert len(matcher(doc)) == 0
    doc[0].set_morph("Feat=Val")
    assert len(matcher(doc)) == 1
    doc[0].set_morph("Feat=Val|Feat2=Val2")
    assert len(matcher(doc)) == 1
    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
    assert len(matcher(doc)) == 1
    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
    assert len(matcher(doc)) == 1

    # INTERSECTS with a single value is the same as IN
    matcher = Matcher(en_vocab)
    pattern = [{"TAG": {"INTERSECTS": ["A", "B"]}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0].tag_ = "A"
    assert len(matcher(doc)) == 1

    # INTERSECTS with an empty pattern list matches nothing
    matcher = Matcher(en_vocab)
    pattern = [{"TAG": {"INTERSECTS": []}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0].tag_ = "A"
    assert len(matcher(doc)) == 0

    # INTERSECTS with a list value
    Token.set_extension("ext", default=[])
    matcher = Matcher(en_vocab)
    pattern = [{"_": {"ext": {"INTERSECTS": ["A", "C"]}}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0]._.ext = ["A", "B"]
    assert len(matcher(doc)) == 1

    # INTERSECTS with an empty pattern list matches nothing
    matcher = Matcher(en_vocab)
    pattern = [{"_": {"ext": {"INTERSECTS": []}}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0]._.ext = ["A", "B"]
    assert len(matcher(doc)) == 0

    # INTERSECTS with an empty value matches nothing
    matcher = Matcher(en_vocab)
    pattern = [{"_": {"ext": {"INTERSECTS": ["A", "B"]}}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0]._.ext = []
    assert len(matcher(doc)) == 0
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("fr_core_news_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "L'appli se distingue par une interface magnifique, la recherche "
    "intelligente, la labellisation automatique et les réponses "
    "vocales fluides."
)

# Écris un motif pour un adjectif suivi d'un ou deux noms
pattern = [{"POS": "NOUN"}, {"POS": "ADJ"}, {"POS": "ADJ", "OP": "?"}]

# Ajoute le motif au matcher et applique le matcher au doc
matcher.add("ADJ_NOUN_PATTERN", None, pattern)
matches = matcher(doc)
print("Nombre de correspondances trouvées :", len(matches))

# Itère sur les correspondances et affiche la portion de texte
for match_id, start, end in matches:
    print("Correspondance trouvée :", doc[start:end].text)
def feature_extraction(df, ft_model, nlp):
    # Extracting all the single nouns in the corpus
    all_nouns = []

    for review in df['spacyObj']:
        for token in review:
            if token.pos_ == "NOUN":
                all_nouns.append(token.text)

    all_nouns = pd.Series(all_nouns)
    # Finding unique nouns along with their counts sorted in descending order
    unique_nouns = all_nouns.value_counts()

    noun_phrases = []

    # Pattern to match i.e. two nouns occuring together
    patterns = [[{'TAG': 'NN'}, {'TAG': 'NN'}]]

    matcher = Matcher(nlp.vocab)
    matcher.add('NounPhrasees', patterns)

    for review in df['spacyObj']:
        matches = matcher(review)

        for match_id, start, end in matches:
            noun_phrases.append(review[start:end].text)

    noun_phrases = pd.Series(noun_phrases)
    unique_noun_phrases = noun_phrases.value_counts()

    # Remove nouns with single or double character
    for noun in unique_nouns.index:
        # if noun length is less than 3 or if nouns contain any numbers, it is considered invalid
        if len(noun) < 3 or re.match(r".*[0-9].*", noun) is not None:
            del unique_nouns[noun]

    # Extracting Top Features

    top2 = len(unique_nouns) * 0.05  # considering top 5% of features
    top2 = int(top2)

    top_features = unique_nouns[0:top2]

    # this will contain all the final features
    features_bucket = OrderedDict()

    top_features_list = list(top_features.keys())
    top_features_set = set(top_features.keys())
    unique_noun_phrases_set = set(unique_noun_phrases.keys())

    # Applying assocation rule mining to group nouns occuring together
    for feature1 in top_features_list:
        for feature2 in top_features_list:
            feature_phrase = feature1 + ' ' + feature2

            if feature1 in top_features_set and feature2 in top_features_set and feature_phrase in unique_noun_phrases_set:
                # If the condition is true, we have identified a noun phrase which is a combination of two nouns
                # in the top_features. So one of the nouns cn be eliminated from top features.

                # Ex. if "battery life" is found, then "life" can be eliminated from top features as it is not a feature
                # by itself. It is just part of the feature "battery life"

                # Now we need to find out if frequency of the lesser occuring noun (in our ex., the word "life") matches
                # with the frequency of the noun phrase (in our ex., "battery life") by a certain confidence.
                # If it does so, then we can be sure that the lesser occuring noun occurs only in that particular noun_phrase
                # i.e in our ex "life" occurs primaryly in the phrase "battery life"

                lesser_occurring_noun = ""
                often_occurring_noun = ""
                if unique_nouns[feature1] < unique_nouns[feature2]:
                    lesser_occurring_noun = feature1
                    often_occurring_noun = feature2
                else:
                    lesser_occurring_noun = feature2
                    often_occurring_noun = feature1

                # assuming confidence interval of 40%
                # i.e. accordnig to 'battery life' example, out of total times that 'life' is seen, 'battery' is seen next to it 40% of the time.

                if unique_noun_phrases[feature_phrase] / unique_nouns[
                        lesser_occurring_noun] > 0.4:
                    try:
                        if often_occurring_noun not in features_bucket:
                            features_bucket[often_occurring_noun] = []
                        features_bucket[often_occurring_noun].append(
                            lesser_occurring_noun)
                        top_features_set.remove(lesser_occurring_noun)
                        # print(lesser_occurring_noun)
                    except BaseException as error:
                        print(error)
                        continue

    main_features = list(features_bucket.keys())
    top_features_to_add = set(top_features_list[:20])

    # here we are manually adding adding 20 top nouns as features which were previously not
    # added by the assocation rule mining step above.
    # But before adding, we are checking if any similar nouns exist among the 20 nouns.
    # Ex. If 'display' and 'screen' occur in the top 20, we must add only the most commonly occuring
    # one among the two and remove the other.

    # Here we are only eliminating the nouns that are similar to existing ones in features_bucket.
    for feature1 in top_features_list[:20]:
        for feature2 in main_features:
            if feature1 not in features_bucket and feature1 in top_features_set:
                similarity = cosine_similarity(
                    ft_model.get_word_vector(feature1).reshape(1, -1),
                    ft_model.get_word_vector(feature2).reshape(1, -1))
                if similarity[0][0] > 0.64:
                    top_features_to_add.discard(feature1)

            else:
                top_features_to_add.discard(feature1)

    top_features_to_add_list = list(top_features_to_add)

    # Here we are eliminating nouns that are similar to one another in the top_features_to_add
    for feature1 in top_features_to_add_list:
        for feature2 in top_features_to_add_list:
            if feature1 in top_features_to_add and feature2 in top_features_to_add:
                similarity = cosine_similarity(
                    ft_model.get_word_vector(feature1).reshape(1, -1),
                    ft_model.get_word_vector(feature2).reshape(1, -1))
                if similarity[0][0] < 0.99 and similarity[0][0] > 0.64:
                    feature_to_remove = min(
                        (unique_nouns[feature1], feature1),
                        (unique_nouns[feature2], feature2))[1]
                    top_features_to_add.remove(feature_to_remove)

    for feature in top_features_to_add:
        features_bucket[feature] = []

    for main_noun in features_bucket.keys():
        top_features_set.remove(main_noun)

    # Here we are going through the top 5% of the nouns that we originally considering and checking
    # if any of them are similar to the ones already present in features_bucket.
    top_features_copy = list(top_features_set)
    main_features = features_bucket.keys()

    for feature2 in top_features_copy:
        best_similarity = 0
        most_matching_main_feature = ""

        for feature1 in main_features:
            if feature2 in top_features_set:
                similarity = cosine_similarity(
                    ft_model.get_word_vector(feature1).reshape(1, -1),
                    ft_model.get_word_vector(feature2).reshape(1, -1))
                if similarity[0][0] <= 0.99 and similarity[0][0] > 0.62:
                    if similarity[0][0] > best_similarity:
                        best_similarity = similarity[0][0]
                        most_matching_main_feature = feature1

        if best_similarity != 0 and most_matching_main_feature != "":
            features_bucket[most_matching_main_feature].append(feature2)
            top_features_set.remove(feature2)

    # We finally sort the features in descending order based on how often they occur.
    final_features = list(features_bucket.items())

    final_features_with_counts = []
    for feature in final_features:
        count = unique_nouns[feature[0]]
        final_features_with_counts.append((feature, count))

    final_features_with_counts.sort(key=lambda x: x[1], reverse=True)

    final_features = OrderedDict()
    for feature, count in final_features_with_counts:
        final_features[feature[0]] = feature[1]

    return final_features
예제 #46
0
           {'POS':'NOUN'},
           {'LOWER':'such'},
           {'LOWER':'as'},
           {'POS':'PROPN'}]

# Extract the pattern from the text
# Create a Matcher object.  Pass in the vocab for the loaded SpaCy model
# (the vocab stores the vocabularly and other data shared across a language)
matcher = Matcher(nlp.vocab)

# See https://stackoverflow.com/questions/66164156/problem-with-using-spacy-matcher-matcher-matcher-add-method
# Define the Matcher object
# (matcher now only takes 2 positional arguments in Spacy 3 - an ID for the
# matcher, and a list of patterns (which must be passed as a list, even if
# there is only 1 pattern)
matcher.add("matching_1", [pattern])

# Apply the matcher to the SpaCy document
matches = matcher(doc)

# The matcher returns a list of three-element tuples, in which each tuple is :
# (match_id, start, end).  match_id is the hash value of the ID of the matcher
# ("matching_1" in this case).  start and end represent the token positions of
# where the identified match starts and ends (So the first token is 0, second
# token is 1 etc).  We'll get a tuple returned for every match identified.
# Here, we know we only have one match, so we can just refer to matches[0],
# which refers to the first (and only) tuple.  We can then specify the "span"
# (the matched text) using the start and end token positions stored in 
# the second and third elements of the tuple (so matches[0][1] and 
# matches[0][2] respectively)
span = doc[matches[0][1]:matches[0][2]]
예제 #47
0
import spacy

nlp = spacy.load('en_core_web_sm')
from spacy.matcher import Matcher

###Token matching for rule based matching example###

matcher = Matcher(nlp.vocab)

pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}]
pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}]

matcher.add('SolarPower', None, pattern1,pattern2,pattern3)

doc = nlp(u"The Solar Power industry continues to grow a solarpower increases.  Solar-Power is amazing.")
found_matches=matcher(doc)

print(found_matches)
#Results:
#8656102463236116519 SolarPower 1 3 Solar Power
#8656102463236116519 SolarPower 8 9 solarpower
#8656102463236116519 SolarPower 12 15 Solar-Power

for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

#####phrase matcher#####
예제 #48
0
class UnitComponent(BaseComponent):
    """
    A pipeline component that tags units
    Begins by first tagging all mass, volume, time, and form units.
    """

    #TODO Split into files mass_annotator_component, volume_annotator_component
    #TODO time_annotator_component, etc.

    name = "unit_annotator"
    dependencies = []

    def __init__(self, nlp):
        self.nlp = nlp
        Token.set_extension('feature_is_mass_unit', default=False)
        nlp.entity.add_label('mass_unit')

        Token.set_extension('feature_is_volume_unit', default=False)
        nlp.entity.add_label('volume_unit')

        Token.set_extension('feature_is_time_unit', default=False)
        nlp.entity.add_label('time_unit')

        Token.set_extension('feature_is_route_type', default=False)
        nlp.entity.add_label('route_type')

        Token.set_extension('feature_is_form_unit', default=False)
        nlp.entity.add_label('form_unit')

        Token.set_extension('feature_is_frequency_indicator', default=False)
        nlp.entity.add_label('frequency_indicator')

        Token.set_extension('feature_is_measurement_unit', default=False)
        nlp.entity.add_label('measurement_unit')

        Token.set_extension('feature_is_measurement', default=False)
        nlp.entity.add_label('measurement')

        Token.set_extension('feature_is_duration_pattern', default=False)
        nlp.entity.add_label('duration_pattern')

        self.mass_matcher = Matcher(nlp.vocab)
        self.volume_matcher = Matcher(nlp.vocab)
        self.time_matcher = Matcher(nlp.vocab)
        self.route_matcher = Matcher(nlp.vocab)
        self.form_matcher = Matcher(nlp.vocab)
        self.unit_of_measurement_matcher = Matcher(nlp.vocab)
        self.measurement_matcher = Matcher(nlp.vocab)
        self.frequency_matcher = Matcher(nlp.vocab)
        self.duration_matcher = Matcher(nlp.vocab)

        self.mass_matcher.add('UNIT_OF_MASS', None, [{
            'LOWER': 'mcg'
        }], [{
            'LOWER': 'microgram'
        }], [{
            'LOWER': 'micrograms'
        }], [{
            'ORTH': 'mg'
        }], [{
            'LOWER': 'milligram'
        }], [{
            'LOWER': 'g'
        }], [{
            'LOWER': 'kg'
        }], [{
            'ORTH': 'mEq'
        }])

        self.volume_matcher.add('UNIT_OF_VOLUME', None, [{
            'LOWER': 'ml'
        }], [{
            'ORTH': 'dL'
        }], [{
            'LOWER': 'cc'
        }], [{
            'ORTH': 'L'
        }])

        self.time_matcher.add('UNIT_OF_TIME', None, [{
            'LOWER': 'sec'
        }], [{
            'LOWER': 'second'
        }], [{
            'LOWER': 'seconds'
        }], [{
            'LOWER': 'min'
        }], [{
            'LOWER': 'minute'
        }], [{
            'LOWER': 'minutes'
        }], [{
            'LOWER': 'hr'
        }], [{
            'LOWER': 'hour'
        }], [{
            'LOWER': 'day'
        }], [{
            'LOWER': 'days'
        }], [{
            'LOWER': 'week'
        }], [{
            'LOWER': 'weeks'
        }], [{
            'LOWER': 'month'
        }], [{
            'LOWER': 'months'
        }], [{
            'LOWER': 'year'
        }], [{
            'LOWER': 'years'
        }], [{
            'LOWER': 'yrs'
        }])

        self.frequency_matcher.add('FREQUENCY_MATCHER', None, [{
            'LOWER': 'bid'
        }], [{
            'LOWER': 'prn'
        }], [{
            'LOWER': 'qid'
        }], [{
            'LOWER': 'tid'
        }], [{
            'LOWER': 'qd'
        }], [{
            'LOWER': 'daily'
        }], [{
            'LOWER': 'hs'
        }], [{
            'LOWER': 'as'
        }, {
            'LOWER': 'needed'
        }], [{
            'LOWER': 'once'
        }, {
            'LOWER': 'a'
        }, {
            'LOWER': 'day'
        }], [{
            'LOWER': 'twice'
        }, {
            'LOWER': 'a'
        }, {
            'LOWER': 'day'
        }])

        self.form_matcher.add('UNIT_OF_FORM', None, [{
            'ORTH': 'dose'
        }], [{
            'ORTH': 'doses'
        }], [{
            'LEMMA': 'pill'
        }], [{
            'LEMMA': 'tablet'
        }], [{
            'LEMMA': 'unit'
        }], [{
            'LEMMA': 'u'
        }], [{
            'LEMMA': 'patch'
        }], [{
            'LEMMA': 'unit'
        }], [{
            'ORTH': 'lotion'
        }], [{
            'ORTH': 'powder'
        }], [{
            'ORTH': 'amps'
        }], [{
            'LOWER': 'actuation'
        }], [{
            'LEMMA': 'suspension'
        }], [{
            'LEMMA': 'syringe'
        }], [{
            'LEMMA': 'puff'
        }], [{
            'LEMMA': 'liquid'
        }], [{
            'LEMMA': 'aerosol'
        }], [{
            'LEMMA': 'cap'
        }])

        self.route_matcher.add('TYPE_OF_ROUTE', None, [{
            'LOWER': 'IV'
        }], [{
            'ORTH': 'intravenous'
        }], [{
            'LOWER': 'po'
        }], [{
            'ORTH': 'gtt'
        }], [{
            'LOWER': 'drip'
        }], [{
            'LOWER': 'inhalation'
        }], [{
            'LOWER': 'by'
        }, {
            'LOWER': 'mouth'
        }], [{
            'LOWER': 'topical'
        }], [{
            'LOWER': 'subcutaneous'
        }], [{
            'LOWER': 'ophthalmic'
        }], [{
            'LEMMA': 'injection'
        }], [{
            'LOWER': 'mucous'
        }, {
            'LOWER': 'membrane'
        }], [{
            'LOWER': 'oral'
        }], [{
            'LOWER': 'nebs'
        }], [{
            'LOWER': 'transdermal'
        }], [{
            'LOWER': 'nasal'
        }])

        self.unit_of_measurement_matcher.add('UNIT_OF_MEASUREMENT', None,
                                             [{
                                                 'ENT_TYPE': 'mass_unit'
                                             }, {
                                                 'ORTH': '/'
                                             }, {
                                                 'ENT_TYPE': 'volume_unit'
                                             }], [{
                                                 'ENT_TYPE': 'volume_unit'
                                             }, {
                                                 'ORTH': '/'
                                             }, {
                                                 'ENT_TYPE': 'time_unit'
                                             }], [{
                                                 'ENT_TYPE': 'form_unit'
                                             }, {
                                                 'ORTH': '/'
                                             }, {
                                                 'ENT_TYPE': 'volume_unit'
                                             }])
        self.measurement_matcher.add('MEASUREMENT', None, [{
            'LIKE_NUM': True
        }, {
            'ORTH': '%'
        }], [{
            'LIKE_NUM': True
        }, {
            'ENT_TYPE': 'measurement_unit'
        }], [{
            'LIKE_NUM': True
        }, {
            'ENT_TYPE': 'mass_unit'
        }], [{
            'LIKE_NUM': True
        }, {
            'ENT_TYPE': 'volume_unit'
        }], [{
            'LIKE_NUM': True
        }, {
            'ENT_TYPE': 'form_unit'
        }], [{
            'LIKE_NUM': True
        }, {
            'LOWER': 'x'
        }, {
            'ENT_TYPE': 'form_unit'
        }])

        self.duration_matcher.add('DURATION', None, [{
            'POS': 'PREP'
        }, {
            'LIKE_NUM': True
        }, {
            'ENT_TYPE': 'time_unit'
        }], [{
            'LIKE_NUM': True
        }, {
            'ENT_TYPE': 'time_unit'
        }], [{
            'LOWER': 'in'
        }, {
            'LIKE_NUM': True
        }, {
            'ENT_TYPE': 'time_unit'
        }], [{
            'LOWER': 'prn'
        }])

    def __call__(self, doc):
        logging.debug("Called UnitAnnotator Component")
        nlp = self.nlp

        with doc.retokenize() as retokenizer:
            #match and tag mass units
            matches = self.mass_matcher(doc)
            for match_id, start, end in matches:
                span = Span(doc,
                            start,
                            end,
                            label=nlp.vocab.strings['mass_unit'])
                if span is None:
                    raise BaseException("Span is none")
                for token in span:
                    token._.feature_is_mass_unit = True
                try:
                    if len(span) > 1:
                        retokenizer.merge(span)
                except ValueError:
                    pass
                doc.ents = list(doc.ents) + [span]

        with doc.retokenize() as retokenizer:
            #match and tag volume units
            matches = self.volume_matcher(doc)
            for match_id, start, end in matches:
                span = Span(doc,
                            start,
                            end,
                            label=nlp.vocab.strings['volume_unit'])
                for token in span:
                    token._.feature_is_volume_unit = True
                try:
                    if len(span) > 1:
                        retokenizer.merge(span)
                except ValueError:
                    pass
                doc.ents = list(doc.ents) + [span]

        with doc.retokenize() as retokenizer:
            # match and tag time units
            matches = self.time_matcher(doc)
            for match_id, start, end in matches:
                span = Span(doc,
                            start,
                            end,
                            label=nlp.vocab.strings['time_unit'])
                for token in span:
                    token._.feature_is_time_unit = True
                if len(span) > 1:
                    retokenizer.merge(span)
                doc.ents = list(doc.ents) + [span]

        with doc.retokenize() as retokenizer:
            # durations
            matches = self.duration_matcher(doc)
            for match_id, start, end in matches:
                span = Span(doc,
                            start,
                            end,
                            label=nlp.vocab.strings['duration_pattern'])
                for token in span:
                    token._.feature_is_duration_pattern = True
                try:
                    if len(span) > 1:
                        retokenizer.merge(span)
                except ValueError:
                    pass

                doc.ents = list(doc.ents) + [span]

        with doc.retokenize() as retokenizer:

            # match and frequency indicators
            matches = self.frequency_matcher(doc)
            for match_id, start, end in matches:
                span = Span(doc,
                            start,
                            end,
                            label=nlp.vocab.strings['frequency_indicator'])
                for token in span:
                    token._.feature_is_frequency_indicator = True
                try:
                    if len(span) > 1:
                        retokenizer.merge(span)
                except ValueError:
                    pass
                doc.ents = list(doc.ents) + [span]

        with doc.retokenize() as retokenizer:
            #match and tag form units
            matches = self.form_matcher(doc)
            spans = []
            for match_id, start, end in matches:
                span = Span(doc,
                            start,
                            end,
                            label=nlp.vocab.strings['form_unit'])
                for token in span:
                    token._.feature_is_form_unit = True
                try:
                    if len(span) > 1:
                        retokenizer.merge(span)
                except ValueError:
                    pass
                doc.ents = list(doc.ents) + [span]

        with doc.retokenize() as retokenizer:
            # match and tag route types
            matches = self.route_matcher(doc)
            for match_id, start, end in matches:
                span = Span(doc,
                            start,
                            end,
                            label=nlp.vocab.strings['route_type'])
                for token in span:
                    token._.feature_is_route_type = True
                    try:
                        if len(span) > 1:
                            retokenizer.merge(span)
                    except ValueError:
                        pass
                    doc.ents = list(doc.ents) + [span]

        with doc.retokenize() as retokenizer:
            # match units of measurement (x/y, , etc)
            matches = self.unit_of_measurement_matcher(doc)
            for match_id, start, end in matches:
                span = Span(doc,
                            start,
                            end,
                            label=nlp.vocab.strings['measurement_unit'])
                for token in span:
                    token._.feature_is_measurement_unit = True
                try:
                    if len(span) > 1:
                        retokenizer.merge(span)
                except ValueError:
                    pass
                doc.ents = list(doc.ents) + [span]

        with doc.retokenize() as retokenizer:

            # units of measures, numbers , percentages all together
            matches = self.measurement_matcher(doc)
            for match_id, start, end in matches:
                span = Span(doc,
                            start,
                            end,
                            label=nlp.vocab.strings['measurement'])
                for token in span:
                    token._.feature_is_measurement = True
                try:
                    if len(span) > 1:
                        retokenizer.merge(span)
                except ValueError:
                    pass
                doc.ents = list(doc.ents) + [span]

        return doc
예제 #49
0
class PatternMatcher:
    def __init__(self):
        self.count = {
            "0": 0,
            "1": 0,
            "2": 0,
            "3": 0,
            "4": 0,
            "5": 0,
            "6": 0,
            "7": 0,
            "8": 0,
            "9": 0,
            "10": 0
        }
        self.compa_sent_count = 0

        self.nlp = spacy.load("en")
        self.matcher = Matcher(self.nlp.vocab)
        # self.matcher.add(6,
        #             None,
        #             [{'ORTH': 'JJR'}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}],
        #             [{'ORTH': 'JJR'}, {}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}],
        #             [{'ORTH': 'JJR'}, {'ORTH': 'CIN'}, {}, {'ORTH': 'TECH'}],
        #             [{'ORTH': 'JJR'}, {}, {'ORTH': 'CIN'}, {}, {'ORTH': 'TECH'}])
        # self.matcher.add(7,
        #             None,
        #             [{'ORTH': 'RB'}, {'ORTH': 'JJ'}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}],
        #             [{'ORTH': 'RB'}, {'ORTH': 'JJ'}, {}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}])
        # self.matcher.add(8,
        #             None,
        #             [{'ORTH': 'RBR'}, {'ORTH': 'JJ'}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}],
        #             [{'ORTH': 'RBR'}, {'ORTH': 'JJ'}, {}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}])
        #
        #
        # self.matcher.add(4,
        #                  None,
        #                  [{'ORTH': 'NN'}, {'ORTH': 'IN'}, {'ORTH': 'TECH'}, {'ORTH': 'VBZ'},  {}, {'ORTH': 'RB'}],
        #                  [{'ORTH': 'NN'}, {'ORTH': 'IN'}, {'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}],
        #                  [{'ORTH': 'NN'}, {'ORTH': 'IN'}, {'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'RB'}],
        #                  [{'ORTH': 'NN'}, {'ORTH': 'IN'}, {}, {'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'RB'}],
        #                  [{'ORTH': 'NN'}, {'ORTH': 'IN'}, {}, {'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RB'}],
        #
        #
        #                  )
        #
        self.matcher.add(
            5,
            None,
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBP'
            }, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBP'
            }, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBP'
            }, {}, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBP'
            }, {}, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBZ'
            }, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBZ'
            }, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBZ'
            }, {}, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBZ'
            }, {}, {
                'ORTH': 'NN'
            }],
        )
        self.matcher.add(
            1,
            None,
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBZ'
            }, {
                'ORTH': 'JJ'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBZ'
            }, {
                'ORTH': 'JJ'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBZ'
            }, {}, {
                'ORTH': 'JJ'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBZ'
            }, {}, {
                'ORTH': 'JJ'
            }],
            # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {}, {}, {'ORTH': 'JJ'}],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBZ'
            }, {}, {
                'ORTH': 'JJ'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBD'
            }, {
                'ORTH': 'JJ'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBD'
            }, {
                'ORTH': 'JJ'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBD'
            }, {}, {
                'ORTH': 'JJ'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBD'
            }, {}, {
                'ORTH': 'JJ'
            }],
            # [{'ORTH': 'TECH'}, {'ORTH': 'VBD'}, {}, {}, {}, {'ORTH': 'JJ'}],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBD'
            }, {}, {
                'ORTH': 'JJ'
            }])
        self.matcher.add(
            3,
            None,
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBZ'
            }, {
                'ORTH': 'RB'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBZ'
            }, {
                'ORTH': 'RB'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBZ'
            }, {}, {
                'ORTH': 'RB'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBZ'
            }, {}, {
                'ORTH': 'RB'
            }],
            # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {}, {}, {'ORTH': 'RB'}],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBZ'
            }, {}, {
                'ORTH': 'RB'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBD'
            }, {
                'ORTH': 'RB'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBD'
            }, {
                'ORTH': 'RB'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBD'
            }, {}, {
                'ORTH': 'RB'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBD'
            }, {}, {
                'ORTH': 'RB'
            }],
            # [{'ORTH': 'TECH'}, {'ORTH': 'VBD'}, {}, {}, {}, {'ORTH': 'RB'}],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBD'
            }, {}, {
                'ORTH': 'RB'
            }])

    def add_pos_tag(self, words, tech_pair):
        if len(words) == 0:
            return []
        words = words.split()
        tagged_words = CoreNLPParser(url='http://localhost:9000',
                                     tagtype='pos').tag(words)
        if len(words) != len(tagged_words):
            tagged_words = pos_tag(words)
        tag_list = []
        for (word, tag) in tagged_words:
            if tag == "IN" and word in cin:
                tag_list.append("CIN")
            elif tag[:2] == "VB" and word in cv:
                tag_list.append("CV")
            elif word == tech_pair.split()[0] or word == tech_pair.split()[1]:
                tag_list.append("TECH")
            else:
                tag_list.append(tag)
        return tag_list

    def match_pattern(self, pre, words, post, current_id, tech_pair):
        pre_rm = pre
        words_rm = words
        post_rm = post
        for w in remove_word:
            pre_rm = pre_rm.replace(w, ' ')
            words_rm = words_rm.replace(w, ' ')
            post_rm = post_rm.replace(w, ' ')

        tag_list = self.add_pos_tag(words_rm, tech_pair)
        pre_tag_list = self.add_pos_tag(pre_rm, tech_pair)
        post_tag_list = self.add_pos_tag(post_rm, tech_pair)
        words_patterns = []
        pre_patterns = []
        post_patterns = []
        if len(tag_list) > 0:
            words_patterns = self.matcher(
                self.nlp(u'{}'.format(" ".join(tag_list))))
        if len(pre_tag_list) > 0:
            pre_patterns = self.matcher(
                self.nlp(u'{}'.format(" ".join(pre_tag_list))))
        if len(post_tag_list) > 0:
            post_patterns = self.matcher(
                self.nlp(u'{}'.format(" ".join(post_tag_list))))

        patterns = pre_patterns + words_patterns + post_patterns
        if words_patterns != [] or post_patterns != []:
            pre_ss = sid.polarity_scores("{}".format(pre))
            words_ss = sid.polarity_scores("{}".format(words))
            post_ss = sid.polarity_scores("{}".format(post))
            if ('TECH' in pre_tag_list and 'TECH' in tag_list) and (
                (pre_ss['compound'] >= 0.05 and words_ss['compound'] <= -0.05)
                    or (words_ss['compound'] >= 0.05
                        and pre_ss['compound'] <= -0.05)) and (
                            (tech_pair[0] in pre and tech_pair[1] in words) or
                            (tech_pair[0] in words and tech_pair[1] in pre)):
                self.compa_sent_count += 1
                data = open(
                    os.path.join(os.pardir, "outnew", "pattern_v4",
                                 "test_output_{}.txt".format(os.getpid())),
                    "a")
                data.write("{}\n".format(current_id))
                data.write("{}\nPattern(s): ".format(tech_pair))
                for pattern in patterns:
                    self.count[str(pattern[0])] += 1
                    data.write(str(pattern[0]) + "\t")
                data.write("\n")
                data.write("{}\n".format(pre))
                data.write("{}\n".format(words))
                data.write("\n\n\n")
                data.close()
            if ('TECH' in post_tag_list and 'TECH' in tag_list) and (
                    (post_ss['compound'] >= 0.05 and words_ss['compound'] <= -0.05) or
                    (words_ss['compound'] >= 0.05 and post_ss['compound'] <= -0.05)) and \
                    ((tech_pair[0] in post and tech_pair[1] in words) or (tech_pair[0] in words and tech_pair[1] in post)):
                self.compa_sent_count += 1
                data = open(
                    os.path.join(os.pardir, "outnew", "pattern_v4",
                                 "test_output_{}.txt".format(os.getpid())),
                    "a")
                data.write("{}\n".format(current_id))
                data.write("{}\nPattern(s): ".format(tech_pair))
                for pattern in patterns:
                    self.count[str(pattern[0])] += 1
                    data.write(str(pattern[0]) + "\t")
                data.write("\n")
                data.write("{}\n".format(words))
                data.write("{}\n".format(post))
                data.write("\n\n\n")
                data.close()
예제 #50
0
import json
from spacy.matcher import Matcher
from spacy.lang.es import Spanish

with open("exercises/es/adidas.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

nlp = Spanish()
matcher = Matcher(nlp.vocab)

# Dos tokens que en minúsculas encuentran "adidas" y "zx"
pattern1 = [{____: ____}, {____: ____}]

# Token que en minúsculas encuentra "adidas" y un dígito
pattern2 = [{____: ____}, {____: ____}]

# Añade los patrones al matcher y revisa el resultado
matcher.add("ROPA", None, pattern1, pattern2)
for doc in nlp.pipe(TEXTS):
    print([doc[start:end] for match_id, start, end in matcher(doc)])
예제 #51
0
import json
from spacy.matcher import Matcher
from spacy.lang.en import English

with open("exercises/en/iphone.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

nlp = English()
matcher = Matcher(nlp.vocab)

# Two tokens whose lowercase forms match "iphone" and "x"
pattern1 = [{____: ____}, {____: ____}]

# Token whose lowercase form matches "iphone" and a digit
pattern2 = [{____: ____}, {____: ____}]

# Add patterns to the matcher and check the result
matcher.add("GADGET", None, pattern1, pattern2)
for doc in nlp.pipe(TEXTS):
    print([doc[start:end] for match_id, start, end in matcher(doc)])
def custom_tokenizer_to_df(nlp, doc):
    # Initialize the Matcher with a vocab
    matcher = Matcher(nlp.vocab)

    ###############################################################
    # Add pattern for valid hashtag, i.e. '#' plus any ASCII token
    matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ALPHA": True}])

    # Register token extension for hashtag
    Token.set_extension("is_hashtag", default=False, force=True)

    # Fit in text in matcher
    matches = matcher(doc)

    # Find hashtag and merge, assign hashtag label
    hashtags = []
    for match_id, start, end in matches:
        if doc.vocab.strings[match_id] == "HASHTAG":
            hashtags.append(doc[start:end])
    with doc.retokenize() as retokenizer:
        for span in hashtags:
            retokenizer.merge(span)
            for token in span:
                token._.is_hashtag = True
    ##############################################################

    ##############################################################
    # Find number and merge, assign number label
    # Add pattern for valid hashtag, i.e. '#' plus any ASCII token
    matcher.add("LONG_NUMBER", None, [{
        "IS_DIGIT": True
    }, {
        "ORTH": ','
    }, {
        "IS_DIGIT": True
    }])
    matcher.add("LONG_NUMBER", None, [{
        "IS_DIGIT": True
    }, {
        "ORTH": '.'
    }, {
        "IS_DIGIT": True
    }])

    # Register token extension for hashtag
    Token.set_extension("is_long_number", default=False, force=True)

    # Fit in text in matcher
    matches = matcher(doc)

    long_number = []
    for match_id, start, end in matches:
        if doc.vocab.strings[match_id] == "LONG_NUMBER":
            long_number.append(doc[start:end])
    with doc.retokenize() as retokenizer:
        for span in long_number:
            retokenizer.merge(span)
            for token in span:
                token._.is_long_number = True
    ##############################################################

    for i, token in enumerate(doc):
        if token._.is_hashtag:
            token.tag_ = 'Hashtag'
        if token.like_url:
            token.tag_ = 'URL'
        if token.like_email:
            token.tag_ = 'Email'
        if token.is_stop:
            token.tag_ = 'Stop Word'
        if token.like_num:
            token.tag_ = 'Number'
        if token._.is_long_number:
            token.tag_ = 'Number'
        if token.is_punct:
            token.tag_ = 'Punctuation'

    # Write the tokens to data frame
    df = pd.DataFrame()
    df['Token'] = [token.text for token in doc]
    df['POS'] = [token.pos_ for token in doc]
    df['NE'] = [token.ent_iob_ for token in doc]
    df['Lemma'] = [token.lemma_ for token in doc]
    df['Tag'] = [token.tag_ for token in doc]
    df['Language'] = np.nan
    df['Candidate'] = True
    df['Anglicism'] = np.nan
    return df
import spacy
from spacy.matcher import Matcher

text = "New iPhone X release date leaked as Apple reveals pre-orders by mistake"

# load a model
nlp = spacy.load("en_core_web_sm")
# initialize matcher
matcher = Matcher(nlp.vocab)

# create a pattern matching two tokens: "iPhone" and "X"
pattern = [{"ORTH": "iPhone"}, {"ORTH": "X"}]

# add pattern to the matcher
matcher.add("IPHONE_X_PATTERN", None, pattern)

doc = nlp(text)

# use the matcher on the doc
matches = matcher(doc)

print("matches: ", [doc[start:end].text for match_id, start, end in matches])

# Writing match patters

## mentions of full iOS versions - iOS 7, iOS 11, etc
doc = nlp(
    "After making the iOS update you won't notice a radical system-wide redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of iOS 11's furniture remains the same as in iOS 10. But you will discover some tweaks once you delve a little deeper."
)

pattern = [{"TEXT": "iOS"}, {"IS_DIGIT": True}]
예제 #54
0
class TextSum(object):
    def __init__(self):
        self.nlp = None
        self.matcher = None
        self.matched_sents = []
        self.phrases_w_bacteria = None
        self.phrases_w_food = None
        self.phrases_w_pregnancy = None
        self.phrases_w_diseases = None
        self.sent_bounds = []
        self.unit_vector = []
        self.phrase_id = 0
        self.text_for_summary = ""

    def load_dictionary(self, name='en_core_sci_lg'):
        """Load SciSpacy dictionary

        Args:
            name (str, optional): Name of the vocabulary. 
                Defaults to 'en_core_sci_lg'.

        Returns:
            spacy.lang: Spacy vocabulary 
        """
        print('\n\nLoading dictionary {}...'.format(name))
        self.nlp = spacy.load(name)
        self.matcher = Matcher(self.nlp.vocab)
        self.phrase_matcher = PhraseMatcher(self.nlp.vocab, attr="LOWER")
        print('Dictionary loaded successfully.')
        return self.nlp

    def _get_lemmas(self, doc):
        """Get lemmatization results for document.

        Args:
            doc (spacy.doc): Parsed spacy document.

        Returns:
            list: list of all document lemmatized tokens.
        """
        result = []
        for token in doc:
            if (token.is_alpha and not (token.is_space or token.is_punct
                                        or token.is_stop or token.like_num)):
                if len(str(token.lemma_)) > 1:
                    result.append(token.lemma_)
        return result

    def get_text(self, text_data, remove_references=True):
        """Tokenize and lemmatize the input text.

        Args:
            text_data (str): Document text body
            remove_references (bool, optional): Remove references from 
            document body. Defaults to True.

        Returns:
            spacy.doc: Processed text by Spacy.
        """
        if not self.nlp:
            print('Vocabulary is not loaded.')
            return None
        print('\n\nProcessing the document...')
        if remove_references:  # Remove article references
            splits = re.split("references", text_data, flags=re.IGNORECASE)
            if splits:
                if (len(splits) > 0):
                    text_data = ' '.join(splits[:-1])
            # remove numbers in square bracket:
            text_data = re.sub("[\[].*?[\]]", "", text_data)
        doc = self.nlp(text_data)  # Process document with Spacy
        print('Finished processing the document.')
        return doc

    def add_pipe(self, pipe):
        """Add Spacy pipes

        Args:
            pipe (str): pipe name
        """
        print('Loading Spacy pipe: {}'.format(pipe))
        pipe = pipe.lower()
        if pipe == 'abbreviation':  # Abbreviation extraction
            abbreviation_pipe = AbbreviationDetector(self.nlp)
            self.nlp.add_pipe(abbreviation_pipe)
        elif pipe == 'entitylinker':  # Entity linker
            linker = UmlsEntityLinker(resolve_abbreviations=True)
            self.nlp.add_pipe(linker)
        elif pipe == 'segmenter':  # Rule Segmenter
            self.nlp.add_pipe(combined_rule_sentence_segmenter, first=True)
        elif pipe == 'tokenizer':  # Tokenizer
            self.nlp.tokenizer = combined_rule_tokenizer(self.nlp)
        elif pipe == 'textrank':  # Textrank
            tr = pytextrank.TextRank()
            self.nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True)
        print('Pipe loaded.')

    def get_text_rank_summary(self, doc, limit_sentences=20, verbose=True):
        """Get extractive summary from textrank.

        Args:
            doc (spacy.doc): Parsed spacy document.
            limit_sentences (int, optional): Length of summary. 
                Defaults to 20.
            verbose (bool, optional): Whether or not print results. 
                Defaults to True.
        
        Returns:
            list: Summary
        """
        result = doc._.textrank.summary(limit_sentences=limit_sentences)
        res = ''

        for sent in result:
            res += '{} '.format(sent)
            if verbose:
                print(sent)
        return res

    def create_patterns(self, unique_keys, phrase_pattern=False):
        pattern = []
        if phrase_pattern:
            for item in unique_keys:
                pattern.append(self.nlp.make_doc(item.lower()))
        else:
            for item in unique_keys:
                pattern.append([{"LOWER": item.lower()}])
        return pattern

    def collect_sents(self, matcher, doc, i, matches):
        """This function collects sentences matched with phrases or tokens.
        This function is originally from Spacy website.

        Args:
            matcher (apacy.matcher): Spacy matcher instance.
            doc (spacy.doc): Spacy document.
            i (int): Iteration index.
            matches (list): List of matches.

        Returns:
            list: List of matches.
        """
        matched_sents = self.matched_sents
        match_id, start, end = matches[i]
        span = doc[start:end]  # Matched span
        sent = span.sent  # Sentence containing matched span
        match_ents = [{
            "start": span.start_char - sent.start_char,
            "end": span.end_char - sent.start_char,
            "label": "MATCH",
        }]
        matched_sents.append({"text": sent.text, "ents": match_ents})
        self.matched_sents = matched_sents
        return matched_sents

    def match_token_patterns(self, doc, pattern=[]):
        """Match a list of token patterns with document body.

        Args:
            doc (spacy.doc): Spacy document.
            pattern (list, optional): List of patterns. Defaults to [].

        Returns:
            list: List of matched phrases.
        """
        self.matched_sents = []
        self.matcher.add("PDFTokens", self.collect_sents,
                         *pattern)  # add pattern
        matches = self.matcher(doc)
        return matches

    def match_phrase_patterns(self, doc, pattern=[]):
        """Match a list of phrases patterns with document body.

        Args:
            doc (spacy.doc): Spacy document.
            pattern (list, optional): List of patterns. Defaults to [].

        Returns:
            list: List of matched phrases.
        """
        self.matched_sents = []
        self.phrase_matcher.add("PDFPhrases", self.collect_sents,
                                *pattern)  # add pattern
        matches = self.phrase_matcher(doc)
        return matches

    def get_matched_sents(self):
        """A list of matched sentences from pattern matching.

        Returns:
            list: List of matched sentences.
        """
        return self.matched_sents
예제 #55
0
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_lg')
matcher = Matcher(nlp.vocab)
cornerofpattern =  [
    {'LOWER': 'corner'},
    {'LOWER': 'of'}
]
matcher.add('CORNER_OF_PATTERN', None, cornerofpattern)
text = 'meet you are the Corner of chapin and 14th st.  See you there!'
doc = nlp(text)
matches = matcher(doc)
for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

import spacy

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_lg")

# Process whole documents
text = ("Meet us at 1417 Chapin St NW")
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
예제 #56
0
def task2(sentence,timestamp):
  # loading spacy model
  nlp = spacy.load("en_core_web_sm")
  import en_core_web_sm
  nlp = en_core_web_sm.load()

  print(sentence)
  if (isAlredyPresent(sentence) == False):
    processedTweets.append(sentence)
    call(["aplay", "Air.wav"])
    doc = nlp(sentence)
 #   print(sutime.SUTime(sentence))
    #  print([(X.text, X.label_) for X in doc.ents])

    # Tokenization
    tokens = []
    tokens = nltk.word_tokenize(sentence);
    #print("Tokens: ", tokens)
    #  tweetFile = open("stanford-ner-2018-10-16/tweet.txt", 'w')

    nlp = spacy.load("en_core_web_sm")
    # Matcher class object
    matcher = Matcher(nlp.vocab)
    matcher.add("matching", None, [{'POS': 'PROPN'}, {'LOWER': {'IN': ['ave', 'avenue', 'st', 'street',
                                                                       'rd', 'road', 'dr', 'drive', 'pkwy', 'parkway',
                                                                       'bend', 'bnd', 'boulevard', 'blvd', 'court',
                                                                       'ct',
                                                                       'expressway', 'expy', 'freeway', 'fwy',
                                                                       'highway', 'hwy', 'junction', 'jct', 'lane',
                                                                       'ln', 'loop', 'motorway', 'mtwy',
                                                                       'parkway', 'pkwy', 'point', 'pt', 'ramp',
                                                                       'turnpike', 'tpke', 'tunnel', 'tunl',
                                                                       'underpass']}}])

    matches = matcher(doc)
    span = ""
    for match_id, start, end in matches:
        span = doc[start:end]
    # print(span)

    st = StanfordNERTagger('stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz',
                           "stanford-ner-2018-10-16/stanford-ner.jar", encoding='utf-8')
    classifiedText = st.tag(tokens)
    location = ""
    #print(classifiedText)
    i = 0
    locationMatches = []
    for eachOut in classifiedText:
        if "LOCATION" in eachOut[1]:
            locationMatches.append(eachOut[0])
    # print(locationMatches)
    span = str(span)
    #print(span)
    # Lemmatization without POS tags
    lems = []
    lemmatizer = WordNetLemmatizer()
    pos_sen = nltk.pos_tag(tokens);
    #print("\n POS Tags: \n", pos_sen);

    pos_wn = [(s[0], penn_to_wn(s[1])) for s in pos_sen]
    # print("\n POS Tags for wordnet: \n", pos_wn)

    lems_pos = []
    for w in pos_wn:
        if (w[1]):
            lems_pos.append(lemmatizer.lemmatize(w[0], pos=w[1]))
        else:
            lems_pos.append(lemmatizer.lemmatize(w[0]))
    # print("\n Lemmatization by taking into account the pos tags: \n")
    # print(lems_pos)

    if("on" in tokens):
        try:
            x = tokens.index("on")
            x+=1
            while pos_sen[x][1]=="NNP":
                if pos_sen[x][0] not in locationMatches:
                    locationMatches.append(pos_sen[x][0])
                x+=1
            if(pos_sen[x][1]=="CD" and pos_sen[x+1][1]=="NNP" and pos_sen[x+1][0]!="AM" and pos_sen[x+1][0]!="am" and pos_sen[x+1][0]!="pm" and pos_sen[x+1][0]!="PM" ):
                if pos_sen[x][0] not in locationMatches:
                    locationMatches.append(pos_sen[x][0])

                if pos_sen[x+1][0] not in locationMatches:
                    locationMatches.append(pos_sen[x+1][0])

                x+=1
                x+=1
                while pos_sen[x][1] == "NNP":
                    if pos_sen[x][0] not in locationMatches:
                        locationMatches.append(pos_sen[x][0])
                    x += 1



        except:
            pass

    if ("at" in tokens):
        try:
            x = tokens.index("at")
            x += 1
            while pos_sen[x][1] == "NNP":
                if pos_sen[x][0] not in locationMatches:
                    locationMatches.append(pos_sen[x][0])
                x+=1
            if (pos_sen[x][1] == "CD" and pos_sen[x + 1][1] == "NNP" and pos_sen[x+1][0]!="AM" and pos_sen[x+1][0]!="am" and pos_sen[x+1][0]!="pm" and pos_sen[x+1][0]!="PM" ):
                if pos_sen[x][0] not in locationMatches:
                    locationMatches.append(pos_sen[x][0])

                if pos_sen[x + 1][0] not in locationMatches:
                    locationMatches.append(pos_sen[x + 1][0])

                x += 1
                x += 1
                while pos_sen[x][1] == "NNP":
                    if pos_sen[x][0] not in locationMatches:
                        locationMatches.append(pos_sen[x][0])
                    x += 1

        except:

            pass

    if ("AT" in tokens):
        try:
            x = tokens.index("AT")
            x += 1
            while pos_sen[x][1] == "NNP":
                if pos_sen[x][0] not in locationMatches:
                    locationMatches.append(pos_sen[x][0])
                x+=1
            if (pos_sen[x][1] == "CD" and pos_sen[x + 1][1] == "NNP" and pos_sen[x+1][0]!="AM" and pos_sen[x+1][0]!="am" and pos_sen[x+1][0]!="pm" and pos_sen[x+1][0]!="PM" ):
                if pos_sen[x][0] not in locationMatches:
                    locationMatches.append(pos_sen[x][0])

                if pos_sen[x + 1][0] not in locationMatches:
                    locationMatches.append(pos_sen[x + 1][0])

                x += 1
                x += 1
                while pos_sen[x][1] == "NNP":
                    if pos_sen[x][0] not in locationMatches:
                        locationMatches.append(pos_sen[x][0])
                    x += 1

        except:
            pass
    #print(locationMatches)
    removal=[]
    if (len(locationMatches) > 0 and len(span) > 0):
        for eachMatch in locationMatches:
            #print(len(locationMatches))
            try:
                #print(span.find(eachMatch))
                if span.find(eachMatch) != -1:
                    removal.append(eachMatch)
            except:
                print("Exception Distinct")

        for removeItem in removal:
            locationMatches.remove(removeItem)

    location= (span + " " + " ".join(locationMatches)).strip()


    #Extracting Time using Regular Expression:
    re6 = r"(24:00|2[0-3]:[0-5][0-9]|[0-1][0-9]:[0-5][0-9]:[0-5][0-9])([\s]*[AaPp][Mm])"
    re2 = r"(24:00|2[0-3]:[0-5][0-9]|[0-1][0-9]:[0-5][0-9]:[0-5][0-9])"
    re3 = r"24:00|2[0-3]:[0-5][0-9]|[0-1][0-9]:[0-5][0-9]([\s]*[AaPp][Mm])"
    re4 = r"24:00|2[0-3]:[0-5][0-9]|[0-1][0-9]:[0-5][0-9]"
    re5 = r"([0-9][0-9]?:[0-5][0-9]|[0-1][0-9]:[0-5][0-9])([\s]*[AaPp]*[Mm]*)"
    re1 = r"([0-9][0-9]*:[0-5][0-9]:[0-5][0-9])([\s]*[AaPp]*[Mm]*)"
    re7 = r"([0-9][0-9]*:[0-5][0-9])"

    try:
        time=(re.compile("(%s|%s|%s|%s|%s|%s|%s)" % (re1, re2, re3, re4, re5, re6, re7)).findall(sentence))[0][0]
        time=str(time)
        if(len(time.strip())>0):
            print("Time: "+str(time))
            timestamp=time
    except BaseException as e:
        print("Time : "+timestamp)


    severity= severity_classifier.severity_finder(sentence)
    severityStr=""
    for eachKeyword in severity:
        severityStr+=str(eachKeyword)+" "
    print("Severity: "+severityStr)

    if (len(location) > 0):
        print("Location: " + location)
        e2 = {"predictedClassLabel": "Accidental", "tweet": sentence, "timestamp": timestamp, "location":location,"severity":severityStr}
    else:
        e2 = {"predictedClassLabel": "Accidental", "tweet": sentence, "timestamp": timestamp,"severity":severityStr}
    res2 = es.index(index=indexName2, doc_type=typeName2, body=e2)
예제 #57
0
def run_prdualrank(T_0, unranked_patterns, unranked_phrases, file):
    global final_patterns, final_keywords, pattern_to_score_map, keyword_to_score_map, ngram_prob_map, phrase_seg_score, removed_phrases, wiki_ir_cache, error_count, total_ngram_counts
    phrase2id = {}
    for i in range(len(unranked_phrases)):
        phrase2id[unranked_phrases[i]] = i

    id2phrase = {}
    for i in range(len(unranked_phrases)):
        id2phrase[i] = unranked_phrases[i]

    id2pattern = {}
    for i in range(len(unranked_patterns)):
        id2pattern[i] = unranked_patterns[i]

    seedIdwConfidence = {}
    for key, val in phrase2id.items():
        if key in T_0:
            seedIdwConfidence[val] = 0.0

    id2patterns = defaultdict(set)
    pattern2ids = defaultdict(set)

    context_matrix = np.zeros((len(unranked_phrases), len(unranked_patterns)))
    # find c (t, p)
    with open(file, 'r') as f:
        file_chunk = partition(f)
        matcher = Matcher(nlp.vocab)
        for t in file_chunk:
            doc = nlp(t)
            for i in range(len(unranked_patterns)):
                offset = 0
                for pattern_dict in unranked_patterns[i]:
                    if 'POS' in pattern_dict:
                        break
                    offset += 1
                matcher.add("extraction", None, unranked_patterns[i])
                matches = matcher(doc)
                for match_id, start, end in matches:
                    span = doc[start+offset:end].text
                    j = unranked_phrases.index(span) if span in unranked_phrases else -1
                    if j == -1:
                        continue
                    context_matrix[j, i] += 1
                    id2patterns[j].add(i)
                    pattern2ids[i].add(j)
                matcher.remove("extraction")


    id2sup = {}
    for i in range(len(unranked_phrases)):
        id2sup[i] = 0

    pattern2sup = {}
    for i in range(len(unranked_patterns)):
        pattern2sup[i] = 0

    for id in id2patterns.keys():
        sum = 0
        for col in range(len(unranked_patterns)):
            sum += context_matrix[id, col]
        id2sup[id] = sum

    for pattern in pattern2ids.keys():
        sum = 0
        for row in range(len(unranked_phrases)):
            sum += context_matrix[row, pattern]
        pattern2sup[pattern] = sum

    l1, l2, l3, l4, m1, m2, m3, m4 = prDualRank(seedIdwConfidence, [], id2patterns, pattern2ids, {},
             {}, {}, {}, id2phrase, context_matrix.tolist(), id2sup, pattern2sup,
             FLAGS_VERBOSE=False, FLAGS_DEBUG=False)

    return l1, l2, l3, l4, m1, m2, m3, m4
예제 #58
0
result = [(w.text, w.pos_) for w in doc]


# Phone number extraction
phone_matcher = Matcher(nlp.vocab)

# input could be a list of patterns [pattern1, pattern2, ...]
pattern1 = [
    {"SHAPE": "ddd"}, {"ORTH": "-"}, {"SHAPE": "ddd"}, {"ORTH": "-"}, {"SHAPE": "dddd"}
]
pattern2 = [
    {"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"}, {"ORTH": "-"},
    {"SHAPE": "dddd"}
]
pattern3 = [
    {"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"}, {"SHAPE": "dddd"}
]
patterns = [
    pattern1, pattern2, pattern3
]

phone_matcher.add("PHONE_NUMBER", patterns)
matches = phone_matcher(doc)

phone_numbers = []
for match_id, start, end in matches:
    span = doc[start:end]
    phone_numbers.append(span.text)

print(phone_numbers)
예제 #59
0
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("ja_core_news_sm")
matcher = Matcher(nlp.vocab)

doc = nlp("私には年の離れた小さい弟がいます。彼は、甘い卵焼きが好きです")

# 形容詞と1つまたは2つの名詞からなるパターンを書きます
pattern = [{"POS": ____}, {"POS": ____}, {"POS": ____, "OP": ____}]

# パターンをmatcherに追加し、docにmatcherを適用してください
matcher.add("ADJ_NOUN_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

# 結果をイテレートし、スパンの文字列をプリントしてください。
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)
class OldPatternMatcher:

    cv = {
        "beat", "beats", "prefer", "prefers", "recommend", "recommends",
        "defeat", "defeats", "kill", "kills", "lead", "leads", "obliterate",
        "obliterates", "outclass", "outclasses", "outdo", "outdoes",
        "outperform", "outperforms", "outplay", "outplays", "overtake",
        "overtakes", "smack", "smacks", "subdue", "subdues", "surpass",
        "surpasses", "trump", "trumps", "win", "wins", "blow", "blows",
        "decimate", "decimates", "destroy", "destroys", "buy", "buys",
        "choose", "chooses", "favor", "favors", "grab", "grabs", "pick",
        "picks", "purchase", "purchases", "select", "selects", "race", "races",
        "compete", "competes", "match", "matches", "compare", "compares",
        "lose", "loses", "suck", "sucks"
    }
    cin = {
        "than", "over", "beyond", "upon", "as", "against", "out", "behind",
        "under", "between", "after", "unlike", "with", "by", "opposite"
    }

    def __init__(self):
        self.count = {
            "0": 0,
            "1": 0,
            "2": 0,
            "3": 0,
            "4": 0,
            "5": 0,
            "6": 0,
            "7": 0,
            "8": 0,
            "9": 0,
            "10": 0
        }
        self.compa_sent_count = 0

        self.matcher = Matcher(nlp.vocab)
        self.matcher.add(0, None, [{
            'ORTH': 'JJR'
        }, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'JJR'
        }, {}, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'JJR'
        }, {
            'ORTH': 'CIN'
        }, {}, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'JJR'
        }, {}, {
            'ORTH': 'CIN'
        }, {}, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'JJ'
        }, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'JJ'
        }, {}, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'JJ'
        }, {
            'ORTH': 'CIN'
        }, {}, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'JJ'
        }, {}, {
            'ORTH': 'CIN'
        }, {}, {
            'ORTH': 'TECH'
        }])
        self.matcher.add(
            1,
            None,
            [{
                'ORTH': 'VB'
            }, {
                'ORTH': 'TECH'
            }, {
                'ORTH': 'TO'
            }, {
                'ORTH': 'VB'
            }],
            [{
                'ORTH': 'VB'
            }, {
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'TO'
            }, {
                'ORTH': 'VB'
            }],
        )
        self.matcher.add(8, None, [{
            'ORTH': 'RBR'
        }, {
            'ORTH': 'JJ'
        }, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'RBR'
        }, {
            'ORTH': 'JJ'
        }, {}, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }])
        self.matcher.add(2, None, [{
            'ORTH': 'CV'
        }, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'CV'
        }, {}, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }])
        self.matcher.add(3, None, [{
            'ORTH': 'CV'
        }, {
            'ORTH': 'VBG'
        }, {
            'ORTH': 'TECH'
        }])
        self.matcher.add(
            5,
            None,
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VB'
            }, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VB'
            }, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VB'
            }, {}, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VB'
            }, {}, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VB'
            }, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VB'
            }, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VB'
            }, {}, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VB'
            }, {}, {
                'ORTH': 'NN'
            }],
        )

        # self.matcher.add(6,
        #             None,
        #             [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'JJS'}],
        #             [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {'ORTH': 'JJS'}],
        #             [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'JJS'}],
        #             [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'JJS'}])
        self.matcher.add(7, None, [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VB'
        }, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VBZ'
        }, {}, {}, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VB'
        }, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {}, {}, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'JJR'
        }])
        self.matcher.add(10, None, [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VB'
        }, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {}, {}, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VB'
        }, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {}, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'RBR'
        }])
        # self.matcher.add(9,
        #             None,
        #             [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'RBS'}],
        #             [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {'ORTH': 'RBS'}],
        #             [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RBS'}],
        #             [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RBS'}])

        self.matcher.add(
            11,
            None,
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'NP'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'NP'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'NP'
            }],
        )

    def add_pos_tag(self, words, tech_pair):

        tagged_words = nltk.pos_tag(words.split())
        # print(words)
        # print (tagged_words)
        tag_list = []
        for (word, tag) in tagged_words:
            if tag == "IN" and word in self.cin:
                tag_list.append("CIN")
            elif word == tech_pair.split()[0] or word == tech_pair.split()[1]:
                tag_list.append("TECH")
            elif word in np:
                tag_list.append("NP")
            elif tag[:2] == "VB" and word in cv:
                tag_list.append("CV")
            elif tag[:2] == "VB":
                tag_list.append("VB")
            elif tag[:2] == "RB":
                tag_list.append("RBR")

            else:
                tag_list.append(tag)

        return tag_list