Exemplo n.º 1
0
    def get_cause(self):

        cause_matcher = Matcher(nlp.vocab)
        CAUSUAL_WORDS = [
            "consequently", "as a result", "therefore", "as a result",
            "as a consequence", "for these reason", "thus", "due",
            "for all these reasons", "because of", "because", "since", "thus",
            "cause", "occur", "accord", "after", "off", "all of a sudden",
            "coming from the opposite direction", "fell", "hit"
        ]
        CAUSUAL_SENTENCES = []
        DOCUMENT = unicode(self.news_story.decode('utf8'))
        DOC = nlp(DOCUMENT)
        for word in CAUSUAL_WORDS:
            cause_matcher.add_pattern("Causual sentence", [{LEMMA: word}])
        cause = ""
        for sent in DOC.sents:
            new_sent = nlp(unicode(str(sent).decode('utf8')))
            matches = cause_matcher(new_sent)
            if len(matches) > 0:
                CAUSUAL_SENTENCES.append(sent)
            else:
                CAUSUAL_SENTENCES.append("")
        for sent in CAUSUAL_SENTENCES:
            sent = str(sent)
            cause = cause + sent
        # print("the cause is:", cause)
        return cause
Exemplo n.º 2
0
def test_get_entity_via_match(en_vocab):
    matcher = Matcher(en_vocab)
    matcher.add_entity('TestEntity', attrs={u'Hello': u'World'})
    assert matcher.n_patterns == 0
    assert matcher(Doc(en_vocab, words=[u'Test', u'Entity'])) == []
    matcher.add_pattern(u'TestEntity', [{ORTH: u'Test'}, {ORTH: u'Entity'}])
    assert matcher.n_patterns == 1
    matches = matcher(Doc(en_vocab, words=[u'Test', u'Entity']))
    assert len(matches) == 1
    assert len(matches[0]) == 4
    ent_id, label, start, end = matches[0]
    assert ent_id == matcher.vocab.strings[u'TestEntity']
    assert label == 0
    assert start == 0
    assert end == 2
    attrs = matcher.get_entity(ent_id)
    assert attrs == {u'Hello': u'World'}
    def __init__(self, nlp):
        matcher = Matcher(nlp.vocab)

        iob_pattern = [{
            a.LIKE_NUM: False,
            a.ENT_IOB: 3
        }, {
            'OP': '*',
            a.ENT_IOB: 1
        }, {
            'OP': '?',
            a.LIKE_NUM: True
        }]
        entity_name = 'object'  # it is to associate matches with patterns
        matcher.add_entity(entity_name, acceptor=self.make_intersect_ar(2, 2))
        matcher.add_pattern(entity_name, iob_pattern, label=Part.OBJ)

        entity_name = 'subject'
        matcher.add_entity(entity_name, acceptor=self.make_intersect_ar())
        matcher.add_pattern(entity_name, iob_pattern, label=Part.SUBJ)

        # conjugation_pattern = iob_pattern + [{a.POS: 'CONJ'}]

        # entity_name = 'version'
        # ver_pattern1 = [{a.LEMMA: 'version'}, {a.LIKE_NUM: True}]
        # matcher.add_entity(entity_name, acceptor=self.make_intersect_ar(1,1))
        # matcher.add_pattern(entity_name, ver_pattern1, label=Part.SUBJ)
        # matcher.add_pattern(entity_name, ver_pattern1, label=Part.OBJ)

        # entity_name = 'location'
        # entity_name = 'date'
        # matcher.add_entity(entity_name, acceptor=self.make_inclusion_ar(1,1))
        # matcher.add_pattern(entity_name, [{a.ENT_TYPE: 'DATE'}], label=Part.OBJ)

        self.entity_rules = ['subject', 'object']
        super().__init__(matcher)
Exemplo n.º 4
0
def load_age_matcher(nlp):
    """
    Matcher Handles:
    Age : 22 years
    age : 22 yrs
    Age 22-40
    22 yrs
    23yrs
    22-40 years
    About me 22
    """

    matcher = Matcher(nlp.vocab)

    # Added New attribute to check for years
    years = ['years', 'yrs', 'year']
    is_year = FLAG63
    target_ids = {nlp.vocab.strings[s.lower()] for s in years}
    for lexeme in nlp.vocab:
        if lexeme.lower in target_ids:
            lexeme.set_flag(is_year, True)

    # New Entity Type : Age
    matcher.add_entity("Age", acceptor=get_age)

    # Age Matcher Patterns
    matcher.add_pattern("Age", [{LOWER: "age"}, {IS_PUNCT: True}, {IS_DIGIT: True, LENGTH: 2}])
    matcher.add_pattern("Age", [{LOWER: "age"}, {IS_DIGIT: True, LENGTH: 2}])

    matcher.add_pattern("Age", [{LOWER: "age"}, {IS_PUNCT: True}, {IS_DIGIT: True, LENGTH: 2}, {IS_PUNCT: True},
                                {IS_DIGIT: True, LENGTH: 2}])
    matcher.add_pattern("Age",
                        [{LOWER: "age"}, {IS_DIGIT: True, LENGTH: 2}, {IS_PUNCT: True}, {IS_DIGIT: True, LENGTH: 2}])

    matcher.add_pattern("Age", [{IS_DIGIT: True, LENGTH: 2}, {is_year: True}])

    matcher.add_pattern("Age", [{SUFFIX: "yrs", LENGTH: 5}])

    matcher.add_pattern("Age", [{IS_DIGIT: True, LENGTH: 2}, {IS_PUNCT: True, 'OP': '?'}, {IS_DIGIT: True, LENGTH: 2},
                                {is_year: True}])
    matcher.add_pattern("Age", [{IS_DIGIT: True, LENGTH: 2}, {IS_ASCII: True, 'OP': '?'}, {IS_DIGIT: True, LENGTH: 2},
                                {is_year: True}])

    matcher.add_pattern("Age", [{LOWER: 'about'}, {LOWER: 'me', 'OP': '?'}, {IS_DIGIT: True}])

    return matcher
Exemplo n.º 5
0
def load_date_matcher(nlp):

    # Create matcher object with list of rules and return
    matcher = Matcher(nlp.vocab)

    # Add to vocab
    add_to_vocab(nlp, months_dict.keys())
    add_to_vocab(nlp, ordinals)
    add_to_vocab(nlp, date_delimiters)
    add_to_vocab(nlp, date_digits)

    # Create flag for MONTH
    is_month = FLAG62
    month_target_ids = {
        nlp.vocab.strings[s.lower()]
        for s in months_dict.keys()
    }

    # Create flag for ORDINALS
    is_ordinal = FLAG61
    ordinal_target_ids = {nlp.vocab.strings[s.lower()] for s in ordinals}

    # Create flag for DATE_DELIMITER
    is_date_delimiter = FLAG60
    date_delimiter_target_ids = {
        nlp.vocab.strings[s.lower()]
        for s in date_delimiters
    }

    # Create flag for DIGIT
    is_date_digit = FLAG59
    date_digit_target_ids = {nlp.vocab.strings[s.lower()] for s in date_digits}

    # Add the flags
    for lexeme in nlp.vocab:
        if lexeme.lower in month_target_ids:
            lexeme.set_flag(is_month, True)
        if lexeme.lower in ordinal_target_ids:
            lexeme.set_flag(is_ordinal, True)
        if lexeme.lower in date_delimiter_target_ids:
            lexeme.set_flag(is_date_delimiter, True)
        if lexeme.lower in date_digit_target_ids:
            lexeme.set_flag(is_date_digit, True)
        if lexeme.is_digit == True:
            lexeme.set_flag(is_date_digit, True)
        # if is_date_digit_with_ordinal(lexeme.lower_):
        #     lexeme.set_flag(is_date_digit, True)

    # Add rules

    # March 25, 2017
    # March 25th, 2017
    # March 25th 2017
    # March 25 2017
    matcher.add_pattern('DATE', [{
        is_month: True
    }, {
        is_date_digit: True
    }, {
        is_ordinal: True,
        'OP': '?'
    }, {
        ORTH: ',',
        'OP': '?'
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=1)

    # 25 March, 2017
    # 25th March, 2017
    # 25th March 2017
    # 25 March 2017
    matcher.add_pattern('DATE', [{
        is_date_digit: True
    }, {
        is_date_delimiter: True,
        'OP': '?'
    }, {
        is_month: True
    }, {
        is_ordinal: True,
        'OP': '?'
    }, {
        ORTH: ',',
        'OP': '?'
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=2)

    # 25/05/2016
    matcher.add_pattern('DATE', [{
        is_date_digit: True
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        is_month: True
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=3)

    # 05/25/2016
    matcher.add_pattern('DATE', [{
        is_month: True
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        is_date_digit: True
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=4)

    # Diciembre, 2009
    # December 2009
    matcher.add_pattern('DATE', [{
        is_month: True,
        is_date_digit: False
    }, {
        ORTH: ','
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=9)
    matcher.add_pattern('DATE', [{
        is_month: True,
        is_date_digit: False
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=9)

    # 2013-12-04
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 4
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        is_month: True
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        is_date_digit: True
    }],
                        label=10)

    # 9 days ago
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True
    }, {
        POS: 'NOUN'
    }, {
        LOWER: 'ago'
    }],
                        label=12)

    # 1 Jul
    # 1. Jul
    matcher.add_pattern('DATE', [{
        is_date_digit: True
    }, {
        is_ordinal: True
    }, {
        is_date_delimiter: True
    }, {
        is_month: True,
        is_date_digit: False
    }],
                        label=13)
    matcher.add_pattern('DATE', [{
        is_date_digit: True
    }, {
        is_ordinal: True
    }, {
        is_month: True,
        is_date_digit: False
    }],
                        label=13)
    matcher.add_pattern('DATE', [{
        is_date_digit: True
    }, {
        is_date_delimiter: True
    }, {
        is_month: True,
        is_date_digit: False
    }],
                        label=13)
    matcher.add_pattern('DATE', [{
        is_date_digit: True
    }, {
        is_month: True,
        is_date_digit: False
    }],
                        label=13)

    # Jul 2nd
    matcher.add_pattern('DATE', [{
        is_month: True,
        is_date_digit: False
    }, {
        is_date_delimiter: True
    }, {
        is_date_digit: True
    }, {
        is_ordinal: True
    }],
                        label=15)
    matcher.add_pattern('DATE', [{
        is_month: True,
        is_date_digit: False
    }, {
        is_date_delimiter: True
    }, {
        is_date_digit: True
    }],
                        label=15)
    matcher.add_pattern('DATE', [{
        is_month: True,
        is_date_digit: False
    }, {
        is_date_digit: True
    }, {
        is_ordinal: True
    }],
                        label=15)
    matcher.add_pattern('DATE', [{
        is_month: True,
        is_date_digit: False
    }, {
        is_date_digit: True
    }],
                        label=15)

    return matcher
Exemplo n.º 6
0
def load_risky_activities_matcher(nlp):
    matcher = Matcher(nlp.vocab)

    risky_activities = [
        'bareback', 'uncovered', 'bbbjtcim', 'bbbj', 'bbbjtc', 'bbbjtcws',
        'bbbjwf', 'bbfs', 'anal', 'greek', 'rca', 'swallow', 'cim', 'choke',
        'bdsm', 'bondage', 'g******g', 'hardcore'
    ]

    provider = [
        'girl', 'girls', 'model', 'models', 'staff', 'staffs', 'latina',
        'latinas', 'talent', 'talents', 'supermodel', 'supermodels',
        'princess', 'princesses'
    ]

    is_risky_activities = FLAG40
    is_provider = FLAG41
    set_flag(nlp, risky_activities, is_risky_activities)
    set_flag(nlp, provider, is_provider)

    matcher.add_entity(1)
    matcher.add_pattern(1, [{is_risky_activities: True}])

    matcher.add_entity(2)
    matcher.add_pattern(2, [{LEMMA: "hardcore"}, {LEMMA: "sex"}])
    matcher.add_pattern(2, [{LEMMA: "hardcore"}, {LEMMA: "service"}])

    matcher.add_entity(3)
    matcher.add_pattern(3, [{LEMMA: "hardcore"}])

    matcher.add_entity(4)
    matcher.add_pattern(4, [{LEMMA: "greek"}, {IS_DIGIT: True}])
    matcher.add_pattern(4, [{LEMMA: "greek"}, {is_provider: True}])
    matcher.add_pattern(4, [{
        LEMMA: "if",
        DEP: "mark"
    }, {
        IS_ALPHA: True,
        DEP: "ROOT"
    }, {
        is_risky_activities: True
    }])
    matcher.add_pattern(4, [{is_risky_activities: True}, {LEMMA: "sorry"}])

    return matcher
Exemplo n.º 7
0
def load_movement_matcher(nlp):
    matcher = Matcher(nlp.vocab)

    place = ['area', 'place', 'city', 'town']
    girl = [
        'gal', 'girl', 's**t', 'cutie', 'hottie', 'lady', 'teen', 'teenager',
        'chick', 'staff', 'gf', 'she'
    ]

    add_to_vocab(nlp, place)
    add_to_vocab(nlp, girl)

    is_place = FLAG18
    is_girl = FLAG19
    upper_start = FLAG20

    for lexeme in nlp.vocab:
        if lexeme.lower_ in place:
            lexeme.set_flag(is_place, True)
        if lexeme.lower_ in girl:
            lexeme.set_flag(is_girl, True)
        if lexeme.prefix_.isupper():
            lexeme.set_flag(upper_start, True)

    # Positive Matcher Patterns
    matcher.add_entity(1)
    matcher.add_pattern(1, [{
        LEMMA: "last"
    }, {
        LEMMA: "night"
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True,
        ENT_TYPE: "DATE"
    }])
    matcher.add_pattern(1, [{LEMMA: "leave"}, {DEP: "partmod"}])
    matcher.add_pattern(1, [{LEMMA: "leave"}, {DEP: "quantmod"}])
    matcher.add_pattern(1, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True,
        ENT_TYPE: "TIME"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "leave"
    }, {
        LEMMA: "in"
    }, {
        IS_ASCII: True,
        ENT_TYPE: "DATE"
    }])
    matcher.add_pattern(1, [{LEMMA: "leave"}, {LEMMA: "town"}])
    matcher.add_pattern(1, [{LEMMA: "out"}, {LEMMA: "of"}, {LEMMA: "town"}])
    matcher.add_pattern(1, [{LOWER: "outta"}, {LEMMA: "town"}])
    matcher.add_pattern(1, [{
        LEMMA: "lastnight"
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(1, [{LEMMA: "back"}, {LEMMA: "in"}, {LEMMA: "town"}])
    matcher.add_pattern(1, [{LEMMA: "just"}, {LEMMA: "in"}, {LEMMA: "town"}])
    matcher.add_pattern(1, [{LEMMA: "day"}, {LEMMA: "in"}, {LEMMA: "town"}])
    matcher.add_pattern(1, [{
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }, {
        LEMMA: "tonight"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }, {
        LEMMA: "through"
    }])
    matcher.add_pattern(1, [{LEMMA: "in"}, {LEMMA: "town"}, {LEMMA: "until"}])
    matcher.add_pattern(1, [{
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }, {
        LEMMA: "for"
    }, {
        LEMMA: "one"
    }, {
        LEMMA: "night"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }, {
        LEMMA: "for"
    }, {
        IS_DIGIT: True
    }, {
        LEMMA: "night"
    }])
    matcher.add_pattern(1, [{LEMMA: "town"}, {LEMMA: "stay", DEP: "nmod"}])
    matcher.add_pattern(1, [{
        LEMMA: "town"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "stay",
        DEP: "nmod"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "new"
    }, {
        LEMMA: "girl"
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(1, [{LEMMA: "recent"}, {LEMMA: "move"}])
    matcher.add_pattern(1, [{LEMMA: "recently"}, {LEMMA: "move"}])
    matcher.add_pattern(1, [{LEMMA: "relocate"}])
    matcher.add_pattern(1, [{
        LEMMA: "new",
        DEP: "amod"
    }, {
        LEMMA: "city"
    }, {
        LEMMA: "to",
        DEP: "dep"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "new",
        DEP: "amod"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "city"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "to",
        DEP: "dep"
    }])
    matcher.add_pattern(1, [{LEMMA: "new"}, {LEMMA: "to"}, {LEMMA: "area"}])
    matcher.add_pattern(1, [{
        LEMMA: "new"
    }, {
        LEMMA: "to"
    }, {
        upper_start: True
    }])
    matcher.add_pattern(1, [{LEMMA: "first"}, {LEMMA: "visit"}, {LEMMA: "to"}])
    matcher.add_pattern(1, [{LEMMA: "i", DEP: "nsubj"}, {LEMMA: "arrive"}])
    matcher.add_pattern(1, [{
        LEMMA: "girl",
        DEP: "nsubj"
    }, {
        LEMMA: "arrive"
    }, {
        DEP: "partmod"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "girl",
        DEP: "nsubj"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "arrive"
    }, {
        IS_ASCII: True
    }, {
        DEP: "partmod"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "girl",
        DEP: "nsubj"
    }, {
        LEMMA: "arrive"
    }, {
        DEP: "quantmod"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "girl",
        DEP: "nsubj"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "arrive"
    }, {
        IS_ASCII: True
    }, {
        DEP: "quantmod"
    }])
    matcher.add_pattern(1, [{LEMMA: "just"}, {LEMMA: "arrive"}])
    matcher.add_pattern(1, [{
        LEMMA: "on"
    }, {
        LEMMA: "my"
    }, {
        LEMMA: "way"
    }, {
        LEMMA: "to"
    }, {
        TAG: "NNP"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "on"
    }, {
        LEMMA: "my"
    }, {
        LEMMA: "way"
    }, {
        LEMMA: "to"
    }, {
        TAG: "NN"
    }])
    matcher.add_pattern(1, [{LEMMA: "on"}, {LEMMA: "the"}, {LEMMA: "way"}])
    matcher.add_pattern(1, [{LEMMA: "just"}, {LEMMA: "get"}, {LEMMA: "here"}])
    matcher.add_pattern(1, [{LEMMA: "get"}, {LEMMA: "here"}, {LEMMA: "today"}])
    matcher.add_pattern(1, [{
        LEMMA: "get"
    }, {
        LEMMA: "here"
    }, {
        LEMMA: "yesterday"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "get"
    }, {
        LEMMA: "here"
    }, {
        LEMMA: "last"
    }, {
        LEMMA: "night"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "i",
        DEP: "nsubj"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "visit"
    }, {
        IS_ASCII: True
    }, {
        is_place: True,
        DEP: "dobj"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "i",
        DEP: "nsubj"
    }, {
        LEMMA: "visit"
    }, {
        is_place: True,
        DEP: "dobj"
    }])

    # Strong Positive Matcher Patterns
    matcher.add_entity(2)
    matcher.add_pattern(2, [{
        LEMMA: "new"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        is_place: True
    }])
    matcher.add_pattern(2, [{
        LEMMA: "new"
    }, {
        IS_ASCII: True
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        is_place: True
    }])
    matcher.add_pattern(2, [{
        LEMMA: "im"
    }, {
        LEMMA: "new"
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(2, [{LEMMA: "new"}, {LEMMA: "in"}, {is_place: True}])
    matcher.add_pattern(2, [{LEMMA: "new"}, {LEMMA: "to"}, {is_place: True}])
    matcher.add_pattern(2, [{
        LEMMA: "new"
    }, {
        is_girl: True
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(2, [{
        LEMMA: "new"
    }, {
        LEMMA: "to"
    }, {
        upper_start: True
    }, {
        LEMMA: "area"
    }])

    # Negative Matcher Patterns
    matcher.add_entity(3)
    matcher.add_pattern(3, [{LEMMA: "new"}])
    matcher.add_pattern(3, [{LEMMA: "girl"}, {LEMMA: "in"}, {LEMMA: "town"}])
    matcher.add_pattern(3, [{LEMMA: "grand"}, {LEMMA: "new"}])
    matcher.add_pattern(3, [{LEMMA: "new"}, {LEMMA: "at"}])
    matcher.add_pattern(3, [{
        LEMMA: "new"
    }, {
        LEMMA: "to"
    }, {
        LEMMA: "business"
    }])
    matcher.add_pattern(3, [{
        LEMMA: "new"
    }, {
        LEMMA: "to"
    }, {
        LEMMA: "industry"
    }])
    matcher.add_pattern(3, [{
        LEMMA: "new"
    }, {
        LEMMA: "to"
    }, {
        LEMMA: "scenario"
    }])
    matcher.add_pattern(3, [{LEMMA: "dream", DEP: "nsubj"}, {LEMMA: "arrive"}])
    matcher.add_pattern(3, [{
        LEMMA: "fantasy",
        DEP: "nsubj"
    }, {
        LEMMA: "arrive"
    }])
    matcher.add_pattern(3, [{LEMMA: "you", DEP: "nsubj"}, {LEMMA: "arrive"}])
    matcher.add_pattern(3, [{LEMMA: "area"}, {LEMMA: "only"}])
    matcher.add_pattern(3, [{upper_start: True}, {LEMMA: "area"}])
    matcher.add_pattern(3, [{LEMMA: "you", DEP: "nsubj"}, {LEMMA: "leave"}])
    matcher.add_pattern(3, [{
        LEMMA: "it",
        DEP: "dobj"
    }, {
        LEMMA: "leave"
    }, {
        IS_ASCII: True,
        DEP: "nmod",
        TAG: "TO"
    }])
    matcher.add_pattern(3, [{
        LEMMA: "that",
        DEP: "dobj"
    }, {
        LEMMA: "leave"
    }, {
        IS_ASCII: True,
        DEP: "nmod",
        TAG: "TO"
    }])
    matcher.add_pattern(3, [{LEMMA: "best"}, {LEMMA: "move"}])
    matcher.add_pattern(3, [{LEMMA: "next"}, {LEMMA: "move"}])
    matcher.add_pattern(3, [{
        LEMMA: "arrive"
    }, {
        IS_ASCII: True
    }, {
        IS_ASCII: True,
        DEP: "xcomp"
    }])
    matcher.add_pattern(3, [{LEMMA: "arrive"}, {IS_ASCII: True, DEP: "xcomp"}])
    matcher.add_pattern(3, [{LEMMA: "visit"}, {LEMMA: "sister", DEP: "dobj"}])
    matcher.add_pattern(3, [{
        LEMMA: "visit"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "sister",
        DEP: "dobj"
    }])
    matcher.add_pattern(3, [{LEMMA: "visit"}, {LEMMA: "family", DEP: "dobj"}])
    matcher.add_pattern(3, [{
        LEMMA: "visit"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "family",
        DEP: "dobj"
    }])
    matcher.add_pattern(3, [{LEMMA: "we", DEP: "poss"}, {LEMMA: "visit"}])

    # Strong Negative Matcher Patterns
    matcher.add_entity(4)
    matcher.add_pattern(4, [{LEMMA: "town"}, {LEMMA: "girl"}])
    matcher.add_pattern(4, [{LEMMA: "on"}, {LEMMA: "the"}, {LEMMA: "town"}])
    matcher.add_pattern(4, [{LEMMA: "near"}, {LEMMA: "town"}])
    matcher.add_pattern(4, [{LEMMA: "down"}, {LEMMA: "town"}])
    matcher.add_pattern(4, [{LEMMA: "town"}, {LEMMA: "hall"}])
    matcher.add_pattern(4, [{LEMMA: "best"}, {LEMMA: "in"}, {LEMMA: "town"}])
    matcher.add_pattern(4, [{
        LEMMA: "best"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "best"
    }, {
        IS_ASCII: True
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "best"
    }, {
        LEMMA: "in"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "best"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "best"
    }, {
        IS_ASCII: True
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "not"
    }, {
        LEMMA: "new"
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "not"
    }, {
        LEMMA: "new"
    }, {
        LEMMA: "to"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(4, [{LEMMA: "not"}, {LEMMA: "leave"}, {LEMMA: "town"}])
    matcher.add_pattern(4, [{
        LEMMA: "i",
        DEP: "nsubj"
    }, {
        LEMMA: "leave"
    }, {
        LEMMA: "you",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{LEMMA: "new"}, {LEMMA: "but"}])
    matcher.add_pattern(4, [{
        LEMMA: "new"
    }, {
        LEMMA: "backpage",
        DEP: "nmod",
        TAG: "TO"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "new"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "backpage",
        DEP: "nmod",
        TAG: "TO"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "new"
    }, {
        LEMMA: "bp",
        DEP: "nmod",
        TAG: "TO"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "new"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "bp",
        DEP: "nmod",
        TAG: "TO"
    }])
    #DS
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "message", DEP: "dobj"}])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "msg", DEP: "dobj"}])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "txt", DEP: "dobj"}])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "text", DEP: "dobj"}])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        LEMMA: "impression",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        LEMMA: "voicemail",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "smile", DEP: "dobj"}])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "message",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "msg",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "txt",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "text",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "impression",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "voicemail",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "smile",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "satisfied"}])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "memory", DEP: "dobj"}])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "memory",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "you"}])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "u"}])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "with"}])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        LEMMA: "a"
    }, {
        LEMMA: "gentleman"
    }])
    matcher.add_pattern(4, [{LEMMA: "or"}, {LEMMA: "leave"}])
    matcher.add_pattern(4, [{LEMMA: "or"}, {LEMMA: "i"}, {LEMMA: "leave"}])
    matcher.add_pattern(4, [{LEMMA: "move"}, {LEMMA: "on"}])
    matcher.add_pattern(4, [{LEMMA: "i"}, {LEMMA: "move"}, {LEMMA: "like"}])
    matcher.add_pattern(4, [{LEMMA: "arrive"}, {LEMMA: "on"}, {LEMMA: "time"}])
    matcher.add_pattern(4, [{LEMMA: "can"}, {LEMMA: "move"}])
    matcher.add_pattern(4, [{LEMMA: "new"}, {LEMMA: "but"}])
    matcher.add_pattern(4, [{
        LEMMA: "on"
    }, {
        LEMMA: "my"
    }, {
        LEMMA: "way"
    }, {
        LEMMA: "to"
    }, {
        TAG: "PRP"
    }])
    matcher.add_pattern(4, [{LEMMA: "u"}, {LEMMA: "get"}, {LEMMA: "here"}])
    matcher.add_pattern(4, [{LEMMA: "you"}, {LEMMA: "get"}, {LEMMA: "here"}])
    matcher.add_pattern(4, [{LEMMA: "go"}, {LEMMA: "to"}, {LEMMA: "town"}])
    matcher.add_pattern(4, [{LEMMA: "new"}, {LEMMA: "management"}])

    return matcher
Exemplo n.º 8
0
def load_multi_girl_matcher(nlp):
    matcher = Matcher(nlp.vocab)

    multi_num = [
        'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten',
        'double', 'triple'
    ] + [str(x) for x in range(2, 11)]

    girl = [
        'gal', 'girl', 's**t', 'cutie', 'hottie', 'lady', 'teen', 'teenager',
        'chick', 'staff', 'gf', 'she'
    ]
    show = ['show', 'special', 'session', 'fantasy']
    dict_and = ['and', 'an', 'n', '&']

    is_multi_num = FLAG30
    is_girl = FLAG31
    is_show = FLAG33
    is_and = FLAG34
    set_flag(nlp, multi_num, is_multi_num)
    set_flag(nlp, girl, is_girl)
    set_flag(nlp, show, is_show)
    set_flag(nlp, dict_and, is_and)

    matcher.add_entity(1)
    matcher.add_pattern(1, [{is_multi_num: True}, {is_girl: True, TAG: "NNS"}])
    matcher.add_pattern(1, [{
        is_multi_num: True
    }, {
        is_girl: True,
        TAG: "NNPS"
    }])
    matcher.add_pattern(1, [{LOWER: "duo"}])
    matcher.add_pattern(1, [{
        LOWER: "2"
    }, {
        ORTH: "-"
    }, {
        LOWER: "for"
    }, {
        ORTH: "-"
    }, {
        LOWER: "1"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "double"
    }, {
        ORTH: "-"
    }, {
        LEMMA: "session"
    }])
    matcher.add_pattern(1, [{LEMMA: "three"}, {ORTH: "-"}, {LEMMA: "way"}])
    matcher.add_pattern(1, [{is_multi_num: True}, {LOWER: "for"}, {ORTH: "1"}])
    matcher.add_pattern(1, [{
        is_multi_num: True
    }, {
        LOWER: "for"
    }, {
        ORTH: "one"
    }])
    matcher.add_pattern(1, [{LEMMA: "double"}, {is_show: True}])
    matcher.add_pattern(1, [{is_multi_num: True}, {LOWER: "way"}])

    matcher.add_entity(4)
    matcher.add_pattern(4, [{LOWER: "a"}, {is_girl: True}])
    matcher.add_pattern(4, [{LOWER: "how"}, {is_girl: True}])
    matcher.add_pattern(4, [{LOWER: "for"}, {is_girl: True}])
    matcher.add_pattern(4, [{IS_ALPHA: True, DEP: "nmod"}, {is_girl: True}])
    matcher.add_pattern(4, [{
        is_girl: True
    }, {
        is_and: True
    }, {
        LEMMA: "gentleman"
    }])
    matcher.add_pattern(4, [{
        is_girl: True
    }, {
        ORTH: "&"
    }, {
        ORTH: "&"
    }, {
        LEMMA: "gentleman"
    }])
    matcher.add_pattern(4, [{is_girl: True}, {is_and: True}, {LEMMA: "guy"}])
    matcher.add_pattern(4, [{
        is_girl: True
    }, {
        ORTH: "&"
    }, {
        ORTH: "&"
    }, {
        LEMMA: "guy"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "gentleman"
    }, {
        is_and: True
    }, {
        is_girl: True
    }])
    matcher.add_pattern(4, [{
        LEMMA: "gentleman"
    }, {
        ORTH: "&"
    }, {
        ORTH: "&"
    }, {
        is_girl: True
    }])
    matcher.add_pattern(4, [{LEMMA: "guy"}, {is_and: True}, {is_girl: True}])
    matcher.add_pattern(4, [{
        LEMMA: "guy"
    }, {
        ORTH: "&"
    }, {
        ORTH: "&"
    }, {
        is_girl: True
    }])
    matcher.add_pattern(4, [{LOWER: "she"}])

    return matcher
Exemplo n.º 9
0
def load_credit_card_matcher(nlp):
    matcher = Matcher(nlp.vocab)

    payment = [
        'visa', 'mastercard', 'masterc', 'mc', 'mcard', 'cash', 'csh',
        'discover', 'amex', 'interac', 'jcb'
    ]

    visa_type = [
        'us', 'american', 'canadian', 'student', 'online', 'transit', 'need',
        'make', 'f1', 'temp', 'temporary', 'permanent', 'visitor', 'visit',
        'visiting'
    ]

    is_payment = FLAG40
    is_visa_type = FLAG41
    set_flag(nlp, payment, is_payment)
    set_flag(nlp, visa_type, is_visa_type)

    matcher.add_entity(1)
    matcher.add_pattern(1, [{
        is_payment: True
    }, {
        ORTH: "/"
    }, {
        is_payment: True
    }])
    matcher.add_pattern(1, [{
        is_payment: True
    }, {
        ORTH: ","
    }, {
        is_payment: True
    }])
    matcher.add_pattern(1, [{
        is_payment: True
    }, {
        LEMMA: "and"
    }, {
        is_payment: True
    }])
    matcher.add_pattern(1, [{
        is_payment: True
    }, {
        ORTH: "/"
    }, {
        LOWER: "american"
    }, {
        LOWER: "express"
    }])
    matcher.add_pattern(1, [{
        is_payment: True
    }, {
        ORTH: ","
    }, {
        LOWER: "american"
    }, {
        LOWER: "express"
    }])
    matcher.add_pattern(1, [{
        is_payment: True
    }, {
        ORTH: "&"
    }, {
        LOWER: "american"
    }, {
        LOWER: "express"
    }])
    matcher.add_pattern(1, [{
        is_payment: True
    }, {
        LEMMA: "and"
    }, {
        LOWER: "american"
    }, {
        LOWER: "express"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }, {
        ORTH: ","
    }, {
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }, {
        LEMMA: "and"
    }, {
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }, {
        ORTH: "/"
    }, {
        LOWER: "american"
    }, {
        LOWER: "express"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }, {
        ORTH: ","
    }, {
        LOWER: "american"
    }, {
        LOWER: "express"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }, {
        ORTH: "&"
    }, {
        LOWER: "american"
    }, {
        LOWER: "express"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }, {
        LEMMA: "and"
    }, {
        LOWER: "american"
    }, {
        LOWER: "express"
    }])
    matcher.add_pattern(1, [{LOWER: "american"}, {LOWER: "express"}])
    matcher.add_pattern(1, [{
        LEMMA: "diners"
    }, {
        LEMMA: "club"
    }, {
        LEMMA: "internacional"
    }])
    matcher.add_pattern(1, [{LOWER: "union"}, {LOWER: "pay"}])
    matcher.add_pattern(1, [{LEMMA: "credit"}, {LEMMA: "card"}])
    matcher.add_pattern(1, [{LEMMA: "creditcard"}])

    matcher.add_entity(2)
    matcher.add_pattern(2, [{LEMMA: "accept"}, {LEMMA: "card"}])
    matcher.add_pattern(2, [{LEMMA: "accept"}, {is_payment: True}])
    matcher.add_pattern(2, [{
        LEMMA: "accept"
    }, {
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }])
    matcher.add_pattern(2, [{
        LEMMA: "accept"
    }, {
        ORTH: ":"
    }, {
        is_payment: True
    }])
    matcher.add_pattern(2, [{
        LEMMA: "payment"
    }, {
        ORTH: ":"
    }, {
        is_payment: True
    }])
    matcher.add_pattern(2, [{
        LEMMA: "accept"
    }, {
        ORTH: ":"
    }, {
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }])
    matcher.add_pattern(2, [{
        LEMMA: "payment"
    }, {
        ORTH: ":"
    }, {
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }])

    matcher.add_entity(3)
    matcher.add_pattern(3, [{LEMMA: "at"}, {is_payment: True}])
    matcher.add_pattern(3, [{
        LEMMA: "at"
    }, {
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }])
    matcher.add_pattern(3, [{LEMMA: "visa"}, {LEMMA: "versa"}])

    matcher.add_entity(4)
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "credit"}, {LEMMA: "card"}])
    matcher.add_pattern(4, [{LEMMA: "credit"}, {LEMMA: "card"}, {DEP: "neg"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "creditcard"}])
    matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "credit"}, {LEMMA: "card"}])
    matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "creditcard"}])
    matcher.add_pattern(4, [{
        LEMMA: "not"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "credit"
    }, {
        LEMMA: "card"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "not"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "creditcard"
    }])
    matcher.add_pattern(4, [{is_visa_type: True}, {LEMMA: "visa"}])
    matcher.add_pattern(4, [{LEMMA: "visa"}, {LEMMA: "student"}])
    matcher.add_pattern(4, [{LEMMA: "rent"}, {LEMMA: "and"}, {LEMMA: "visa"}])
    matcher.add_pattern(4, [{
        LEMMA: "rent"
    }, {
        LEMMA: "and"
    }, {
        LEMMA: "credit"
    }])
    matcher.add_pattern(4, [{LEMMA: "visa"}, {LEMMA: "and"}, {LEMMA: "rent"}])
    matcher.add_pattern(4, [{LEMMA: "card"}, {LEMMA: "and"}, {LEMMA: "rent"}])
    matcher.add_pattern(4, [{
        LEMMA: "apply",
        DEP: "ROOT"
    }, {
        LEMMA: "for",
        DEP: "prep"
    }, {
        LEMMA: "visa"
    }])
    matcher.add_pattern(4, [{LEMMA: "apply", DEP: "ROOT"}, {LEMMA: "visa"}])
    matcher.add_pattern(4, [{LEMMA: "arrival", DEP: "ROOT"}, {LEMMA: "visa"}])

    return matcher
Exemplo n.º 10
0
def load_outcall_matcher(nlp):
    matcher = Matcher(nlp.vocab)

    location = ['location', 'place', 'studio', 'apartment', 'home', 'house', 'hotel']

    add_to_vocab(nlp, location)

    location_ids = {nlp.vocab.strings[s.lower()] for s in location}
    hyphen_id = nlp.vocab.strings['-']
    ampersand_id = nlp.vocab.strings['&']

    is_hyphen = FLAG23
    is_ampersand = FLAG24
    is_location = FLAG25
    
    for lexeme in nlp.vocab:
        if lexeme.lower == hyphen_id:
            lexeme.set_flag(is_hyphen, True)
        if lexeme.lower == ampersand_id:
            lexeme.set_flag(is_ampersand, True)
        if lexeme.lower in location_ids:
            lexeme.set_flag(is_location, True)

    matcher.add_entity(1)
    matcher.add_pattern(1, [{LEMMA: "outcall"}])
    matcher.add_pattern(1, [{LEMMA: "out"}, {LEMMA: "call"}])
    matcher.add_pattern(1, [{LEMMA: "out"}, {is_hyphen: True}, {LEMMA: "call"}])
    matcher.add_pattern(1, [{LEMMA: "your"}, {is_location: True}])
    matcher.add_pattern(1, [{LEMMA: "out"}, {LEMMA: "and"}, {LEMMA: "in"}, {LEMMA: "call"}])
    matcher.add_pattern(1, [{LEMMA: "out"}, {is_ampersand: True}, {LEMMA: "in"}, {LEMMA: "call"}])
    matcher.add_pattern(1, [{LEMMA: "visit"}, {LEMMA: "you"}])
    matcher.add_pattern(1, [{LEMMA: "mind"}, {LEMMA: "travel"}])
    matcher.add_pattern(1, [{LEMMA: "anywhere"}, {LEMMA: "and"}, {LEMMA: "everywhere"}])
    matcher.add_pattern(1, [{LEMMA: "prefer"}, {LEMMA: "residence"}])
    matcher.add_pattern(1, [{LEMMA: "prefer"}, {LEMMA: "hotel"}])
    matcher.add_pattern(1, [{LEMMA: "come"}, {LEMMA: "to"}, {LEMMA: "you"}])
    matcher.add_pattern(1, [{LEMMA: "will"}, {LEMMA: "travel"}])
    
    matcher.add_entity(2)
    matcher.add_pattern(2, [{LEMMA: "outcall"}, {LEMMA: "only"}])
    matcher.add_pattern(2, [{LEMMA: "out"}, {LEMMA: "call"}, {LEMMA: "only"}])
    matcher.add_pattern(2, [{LEMMA: "out"}, {is_hyphen: True}, {LEMMA: "call"}, {LEMMA: "only"}])
    matcher.add_pattern(2, [{LEMMA: "your", DEP: "amod"}, {is_location: True}])
    matcher.add_pattern(2, [{LEMMA: "your", DEP: "amod"}, {IS_ASCII: True}, {is_location: True}])
    matcher.add_pattern(2, [{LEMMA: "your", DEP: "poss"}, {is_location: True}])
    matcher.add_pattern(2, [{LEMMA: "your", DEP: "poss"}, {IS_ASCII: True}, {is_location: True}])

    matcher.add_entity(3)
    matcher.add_pattern(3, [{is_location: True}])
    matcher.add_pattern(3, [{LEMMA: "place"}])
    matcher.add_pattern(3, [{LEMMA: "be"}, {LEMMA: "place"}])
    matcher.add_pattern(3, [{LEMMA: "is"}, {LEMMA: "place"}])

    matcher.add_entity(4)
    matcher.add_pattern(4, [{LEMMA: "house"}, {LEMMA: "wives"}])
    matcher.add_pattern(4, [{LEMMA: "if", DEP: "mark"}, {LEMMA: "have"}, {IS_ASCII: True, DEP: "dobj"}])
    matcher.add_pattern(4, [{LEMMA: "if", DEP: "mark"}, {IS_ASCII: True}, {LEMMA: "have"}, {IS_ASCII: True}, {IS_ASCII: True, DEP: "dobj"}])
    matcher.add_pattern(4, [{LEMMA: "if", DEP: "mark"}, {LEMMA: "have"}, {IS_ASCII: True}, {IS_ASCII: True, DEP: "dobj"}])
    matcher.add_pattern(4, [{LEMMA: "my", DEP: "poss"}, {is_location: True}])
    matcher.add_pattern(4, [{LEMMA: "my", DEP: "poss"}, {IS_ASCII: True}, {is_location: True}])
    matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "outcall"}])
    matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "out"}, {LEMMA: "call"}])
    matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "out"}, {is_hyphen: True}, {LEMMA: "call"}])
    matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {LEMMA: "outcall"}])
    matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {LEMMA: "out"}, {LEMMA: "call"}])
    matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {LEMMA: "out"}, {is_hyphen: True}, {LEMMA: "call"}])
    matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {IS_ASCII: True}, {LEMMA: "outcall"}])
    matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {IS_ASCII: True}, {LEMMA: "out"}, {LEMMA: "call"}])
    matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {IS_ASCII: True}, {LEMMA: "out"}, {is_hyphen: True}, {LEMMA: "call"}])
    matcher.add_pattern(4, [{LEMMA: "visit"}, {LEMMA: "your"}, {LEMMA: "city"}])
    matcher.add_pattern(4, [{IS_ASCII: True}, {LEMMA: "miss"}, {LEMMA: "out"}])
    matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "out"}, {LEMMA: "call"}])
    matcher.add_pattern(4, [{IS_ASCII: True, DEP: "dep"}, {LEMMA: "no"}])
    
    return matcher
Exemplo n.º 11
0
def load_address_matcher(nlp):

    # Create matcher object with list of rules and return
    matcher = Matcher(nlp.vocab)

    # Add to vocab
    add_to_vocab(nlp, street)
    add_to_vocab(nlp, street_name)

    # Create flag for MONTH
    is_street = FLAG58
    street_ids = {nlp.vocab.strings[s.lower()] for s in street}

    is_separator = FLAG57
    separator_ids = {nlp.vocab.strings[s.lower()] for s in separator}

    is_street_name = FLAG56
    street_name_ids = {nlp.vocab.strings[s.lower()] for s in street_name}

    # Add the flags
    for lexeme in nlp.vocab:
        if lexeme.lower in street_ids:
            lexeme.set_flag(is_street, True)
        if lexeme.lower in separator_ids:
            lexeme.set_flag(is_separator, True)
        if lexeme.is_alpha:
            lexeme.set_flag(is_street_name, True)
        if lexeme.like_num:
            lexeme.set_flag(is_street_name, True)
        if lexeme.lower in street_name_ids:
            lexeme.set_flag(is_street_name, True)

    # Add rules
    street_name_rules = [{is_street_name: True}, {IS_ALPHA: True}]
    for street_name_rule in street_name_rules:
        for length in range(1, 6):
            # direct address
            matcher.add_pattern('ADDRESS', [{
                LIKE_NUM: True,
                LENGTH: length
            }, street_name_rule, {
                is_street: True
            }])
            matcher.add_pattern('ADDRESS', [{
                LIKE_NUM: True,
                LENGTH: length
            }, {
                IS_ALPHA: True
            }, street_name_rule, {
                is_street: True
            }])
            matcher.add_pattern('ADDRESS', [{
                LIKE_NUM: True,
                LENGTH: length
            }, {
                IS_ALPHA: True
            }, {
                IS_ALPHA: True
            }, street_name_rule, {
                is_street: True
            }])

        # Add and filter out matches to return longest match
        matcher.add_pattern('ADDRESS', [street_name_rule, {is_street: True}])

    # two street rules
    for street_name_rule1 in street_name_rules:
        for street_name_rule2 in street_name_rules:
            matcher.add_pattern('ADDRESS', [{
                LIKE_NUM: True
            }, street_name_rule1, {
                is_street: True
            }, {
                is_separator: True
            }, {
                LIKE_NUM: True
            }, street_name_rule2, {
                is_street: True
            }])
            matcher.add_pattern('ADDRESS', [
                street_name_rule1, {
                    is_street: True
                }, {
                    is_separator: True
                }, street_name_rule2, {
                    is_street: True
                }
            ])

    return matcher
Exemplo n.º 12
0
def load_hotel_matcher(nlp):
    matcher = Matcher(nlp.vocab)

    hotel = ['hotel', 'motel', 'inn', 'hotels', 'motels', 'inns']
    dict_and = ['and', 'n', 'an', 'nd', '&', '/']

    is_hotel = FLAG29
    is_and = FLAG30
    set_flag(nlp, hotel, is_hotel)
    set_flag(nlp, dict_and, is_and)

    matcher.add_entity(1)
    matcher.add_pattern(1, [{is_hotel: True}])

    matcher.add_entity(4)
    matcher.add_pattern(4, [{LEMMA: "inn"}, {is_and: True}, {LOWER: "out"}])
    matcher.add_pattern(4, [{LEMMA: "inn"}, {is_and: True}, {LOWER: "outcall"}])
    matcher.add_pattern(4, [{LEMMA: "inn"}, {LOWER: "call"}])
    matcher.add_pattern(4, [{LEMMA: "inn"}, {is_and: True}, {LOWER: "outcalls"}])
    matcher.add_pattern(4, [{LEMMA: "inn"}, {LOWER: "calls"}])
    matcher.add_pattern(4, [{LEMMA: "inn"}, {IS_DIGIT: True}])
    matcher.add_pattern(4, [{LEMMA: "come"}, {LEMMA: "inn"}])
    matcher.add_pattern(4, [{LEMMA: "inn"}, {LEMMA: "town"}])
    matcher.add_pattern(4, [{LOWER: "no"}, {is_hotel: True}])
    matcher.add_pattern(4, [{DEP: "neg"}, {is_hotel: True}])
    matcher.add_pattern(4, [{DEP: "neg"}, {IS_ALPHA: True}, {is_hotel: True}])

    return matcher
Exemplo n.º 13
0
def load_incall_matcher(nlp):
    matcher = Matcher(nlp.vocab)

    location = ['location', 'place', 'studio', 'apartment', 'home', 'house']
    private = ['private', 'discreet', 'discrete']
    clean = ['clean', 'nice', 'lovely']

    add_to_vocab(nlp, location)
    add_to_vocab(nlp, private)
    add_to_vocab(nlp, clean)

    location_ids = {nlp.vocab.strings[s.lower()] for s in location}
    private_ids = {nlp.vocab.strings[s.lower()] for s in private}
    clean_ids = {nlp.vocab.strings[s.lower()] for s in clean}
    hyphen_id = nlp.vocab.strings['-']
    ampersand_id = nlp.vocab.strings['&']

    is_hyphen = FLAG23
    is_ampersand = FLAG24
    is_location = FLAG25
    is_private = FLAG26
    is_clean = FLAG27

    for lexeme in nlp.vocab:
        if lexeme.lower == hyphen_id:
            lexeme.set_flag(is_hyphen, True)
        if lexeme.lower == ampersand_id:
            lexeme.set_flag(is_ampersand, True)
        if lexeme.lower in location_ids:
            lexeme.set_flag(is_location, True)
        if lexeme.lower in private_ids:
            lexeme.set_flag(is_private, True)
        if lexeme.lower in clean_ids:
            lexeme.set_flag(is_clean, True)

    matcher.add_entity(1)
    matcher.add_pattern(1, [{LEMMA: "incall"}])
    matcher.add_pattern(1, [{LEMMA: "in"}, {LEMMA: "call"}])
    matcher.add_pattern(1, [{LEMMA: "in"}, {is_hyphen: True}, {LEMMA: "call"}])
    matcher.add_pattern(1, [{
        LEMMA: "in"
    }, {
        LEMMA: "and"
    }, {
        LEMMA: "out"
    }, {
        LEMMA: "call"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "in"
    }, {
        is_ampersand: True
    }, {
        LEMMA: "out"
    }, {
        LEMMA: "call"
    }])
    matcher.add_pattern(1, [{LEMMA: "visit"}, {LEMMA: "i"}])

    matcher.add_entity(2)
    matcher.add_pattern(2, [{LEMMA: "incall"}, {LEMMA: "only"}])
    matcher.add_pattern(2, [{LEMMA: "in"}, {LEMMA: "call"}, {LEMMA: "only"}])
    matcher.add_pattern(2, [{
        LEMMA: "in"
    }, {
        is_hyphen: True
    }, {
        LEMMA: "call"
    }, {
        LEMMA: "only"
    }])
    matcher.add_pattern(2, [{
        is_private: True,
        DEP: "amod"
    }, {
        is_location: True
    }])
    matcher.add_pattern(2, [{
        is_private: True,
        DEP: "amod"
    }, {
        IS_ASCII: True
    }, {
        is_location: True
    }])
    matcher.add_pattern(2, [{is_clean: True}, {is_location: True}])
    matcher.add_pattern(2, [{LEMMA: "my", DEP: "poss"}, {is_location: True}])
    matcher.add_pattern(2, [{
        LEMMA: "my",
        DEP: "poss"
    }, {
        IS_ASCII: True
    }, {
        is_location: True
    }])

    matcher.add_entity(3)
    matcher.add_pattern(3, [{LEMMA: "location"}])
    matcher.add_pattern(3, [{LEMMA: "place"}])
    matcher.add_pattern(3, [{LEMMA: "be"}, {LEMMA: "place"}])
    matcher.add_pattern(3, [{LEMMA: "is"}, {LEMMA: "place"}])

    matcher.add_entity(4)
    matcher.add_pattern(4, [{LEMMA: "house"}, {LEMMA: "wife"}])
    matcher.add_pattern(4, [{LOWER: "your", DEP: "poss"}, {is_location: True}])
    matcher.add_pattern(4, [{
        LOWER: "your",
        DEP: "poss"
    }, {
        IS_ASCII: True
    }, {
        is_location: True
    }])
    matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "incall"}])
    matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "in"}, {LEMMA: "call"}])
    matcher.add_pattern(4, [{
        LEMMA: "no"
    }, {
        LEMMA: "in"
    }, {
        is_hyphen: True
    }, {
        LEMMA: "call"
    }])
    matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {LEMMA: "incall"}])
    matcher.add_pattern(4, [{
        IS_ASCII: True,
        DEP: "neg"
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "call"
    }])
    matcher.add_pattern(4, [{
        IS_ASCII: True,
        DEP: "neg"
    }, {
        LEMMA: "in"
    }, {
        is_hyphen: True
    }, {
        LEMMA: "call"
    }])
    matcher.add_pattern(4, [{
        IS_ASCII: True,
        DEP: "neg"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "incall"
    }])
    matcher.add_pattern(4, [{
        IS_ASCII: True,
        DEP: "neg"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "call"
    }])
    matcher.add_pattern(4, [{
        IS_ASCII: True,
        DEP: "neg"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        is_hyphen: True
    }, {
        LEMMA: "call"
    }])
    matcher.add_pattern(4, [{
        IS_ASCII: True,
        DEP: "neg"
    }, {
        LEMMA: "have"
    }, {
        IS_ASCII: True,
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        IS_ASCII: True,
        DEP: "neg"
    }, {
        LEMMA: "have"
    }, {
        IS_ASCII: True
    }, {
        IS_ASCII: True,
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "if",
        DEP: "mark"
    }, {
        LEMMA: "have"
    }, {
        IS_ASCII: True,
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "if",
        DEP: "mark"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "have"
    }, {
        IS_ASCII: True
    }, {
        IS_ASCII: True,
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "if",
        DEP: "mark"
    }, {
        LEMMA: "have"
    }, {
        IS_ASCII: True
    }, {
        IS_ASCII: True,
        DEP: "dobj"
    }])

    return matcher
Exemplo n.º 14
0
def load_derogatory_mentions_matcher(nlp):
    matcher = Matcher(nlp.vocab)

    bitch = ['w***e', 'bitch', 'c**t', 'psycho', 's**t']
    your = ['your', 'ur']

    is_bitch = FLAG29
    is_your = FLAG30
    set_flag(nlp, bitch, is_bitch)
    set_flag(nlp, your, is_your)

    matcher.add_entity(1)
    matcher.add_pattern(1, [{is_bitch: True}])
    matcher.add_pattern(1, [{LEMMA: "expose"}, {LOWER: "i"}])
    matcher.add_pattern(1, [{LEMMA: "expose"}, {LOWER: "me"}])
    matcher.add_pattern(1, [{LEMMA: "violate"}, {LOWER: "i"}])
    matcher.add_pattern(1, [{LEMMA: "violate"}, {LOWER: "me"}])
    matcher.add_pattern(1, [{LEMMA: "f**k"}, {LOWER: "i"}])
    matcher.add_pattern(1, [{LEMMA: "f**k"}, {LOWER: "me"}])
    matcher.add_pattern(1, [{
        LOWER: "i",
        DEP: "nsubj"
    }, {
        IS_ALPHA: True
    }, {
        LEMMA: "violate",
        DEP: "xcomp"
    }])
    matcher.add_pattern(1, [{LEMMA: "piece"}, {LOWER: "of"}, {LEMMA: "shit"}])
    matcher.add_pattern(1, [{LOWER: "hardcore"}])
    matcher.add_pattern(1, [{is_your: True}, {is_bitch: True}])
    matcher.add_pattern(1, [{is_your: True}, {LEMMA: "slave"}])
    matcher.add_pattern(1, [{is_your: True, DEP: "poss"}, {is_bitch: True}])
    matcher.add_pattern(1, [{is_your: True}, {LEMMA: "slave"}])
    matcher.add_pattern(1, [{LOWER: "i", DEP: "nsubj"}, {is_bitch: True}])
    matcher.add_pattern(1, [{
        LOWER: "i",
        DEP: "nsubj"
    }, {
        IS_ALPHA: True
    }, {
        is_bitch: True,
        DEP: "xcomp"
    }])

    matcher.add_entity(3)
    matcher.add_pattern(3, [{LOWER: "like"}, {is_bitch: True}])
    matcher.add_pattern(3, [{LEMMA: "bitch", POS: "VERB"}])

    matcher.add_entity(4)
    matcher.add_pattern(4, [{LOWER: "to"}, {is_bitch: True}])
    matcher.add_pattern(4, [{LOWER: "nor"}, {is_bitch: True}])
    matcher.add_pattern(4, [{DEP: "neg"}, {is_bitch: True}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "slave"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "expose"}, {LOWER: "i"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "expose"}, {LOWER: "me"}])
    matcher.add_pattern(4, [{
        DEP: "neg"
    }, {
        LEMMA: "piece"
    }, {
        LOWER: "of"
    }, {
        LEMMA: "shit"
    }])
    matcher.add_pattern(4, [{DEP: "neg"}, {is_your: True}, {is_bitch: True}])
    matcher.add_pattern(4, [{DEP: "neg"}, {IS_ALPHA: True}, {is_bitch: True}])
    matcher.add_pattern(4, [{DEP: "neg"}, {IS_ALPHA: True}, {LEMMA: "slave"}])
    matcher.add_pattern(4, [{
        DEP: "neg"
    }, {
        IS_ALPHA: True
    }, {
        LEMMA: "expose"
    }, {
        LOWER: "i"
    }])
    matcher.add_pattern(4, [{
        DEP: "neg"
    }, {
        IS_ALPHA: True
    }, {
        LEMMA: "expose"
    }, {
        LOWER: "me"
    }])
    matcher.add_pattern(4, [{
        DEP: "neg"
    }, {
        IS_ALPHA: True
    }, {
        LEMMA: "piece"
    }, {
        LOWER: "of"
    }, {
        LEMMA: "shit"
    }])
    matcher.add_pattern(4, [{
        DEP: "neg"
    }, {
        IS_ALPHA: True
    }, {
        is_your: True
    }, {
        is_bitch: True
    }])
    matcher.add_pattern(4, [{DEP: "neg"}, {IS_ALPHA: True}, {is_bitch: True}])
    matcher.add_pattern(4, [{DEP: "neg"}, {IS_ALPHA: True}, {LEMMA: "slave"}])
    matcher.add_pattern(4, [{
        DEP: "neg"
    }, {
        IS_ALPHA: True
    }, {
        LEMMA: "expose"
    }, {
        LOWER: "i"
    }])
    matcher.add_pattern(4, [{
        DEP: "neg"
    }, {
        IS_ALPHA: True
    }, {
        LEMMA: "expose"
    }, {
        LOWER: "me"
    }])
    matcher.add_pattern(4, [{
        DEP: "neg"
    }, {
        IS_ALPHA: True
    }, {
        LEMMA: "piece"
    }, {
        LOWER: "of"
    }, {
        LEMMA: "shit"
    }])
    matcher.add_pattern(4, [{LEMMA: "girl"}, {LOWER: "next"}, {LEMMA: "door"}])
    matcher.add_pattern(4, [{LOWER: "with"}, {LOWER: "my"}, {LEMMA: "girl"}])
    matcher.add_pattern(4, [{LOWER: "no"}, {is_bitch: True}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LOWER: "like"}, {is_bitch: True}])
    matcher.add_pattern(4, [{LEMMA: "look"}, {LEMMA: "slave", DEP: "prep"}])
    matcher.add_pattern(4, [{LOWER: "you"}, {is_bitch: True}])
    matcher.add_pattern(4, [{LOWER: "you"}, {POS: "ADJ"}, {is_bitch: True}])

    return matcher
Exemplo n.º 15
0
def load_social_media_matcher(nlp):

    social_media = ['twitter', 'facebook', 'instagram', 'wechat', 'line', 'snapchat']
    separators = [':', '-', '@']
    add_to_vocab(nlp, social_media)
    add_to_vocab(nlp, separators)

    is_separator = FLAG55
    is_social_media = FLAG54
    social_media_ids = {nlp.vocab.strings[s.lower()] for s in social_media}
    separators_ids = {nlp.vocab.strings[s.lower()] for s in separators}

    for lexeme in nlp.vocab:
        if lexeme.lower in social_media_ids:
            lexeme.set_flag(is_social_media, True)
        if lexeme.lower in separators_ids:
            lexeme.set_flag(is_separator, True)

    matcher = Matcher(nlp.vocab)
    matcher.add_entity("social_media")

    matcher.add_pattern("social_media",
                        [
                            {is_social_media: True},
                            {is_separator: True},
                            {is_separator: True, 'OP': '?'},
                            {
                                IS_ASCII: True
                            }
                        ],
                        label=1
                        )

    matcher.add_pattern("social_media",
                        [
                            {is_social_media: True},
                            {
                                LOWER: "me",
                                TAG: "PRP"
                            },
                            {is_separator: True, "OP": '?'},
                            {
                                IS_ASCII: True,
                                TAG: 'NN'
                            }
                        ],
                        label=2
                        )
    '''matcher.add_pattern("social_media",
                                        [
                                         {is_social_media: True},
                                         {
                                             LOWER: "me",
                                             TAG: "PRP"
                                         },
                                         {is_separator: True, "OP": '?'},
                                         {IS_ASCII: False, "OP":"?"},
                                        ],
                                         label = 3
                                        )
    '''
    matcher.add_pattern("social_media",
                        [
                            {is_social_media: True},
                            {LOWER: 'id'},
                            {LOWER: 'is', 'OP': '?'},
                            {is_separator: True, 'OP': '?'},
                            {IS_ASCII: True}
                        ],
                        label=4
                        )

    matcher.add_pattern("social_media",
                        [
                            {is_social_media: True},
                            {TAG: 'NN'},
                            {LOWER: 'is', 'OP': '?'},
                            {LOWER: 'to'},
                            {TAG: 'VB'},
                            {LOWER: 'me'}
                        ],
                        label=5
                        )

    matcher.add_pattern("social_media",
                        [
                            {LOWER: 'add'},
                            {TAG: 'PRP'},
                            {LOWER: 'on'},
                            {is_social_media: True},
                            {TAG: 'NN'}
                        ],
                        label=6
                        )

    return matcher
Exemplo n.º 16
0
def load_agency_matcher(nlp):
    matcher = Matcher(nlp.vocab)

    agency = ['agency', 'agncy', 'agenc', 'agencies']

    is_agency = FLAG29
    set_flag(nlp, agency, is_agency)

    matcher.add_entity(1)
    matcher.add_pattern(1, [{is_agency: True}])

    matcher.add_entity(3)
    matcher.add_pattern(3, [{LOWER: "or"}, {is_agency: True}])
    matcher.add_pattern(3, [{LOWER: "le"}, {is_agency: True}])
    matcher.add_pattern(3, [{
        LOWER: "law"
    }, {
        LOWER: "enforcement"
    }, {
        is_agency: True
    }])
    matcher.add_pattern(3, [{LOWER: "no"}, {is_agency: True}])
    matcher.add_pattern(3, [{DEP: "neg"}, {is_agency: True}])
    matcher.add_pattern(3, [{LOWER: "ad"}, {is_agency: True}])
    matcher.add_pattern(3, [{LOWER: "not"}, {LOWER: "a"}, {is_agency: True}])
    matcher.add_pattern(3, [{LOWER: "tire"}, {LOWER: "of"}, {is_agency: True}])

    return matcher
def main():
    nlp = spacy.load('en')

    matcher = Matcher(nlp.vocab)
    matcher.add_pattern("deep_learning", [{
        LOWER: "deep"
    }, {
        LOWER: "learning"
    }])
    matcher.add_pattern("artificial_intelligence", [{
        LOWER: "artificial"
    }, {
        LOWER: "intelligence"
    }])
    matcher.add_pattern("machine_learning", [{
        LOWER: "machine"
    }, {
        LOWER: "learning"
    }])
    matcher.add_pattern("reinforcement_learning", [{
        LOWER: "reinforcement"
    }, {
        LOWER: "learning"
    }])
    matcher.add_pattern("pattern_recognition", [{
        LOWER: "parttern"
    }, {
        LOWER: "recognition"
    }])
    matcher.add_pattern("computer_vision", [{
        LOWER: "computer"
    }, {
        LOWER: "vision"
    }])
    matcher.add_pattern("machine_vision", [{
        LOWER: "machine"
    }, {
        LOWER: "vision"
    }])
    matcher.add_pattern("machine_translation", [{
        LOWER: "machine"
    }, {
        LOWER: "vision"
    }])

    text = re.sub(r'\s+', ' ', sys.stdin.read())

    doc = nlp(text)
    entities = list(doc.ents)
    matches = matcher(doc)

    skip_until = -1
    for i, token in enumerate(doc):
        if i < skip_until:
            continue

        if matches:
            _, _, start, end = matches[0]
            if i == start:
                print(doc[start:end].lemma_.lower())
                skip_until = end
                matches.pop(0)

        if token.is_alpha and not token.is_stop and token.ent_iob_ == 'O':
            print(token.lemma_.lower())

        if (token.ent_iob_ == 'B' and token.ent_type_ not in [
                'DATE', 'TIME', 'MONEY', 'PERCENT', 'QUANTITY', 'ORDINAL',
                'CARDINAL'
        ]):
            entity = entities.pop(0)
            print(entity.lemma_.lower())
Exemplo n.º 18
0
def load_date_matcher(nlp):

    # Create matcher object with list of rules and return
    matcher = Matcher(nlp.vocab)

    # Add to vocab
    add_to_vocab(nlp, months_dict.keys())
    add_to_vocab(nlp, ordinals)
    add_to_vocab(nlp, date_delimiters)

    # Create flag for MONTH
    is_month = FLAG62
    target_ids = {nlp.vocab.strings[s.lower()] for s in months_dict.keys()}
    for lexeme in nlp.vocab:
        if lexeme.lower in target_ids:
            lexeme.set_flag(is_month, True)

    # Create flag for ORDINALS
    is_ordinal = FLAG61
    target_ids = {nlp.vocab.strings[s.lower()] for s in ordinals}
    for lexeme in nlp.vocab:
        if lexeme.lower in target_ids:
            lexeme.set_flag(is_ordinal, True)

    # Create flag for DATE_DELIMITER
    is_date_delimiter = FLAG60
    target_ids = {nlp.vocab.strings[s.lower()] for s in date_delimiters}
    for lexeme in nlp.vocab:
        if lexeme.lower in target_ids:
            lexeme.set_flag(is_date_delimiter, True)

    # print('December', nlp.vocab.__contains__('December'))
    # print('Diciembre', nlp.vocab.__contains__('diciembre'))

    # print('December', nlp.vocab['december'].check_flag(is_month))
    # print('Diciembre', nlp.vocab['diciembre'].check_flag(is_month))

    # Add rules

    # March 25, 2017
    # March 25th, 2017
    # March 25th 2017
    # March 25 2017
    matcher.add_pattern('DATE', [{
        is_month: True
    }, {
        IS_DIGIT: True,
        LENGTH: 1
    }, {
        is_ordinal: True,
        'OP': '?'
    }, {
        ORTH: ',',
        'OP': '?'
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=1)
    matcher.add_pattern('DATE', [{
        is_month: True
    }, {
        IS_DIGIT: True,
        LENGTH: 2
    }, {
        is_ordinal: True,
        'OP': '?'
    }, {
        ORTH: ',',
        'OP': '?'
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=2)

    # 25 March, 2017
    # 25th March, 2017
    # 25th March 2017
    # 25 March 2017
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 1
    }, {
        is_date_delimiter: True,
        'OP': '?'
    }, {
        is_month: True
    }, {
        is_ordinal: True,
        'OP': '?'
    }, {
        ORTH: ',',
        'OP': '?'
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=3)
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 2
    }, {
        is_date_delimiter: True,
        'OP': '?'
    }, {
        is_month: True
    }, {
        is_ordinal: True,
        'OP': '?'
    }, {
        ORTH: ',',
        'OP': '?'
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=4)

    # 25/05/2016
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 1
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        is_month: True
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=5)
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 2
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        is_month: True
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=6)

    # 05/25/2016
    matcher.add_pattern('DATE', [{
        is_month: True
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        IS_DIGIT: True,
        LENGTH: 1
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=7)
    matcher.add_pattern('DATE', [{
        is_month: True
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        IS_DIGIT: True,
        LENGTH: 2
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=8)

    # Diciembre, 2009
    # December 2009
    matcher.add_pattern('DATE', [{
        is_month: True,
        IS_DIGIT: False
    }, {
        ORTH: ','
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=9)
    matcher.add_pattern('DATE', [{
        is_month: True,
        IS_DIGIT: False
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=9)

    # 2013-12-04
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 4
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        is_month: True
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        IS_DIGIT: True,
        LENGTH: 2
    }],
                        label=10)
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 4
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        is_month: True
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        IS_DIGIT: True,
        LENGTH: 1
    }],
                        label=11)

    # 9 days ago
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True
    }, {
        POS: 'NOUN'
    }, {
        LOWER: 'ago'
    }],
                        label=12)

    # 1 Jul
    # 1. Jul
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 2
    }, {
        is_ordinal: True
    }, {
        is_date_delimiter: True
    }, {
        is_month: True,
        IS_DIGIT: False
    }],
                        label=13)
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 2
    }, {
        is_ordinal: True
    }, {
        is_month: True,
        IS_DIGIT: False
    }],
                        label=13)
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 2
    }, {
        is_date_delimiter: True
    }, {
        is_month: True,
        IS_DIGIT: False
    }],
                        label=13)
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 2
    }, {
        is_month: True,
        IS_DIGIT: False
    }],
                        label=13)

    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 1
    }, {
        is_ordinal: True
    }, {
        is_date_delimiter: True
    }, {
        is_month: True,
        IS_DIGIT: False
    }],
                        label=14)
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 1
    }, {
        is_ordinal: True
    }, {
        is_month: True,
        IS_DIGIT: False
    }],
                        label=14)
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 1
    }, {
        is_date_delimiter: True
    }, {
        is_month: True,
        IS_DIGIT: False
    }],
                        label=14)
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 1
    }, {
        is_month: True,
        IS_DIGIT: False
    }],
                        label=14)

    # Jul 2nd
    matcher.add_pattern('DATE', [{
        is_month: True,
        IS_DIGIT: False
    }, {
        is_date_delimiter: True
    }, {
        IS_DIGIT: True,
        LENGTH: 2
    }, {
        is_ordinal: True
    }],
                        label=15)
    matcher.add_pattern('DATE', [{
        is_month: True,
        IS_DIGIT: False
    }, {
        is_date_delimiter: True
    }, {
        IS_DIGIT: True,
        LENGTH: 2
    }],
                        label=15)
    matcher.add_pattern('DATE', [{
        is_month: True,
        IS_DIGIT: False
    }, {
        IS_DIGIT: True,
        LENGTH: 2
    }, {
        is_ordinal: True
    }],
                        label=15)
    matcher.add_pattern('DATE', [{
        is_month: True,
        IS_DIGIT: False
    }, {
        IS_DIGIT: True,
        LENGTH: 2
    }],
                        label=15)

    matcher.add_pattern('DATE', [{
        is_month: True,
        IS_DIGIT: False
    }, {
        is_date_delimiter: True
    }, {
        IS_DIGIT: True,
        LENGTH: 1
    }, {
        is_ordinal: True
    }],
                        label=16)
    matcher.add_pattern('DATE', [{
        is_month: True,
        IS_DIGIT: False
    }, {
        is_date_delimiter: True
    }, {
        IS_DIGIT: True,
        LENGTH: 1
    }],
                        label=16)
    matcher.add_pattern('DATE', [{
        is_month: True,
        IS_DIGIT: False
    }, {
        IS_DIGIT: True,
        LENGTH: 1
    }, {
        is_ordinal: True
    }],
                        label=16)
    matcher.add_pattern('DATE', [{
        is_month: True,
        IS_DIGIT: False
    }, {
        IS_DIGIT: True,
        LENGTH: 1
    }],
                        label=16)

    return matcher
Exemplo n.º 19
0
def load_webcam_matcher(nlp):
    matcher = Matcher(nlp.vocab)

    cam = ['cam', 'skype', 'facetime', 'webcam', 'mfc', 'iml']
    provider = [
        'girls', 'girl', 'models', 'model', 'staffs', 'staff', 'latinas',
        'latina', 'talent', 'supermodels', 'supermodel', 'princesses',
        'princess'
    ]

    is_cam = FLAG29
    is_provider = FLAG30
    set_flag(nlp, cam, is_cam)
    set_flag(nlp, provider, is_provider)

    matcher.add_entity(1)
    matcher.add_pattern(1, [{is_cam: True}])
    matcher.add_pattern(1, [{LOWER: "live"}, {LEMMA: "show"}])
    matcher.add_pattern(1, [{LEMMA: "video"}, {ORTH: "@"}])
    matcher.add_pattern(1, [{LOWER: "free", DEP: "amod"}, {LEMMA: "video"}])
    matcher.add_pattern(1, [{LOWER: "porno"}, {is_provider: True}])
    matcher.add_pattern(1, [{LEMMA: "add"}, {LOWER: "i"}])
    matcher.add_pattern(1, [{LOWER: "chaturbate"}])
    matcher.add_pattern(1, [{
        LEMMA: "see"
    }, {
        LOWER: "i"
    }, {
        LOWER: "on"
    }, {
        LOWER: "http"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "see"
    }, {
        LOWER: "me"
    }, {
        LOWER: "on"
    }, {
        LOWER: "http"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "see"
    }, {
        LOWER: "i"
    }, {
        LOWER: "on"
    }, {
        LOWER: "https"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "see"
    }, {
        LOWER: "me"
    }, {
        LOWER: "on"
    }, {
        LOWER: "http"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "see"
    }, {
        LOWER: "i"
    }, {
        LOWER: "on"
    }, {
        LOWER: "www"
    }, {
        ORTH: "."
    }])
    matcher.add_pattern(1, [{
        LEMMA: "see"
    }, {
        LOWER: "me"
    }, {
        LOWER: "on"
    }, {
        LOWER: "www"
    }, {
        ORTH: "."
    }])

    matcher.add_entity(4)
    matcher.add_pattern(4, [{LOWER: "i"}, {LOWER: "cam"}])
    matcher.add_pattern(4, [{LOWER: "cam"}, {LOWER: "to"}])
    matcher.add_pattern(4, [{LOWER: "you"}, {LOWER: "cam"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {is_cam: True}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "camshow"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "liveshow"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "skypeshow"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "livshow"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LOWER: "paypal"}, {LEMMA: "show"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "show"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "chaturbate"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "live"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "video"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "porno"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "girl"}, {LEMMA: "cam"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "girl"}, {LEMMA: "webcam"}])
    matcher.add_pattern(4, [{
        is_cam: True,
        DEP: "nsubj"
    }, {
        IS_ALPHA: True
    }, {
        IS_ALPHA: True,
        DEP: "neg"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "video",
        DEP: "nsubj"
    }, {
        IS_ALPHA: True
    }, {
        IS_ALPHA: True,
        DEP: "neg"
    }])
    matcher.add_pattern(4, [{
        is_cam: True,
        DEP: "conj"
    }, {
        IS_ALPHA: True
    }, {
        IS_ALPHA: True,
        DEP: "neg"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "video",
        DEP: "conj"
    }, {
        IS_ALPHA: True
    }, {
        IS_ALPHA: True,
        DEP: "neg"
    }])
    matcher.add_pattern(4, [{LOWER: "no"}, {is_cam: True}])
    matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "live"}])
    matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "video"}])
    matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "free"}])
    matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "porno"}])
    matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "chaturbate"}])
    matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "camshow"}])
    matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "liveshow"}])
    matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "skypeshow"}])
    matcher.add_pattern(4, [{LOWER: "no"}, {LOWER: "paypal"}, {LEMMA: "show"}])
    matcher.add_pattern(4, [{LOWER: "it"}, {LEMMA: "be"}, {LEMMA: "cam"}])
    matcher.add_pattern(4, [{LOWER: "its"}, {LEMMA: "cam"}])
    matcher.add_pattern(4, [{LOWER: "im"}, {LEMMA: "cam"}])
    matcher.add_pattern(4, [{LOWER: "i"}, {LEMMA: "be"}, {LEMMA: "cam"}])
    matcher.add_pattern(4, [{
        LOWER: "my"
    }, {
        LOWER: "name"
    }, {
        LEMMA: "be"
    }, {
        LEMMA: "cam"
    }])
    matcher.add_pattern(4, [{LEMMA: "cam"}, {LOWER: "here"}])

    return matcher
Exemplo n.º 20
0
def load_like_email_matcher(nlp):
    matcher = Matcher(nlp.vocab)
    matcher.add_pattern(1, [{LIKE_EMAIL: True}])
    return matcher