Пример #1
0
def load_hotel_matcher(nlp):
    matcher = Matcher(nlp.vocab)

    hotel = ['hotel', 'motel', 'inn', 'hotels', 'motels', 'inns']
    dict_and = ['and', 'n', 'an', 'nd', '&', '/']

    is_hotel = FLAG29
    is_and = FLAG30
    set_flag(nlp, hotel, is_hotel)
    set_flag(nlp, dict_and, is_and)

    matcher.add_entity(1)
    matcher.add_pattern(1, [{is_hotel: True}])

    matcher.add_entity(4)
    matcher.add_pattern(4, [{LEMMA: "inn"}, {is_and: True}, {LOWER: "out"}])
    matcher.add_pattern(4, [{LEMMA: "inn"}, {is_and: True}, {LOWER: "outcall"}])
    matcher.add_pattern(4, [{LEMMA: "inn"}, {LOWER: "call"}])
    matcher.add_pattern(4, [{LEMMA: "inn"}, {is_and: True}, {LOWER: "outcalls"}])
    matcher.add_pattern(4, [{LEMMA: "inn"}, {LOWER: "calls"}])
    matcher.add_pattern(4, [{LEMMA: "inn"}, {IS_DIGIT: True}])
    matcher.add_pattern(4, [{LEMMA: "come"}, {LEMMA: "inn"}])
    matcher.add_pattern(4, [{LEMMA: "inn"}, {LEMMA: "town"}])
    matcher.add_pattern(4, [{LOWER: "no"}, {is_hotel: True}])
    matcher.add_pattern(4, [{DEP: "neg"}, {is_hotel: True}])
    matcher.add_pattern(4, [{DEP: "neg"}, {IS_ALPHA: True}, {is_hotel: True}])

    return matcher
Пример #2
0
def load_agency_matcher(nlp):
    matcher = Matcher(nlp.vocab)

    agency = ['agency', 'agncy', 'agenc', 'agencies']

    is_agency = FLAG29
    set_flag(nlp, agency, is_agency)

    matcher.add_entity(1)
    matcher.add_pattern(1, [{is_agency: True}])

    matcher.add_entity(3)
    matcher.add_pattern(3, [{LOWER: "or"}, {is_agency: True}])
    matcher.add_pattern(3, [{LOWER: "le"}, {is_agency: True}])
    matcher.add_pattern(3, [{
        LOWER: "law"
    }, {
        LOWER: "enforcement"
    }, {
        is_agency: True
    }])
    matcher.add_pattern(3, [{LOWER: "no"}, {is_agency: True}])
    matcher.add_pattern(3, [{DEP: "neg"}, {is_agency: True}])
    matcher.add_pattern(3, [{LOWER: "ad"}, {is_agency: True}])
    matcher.add_pattern(3, [{LOWER: "not"}, {LOWER: "a"}, {is_agency: True}])
    matcher.add_pattern(3, [{LOWER: "tire"}, {LOWER: "of"}, {is_agency: True}])

    return matcher
Пример #3
0
def test_get_entity_attrs(en_vocab):
    matcher = Matcher(en_vocab)
    matcher.add_entity('TestEntity')
    entity = matcher.get_entity('TestEntity')
    assert entity == {} 
    matcher.add_entity('TestEntity2', attrs={'Hello': 'World'})
    entity = matcher.get_entity('TestEntity2')
    assert entity == {'Hello': 'World'} 
    assert matcher.get_entity('TestEntity') == {}
Пример #4
0
def load_age_matcher(nlp):
    """
    Matcher Handles:
    Age : 22 years
    age : 22 yrs
    Age 22-40
    22 yrs
    23yrs
    22-40 years
    About me 22
    """

    matcher = Matcher(nlp.vocab)

    # Added New attribute to check for years
    years = ['years', 'yrs', 'year']
    is_year = FLAG63
    target_ids = {nlp.vocab.strings[s.lower()] for s in years}
    for lexeme in nlp.vocab:
        if lexeme.lower in target_ids:
            lexeme.set_flag(is_year, True)

    # New Entity Type : Age
    matcher.add_entity("Age", acceptor=get_age)

    # Age Matcher Patterns
    matcher.add_pattern("Age", [{LOWER: "age"}, {IS_PUNCT: True}, {IS_DIGIT: True, LENGTH: 2}])
    matcher.add_pattern("Age", [{LOWER: "age"}, {IS_DIGIT: True, LENGTH: 2}])

    matcher.add_pattern("Age", [{LOWER: "age"}, {IS_PUNCT: True}, {IS_DIGIT: True, LENGTH: 2}, {IS_PUNCT: True},
                                {IS_DIGIT: True, LENGTH: 2}])
    matcher.add_pattern("Age",
                        [{LOWER: "age"}, {IS_DIGIT: True, LENGTH: 2}, {IS_PUNCT: True}, {IS_DIGIT: True, LENGTH: 2}])

    matcher.add_pattern("Age", [{IS_DIGIT: True, LENGTH: 2}, {is_year: True}])

    matcher.add_pattern("Age", [{SUFFIX: "yrs", LENGTH: 5}])

    matcher.add_pattern("Age", [{IS_DIGIT: True, LENGTH: 2}, {IS_PUNCT: True, 'OP': '?'}, {IS_DIGIT: True, LENGTH: 2},
                                {is_year: True}])
    matcher.add_pattern("Age", [{IS_DIGIT: True, LENGTH: 2}, {IS_ASCII: True, 'OP': '?'}, {IS_DIGIT: True, LENGTH: 2},
                                {is_year: True}])

    matcher.add_pattern("Age", [{LOWER: 'about'}, {LOWER: 'me', 'OP': '?'}, {IS_DIGIT: True}])

    return matcher
Пример #5
0
def test_get_entity_via_match(en_vocab):
    matcher = Matcher(en_vocab)
    matcher.add_entity('TestEntity', attrs={u'Hello': u'World'})
    assert matcher.n_patterns == 0
    assert matcher(Doc(en_vocab, words=[u'Test', u'Entity'])) == []
    matcher.add_pattern(u'TestEntity', [{ORTH: u'Test'}, {ORTH: u'Entity'}])
    assert matcher.n_patterns == 1
    matches = matcher(Doc(en_vocab, words=[u'Test', u'Entity']))
    assert len(matches) == 1
    assert len(matches[0]) == 4
    ent_id, label, start, end = matches[0]
    assert ent_id == matcher.vocab.strings[u'TestEntity']
    assert label == 0
    assert start == 0
    assert end == 2
    attrs = matcher.get_entity(ent_id)
    assert attrs == {u'Hello': u'World'}
    def __init__(self, nlp):
        matcher = Matcher(nlp.vocab)

        iob_pattern = [{
            a.LIKE_NUM: False,
            a.ENT_IOB: 3
        }, {
            'OP': '*',
            a.ENT_IOB: 1
        }, {
            'OP': '?',
            a.LIKE_NUM: True
        }]
        entity_name = 'object'  # it is to associate matches with patterns
        matcher.add_entity(entity_name, acceptor=self.make_intersect_ar(2, 2))
        matcher.add_pattern(entity_name, iob_pattern, label=Part.OBJ)

        entity_name = 'subject'
        matcher.add_entity(entity_name, acceptor=self.make_intersect_ar())
        matcher.add_pattern(entity_name, iob_pattern, label=Part.SUBJ)

        # conjugation_pattern = iob_pattern + [{a.POS: 'CONJ'}]

        # entity_name = 'version'
        # ver_pattern1 = [{a.LEMMA: 'version'}, {a.LIKE_NUM: True}]
        # matcher.add_entity(entity_name, acceptor=self.make_intersect_ar(1,1))
        # matcher.add_pattern(entity_name, ver_pattern1, label=Part.SUBJ)
        # matcher.add_pattern(entity_name, ver_pattern1, label=Part.OBJ)

        # entity_name = 'location'
        # entity_name = 'date'
        # matcher.add_entity(entity_name, acceptor=self.make_inclusion_ar(1,1))
        # matcher.add_pattern(entity_name, [{a.ENT_TYPE: 'DATE'}], label=Part.OBJ)

        self.entity_rules = ['subject', 'object']
        super().__init__(matcher)
Пример #7
0
def load_risky_activities_matcher(nlp):
    matcher = Matcher(nlp.vocab)

    risky_activities = [
        'bareback', 'uncovered', 'bbbjtcim', 'bbbj', 'bbbjtc', 'bbbjtcws',
        'bbbjwf', 'bbfs', 'anal', 'greek', 'rca', 'swallow', 'cim', 'choke',
        'bdsm', 'bondage', 'g******g', 'hardcore'
    ]

    provider = [
        'girl', 'girls', 'model', 'models', 'staff', 'staffs', 'latina',
        'latinas', 'talent', 'talents', 'supermodel', 'supermodels',
        'princess', 'princesses'
    ]

    is_risky_activities = FLAG40
    is_provider = FLAG41
    set_flag(nlp, risky_activities, is_risky_activities)
    set_flag(nlp, provider, is_provider)

    matcher.add_entity(1)
    matcher.add_pattern(1, [{is_risky_activities: True}])

    matcher.add_entity(2)
    matcher.add_pattern(2, [{LEMMA: "hardcore"}, {LEMMA: "sex"}])
    matcher.add_pattern(2, [{LEMMA: "hardcore"}, {LEMMA: "service"}])

    matcher.add_entity(3)
    matcher.add_pattern(3, [{LEMMA: "hardcore"}])

    matcher.add_entity(4)
    matcher.add_pattern(4, [{LEMMA: "greek"}, {IS_DIGIT: True}])
    matcher.add_pattern(4, [{LEMMA: "greek"}, {is_provider: True}])
    matcher.add_pattern(4, [{
        LEMMA: "if",
        DEP: "mark"
    }, {
        IS_ALPHA: True,
        DEP: "ROOT"
    }, {
        is_risky_activities: True
    }])
    matcher.add_pattern(4, [{is_risky_activities: True}, {LEMMA: "sorry"}])

    return matcher
Пример #8
0
def load_webcam_matcher(nlp):
    matcher = Matcher(nlp.vocab)

    cam = ['cam', 'skype', 'facetime', 'webcam', 'mfc', 'iml']
    provider = [
        'girls', 'girl', 'models', 'model', 'staffs', 'staff', 'latinas',
        'latina', 'talent', 'supermodels', 'supermodel', 'princesses',
        'princess'
    ]

    is_cam = FLAG29
    is_provider = FLAG30
    set_flag(nlp, cam, is_cam)
    set_flag(nlp, provider, is_provider)

    matcher.add_entity(1)
    matcher.add_pattern(1, [{is_cam: True}])
    matcher.add_pattern(1, [{LOWER: "live"}, {LEMMA: "show"}])
    matcher.add_pattern(1, [{LEMMA: "video"}, {ORTH: "@"}])
    matcher.add_pattern(1, [{LOWER: "free", DEP: "amod"}, {LEMMA: "video"}])
    matcher.add_pattern(1, [{LOWER: "porno"}, {is_provider: True}])
    matcher.add_pattern(1, [{LEMMA: "add"}, {LOWER: "i"}])
    matcher.add_pattern(1, [{LOWER: "chaturbate"}])
    matcher.add_pattern(1, [{
        LEMMA: "see"
    }, {
        LOWER: "i"
    }, {
        LOWER: "on"
    }, {
        LOWER: "http"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "see"
    }, {
        LOWER: "me"
    }, {
        LOWER: "on"
    }, {
        LOWER: "http"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "see"
    }, {
        LOWER: "i"
    }, {
        LOWER: "on"
    }, {
        LOWER: "https"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "see"
    }, {
        LOWER: "me"
    }, {
        LOWER: "on"
    }, {
        LOWER: "http"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "see"
    }, {
        LOWER: "i"
    }, {
        LOWER: "on"
    }, {
        LOWER: "www"
    }, {
        ORTH: "."
    }])
    matcher.add_pattern(1, [{
        LEMMA: "see"
    }, {
        LOWER: "me"
    }, {
        LOWER: "on"
    }, {
        LOWER: "www"
    }, {
        ORTH: "."
    }])

    matcher.add_entity(4)
    matcher.add_pattern(4, [{LOWER: "i"}, {LOWER: "cam"}])
    matcher.add_pattern(4, [{LOWER: "cam"}, {LOWER: "to"}])
    matcher.add_pattern(4, [{LOWER: "you"}, {LOWER: "cam"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {is_cam: True}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "camshow"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "liveshow"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "skypeshow"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "livshow"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LOWER: "paypal"}, {LEMMA: "show"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "show"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "chaturbate"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "live"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "video"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "porno"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "girl"}, {LEMMA: "cam"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "girl"}, {LEMMA: "webcam"}])
    matcher.add_pattern(4, [{
        is_cam: True,
        DEP: "nsubj"
    }, {
        IS_ALPHA: True
    }, {
        IS_ALPHA: True,
        DEP: "neg"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "video",
        DEP: "nsubj"
    }, {
        IS_ALPHA: True
    }, {
        IS_ALPHA: True,
        DEP: "neg"
    }])
    matcher.add_pattern(4, [{
        is_cam: True,
        DEP: "conj"
    }, {
        IS_ALPHA: True
    }, {
        IS_ALPHA: True,
        DEP: "neg"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "video",
        DEP: "conj"
    }, {
        IS_ALPHA: True
    }, {
        IS_ALPHA: True,
        DEP: "neg"
    }])
    matcher.add_pattern(4, [{LOWER: "no"}, {is_cam: True}])
    matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "live"}])
    matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "video"}])
    matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "free"}])
    matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "porno"}])
    matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "chaturbate"}])
    matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "camshow"}])
    matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "liveshow"}])
    matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "skypeshow"}])
    matcher.add_pattern(4, [{LOWER: "no"}, {LOWER: "paypal"}, {LEMMA: "show"}])
    matcher.add_pattern(4, [{LOWER: "it"}, {LEMMA: "be"}, {LEMMA: "cam"}])
    matcher.add_pattern(4, [{LOWER: "its"}, {LEMMA: "cam"}])
    matcher.add_pattern(4, [{LOWER: "im"}, {LEMMA: "cam"}])
    matcher.add_pattern(4, [{LOWER: "i"}, {LEMMA: "be"}, {LEMMA: "cam"}])
    matcher.add_pattern(4, [{
        LOWER: "my"
    }, {
        LOWER: "name"
    }, {
        LEMMA: "be"
    }, {
        LEMMA: "cam"
    }])
    matcher.add_pattern(4, [{LEMMA: "cam"}, {LOWER: "here"}])

    return matcher
Пример #9
0
def load_multi_girl_matcher(nlp):
    matcher = Matcher(nlp.vocab)

    multi_num = [
        'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten',
        'double', 'triple'
    ] + [str(x) for x in range(2, 11)]

    girl = [
        'gal', 'girl', 's**t', 'cutie', 'hottie', 'lady', 'teen', 'teenager',
        'chick', 'staff', 'gf', 'she'
    ]
    show = ['show', 'special', 'session', 'fantasy']
    dict_and = ['and', 'an', 'n', '&']

    is_multi_num = FLAG30
    is_girl = FLAG31
    is_show = FLAG33
    is_and = FLAG34
    set_flag(nlp, multi_num, is_multi_num)
    set_flag(nlp, girl, is_girl)
    set_flag(nlp, show, is_show)
    set_flag(nlp, dict_and, is_and)

    matcher.add_entity(1)
    matcher.add_pattern(1, [{is_multi_num: True}, {is_girl: True, TAG: "NNS"}])
    matcher.add_pattern(1, [{
        is_multi_num: True
    }, {
        is_girl: True,
        TAG: "NNPS"
    }])
    matcher.add_pattern(1, [{LOWER: "duo"}])
    matcher.add_pattern(1, [{
        LOWER: "2"
    }, {
        ORTH: "-"
    }, {
        LOWER: "for"
    }, {
        ORTH: "-"
    }, {
        LOWER: "1"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "double"
    }, {
        ORTH: "-"
    }, {
        LEMMA: "session"
    }])
    matcher.add_pattern(1, [{LEMMA: "three"}, {ORTH: "-"}, {LEMMA: "way"}])
    matcher.add_pattern(1, [{is_multi_num: True}, {LOWER: "for"}, {ORTH: "1"}])
    matcher.add_pattern(1, [{
        is_multi_num: True
    }, {
        LOWER: "for"
    }, {
        ORTH: "one"
    }])
    matcher.add_pattern(1, [{LEMMA: "double"}, {is_show: True}])
    matcher.add_pattern(1, [{is_multi_num: True}, {LOWER: "way"}])

    matcher.add_entity(4)
    matcher.add_pattern(4, [{LOWER: "a"}, {is_girl: True}])
    matcher.add_pattern(4, [{LOWER: "how"}, {is_girl: True}])
    matcher.add_pattern(4, [{LOWER: "for"}, {is_girl: True}])
    matcher.add_pattern(4, [{IS_ALPHA: True, DEP: "nmod"}, {is_girl: True}])
    matcher.add_pattern(4, [{
        is_girl: True
    }, {
        is_and: True
    }, {
        LEMMA: "gentleman"
    }])
    matcher.add_pattern(4, [{
        is_girl: True
    }, {
        ORTH: "&"
    }, {
        ORTH: "&"
    }, {
        LEMMA: "gentleman"
    }])
    matcher.add_pattern(4, [{is_girl: True}, {is_and: True}, {LEMMA: "guy"}])
    matcher.add_pattern(4, [{
        is_girl: True
    }, {
        ORTH: "&"
    }, {
        ORTH: "&"
    }, {
        LEMMA: "guy"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "gentleman"
    }, {
        is_and: True
    }, {
        is_girl: True
    }])
    matcher.add_pattern(4, [{
        LEMMA: "gentleman"
    }, {
        ORTH: "&"
    }, {
        ORTH: "&"
    }, {
        is_girl: True
    }])
    matcher.add_pattern(4, [{LEMMA: "guy"}, {is_and: True}, {is_girl: True}])
    matcher.add_pattern(4, [{
        LEMMA: "guy"
    }, {
        ORTH: "&"
    }, {
        ORTH: "&"
    }, {
        is_girl: True
    }])
    matcher.add_pattern(4, [{LOWER: "she"}])

    return matcher
Пример #10
0
def load_social_media_matcher(nlp):

    social_media = ['twitter', 'facebook', 'instagram', 'wechat', 'line', 'snapchat']
    separators = [':', '-', '@']
    add_to_vocab(nlp, social_media)
    add_to_vocab(nlp, separators)

    is_separator = FLAG55
    is_social_media = FLAG54
    social_media_ids = {nlp.vocab.strings[s.lower()] for s in social_media}
    separators_ids = {nlp.vocab.strings[s.lower()] for s in separators}

    for lexeme in nlp.vocab:
        if lexeme.lower in social_media_ids:
            lexeme.set_flag(is_social_media, True)
        if lexeme.lower in separators_ids:
            lexeme.set_flag(is_separator, True)

    matcher = Matcher(nlp.vocab)
    matcher.add_entity("social_media")

    matcher.add_pattern("social_media",
                        [
                            {is_social_media: True},
                            {is_separator: True},
                            {is_separator: True, 'OP': '?'},
                            {
                                IS_ASCII: True
                            }
                        ],
                        label=1
                        )

    matcher.add_pattern("social_media",
                        [
                            {is_social_media: True},
                            {
                                LOWER: "me",
                                TAG: "PRP"
                            },
                            {is_separator: True, "OP": '?'},
                            {
                                IS_ASCII: True,
                                TAG: 'NN'
                            }
                        ],
                        label=2
                        )
    '''matcher.add_pattern("social_media",
                                        [
                                         {is_social_media: True},
                                         {
                                             LOWER: "me",
                                             TAG: "PRP"
                                         },
                                         {is_separator: True, "OP": '?'},
                                         {IS_ASCII: False, "OP":"?"},
                                        ],
                                         label = 3
                                        )
    '''
    matcher.add_pattern("social_media",
                        [
                            {is_social_media: True},
                            {LOWER: 'id'},
                            {LOWER: 'is', 'OP': '?'},
                            {is_separator: True, 'OP': '?'},
                            {IS_ASCII: True}
                        ],
                        label=4
                        )

    matcher.add_pattern("social_media",
                        [
                            {is_social_media: True},
                            {TAG: 'NN'},
                            {LOWER: 'is', 'OP': '?'},
                            {LOWER: 'to'},
                            {TAG: 'VB'},
                            {LOWER: 'me'}
                        ],
                        label=5
                        )

    matcher.add_pattern("social_media",
                        [
                            {LOWER: 'add'},
                            {TAG: 'PRP'},
                            {LOWER: 'on'},
                            {is_social_media: True},
                            {TAG: 'NN'}
                        ],
                        label=6
                        )

    return matcher
Пример #11
0
def test_add_empty_entity(en_vocab):
    matcher = Matcher(en_vocab)
    matcher.add_entity('TestEntity')
    assert matcher.n_patterns == 0
    assert matcher(Doc(en_vocab, words=[u'Test', u'Entity'])) == []
Пример #12
0
def load_credit_card_matcher(nlp):
    matcher = Matcher(nlp.vocab)

    payment = [
        'visa', 'mastercard', 'masterc', 'mc', 'mcard', 'cash', 'csh',
        'discover', 'amex', 'interac', 'jcb'
    ]

    visa_type = [
        'us', 'american', 'canadian', 'student', 'online', 'transit', 'need',
        'make', 'f1', 'temp', 'temporary', 'permanent', 'visitor', 'visit',
        'visiting'
    ]

    is_payment = FLAG40
    is_visa_type = FLAG41
    set_flag(nlp, payment, is_payment)
    set_flag(nlp, visa_type, is_visa_type)

    matcher.add_entity(1)
    matcher.add_pattern(1, [{
        is_payment: True
    }, {
        ORTH: "/"
    }, {
        is_payment: True
    }])
    matcher.add_pattern(1, [{
        is_payment: True
    }, {
        ORTH: ","
    }, {
        is_payment: True
    }])
    matcher.add_pattern(1, [{
        is_payment: True
    }, {
        LEMMA: "and"
    }, {
        is_payment: True
    }])
    matcher.add_pattern(1, [{
        is_payment: True
    }, {
        ORTH: "/"
    }, {
        LOWER: "american"
    }, {
        LOWER: "express"
    }])
    matcher.add_pattern(1, [{
        is_payment: True
    }, {
        ORTH: ","
    }, {
        LOWER: "american"
    }, {
        LOWER: "express"
    }])
    matcher.add_pattern(1, [{
        is_payment: True
    }, {
        ORTH: "&"
    }, {
        LOWER: "american"
    }, {
        LOWER: "express"
    }])
    matcher.add_pattern(1, [{
        is_payment: True
    }, {
        LEMMA: "and"
    }, {
        LOWER: "american"
    }, {
        LOWER: "express"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }, {
        ORTH: ","
    }, {
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }, {
        LEMMA: "and"
    }, {
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }, {
        ORTH: "/"
    }, {
        LOWER: "american"
    }, {
        LOWER: "express"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }, {
        ORTH: ","
    }, {
        LOWER: "american"
    }, {
        LOWER: "express"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }, {
        ORTH: "&"
    }, {
        LOWER: "american"
    }, {
        LOWER: "express"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }, {
        LEMMA: "and"
    }, {
        LOWER: "american"
    }, {
        LOWER: "express"
    }])
    matcher.add_pattern(1, [{LOWER: "american"}, {LOWER: "express"}])
    matcher.add_pattern(1, [{
        LEMMA: "diners"
    }, {
        LEMMA: "club"
    }, {
        LEMMA: "internacional"
    }])
    matcher.add_pattern(1, [{LOWER: "union"}, {LOWER: "pay"}])
    matcher.add_pattern(1, [{LEMMA: "credit"}, {LEMMA: "card"}])
    matcher.add_pattern(1, [{LEMMA: "creditcard"}])

    matcher.add_entity(2)
    matcher.add_pattern(2, [{LEMMA: "accept"}, {LEMMA: "card"}])
    matcher.add_pattern(2, [{LEMMA: "accept"}, {is_payment: True}])
    matcher.add_pattern(2, [{
        LEMMA: "accept"
    }, {
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }])
    matcher.add_pattern(2, [{
        LEMMA: "accept"
    }, {
        ORTH: ":"
    }, {
        is_payment: True
    }])
    matcher.add_pattern(2, [{
        LEMMA: "payment"
    }, {
        ORTH: ":"
    }, {
        is_payment: True
    }])
    matcher.add_pattern(2, [{
        LEMMA: "accept"
    }, {
        ORTH: ":"
    }, {
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }])
    matcher.add_pattern(2, [{
        LEMMA: "payment"
    }, {
        ORTH: ":"
    }, {
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }])

    matcher.add_entity(3)
    matcher.add_pattern(3, [{LEMMA: "at"}, {is_payment: True}])
    matcher.add_pattern(3, [{
        LEMMA: "at"
    }, {
        LEMMA: "m"
    }, {
        ORTH: "/"
    }, {
        LEMMA: "card"
    }])
    matcher.add_pattern(3, [{LEMMA: "visa"}, {LEMMA: "versa"}])

    matcher.add_entity(4)
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "credit"}, {LEMMA: "card"}])
    matcher.add_pattern(4, [{LEMMA: "credit"}, {LEMMA: "card"}, {DEP: "neg"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "creditcard"}])
    matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "credit"}, {LEMMA: "card"}])
    matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "creditcard"}])
    matcher.add_pattern(4, [{
        LEMMA: "not"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "credit"
    }, {
        LEMMA: "card"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "not"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "creditcard"
    }])
    matcher.add_pattern(4, [{is_visa_type: True}, {LEMMA: "visa"}])
    matcher.add_pattern(4, [{LEMMA: "visa"}, {LEMMA: "student"}])
    matcher.add_pattern(4, [{LEMMA: "rent"}, {LEMMA: "and"}, {LEMMA: "visa"}])
    matcher.add_pattern(4, [{
        LEMMA: "rent"
    }, {
        LEMMA: "and"
    }, {
        LEMMA: "credit"
    }])
    matcher.add_pattern(4, [{LEMMA: "visa"}, {LEMMA: "and"}, {LEMMA: "rent"}])
    matcher.add_pattern(4, [{LEMMA: "card"}, {LEMMA: "and"}, {LEMMA: "rent"}])
    matcher.add_pattern(4, [{
        LEMMA: "apply",
        DEP: "ROOT"
    }, {
        LEMMA: "for",
        DEP: "prep"
    }, {
        LEMMA: "visa"
    }])
    matcher.add_pattern(4, [{LEMMA: "apply", DEP: "ROOT"}, {LEMMA: "visa"}])
    matcher.add_pattern(4, [{LEMMA: "arrival", DEP: "ROOT"}, {LEMMA: "visa"}])

    return matcher
Пример #13
0
def load_outcall_matcher(nlp):
    matcher = Matcher(nlp.vocab)

    location = ['location', 'place', 'studio', 'apartment', 'home', 'house', 'hotel']

    add_to_vocab(nlp, location)

    location_ids = {nlp.vocab.strings[s.lower()] for s in location}
    hyphen_id = nlp.vocab.strings['-']
    ampersand_id = nlp.vocab.strings['&']

    is_hyphen = FLAG23
    is_ampersand = FLAG24
    is_location = FLAG25
    
    for lexeme in nlp.vocab:
        if lexeme.lower == hyphen_id:
            lexeme.set_flag(is_hyphen, True)
        if lexeme.lower == ampersand_id:
            lexeme.set_flag(is_ampersand, True)
        if lexeme.lower in location_ids:
            lexeme.set_flag(is_location, True)

    matcher.add_entity(1)
    matcher.add_pattern(1, [{LEMMA: "outcall"}])
    matcher.add_pattern(1, [{LEMMA: "out"}, {LEMMA: "call"}])
    matcher.add_pattern(1, [{LEMMA: "out"}, {is_hyphen: True}, {LEMMA: "call"}])
    matcher.add_pattern(1, [{LEMMA: "your"}, {is_location: True}])
    matcher.add_pattern(1, [{LEMMA: "out"}, {LEMMA: "and"}, {LEMMA: "in"}, {LEMMA: "call"}])
    matcher.add_pattern(1, [{LEMMA: "out"}, {is_ampersand: True}, {LEMMA: "in"}, {LEMMA: "call"}])
    matcher.add_pattern(1, [{LEMMA: "visit"}, {LEMMA: "you"}])
    matcher.add_pattern(1, [{LEMMA: "mind"}, {LEMMA: "travel"}])
    matcher.add_pattern(1, [{LEMMA: "anywhere"}, {LEMMA: "and"}, {LEMMA: "everywhere"}])
    matcher.add_pattern(1, [{LEMMA: "prefer"}, {LEMMA: "residence"}])
    matcher.add_pattern(1, [{LEMMA: "prefer"}, {LEMMA: "hotel"}])
    matcher.add_pattern(1, [{LEMMA: "come"}, {LEMMA: "to"}, {LEMMA: "you"}])
    matcher.add_pattern(1, [{LEMMA: "will"}, {LEMMA: "travel"}])
    
    matcher.add_entity(2)
    matcher.add_pattern(2, [{LEMMA: "outcall"}, {LEMMA: "only"}])
    matcher.add_pattern(2, [{LEMMA: "out"}, {LEMMA: "call"}, {LEMMA: "only"}])
    matcher.add_pattern(2, [{LEMMA: "out"}, {is_hyphen: True}, {LEMMA: "call"}, {LEMMA: "only"}])
    matcher.add_pattern(2, [{LEMMA: "your", DEP: "amod"}, {is_location: True}])
    matcher.add_pattern(2, [{LEMMA: "your", DEP: "amod"}, {IS_ASCII: True}, {is_location: True}])
    matcher.add_pattern(2, [{LEMMA: "your", DEP: "poss"}, {is_location: True}])
    matcher.add_pattern(2, [{LEMMA: "your", DEP: "poss"}, {IS_ASCII: True}, {is_location: True}])

    matcher.add_entity(3)
    matcher.add_pattern(3, [{is_location: True}])
    matcher.add_pattern(3, [{LEMMA: "place"}])
    matcher.add_pattern(3, [{LEMMA: "be"}, {LEMMA: "place"}])
    matcher.add_pattern(3, [{LEMMA: "is"}, {LEMMA: "place"}])

    matcher.add_entity(4)
    matcher.add_pattern(4, [{LEMMA: "house"}, {LEMMA: "wives"}])
    matcher.add_pattern(4, [{LEMMA: "if", DEP: "mark"}, {LEMMA: "have"}, {IS_ASCII: True, DEP: "dobj"}])
    matcher.add_pattern(4, [{LEMMA: "if", DEP: "mark"}, {IS_ASCII: True}, {LEMMA: "have"}, {IS_ASCII: True}, {IS_ASCII: True, DEP: "dobj"}])
    matcher.add_pattern(4, [{LEMMA: "if", DEP: "mark"}, {LEMMA: "have"}, {IS_ASCII: True}, {IS_ASCII: True, DEP: "dobj"}])
    matcher.add_pattern(4, [{LEMMA: "my", DEP: "poss"}, {is_location: True}])
    matcher.add_pattern(4, [{LEMMA: "my", DEP: "poss"}, {IS_ASCII: True}, {is_location: True}])
    matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "outcall"}])
    matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "out"}, {LEMMA: "call"}])
    matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "out"}, {is_hyphen: True}, {LEMMA: "call"}])
    matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {LEMMA: "outcall"}])
    matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {LEMMA: "out"}, {LEMMA: "call"}])
    matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {LEMMA: "out"}, {is_hyphen: True}, {LEMMA: "call"}])
    matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {IS_ASCII: True}, {LEMMA: "outcall"}])
    matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {IS_ASCII: True}, {LEMMA: "out"}, {LEMMA: "call"}])
    matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {IS_ASCII: True}, {LEMMA: "out"}, {is_hyphen: True}, {LEMMA: "call"}])
    matcher.add_pattern(4, [{LEMMA: "visit"}, {LEMMA: "your"}, {LEMMA: "city"}])
    matcher.add_pattern(4, [{IS_ASCII: True}, {LEMMA: "miss"}, {LEMMA: "out"}])
    matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "out"}, {LEMMA: "call"}])
    matcher.add_pattern(4, [{IS_ASCII: True, DEP: "dep"}, {LEMMA: "no"}])
    
    return matcher
Пример #14
0
def load_incall_matcher(nlp):
    matcher = Matcher(nlp.vocab)

    location = ['location', 'place', 'studio', 'apartment', 'home', 'house']
    private = ['private', 'discreet', 'discrete']
    clean = ['clean', 'nice', 'lovely']

    add_to_vocab(nlp, location)
    add_to_vocab(nlp, private)
    add_to_vocab(nlp, clean)

    location_ids = {nlp.vocab.strings[s.lower()] for s in location}
    private_ids = {nlp.vocab.strings[s.lower()] for s in private}
    clean_ids = {nlp.vocab.strings[s.lower()] for s in clean}
    hyphen_id = nlp.vocab.strings['-']
    ampersand_id = nlp.vocab.strings['&']

    is_hyphen = FLAG23
    is_ampersand = FLAG24
    is_location = FLAG25
    is_private = FLAG26
    is_clean = FLAG27

    for lexeme in nlp.vocab:
        if lexeme.lower == hyphen_id:
            lexeme.set_flag(is_hyphen, True)
        if lexeme.lower == ampersand_id:
            lexeme.set_flag(is_ampersand, True)
        if lexeme.lower in location_ids:
            lexeme.set_flag(is_location, True)
        if lexeme.lower in private_ids:
            lexeme.set_flag(is_private, True)
        if lexeme.lower in clean_ids:
            lexeme.set_flag(is_clean, True)

    matcher.add_entity(1)
    matcher.add_pattern(1, [{LEMMA: "incall"}])
    matcher.add_pattern(1, [{LEMMA: "in"}, {LEMMA: "call"}])
    matcher.add_pattern(1, [{LEMMA: "in"}, {is_hyphen: True}, {LEMMA: "call"}])
    matcher.add_pattern(1, [{
        LEMMA: "in"
    }, {
        LEMMA: "and"
    }, {
        LEMMA: "out"
    }, {
        LEMMA: "call"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "in"
    }, {
        is_ampersand: True
    }, {
        LEMMA: "out"
    }, {
        LEMMA: "call"
    }])
    matcher.add_pattern(1, [{LEMMA: "visit"}, {LEMMA: "i"}])

    matcher.add_entity(2)
    matcher.add_pattern(2, [{LEMMA: "incall"}, {LEMMA: "only"}])
    matcher.add_pattern(2, [{LEMMA: "in"}, {LEMMA: "call"}, {LEMMA: "only"}])
    matcher.add_pattern(2, [{
        LEMMA: "in"
    }, {
        is_hyphen: True
    }, {
        LEMMA: "call"
    }, {
        LEMMA: "only"
    }])
    matcher.add_pattern(2, [{
        is_private: True,
        DEP: "amod"
    }, {
        is_location: True
    }])
    matcher.add_pattern(2, [{
        is_private: True,
        DEP: "amod"
    }, {
        IS_ASCII: True
    }, {
        is_location: True
    }])
    matcher.add_pattern(2, [{is_clean: True}, {is_location: True}])
    matcher.add_pattern(2, [{LEMMA: "my", DEP: "poss"}, {is_location: True}])
    matcher.add_pattern(2, [{
        LEMMA: "my",
        DEP: "poss"
    }, {
        IS_ASCII: True
    }, {
        is_location: True
    }])

    matcher.add_entity(3)
    matcher.add_pattern(3, [{LEMMA: "location"}])
    matcher.add_pattern(3, [{LEMMA: "place"}])
    matcher.add_pattern(3, [{LEMMA: "be"}, {LEMMA: "place"}])
    matcher.add_pattern(3, [{LEMMA: "is"}, {LEMMA: "place"}])

    matcher.add_entity(4)
    matcher.add_pattern(4, [{LEMMA: "house"}, {LEMMA: "wife"}])
    matcher.add_pattern(4, [{LOWER: "your", DEP: "poss"}, {is_location: True}])
    matcher.add_pattern(4, [{
        LOWER: "your",
        DEP: "poss"
    }, {
        IS_ASCII: True
    }, {
        is_location: True
    }])
    matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "incall"}])
    matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "in"}, {LEMMA: "call"}])
    matcher.add_pattern(4, [{
        LEMMA: "no"
    }, {
        LEMMA: "in"
    }, {
        is_hyphen: True
    }, {
        LEMMA: "call"
    }])
    matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {LEMMA: "incall"}])
    matcher.add_pattern(4, [{
        IS_ASCII: True,
        DEP: "neg"
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "call"
    }])
    matcher.add_pattern(4, [{
        IS_ASCII: True,
        DEP: "neg"
    }, {
        LEMMA: "in"
    }, {
        is_hyphen: True
    }, {
        LEMMA: "call"
    }])
    matcher.add_pattern(4, [{
        IS_ASCII: True,
        DEP: "neg"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "incall"
    }])
    matcher.add_pattern(4, [{
        IS_ASCII: True,
        DEP: "neg"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "call"
    }])
    matcher.add_pattern(4, [{
        IS_ASCII: True,
        DEP: "neg"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        is_hyphen: True
    }, {
        LEMMA: "call"
    }])
    matcher.add_pattern(4, [{
        IS_ASCII: True,
        DEP: "neg"
    }, {
        LEMMA: "have"
    }, {
        IS_ASCII: True,
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        IS_ASCII: True,
        DEP: "neg"
    }, {
        LEMMA: "have"
    }, {
        IS_ASCII: True
    }, {
        IS_ASCII: True,
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "if",
        DEP: "mark"
    }, {
        LEMMA: "have"
    }, {
        IS_ASCII: True,
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "if",
        DEP: "mark"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "have"
    }, {
        IS_ASCII: True
    }, {
        IS_ASCII: True,
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "if",
        DEP: "mark"
    }, {
        LEMMA: "have"
    }, {
        IS_ASCII: True
    }, {
        IS_ASCII: True,
        DEP: "dobj"
    }])

    return matcher
Пример #15
0
def load_derogatory_mentions_matcher(nlp):
    matcher = Matcher(nlp.vocab)

    bitch = ['w***e', 'bitch', 'c**t', 'psycho', 's**t']
    your = ['your', 'ur']

    is_bitch = FLAG29
    is_your = FLAG30
    set_flag(nlp, bitch, is_bitch)
    set_flag(nlp, your, is_your)

    matcher.add_entity(1)
    matcher.add_pattern(1, [{is_bitch: True}])
    matcher.add_pattern(1, [{LEMMA: "expose"}, {LOWER: "i"}])
    matcher.add_pattern(1, [{LEMMA: "expose"}, {LOWER: "me"}])
    matcher.add_pattern(1, [{LEMMA: "violate"}, {LOWER: "i"}])
    matcher.add_pattern(1, [{LEMMA: "violate"}, {LOWER: "me"}])
    matcher.add_pattern(1, [{LEMMA: "f**k"}, {LOWER: "i"}])
    matcher.add_pattern(1, [{LEMMA: "f**k"}, {LOWER: "me"}])
    matcher.add_pattern(1, [{
        LOWER: "i",
        DEP: "nsubj"
    }, {
        IS_ALPHA: True
    }, {
        LEMMA: "violate",
        DEP: "xcomp"
    }])
    matcher.add_pattern(1, [{LEMMA: "piece"}, {LOWER: "of"}, {LEMMA: "shit"}])
    matcher.add_pattern(1, [{LOWER: "hardcore"}])
    matcher.add_pattern(1, [{is_your: True}, {is_bitch: True}])
    matcher.add_pattern(1, [{is_your: True}, {LEMMA: "slave"}])
    matcher.add_pattern(1, [{is_your: True, DEP: "poss"}, {is_bitch: True}])
    matcher.add_pattern(1, [{is_your: True}, {LEMMA: "slave"}])
    matcher.add_pattern(1, [{LOWER: "i", DEP: "nsubj"}, {is_bitch: True}])
    matcher.add_pattern(1, [{
        LOWER: "i",
        DEP: "nsubj"
    }, {
        IS_ALPHA: True
    }, {
        is_bitch: True,
        DEP: "xcomp"
    }])

    matcher.add_entity(3)
    matcher.add_pattern(3, [{LOWER: "like"}, {is_bitch: True}])
    matcher.add_pattern(3, [{LEMMA: "bitch", POS: "VERB"}])

    matcher.add_entity(4)
    matcher.add_pattern(4, [{LOWER: "to"}, {is_bitch: True}])
    matcher.add_pattern(4, [{LOWER: "nor"}, {is_bitch: True}])
    matcher.add_pattern(4, [{DEP: "neg"}, {is_bitch: True}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "slave"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "expose"}, {LOWER: "i"}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "expose"}, {LOWER: "me"}])
    matcher.add_pattern(4, [{
        DEP: "neg"
    }, {
        LEMMA: "piece"
    }, {
        LOWER: "of"
    }, {
        LEMMA: "shit"
    }])
    matcher.add_pattern(4, [{DEP: "neg"}, {is_your: True}, {is_bitch: True}])
    matcher.add_pattern(4, [{DEP: "neg"}, {IS_ALPHA: True}, {is_bitch: True}])
    matcher.add_pattern(4, [{DEP: "neg"}, {IS_ALPHA: True}, {LEMMA: "slave"}])
    matcher.add_pattern(4, [{
        DEP: "neg"
    }, {
        IS_ALPHA: True
    }, {
        LEMMA: "expose"
    }, {
        LOWER: "i"
    }])
    matcher.add_pattern(4, [{
        DEP: "neg"
    }, {
        IS_ALPHA: True
    }, {
        LEMMA: "expose"
    }, {
        LOWER: "me"
    }])
    matcher.add_pattern(4, [{
        DEP: "neg"
    }, {
        IS_ALPHA: True
    }, {
        LEMMA: "piece"
    }, {
        LOWER: "of"
    }, {
        LEMMA: "shit"
    }])
    matcher.add_pattern(4, [{
        DEP: "neg"
    }, {
        IS_ALPHA: True
    }, {
        is_your: True
    }, {
        is_bitch: True
    }])
    matcher.add_pattern(4, [{DEP: "neg"}, {IS_ALPHA: True}, {is_bitch: True}])
    matcher.add_pattern(4, [{DEP: "neg"}, {IS_ALPHA: True}, {LEMMA: "slave"}])
    matcher.add_pattern(4, [{
        DEP: "neg"
    }, {
        IS_ALPHA: True
    }, {
        LEMMA: "expose"
    }, {
        LOWER: "i"
    }])
    matcher.add_pattern(4, [{
        DEP: "neg"
    }, {
        IS_ALPHA: True
    }, {
        LEMMA: "expose"
    }, {
        LOWER: "me"
    }])
    matcher.add_pattern(4, [{
        DEP: "neg"
    }, {
        IS_ALPHA: True
    }, {
        LEMMA: "piece"
    }, {
        LOWER: "of"
    }, {
        LEMMA: "shit"
    }])
    matcher.add_pattern(4, [{LEMMA: "girl"}, {LOWER: "next"}, {LEMMA: "door"}])
    matcher.add_pattern(4, [{LOWER: "with"}, {LOWER: "my"}, {LEMMA: "girl"}])
    matcher.add_pattern(4, [{LOWER: "no"}, {is_bitch: True}])
    matcher.add_pattern(4, [{DEP: "neg"}, {LOWER: "like"}, {is_bitch: True}])
    matcher.add_pattern(4, [{LEMMA: "look"}, {LEMMA: "slave", DEP: "prep"}])
    matcher.add_pattern(4, [{LOWER: "you"}, {is_bitch: True}])
    matcher.add_pattern(4, [{LOWER: "you"}, {POS: "ADJ"}, {is_bitch: True}])

    return matcher
Пример #16
0
def load_movement_matcher(nlp):
    matcher = Matcher(nlp.vocab)

    place = ['area', 'place', 'city', 'town']
    girl = [
        'gal', 'girl', 's**t', 'cutie', 'hottie', 'lady', 'teen', 'teenager',
        'chick', 'staff', 'gf', 'she'
    ]

    add_to_vocab(nlp, place)
    add_to_vocab(nlp, girl)

    is_place = FLAG18
    is_girl = FLAG19
    upper_start = FLAG20

    for lexeme in nlp.vocab:
        if lexeme.lower_ in place:
            lexeme.set_flag(is_place, True)
        if lexeme.lower_ in girl:
            lexeme.set_flag(is_girl, True)
        if lexeme.prefix_.isupper():
            lexeme.set_flag(upper_start, True)

    # Positive Matcher Patterns
    matcher.add_entity(1)
    matcher.add_pattern(1, [{
        LEMMA: "last"
    }, {
        LEMMA: "night"
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True,
        ENT_TYPE: "DATE"
    }])
    matcher.add_pattern(1, [{LEMMA: "leave"}, {DEP: "partmod"}])
    matcher.add_pattern(1, [{LEMMA: "leave"}, {DEP: "quantmod"}])
    matcher.add_pattern(1, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True,
        ENT_TYPE: "TIME"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "leave"
    }, {
        LEMMA: "in"
    }, {
        IS_ASCII: True,
        ENT_TYPE: "DATE"
    }])
    matcher.add_pattern(1, [{LEMMA: "leave"}, {LEMMA: "town"}])
    matcher.add_pattern(1, [{LEMMA: "out"}, {LEMMA: "of"}, {LEMMA: "town"}])
    matcher.add_pattern(1, [{LOWER: "outta"}, {LEMMA: "town"}])
    matcher.add_pattern(1, [{
        LEMMA: "lastnight"
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(1, [{LEMMA: "back"}, {LEMMA: "in"}, {LEMMA: "town"}])
    matcher.add_pattern(1, [{LEMMA: "just"}, {LEMMA: "in"}, {LEMMA: "town"}])
    matcher.add_pattern(1, [{LEMMA: "day"}, {LEMMA: "in"}, {LEMMA: "town"}])
    matcher.add_pattern(1, [{
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }, {
        LEMMA: "tonight"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }, {
        LEMMA: "through"
    }])
    matcher.add_pattern(1, [{LEMMA: "in"}, {LEMMA: "town"}, {LEMMA: "until"}])
    matcher.add_pattern(1, [{
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }, {
        LEMMA: "for"
    }, {
        LEMMA: "one"
    }, {
        LEMMA: "night"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }, {
        LEMMA: "for"
    }, {
        IS_DIGIT: True
    }, {
        LEMMA: "night"
    }])
    matcher.add_pattern(1, [{LEMMA: "town"}, {LEMMA: "stay", DEP: "nmod"}])
    matcher.add_pattern(1, [{
        LEMMA: "town"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "stay",
        DEP: "nmod"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "new"
    }, {
        LEMMA: "girl"
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(1, [{LEMMA: "recent"}, {LEMMA: "move"}])
    matcher.add_pattern(1, [{LEMMA: "recently"}, {LEMMA: "move"}])
    matcher.add_pattern(1, [{LEMMA: "relocate"}])
    matcher.add_pattern(1, [{
        LEMMA: "new",
        DEP: "amod"
    }, {
        LEMMA: "city"
    }, {
        LEMMA: "to",
        DEP: "dep"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "new",
        DEP: "amod"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "city"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "to",
        DEP: "dep"
    }])
    matcher.add_pattern(1, [{LEMMA: "new"}, {LEMMA: "to"}, {LEMMA: "area"}])
    matcher.add_pattern(1, [{
        LEMMA: "new"
    }, {
        LEMMA: "to"
    }, {
        upper_start: True
    }])
    matcher.add_pattern(1, [{LEMMA: "first"}, {LEMMA: "visit"}, {LEMMA: "to"}])
    matcher.add_pattern(1, [{LEMMA: "i", DEP: "nsubj"}, {LEMMA: "arrive"}])
    matcher.add_pattern(1, [{
        LEMMA: "girl",
        DEP: "nsubj"
    }, {
        LEMMA: "arrive"
    }, {
        DEP: "partmod"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "girl",
        DEP: "nsubj"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "arrive"
    }, {
        IS_ASCII: True
    }, {
        DEP: "partmod"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "girl",
        DEP: "nsubj"
    }, {
        LEMMA: "arrive"
    }, {
        DEP: "quantmod"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "girl",
        DEP: "nsubj"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "arrive"
    }, {
        IS_ASCII: True
    }, {
        DEP: "quantmod"
    }])
    matcher.add_pattern(1, [{LEMMA: "just"}, {LEMMA: "arrive"}])
    matcher.add_pattern(1, [{
        LEMMA: "on"
    }, {
        LEMMA: "my"
    }, {
        LEMMA: "way"
    }, {
        LEMMA: "to"
    }, {
        TAG: "NNP"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "on"
    }, {
        LEMMA: "my"
    }, {
        LEMMA: "way"
    }, {
        LEMMA: "to"
    }, {
        TAG: "NN"
    }])
    matcher.add_pattern(1, [{LEMMA: "on"}, {LEMMA: "the"}, {LEMMA: "way"}])
    matcher.add_pattern(1, [{LEMMA: "just"}, {LEMMA: "get"}, {LEMMA: "here"}])
    matcher.add_pattern(1, [{LEMMA: "get"}, {LEMMA: "here"}, {LEMMA: "today"}])
    matcher.add_pattern(1, [{
        LEMMA: "get"
    }, {
        LEMMA: "here"
    }, {
        LEMMA: "yesterday"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "get"
    }, {
        LEMMA: "here"
    }, {
        LEMMA: "last"
    }, {
        LEMMA: "night"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "i",
        DEP: "nsubj"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "visit"
    }, {
        IS_ASCII: True
    }, {
        is_place: True,
        DEP: "dobj"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "i",
        DEP: "nsubj"
    }, {
        LEMMA: "visit"
    }, {
        is_place: True,
        DEP: "dobj"
    }])

    # Strong Positive Matcher Patterns
    matcher.add_entity(2)
    matcher.add_pattern(2, [{
        LEMMA: "new"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        is_place: True
    }])
    matcher.add_pattern(2, [{
        LEMMA: "new"
    }, {
        IS_ASCII: True
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        is_place: True
    }])
    matcher.add_pattern(2, [{
        LEMMA: "im"
    }, {
        LEMMA: "new"
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(2, [{LEMMA: "new"}, {LEMMA: "in"}, {is_place: True}])
    matcher.add_pattern(2, [{LEMMA: "new"}, {LEMMA: "to"}, {is_place: True}])
    matcher.add_pattern(2, [{
        LEMMA: "new"
    }, {
        is_girl: True
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(2, [{
        LEMMA: "new"
    }, {
        LEMMA: "to"
    }, {
        upper_start: True
    }, {
        LEMMA: "area"
    }])

    # Negative Matcher Patterns
    matcher.add_entity(3)
    matcher.add_pattern(3, [{LEMMA: "new"}])
    matcher.add_pattern(3, [{LEMMA: "girl"}, {LEMMA: "in"}, {LEMMA: "town"}])
    matcher.add_pattern(3, [{LEMMA: "grand"}, {LEMMA: "new"}])
    matcher.add_pattern(3, [{LEMMA: "new"}, {LEMMA: "at"}])
    matcher.add_pattern(3, [{
        LEMMA: "new"
    }, {
        LEMMA: "to"
    }, {
        LEMMA: "business"
    }])
    matcher.add_pattern(3, [{
        LEMMA: "new"
    }, {
        LEMMA: "to"
    }, {
        LEMMA: "industry"
    }])
    matcher.add_pattern(3, [{
        LEMMA: "new"
    }, {
        LEMMA: "to"
    }, {
        LEMMA: "scenario"
    }])
    matcher.add_pattern(3, [{LEMMA: "dream", DEP: "nsubj"}, {LEMMA: "arrive"}])
    matcher.add_pattern(3, [{
        LEMMA: "fantasy",
        DEP: "nsubj"
    }, {
        LEMMA: "arrive"
    }])
    matcher.add_pattern(3, [{LEMMA: "you", DEP: "nsubj"}, {LEMMA: "arrive"}])
    matcher.add_pattern(3, [{LEMMA: "area"}, {LEMMA: "only"}])
    matcher.add_pattern(3, [{upper_start: True}, {LEMMA: "area"}])
    matcher.add_pattern(3, [{LEMMA: "you", DEP: "nsubj"}, {LEMMA: "leave"}])
    matcher.add_pattern(3, [{
        LEMMA: "it",
        DEP: "dobj"
    }, {
        LEMMA: "leave"
    }, {
        IS_ASCII: True,
        DEP: "nmod",
        TAG: "TO"
    }])
    matcher.add_pattern(3, [{
        LEMMA: "that",
        DEP: "dobj"
    }, {
        LEMMA: "leave"
    }, {
        IS_ASCII: True,
        DEP: "nmod",
        TAG: "TO"
    }])
    matcher.add_pattern(3, [{LEMMA: "best"}, {LEMMA: "move"}])
    matcher.add_pattern(3, [{LEMMA: "next"}, {LEMMA: "move"}])
    matcher.add_pattern(3, [{
        LEMMA: "arrive"
    }, {
        IS_ASCII: True
    }, {
        IS_ASCII: True,
        DEP: "xcomp"
    }])
    matcher.add_pattern(3, [{LEMMA: "arrive"}, {IS_ASCII: True, DEP: "xcomp"}])
    matcher.add_pattern(3, [{LEMMA: "visit"}, {LEMMA: "sister", DEP: "dobj"}])
    matcher.add_pattern(3, [{
        LEMMA: "visit"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "sister",
        DEP: "dobj"
    }])
    matcher.add_pattern(3, [{LEMMA: "visit"}, {LEMMA: "family", DEP: "dobj"}])
    matcher.add_pattern(3, [{
        LEMMA: "visit"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "family",
        DEP: "dobj"
    }])
    matcher.add_pattern(3, [{LEMMA: "we", DEP: "poss"}, {LEMMA: "visit"}])

    # Strong Negative Matcher Patterns
    matcher.add_entity(4)
    matcher.add_pattern(4, [{LEMMA: "town"}, {LEMMA: "girl"}])
    matcher.add_pattern(4, [{LEMMA: "on"}, {LEMMA: "the"}, {LEMMA: "town"}])
    matcher.add_pattern(4, [{LEMMA: "near"}, {LEMMA: "town"}])
    matcher.add_pattern(4, [{LEMMA: "down"}, {LEMMA: "town"}])
    matcher.add_pattern(4, [{LEMMA: "town"}, {LEMMA: "hall"}])
    matcher.add_pattern(4, [{LEMMA: "best"}, {LEMMA: "in"}, {LEMMA: "town"}])
    matcher.add_pattern(4, [{
        LEMMA: "best"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "best"
    }, {
        IS_ASCII: True
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "best"
    }, {
        LEMMA: "in"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "best"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "best"
    }, {
        IS_ASCII: True
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "not"
    }, {
        LEMMA: "new"
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "not"
    }, {
        LEMMA: "new"
    }, {
        LEMMA: "to"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(4, [{LEMMA: "not"}, {LEMMA: "leave"}, {LEMMA: "town"}])
    matcher.add_pattern(4, [{
        LEMMA: "i",
        DEP: "nsubj"
    }, {
        LEMMA: "leave"
    }, {
        LEMMA: "you",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{LEMMA: "new"}, {LEMMA: "but"}])
    matcher.add_pattern(4, [{
        LEMMA: "new"
    }, {
        LEMMA: "backpage",
        DEP: "nmod",
        TAG: "TO"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "new"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "backpage",
        DEP: "nmod",
        TAG: "TO"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "new"
    }, {
        LEMMA: "bp",
        DEP: "nmod",
        TAG: "TO"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "new"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "bp",
        DEP: "nmod",
        TAG: "TO"
    }])
    #DS
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "message", DEP: "dobj"}])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "msg", DEP: "dobj"}])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "txt", DEP: "dobj"}])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "text", DEP: "dobj"}])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        LEMMA: "impression",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        LEMMA: "voicemail",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "smile", DEP: "dobj"}])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "message",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "msg",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "txt",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "text",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "impression",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "voicemail",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "smile",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "satisfied"}])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "memory", DEP: "dobj"}])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "memory",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "you"}])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "u"}])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "with"}])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        LEMMA: "a"
    }, {
        LEMMA: "gentleman"
    }])
    matcher.add_pattern(4, [{LEMMA: "or"}, {LEMMA: "leave"}])
    matcher.add_pattern(4, [{LEMMA: "or"}, {LEMMA: "i"}, {LEMMA: "leave"}])
    matcher.add_pattern(4, [{LEMMA: "move"}, {LEMMA: "on"}])
    matcher.add_pattern(4, [{LEMMA: "i"}, {LEMMA: "move"}, {LEMMA: "like"}])
    matcher.add_pattern(4, [{LEMMA: "arrive"}, {LEMMA: "on"}, {LEMMA: "time"}])
    matcher.add_pattern(4, [{LEMMA: "can"}, {LEMMA: "move"}])
    matcher.add_pattern(4, [{LEMMA: "new"}, {LEMMA: "but"}])
    matcher.add_pattern(4, [{
        LEMMA: "on"
    }, {
        LEMMA: "my"
    }, {
        LEMMA: "way"
    }, {
        LEMMA: "to"
    }, {
        TAG: "PRP"
    }])
    matcher.add_pattern(4, [{LEMMA: "u"}, {LEMMA: "get"}, {LEMMA: "here"}])
    matcher.add_pattern(4, [{LEMMA: "you"}, {LEMMA: "get"}, {LEMMA: "here"}])
    matcher.add_pattern(4, [{LEMMA: "go"}, {LEMMA: "to"}, {LEMMA: "town"}])
    matcher.add_pattern(4, [{LEMMA: "new"}, {LEMMA: "management"}])

    return matcher