def __init__(self):
        self.count = {
            "0": 0,
            "1": 0,
            "2": 0,
            "3": 0,
            "4": 0,
            "5": 0,
            "6": 0,
            "7": 0,
            "8": 0,
            "9": 0,
            "10": 0
        }
        self.compa_sent_count = 0

        self.matcher = Matcher(nlp.vocab)
        self.matcher.add(0, None, [{
            'ORTH': 'JJR'
        }, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'JJR'
        }, {}, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'JJR'
        }, {
            'ORTH': 'CIN'
        }, {}, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'JJR'
        }, {}, {
            'ORTH': 'CIN'
        }, {}, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'JJ'
        }, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'JJ'
        }, {}, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'JJ'
        }, {
            'ORTH': 'CIN'
        }, {}, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'JJ'
        }, {}, {
            'ORTH': 'CIN'
        }, {}, {
            'ORTH': 'TECH'
        }])
        self.matcher.add(
            1,
            None,
            [{
                'ORTH': 'VB'
            }, {
                'ORTH': 'TECH'
            }, {
                'ORTH': 'TO'
            }, {
                'ORTH': 'VB'
            }],
            [{
                'ORTH': 'VB'
            }, {
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'TO'
            }, {
                'ORTH': 'VB'
            }],
        )
        self.matcher.add(8, None, [{
            'ORTH': 'RBR'
        }, {
            'ORTH': 'JJ'
        }, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'RBR'
        }, {
            'ORTH': 'JJ'
        }, {}, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }])
        self.matcher.add(2, None, [{
            'ORTH': 'CV'
        }, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'CV'
        }, {}, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }])
        self.matcher.add(3, None, [{
            'ORTH': 'CV'
        }, {
            'ORTH': 'VBG'
        }, {
            'ORTH': 'TECH'
        }])
        self.matcher.add(
            5,
            None,
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VB'
            }, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VB'
            }, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VB'
            }, {}, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VB'
            }, {}, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VB'
            }, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VB'
            }, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VB'
            }, {}, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VB'
            }, {}, {
                'ORTH': 'NN'
            }],
        )

        # self.matcher.add(6,
        #             None,
        #             [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'JJS'}],
        #             [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {'ORTH': 'JJS'}],
        #             [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'JJS'}],
        #             [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'JJS'}])
        self.matcher.add(7, None, [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VB'
        }, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VBZ'
        }, {}, {}, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VB'
        }, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {}, {}, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'JJR'
        }])
        self.matcher.add(10, None, [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VB'
        }, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {}, {}, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VB'
        }, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VB'
        }, {}, {}, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VB'
        }, {}, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'RBR'
        }])
        # self.matcher.add(9,
        #             None,
        #             [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'RBS'}],
        #             [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {'ORTH': 'RBS'}],
        #             [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RBS'}],
        #             [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RBS'}])

        self.matcher.add(
            11,
            None,
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'NP'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'NP'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'NP'
            }],
        )
예제 #2
0
def get_search_result():
    req = request.get_json()
    query = req.get('Query')
    process_query = query  # query after removing all matched patterns
    doc = nlp(query)
    phraseMatcher = PhraseMatcher(nlp.vocab, attr='LOWER')
    tokenMatcher = Matcher(nlp.vocab)

    GE_terms = [
        "lok sabha", "ls", "ge", "general election", "general elections",
        "national"
    ]
    GE_patterns = list(nlp.tokenizer.pipe(GE_terms))
    phraseMatcher.add("GE_PATTERN", None, *GE_patterns)

    AE_terms = [
        "ae", "vidhan sabha", "state election", "state elections",
        "assembly election", "assembly elections"
    ]
    AE_patterns = list(nlp.tokenizer.pipe(AE_terms))
    phraseMatcher.add("AE_PATTERN", None, *AE_patterns)

    state_patterns = [nlp.make_doc(key) for key in stateNamesDict]
    phraseMatcher.add("STATE_PATTERN", None, *state_patterns)

    matches = phraseMatcher(doc)
    electionType = ""
    stateName = "Lok_Sabha"
    party = []
    years = []

    for i in range(len(matches)):
        string_id = nlp.vocab.strings[matches[i][0]]
        if string_id == "GE_PATTERN":
            electionType = "GE"
        elif string_id == "AE_PATTERN":
            electionType = "AE"
        elif string_id == "STATE_PATTERN":
            start, end = matches[i][1], matches[i][2]
            span = doc[start:end]
            stateName = stateNamesDict.get(span.text.lower())

        if i < len(matches) - 1 and (matches[i][1] != matches[i + 1][1]):
            start, end = matches[i][1], matches[i][2]
            span = doc[start:end]
            process_query = re.sub(span.text, '', process_query)

    tokenMatcher.add("YEAR_PATTERN", None, [{
        "TEXT": {
            "REGEX": "[1-9][0-9][0-9][0-9]"
        }
    }])
    matches2 = tokenMatcher(doc)
    for match_id, start, end in matches2:
        span = doc[start:end]
        years.append(span.text)
        process_query = re.sub(span.text, '', process_query)

    new_doc = nlp(process_query)
    codes_json = open('ChartsMapsCodes.json')
    codes_data = json.load(codes_json)
    similar_modules = {}

    for code in codes_data:
        similar_modules[code['modulename']] = new_doc.similarity(
            nlp(code['title']))

    sorted_modules = sorted(similar_modules.items(),
                            key=operator.itemgetter(1),
                            reverse=True)
    module = ""
    full_party_names = {}
    party_options_modules = [
        "cvoteShareChart", "seatShareChart", "tvoteShareChart",
        "strikeRateChart"
    ]

    for i in range(len(sorted_modules)):
        module_name = sorted_modules[i][0]
        if module_name in party_options_modules:
            module = module_name
            break

    connection = connectdb(db_config)
    if connection.is_connected():
        cursor = connection.cursor()
        cursor.execute("show tables")
        tables = cursor.fetchall()
        db_tables = []
        for (table, ) in tables:
            db_tables.append(table)
        tableName = module_to_table(module)
        if tableName in db_tables:
            cursor = connection.cursor(prepared=True)
            query_input = list()
            get_table = "Select distinct Party from " + tableName
            get_count = "Select count(distinct Party) as count from " + tableName
            get_full_names = "Select distinct Party,Expanded_Party_Name from " + tableName
            # query_input.append(tableName)
            get_election = " where Election_Type = %s"
            if electionType == "":
                query_input.append("GE")
            else:
                query_input.append(electionType)
            get_state = ""
            if stateName is not None:
                get_state = " and State_Name = %s"
                query_input.append(stateName)

            party_names_query = get_full_names + get_election + get_state + " and position <10"
            cursor.execute(party_names_query, tuple(query_input))
            party_names = cursor.fetchall()

            print(query_input)
            for (name, full_name) in party_names:
                # print(name)
                # print(full_name)
                full_party_names.update({name: full_name})

    party_patterns = []
    for key, value in full_party_names.items():
        print(key, value)
        if key is not None:
            party_patterns.append(nlp.make_doc(key))
        if value is not None:
            party_patterns.append(nlp.make_doc(value))

    partyMatcher = PhraseMatcher(nlp.vocab, attr='LOWER')
    partyMatcher.add("PARTY_PATTERN", None, *party_patterns)
    party_matches = partyMatcher(new_doc)

    for match_id, start, end in party_matches:
        span = doc[start:end]
        party_match = span.text.upper()
        for key, value in full_party_names.items():
            if party_match == key or party_match == value:
                party.append(key)

    results = {}
    results["electionType"] = electionType
    results["stateName"] = stateName
    results["year"] = years
    results["similarModules"] = sorted_modules
    results["party"] = party

    return jsonify({'results': results})
예제 #3
0
class NLP():
    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe(Readability(), last=True)
    matcher = Matcher(nlp.vocab)
    def __init__(self, text):
        
        
        self.doc = self.nlp(text)
        self.blob = TextBlob(self.doc.text)
        self.readability = self.readability_indexes()
        self.word_tokens = self.tokenize_words(self.doc)
        
        self.sents = list(self.doc.sents)
        self.polysyllables = self.get_polysyllables(self.word_tokens[1])
        self.nominalized_words = self.get_nominalized(self.word_tokens[1])
        self.pos = self.get_pos(self.doc)
        self.prepositional_phrases = self.get_pps(self.doc)
        self.passive_phrases = self.get_passive_phrases(self.doc)
        self.get_pronouns(self.doc)
        self.get_weak_verbs(self.doc)
        self.sentence_count = len(self.sents)
        self.statistics()
        self.word_count = len(self.word_tokens[1])
        self.get_freq_dist()
        #self.lexicon_count = len(self.lexicon)
        self.get_intities()
    def readability_indexes(self):
        readability_scores = {}
        readability_scores['ari'] = self.doc._.automated_readability_index
        readability_scores['coleman_liau_index'] = self.doc._.coleman_liau_index
        readability_scores['dale_chall'] = self.doc._.dale_chall
        readability_scores['flesch_kincaid_grade'] = self.doc._.flesch_kincaid_grade_level
        readability_scores['flesch_kincaid_re'] = self.doc._.flesch_kincaid_reading_ease
        readability_scores['forcast'] = self.doc._.forcast
        readability_scores['smog'] = self.doc._.smog
        return readability_scores
    
    def tokenize_words(self, document):
        spacy_word_tokens = [t.text for t in document]
        no_punct_word_tokens = []
        for w in spacy_word_tokens:
            for p in punctuation:
                w = w.replace(p, "").replace("\n", "").replace("", '')
            no_punct_word_tokens.append(w.lower())
        no_punct_word_tokens.remove('')
        return (spacy_word_tokens, no_punct_word_tokens)
    def get_polysyllables(self, some_list):
        polysyllables = []
        for w in some_list: 
            if syllables.estimate(w) > 3: 
                polysyllables.append(w)
        return polysyllables
    # def get_polysyllables2(self, doc):
    #     phoney = BigPhoney()
    #     self.total_syllables = phoney.count_syllables(self.doc.text)
    #     self.polys = []
    #     for token in doc:
    #         if phoney.count_syllables(token.text) > 3:
    #             self.polys.append(token.text)
    #         else:
    #             pass
    def get_nominalized(self, list):
        nominalized_words = {}
        nominalized_words['-tion words'] = []
        
        for word in list:
            if word.endswith("tion"):
                nominalized_words['-tion words'].append(word)
            
            else:
                pass
        return nominalized_words
    def get_pos(self, nlp_doc):
        parts_of_speech = {}
        parts_of_speech['gerunds'] = []
        parts_of_speech['adjectives'] = []
        parts_of_speech['adverbs'] = []
        parts_of_speech['prepositions'] = []
        for token in nlp_doc:
            if token.tag_ == "VBG":
                parts_of_speech['gerunds'].append(token.text)
            elif token.pos_ == "ADJ":
                parts_of_speech['adjectives'].append(token.text)
            elif token.pos_ == "ADV":
                parts_of_speech['adverbs'].append(token.text)
            
            else:
                pass
        return parts_of_speech

    def get_pps(self, doc):
        #Function to get prepositions from a parsed document.
        pps = []
        for token in doc:
            if token.pos_ == 'ADP':
                pp = ' '.join([tok.orth_ for tok in token.subtree])
                pps.append(pp)
        return pps

    def get_passive_phrases(self, doc):
        self.passive_sents = []
        passive_phrases = []
        passive_rule = [{'DEP': 'nsubjpass'},
        {'DEP':'aux','OP':'*'},
        {'DEP':'auxpass'},
        {'TAG':'VBN'}
        ] 
        self.matcher.add('passive', None, passive_rule)
        sents = list(doc.sents)
        matches = self.matcher(doc)
        for match_id, start, end in matches:
            string_id = doc.vocab.strings[match_id]
            span = doc[start:end]
            passive_phrases.append(span.text)
        for s in self.sents:
            for p in passive_phrases:
                if p in s.text:
                    self.passive_sents.append(s.text)
        #return passive_phrases
    def get_weak_verbs(self, doc):
        self.weak_verbs = {}
        self.weak_verbs['to be'] = []
        self.weak_verbs['auxiliary'] = []
        for token in doc:
            if token.lemma_ == "be":
                self.weak_verbs['to be'].append(token.text)
            elif token.pos_ == 'AUX':
                self.weak_verbs['auxiliary'].append(token.text)
            else:
                pass
    def get_pronouns(self, doc):
        self.personal_pronouns = {}
        self.personal_pronouns['first person pronouns'] = []
        self.personal_pronouns['second person pronouns'] = []
        self.pronouns = []
        for token in doc:
            if token.tag_ == 'PRP' or token.tag_ == "PRP$":
                if token.text.lower() in ['i', 'me', 'mine', 'my', 'myself']:
                    self.personal_pronouns['first person pronouns'].append(token.text)
                elif token.text.lower() in ['you', 'your', 'yours', 'yourself']:
                    self.personal_pronouns['second person pronouns'].append(token.text)
                
                
                else:
                    pass
            elif token.pos_ == "PRON":
                    self.pronouns.append(token.text.lower())
            else:
                pass
    def statistics(self):
        self.statistics = {}
        self.statistics['per sentence'] = {} # rate per sentence
        self.statistics['per sentence'].update({'preposition rate':len(self.prepositional_phrases)/self.sentence_count})
        self.statistics['per sentence'].update({'be rate':len(self.weak_verbs['to be'])/self.sentence_count})   
        self.statistics['per sentence'].update({'passive rate':len(self.passive_sents)/self.sentence_count})
        self.statistics['percent of sentences'] = {}
        self.statistics['percent of sentences'].update({'prepositions':self.statistics['per sentence']['preposition rate'] * 100})
        self.statistics['percent of sentences'].update({'to be':self.statistics['per sentence']['be rate'] * 100})
        self.statistics['percent of sentences'].update({'passives':self.statistics['per sentence']['passive rate'] * 100})
        self.statistics['ratios'] = {}
        self.statistics['ratios'].update({'adverbs to adjectives':len(self.pos['adverbs'])/len(self.pos['adjectives'])})
    
    def get_freq_dist(self):
        words = [token.text for token in self.doc if token.is_stop != True and token.is_punct != True and token.text.isalpha() == True]
        nouns = [token.text for token in self.doc if token.is_stop != True and token.is_punct != True and token.pos_ == "NOUN" and token.text.isalpha() == True]     
        verbs = [token.text for token in self.doc if token.is_stop != True and token.is_punct != True and token.pos_ == "VERB" and token.text.isalpha() == True]
        
        word_freq = Counter(words)
        noun_freq = Counter(nouns)
        verb_freq = Counter(verbs)
        self.common_words = word_freq.most_common(10)
        self.common_nouns = noun_freq.most_common(10)
        self.common_verbs = verb_freq.most_common(10)
    def get_intities(self):
        self.entities = {}
        for ent in self.doc.ents:
            self.entities[ent.text] = ent.label_
예제 #4
0
 def __init__(self, vocab, boundary_protection_rules=[]):
     self.matcher = Matcher(vocab)
     for rule in boundary_protection_rules:
         self.matcher.add(rule['label'], None, rule['pattern'])
예제 #5
0
# -*- coding: utf-8  -*-
# !/usr/bin/python

__author__ = "biavarone"

from spacy.matcher import Matcher
from utils import nlp

interactions_matcher = Matcher(nlp.vocab, validate=True)

# alone
alone1 = [{'LEMMA': 'on'}, {'LOWER': 'my'}, {'LEMMA': 'own'}]
alone2 = [{'LEMMA': 'by'}, {'LOWER': 'myself'}]
alone3 = [{'LEMMA': 'alone', 'POS': {'IN': ['ADV', 'ADJ']}}]

interactions_matcher.add('alone', None, alone1, alone2, alone3)

# animal
animal1 = [{
    'LEMMA': {
        'IN': [
            'animal', 'cat', 'cub', 'dog', 'kitten', 'kitty', 'pet', 'pup',
            'puppy'
        ]
    },
    'POS': 'NOUN'
}]
animal2 = [{'LOWER': {'IN': ['doggie', 'doggo', 'doggy']}, 'POS': 'NOUN'}]

interactions_matcher.add('animal', None, animal1, animal2)
예제 #6
0
from src.utils.LoopTimer import LoopTimer

path_to_db = "/media/norpheo/mySQL/db/ssorc"

nlp_model = "en_wa_v2"
path_to_annotations = os.path.join(path_to_db, "annotations_version",
                                   nlp_model)
pandas_path = os.path.join(path_to_db, "pandas")
path_to_ner = os.path.join(path_to_db, "NER")

threshold = 3

print("Loading NLP Model and Vocab")
nlp = spacy.load(os.path.join(path_to_db, "models", nlp_model))
vocab = nlp.vocab.from_disk(os.path.join(path_to_annotations, "spacy.vocab"))
matcher = Matcher(vocab)

mla = set()
with open(os.path.join(path_to_ner, "ml_algos_noacronyms.txt"), "r") as handle:
    for line in handle:
        mla.add(line.replace("\n", ""))

for ml_algo in mla:
    ml_doc = nlp(ml_algo)
    pattern = [{"LOWER": token.lower_} for token in ml_doc]
    pattern_name = "".join([entity["LOWER"] for entity in pattern]).lower()
    matcher.add(pattern_name, None, pattern)

infoDF = pd.read_pickle(os.path.join(path_to_annotations, 'info_db.pandas'))
targ = len(infoDF)
TRAIN_DATA = list()
예제 #7
0
def matches(
    doc: Doc,
    patterns: Union[str, List[str], List[Dict[str, str]], List[List[Dict[str, str]]]],
    *,
    on_match: Callable = None,
) -> Iterable[Span]:
    """
    Extract ``Span`` s from a ``Doc`` matching one or more patterns
    of per-token attr:value pairs, with optional quantity qualifiers.

    Args:
        doc
        patterns:
            One or multiple patterns to match against ``doc``
            using a :class:`spacy.matcher.Matcher`.

            If List[dict] or List[List[dict]], each pattern is specified
            as attr: value pairs per token, with optional quantity qualifiers:

            * ``[{"POS": "NOUN"}]`` matches singular or plural nouns,
              like "friend" or "enemies"
            * ``[{"POS": "PREP"}, {"POS": "DET", "OP": "?"}, {"POS": "ADJ", "OP": "?"}, {"POS": "NOUN", "OP": "+"}]``
              matches prepositional phrases, like "in the future" or "from the distant past"
            * ``[{"IS_DIGIT": True}, {"TAG": "NNS"}]`` matches numbered plural nouns,
              like "60 seconds" or "2 beers"
            * ``[{"POS": "PROPN", "OP": "+"}, {}]`` matches proper nouns and
              whatever word follows them, like "Burton DeWilde yaaasss"

            If str or List[str], each pattern is specified as one or more
            per-token patterns separated by whitespace where attribute, value,
            and optional quantity qualifiers are delimited by colons. Note that
            boolean and integer values have special syntax --- "bool(val)" and
            "int(val)", respectively --- and that wildcard tokens still need
            a colon between the (empty) attribute and value strings.

            * ``"POS:NOUN"`` matches singular or plural nouns
            * ``"POS:PREP POS:DET:? POS:ADJ:? POS:NOUN:+"`` matches prepositional phrases
            * ``"IS_DIGIT:bool(True) TAG:NNS"`` matches numbered plural nouns
            * ``"POS:PROPN:+ :"`` matches proper nouns and whatever word follows them

            Also note that these pattern strings don't support spaCy v2.1's
            "extended" pattern syntax; if you need such complex patterns, it's
            probably better to use a List[dict] or List[List[dict]], anyway.

        on_match: Callback function to act on matches.
            Takes the arguments ``matcher``, ``doc``, ``i`` and ``matches``.

    Yields:
        Next matching ``Span`` in ``doc``, in order of appearance

    Raises:
        TypeError
        ValueError

    See Also:
        * https://spacy.io/usage/rule-based-matching
        * https://spacy.io/api/matcher
    """  # noqa: E501
    if isinstance(patterns, str):
        patterns = [_make_pattern_from_string(patterns)]
    elif isinstance(patterns, (list, tuple)):
        if all(isinstance(item, str) for item in patterns):
            patterns = [_make_pattern_from_string(pattern) for pattern in patterns]
        elif all(isinstance(item, dict) for item in patterns):
            patterns = [patterns]
        elif all(isinstance(item, (list, tuple)) for item in patterns):
            pass  # already in the right format!
        else:
            raise TypeError(
                errors.type_invalid_msg(
                    "patterns",
                    type(patterns),
                    Union[
                        str, List[str], List[Dict[str, str]], List[List[Dict[str, str]]]
                    ],
                )
            )
    else:
        raise TypeError(
            errors.type_invalid_msg(
                "patterns",
                type(patterns),
                Union[str, List[str], List[Dict[str, str]], List[List[Dict[str, str]]]],
            )
        )
    matcher = Matcher(doc.vocab)
    matcher.add("match", patterns, on_match=on_match)
    for _, start, end in matcher(doc):
        yield doc[start:end]
예제 #8
0
    def __init__(self, nlp):
        self.nlp = nlp
        Token.set_extension('feature_is_mass_unit', default=False)
        nlp.entity.add_label('mass_unit')

        Token.set_extension('feature_is_volume_unit', default=False)
        nlp.entity.add_label('volume_unit')

        Token.set_extension('feature_is_time_unit', default=False)
        nlp.entity.add_label('time_unit')

        Token.set_extension('feature_is_route_type', default=False)
        nlp.entity.add_label('route_type')

        Token.set_extension('feature_is_form_unit', default=False)
        nlp.entity.add_label('form_unit')

        Token.set_extension('feature_is_frequency_indicator', default=False)
        nlp.entity.add_label('frequency_indicator')

        Token.set_extension('feature_is_measurement_unit', default=False)
        nlp.entity.add_label('measurement_unit')

        Token.set_extension('feature_is_measurement', default=False)
        nlp.entity.add_label('measurement')

        Token.set_extension('feature_is_duration_pattern', default=False)
        nlp.entity.add_label('duration_pattern')

        self.mass_matcher = Matcher(nlp.vocab)
        self.volume_matcher = Matcher(nlp.vocab)
        self.time_matcher = Matcher(nlp.vocab)
        self.route_matcher = Matcher(nlp.vocab)
        self.form_matcher = Matcher(nlp.vocab)
        self.unit_of_measurement_matcher = Matcher(nlp.vocab)
        self.measurement_matcher = Matcher(nlp.vocab)
        self.frequency_matcher = Matcher(nlp.vocab)
        self.duration_matcher = Matcher(nlp.vocab)

        self.mass_matcher.add('UNIT_OF_MASS', None, [{
            'LOWER': 'mcg'
        }], [{
            'LOWER': 'microgram'
        }], [{
            'LOWER': 'micrograms'
        }], [{
            'ORTH': 'mg'
        }], [{
            'LOWER': 'milligram'
        }], [{
            'LOWER': 'g'
        }], [{
            'LOWER': 'kg'
        }], [{
            'ORTH': 'mEq'
        }])

        self.volume_matcher.add('UNIT_OF_VOLUME', None, [{
            'LOWER': 'ml'
        }], [{
            'ORTH': 'dL'
        }], [{
            'LOWER': 'cc'
        }], [{
            'ORTH': 'L'
        }])

        self.time_matcher.add('UNIT_OF_TIME', None, [{
            'LOWER': 'sec'
        }], [{
            'LOWER': 'second'
        }], [{
            'LOWER': 'seconds'
        }], [{
            'LOWER': 'min'
        }], [{
            'LOWER': 'minute'
        }], [{
            'LOWER': 'minutes'
        }], [{
            'LOWER': 'hr'
        }], [{
            'LOWER': 'hour'
        }], [{
            'LOWER': 'day'
        }], [{
            'LOWER': 'days'
        }], [{
            'LOWER': 'week'
        }], [{
            'LOWER': 'weeks'
        }], [{
            'LOWER': 'month'
        }], [{
            'LOWER': 'months'
        }], [{
            'LOWER': 'year'
        }], [{
            'LOWER': 'years'
        }], [{
            'LOWER': 'yrs'
        }])

        self.frequency_matcher.add('FREQUENCY_MATCHER', None, [{
            'LOWER': 'bid'
        }], [{
            'LOWER': 'prn'
        }], [{
            'LOWER': 'qid'
        }], [{
            'LOWER': 'tid'
        }], [{
            'LOWER': 'qd'
        }], [{
            'LOWER': 'daily'
        }], [{
            'LOWER': 'hs'
        }], [{
            'LOWER': 'as'
        }, {
            'LOWER': 'needed'
        }], [{
            'LOWER': 'once'
        }, {
            'LOWER': 'a'
        }, {
            'LOWER': 'day'
        }], [{
            'LOWER': 'twice'
        }, {
            'LOWER': 'a'
        }, {
            'LOWER': 'day'
        }])

        self.form_matcher.add('UNIT_OF_FORM', None, [{
            'ORTH': 'dose'
        }], [{
            'ORTH': 'doses'
        }], [{
            'LEMMA': 'pill'
        }], [{
            'LEMMA': 'tablet'
        }], [{
            'LEMMA': 'unit'
        }], [{
            'LEMMA': 'u'
        }], [{
            'LEMMA': 'patch'
        }], [{
            'LEMMA': 'unit'
        }], [{
            'ORTH': 'lotion'
        }], [{
            'ORTH': 'powder'
        }], [{
            'ORTH': 'amps'
        }], [{
            'LOWER': 'actuation'
        }], [{
            'LEMMA': 'suspension'
        }], [{
            'LEMMA': 'syringe'
        }], [{
            'LEMMA': 'puff'
        }], [{
            'LEMMA': 'liquid'
        }], [{
            'LEMMA': 'aerosol'
        }], [{
            'LEMMA': 'cap'
        }])

        self.route_matcher.add('TYPE_OF_ROUTE', None, [{
            'LOWER': 'IV'
        }], [{
            'ORTH': 'intravenous'
        }], [{
            'LOWER': 'po'
        }], [{
            'ORTH': 'gtt'
        }], [{
            'LOWER': 'drip'
        }], [{
            'LOWER': 'inhalation'
        }], [{
            'LOWER': 'by'
        }, {
            'LOWER': 'mouth'
        }], [{
            'LOWER': 'topical'
        }], [{
            'LOWER': 'subcutaneous'
        }], [{
            'LOWER': 'ophthalmic'
        }], [{
            'LEMMA': 'injection'
        }], [{
            'LOWER': 'mucous'
        }, {
            'LOWER': 'membrane'
        }], [{
            'LOWER': 'oral'
        }], [{
            'LOWER': 'nebs'
        }], [{
            'LOWER': 'transdermal'
        }], [{
            'LOWER': 'nasal'
        }])

        self.unit_of_measurement_matcher.add('UNIT_OF_MEASUREMENT', None,
                                             [{
                                                 'ENT_TYPE': 'mass_unit'
                                             }, {
                                                 'ORTH': '/'
                                             }, {
                                                 'ENT_TYPE': 'volume_unit'
                                             }], [{
                                                 'ENT_TYPE': 'volume_unit'
                                             }, {
                                                 'ORTH': '/'
                                             }, {
                                                 'ENT_TYPE': 'time_unit'
                                             }], [{
                                                 'ENT_TYPE': 'form_unit'
                                             }, {
                                                 'ORTH': '/'
                                             }, {
                                                 'ENT_TYPE': 'volume_unit'
                                             }])
        self.measurement_matcher.add('MEASUREMENT', None, [{
            'LIKE_NUM': True
        }, {
            'ORTH': '%'
        }], [{
            'LIKE_NUM': True
        }, {
            'ENT_TYPE': 'measurement_unit'
        }], [{
            'LIKE_NUM': True
        }, {
            'ENT_TYPE': 'mass_unit'
        }], [{
            'LIKE_NUM': True
        }, {
            'ENT_TYPE': 'volume_unit'
        }], [{
            'LIKE_NUM': True
        }, {
            'ENT_TYPE': 'form_unit'
        }], [{
            'LIKE_NUM': True
        }, {
            'LOWER': 'x'
        }, {
            'ENT_TYPE': 'form_unit'
        }])

        self.duration_matcher.add('DURATION', None, [{
            'POS': 'PREP'
        }, {
            'LIKE_NUM': True
        }, {
            'ENT_TYPE': 'time_unit'
        }], [{
            'LIKE_NUM': True
        }, {
            'ENT_TYPE': 'time_unit'
        }], [{
            'LOWER': 'in'
        }, {
            'LIKE_NUM': True
        }, {
            'ENT_TYPE': 'time_unit'
        }], [{
            'LOWER': 'prn'
        }])
def custom_tokenizer_to_df(nlp, doc):
    # Initialize the Matcher with a vocab
    matcher = Matcher(nlp.vocab)

    ###############################################################
    # Add pattern for valid hashtag, i.e. '#' plus any ASCII token
    matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ALPHA": True}])

    # Register token extension for hashtag
    Token.set_extension("is_hashtag", default=False, force=True)

    # Fit in text in matcher
    matches = matcher(doc)

    # Find hashtag and merge, assign hashtag label
    hashtags = []
    for match_id, start, end in matches:
        if doc.vocab.strings[match_id] == "HASHTAG":
            hashtags.append(doc[start:end])
    with doc.retokenize() as retokenizer:
        for span in hashtags:
            retokenizer.merge(span)
            for token in span:
                token._.is_hashtag = True
    ##############################################################

    ##############################################################
    # Find number and merge, assign number label
    # Add pattern for valid hashtag, i.e. '#' plus any ASCII token
    matcher.add("LONG_NUMBER", None, [{
        "IS_DIGIT": True
    }, {
        "ORTH": ','
    }, {
        "IS_DIGIT": True
    }])
    matcher.add("LONG_NUMBER", None, [{
        "IS_DIGIT": True
    }, {
        "ORTH": '.'
    }, {
        "IS_DIGIT": True
    }])

    # Register token extension for hashtag
    Token.set_extension("is_long_number", default=False, force=True)

    # Fit in text in matcher
    matches = matcher(doc)

    long_number = []
    for match_id, start, end in matches:
        if doc.vocab.strings[match_id] == "LONG_NUMBER":
            long_number.append(doc[start:end])
    with doc.retokenize() as retokenizer:
        for span in long_number:
            retokenizer.merge(span)
            for token in span:
                token._.is_long_number = True
    ##############################################################

    for i, token in enumerate(doc):
        if token._.is_hashtag:
            token.tag_ = 'Hashtag'
        if token.like_url:
            token.tag_ = 'URL'
        if token.like_email:
            token.tag_ = 'Email'
        if token.is_stop:
            token.tag_ = 'Stop Word'
        if token.like_num:
            token.tag_ = 'Number'
        if token._.is_long_number:
            token.tag_ = 'Number'
        if token.is_punct:
            token.tag_ = 'Punctuation'

    # Write the tokens to data frame
    df = pd.DataFrame()
    df['Token'] = [token.text for token in doc]
    df['POS'] = [token.pos_ for token in doc]
    df['NE'] = [token.ent_iob_ for token in doc]
    df['Lemma'] = [token.lemma_ for token in doc]
    df['Tag'] = [token.tag_ for token in doc]
    df['Language'] = np.nan
    df['Candidate'] = True
    df['Anglicism'] = np.nan
    return df
예제 #10
0
    ])

    infix_re = compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab,
                     prefix_search=nlp.tokenizer.prefix_search,
                     suffix_search=nlp.tokenizer.suffix_search,
                     infix_finditer=infix_re.finditer,
                     token_match=nlp.tokenizer.token_match,
                     rules=nlp.Defaults.tokenizer_exceptions)


nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = custom_tokenizer(nlp)

MATCHER = Matcher(nlp.vocab)
RELATION_PATTERN = [{
    'DEP': 'ROOT'
}, {
    'DEP': 'prep',
    'OP': "?"
}, {
    'DEP': 'agent',
    'OP': "?"
}, {
    'POS': 'ADJ',
    'OP': "?"
}]


def extract_noun_chunks(doc, dep_tag):
예제 #11
0
    def __init__(self):
        self.count = {
            "0": 0,
            "1": 0,
            "2": 0,
            "3": 0,
            "4": 0,
            "5": 0,
            "6": 0,
            "7": 0,
            "8": 0,
            "9": 0,
            "10": 0
        }
        self.compa_sent_count = 0

        self.nlp = spacy.load("en")
        self.matcher = Matcher(self.nlp.vocab)
        # self.matcher.add(6,
        #             None,
        #             [{'ORTH': 'JJR'}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}],
        #             [{'ORTH': 'JJR'}, {}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}],
        #             [{'ORTH': 'JJR'}, {'ORTH': 'CIN'}, {}, {'ORTH': 'TECH'}],
        #             [{'ORTH': 'JJR'}, {}, {'ORTH': 'CIN'}, {}, {'ORTH': 'TECH'}])
        # self.matcher.add(7,
        #             None,
        #             [{'ORTH': 'RB'}, {'ORTH': 'JJ'}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}],
        #             [{'ORTH': 'RB'}, {'ORTH': 'JJ'}, {}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}])
        # self.matcher.add(8,
        #             None,
        #             [{'ORTH': 'RBR'}, {'ORTH': 'JJ'}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}],
        #             [{'ORTH': 'RBR'}, {'ORTH': 'JJ'}, {}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}])
        #
        #
        # self.matcher.add(4,
        #                  None,
        #                  [{'ORTH': 'NN'}, {'ORTH': 'IN'}, {'ORTH': 'TECH'}, {'ORTH': 'VBZ'},  {}, {'ORTH': 'RB'}],
        #                  [{'ORTH': 'NN'}, {'ORTH': 'IN'}, {'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}],
        #                  [{'ORTH': 'NN'}, {'ORTH': 'IN'}, {'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'RB'}],
        #                  [{'ORTH': 'NN'}, {'ORTH': 'IN'}, {}, {'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'RB'}],
        #                  [{'ORTH': 'NN'}, {'ORTH': 'IN'}, {}, {'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RB'}],
        #
        #
        #                  )
        #
        self.matcher.add(
            5,
            None,
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBP'
            }, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBP'
            }, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBP'
            }, {}, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBP'
            }, {}, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBZ'
            }, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBZ'
            }, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBZ'
            }, {}, {
                'ORTH': 'NN'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBZ'
            }, {}, {
                'ORTH': 'NN'
            }],
        )
        self.matcher.add(
            1,
            None,
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBZ'
            }, {
                'ORTH': 'JJ'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBZ'
            }, {
                'ORTH': 'JJ'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBZ'
            }, {}, {
                'ORTH': 'JJ'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBZ'
            }, {}, {
                'ORTH': 'JJ'
            }],
            # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {}, {}, {'ORTH': 'JJ'}],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBZ'
            }, {}, {
                'ORTH': 'JJ'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBD'
            }, {
                'ORTH': 'JJ'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBD'
            }, {
                'ORTH': 'JJ'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBD'
            }, {}, {
                'ORTH': 'JJ'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBD'
            }, {}, {
                'ORTH': 'JJ'
            }],
            # [{'ORTH': 'TECH'}, {'ORTH': 'VBD'}, {}, {}, {}, {'ORTH': 'JJ'}],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBD'
            }, {}, {
                'ORTH': 'JJ'
            }])
        self.matcher.add(
            3,
            None,
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBZ'
            }, {
                'ORTH': 'RB'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBZ'
            }, {
                'ORTH': 'RB'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBZ'
            }, {}, {
                'ORTH': 'RB'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBZ'
            }, {}, {
                'ORTH': 'RB'
            }],
            # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {}, {}, {'ORTH': 'RB'}],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBZ'
            }, {}, {
                'ORTH': 'RB'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBD'
            }, {
                'ORTH': 'RB'
            }],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBD'
            }, {
                'ORTH': 'RB'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBD'
            }, {}, {
                'ORTH': 'RB'
            }],
            [{
                'ORTH': 'TECH'
            }, {
                'ORTH': 'VBD'
            }, {}, {
                'ORTH': 'RB'
            }],
            # [{'ORTH': 'TECH'}, {'ORTH': 'VBD'}, {}, {}, {}, {'ORTH': 'RB'}],
            [{
                'ORTH': 'TECH'
            }, {}, {
                'ORTH': 'VBD'
            }, {}, {
                'ORTH': 'RB'
            }])
def feature_extraction(df, ft_model, nlp):
    # Extracting all the single nouns in the corpus
    all_nouns = []

    for review in df['spacyObj']:
        for token in review:
            if token.pos_ == "NOUN":
                all_nouns.append(token.text)

    all_nouns = pd.Series(all_nouns)
    # Finding unique nouns along with their counts sorted in descending order
    unique_nouns = all_nouns.value_counts()

    noun_phrases = []

    # Pattern to match i.e. two nouns occuring together
    patterns = [[{'TAG': 'NN'}, {'TAG': 'NN'}]]

    matcher = Matcher(nlp.vocab)
    matcher.add('NounPhrasees', patterns)

    for review in df['spacyObj']:
        matches = matcher(review)

        for match_id, start, end in matches:
            noun_phrases.append(review[start:end].text)

    noun_phrases = pd.Series(noun_phrases)
    unique_noun_phrases = noun_phrases.value_counts()

    # Remove nouns with single or double character
    for noun in unique_nouns.index:
        # if noun length is less than 3 or if nouns contain any numbers, it is considered invalid
        if len(noun) < 3 or re.match(r".*[0-9].*", noun) is not None:
            del unique_nouns[noun]

    # Extracting Top Features

    top2 = len(unique_nouns) * 0.05  # considering top 5% of features
    top2 = int(top2)

    top_features = unique_nouns[0:top2]

    # this will contain all the final features
    features_bucket = OrderedDict()

    top_features_list = list(top_features.keys())
    top_features_set = set(top_features.keys())
    unique_noun_phrases_set = set(unique_noun_phrases.keys())

    # Applying assocation rule mining to group nouns occuring together
    for feature1 in top_features_list:
        for feature2 in top_features_list:
            feature_phrase = feature1 + ' ' + feature2

            if feature1 in top_features_set and feature2 in top_features_set and feature_phrase in unique_noun_phrases_set:
                # If the condition is true, we have identified a noun phrase which is a combination of two nouns
                # in the top_features. So one of the nouns cn be eliminated from top features.

                # Ex. if "battery life" is found, then "life" can be eliminated from top features as it is not a feature
                # by itself. It is just part of the feature "battery life"

                # Now we need to find out if frequency of the lesser occuring noun (in our ex., the word "life") matches
                # with the frequency of the noun phrase (in our ex., "battery life") by a certain confidence.
                # If it does so, then we can be sure that the lesser occuring noun occurs only in that particular noun_phrase
                # i.e in our ex "life" occurs primaryly in the phrase "battery life"

                lesser_occurring_noun = ""
                often_occurring_noun = ""
                if unique_nouns[feature1] < unique_nouns[feature2]:
                    lesser_occurring_noun = feature1
                    often_occurring_noun = feature2
                else:
                    lesser_occurring_noun = feature2
                    often_occurring_noun = feature1

                # assuming confidence interval of 40%
                # i.e. accordnig to 'battery life' example, out of total times that 'life' is seen, 'battery' is seen next to it 40% of the time.

                if unique_noun_phrases[feature_phrase] / unique_nouns[
                        lesser_occurring_noun] > 0.4:
                    try:
                        if often_occurring_noun not in features_bucket:
                            features_bucket[often_occurring_noun] = []
                        features_bucket[often_occurring_noun].append(
                            lesser_occurring_noun)
                        top_features_set.remove(lesser_occurring_noun)
                        # print(lesser_occurring_noun)
                    except BaseException as error:
                        print(error)
                        continue

    main_features = list(features_bucket.keys())
    top_features_to_add = set(top_features_list[:20])

    # here we are manually adding adding 20 top nouns as features which were previously not
    # added by the assocation rule mining step above.
    # But before adding, we are checking if any similar nouns exist among the 20 nouns.
    # Ex. If 'display' and 'screen' occur in the top 20, we must add only the most commonly occuring
    # one among the two and remove the other.

    # Here we are only eliminating the nouns that are similar to existing ones in features_bucket.
    for feature1 in top_features_list[:20]:
        for feature2 in main_features:
            if feature1 not in features_bucket and feature1 in top_features_set:
                similarity = cosine_similarity(
                    ft_model.get_word_vector(feature1).reshape(1, -1),
                    ft_model.get_word_vector(feature2).reshape(1, -1))
                if similarity[0][0] > 0.64:
                    top_features_to_add.discard(feature1)

            else:
                top_features_to_add.discard(feature1)

    top_features_to_add_list = list(top_features_to_add)

    # Here we are eliminating nouns that are similar to one another in the top_features_to_add
    for feature1 in top_features_to_add_list:
        for feature2 in top_features_to_add_list:
            if feature1 in top_features_to_add and feature2 in top_features_to_add:
                similarity = cosine_similarity(
                    ft_model.get_word_vector(feature1).reshape(1, -1),
                    ft_model.get_word_vector(feature2).reshape(1, -1))
                if similarity[0][0] < 0.99 and similarity[0][0] > 0.64:
                    feature_to_remove = min(
                        (unique_nouns[feature1], feature1),
                        (unique_nouns[feature2], feature2))[1]
                    top_features_to_add.remove(feature_to_remove)

    for feature in top_features_to_add:
        features_bucket[feature] = []

    for main_noun in features_bucket.keys():
        top_features_set.remove(main_noun)

    # Here we are going through the top 5% of the nouns that we originally considering and checking
    # if any of them are similar to the ones already present in features_bucket.
    top_features_copy = list(top_features_set)
    main_features = features_bucket.keys()

    for feature2 in top_features_copy:
        best_similarity = 0
        most_matching_main_feature = ""

        for feature1 in main_features:
            if feature2 in top_features_set:
                similarity = cosine_similarity(
                    ft_model.get_word_vector(feature1).reshape(1, -1),
                    ft_model.get_word_vector(feature2).reshape(1, -1))
                if similarity[0][0] <= 0.99 and similarity[0][0] > 0.62:
                    if similarity[0][0] > best_similarity:
                        best_similarity = similarity[0][0]
                        most_matching_main_feature = feature1

        if best_similarity != 0 and most_matching_main_feature != "":
            features_bucket[most_matching_main_feature].append(feature2)
            top_features_set.remove(feature2)

    # We finally sort the features in descending order based on how often they occur.
    final_features = list(features_bucket.items())

    final_features_with_counts = []
    for feature in final_features:
        count = unique_nouns[feature[0]]
        final_features_with_counts.append((feature, count))

    final_features_with_counts.sort(key=lambda x: x[1], reverse=True)

    final_features = OrderedDict()
    for feature, count in final_features_with_counts:
        final_features[feature[0]] = feature[1]

    return final_features
예제 #13
0
    :param i: is the index of the text matches
    :param matches: matches found in the text
    """

    match_id, start, end = matches[i]  # indices of matched term
    span = doc[start:end]              # extract matched term

    print('span: {} | start_ind:{:5} | end_ind:{:5} | id:{}'.format(
        span, start, end, match_id))

# set a pattern of text to collect
# find all mentions of the word fees
pattern = [{'LOWER':'fees'}] # LOWER coverts words to lowercase before matching

# instantiate matcher
matcher = Matcher(nlp.vocab)

# add pattern to the matcher (one matcher can look for many unique patterns)
# provice a pattern name, function to apply to matches, pattern to identify
matcher.add('fee', collect_sents, pattern)

# pass the doc to the matcher to run the collect_sents function
matcher(doc)
# change the function to print the sentence of the matched term (span)

def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]
    print('SPAN: {}'.format(span))

    # span.sent provides the sentence that contains the span
예제 #14
0
def task2(sentence,timestamp):
  # loading spacy model
  nlp = spacy.load("en_core_web_sm")
  import en_core_web_sm
  nlp = en_core_web_sm.load()

  print(sentence)
  if (isAlredyPresent(sentence) == False):
    processedTweets.append(sentence)
    call(["aplay", "Air.wav"])
    doc = nlp(sentence)
 #   print(sutime.SUTime(sentence))
    #  print([(X.text, X.label_) for X in doc.ents])

    # Tokenization
    tokens = []
    tokens = nltk.word_tokenize(sentence);
    #print("Tokens: ", tokens)
    #  tweetFile = open("stanford-ner-2018-10-16/tweet.txt", 'w')

    nlp = spacy.load("en_core_web_sm")
    # Matcher class object
    matcher = Matcher(nlp.vocab)
    matcher.add("matching", None, [{'POS': 'PROPN'}, {'LOWER': {'IN': ['ave', 'avenue', 'st', 'street',
                                                                       'rd', 'road', 'dr', 'drive', 'pkwy', 'parkway',
                                                                       'bend', 'bnd', 'boulevard', 'blvd', 'court',
                                                                       'ct',
                                                                       'expressway', 'expy', 'freeway', 'fwy',
                                                                       'highway', 'hwy', 'junction', 'jct', 'lane',
                                                                       'ln', 'loop', 'motorway', 'mtwy',
                                                                       'parkway', 'pkwy', 'point', 'pt', 'ramp',
                                                                       'turnpike', 'tpke', 'tunnel', 'tunl',
                                                                       'underpass']}}])

    matches = matcher(doc)
    span = ""
    for match_id, start, end in matches:
        span = doc[start:end]
    # print(span)

    st = StanfordNERTagger('stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz',
                           "stanford-ner-2018-10-16/stanford-ner.jar", encoding='utf-8')
    classifiedText = st.tag(tokens)
    location = ""
    #print(classifiedText)
    i = 0
    locationMatches = []
    for eachOut in classifiedText:
        if "LOCATION" in eachOut[1]:
            locationMatches.append(eachOut[0])
    # print(locationMatches)
    span = str(span)
    #print(span)
    # Lemmatization without POS tags
    lems = []
    lemmatizer = WordNetLemmatizer()
    pos_sen = nltk.pos_tag(tokens);
    #print("\n POS Tags: \n", pos_sen);

    pos_wn = [(s[0], penn_to_wn(s[1])) for s in pos_sen]
    # print("\n POS Tags for wordnet: \n", pos_wn)

    lems_pos = []
    for w in pos_wn:
        if (w[1]):
            lems_pos.append(lemmatizer.lemmatize(w[0], pos=w[1]))
        else:
            lems_pos.append(lemmatizer.lemmatize(w[0]))
    # print("\n Lemmatization by taking into account the pos tags: \n")
    # print(lems_pos)

    if("on" in tokens):
        try:
            x = tokens.index("on")
            x+=1
            while pos_sen[x][1]=="NNP":
                if pos_sen[x][0] not in locationMatches:
                    locationMatches.append(pos_sen[x][0])
                x+=1
            if(pos_sen[x][1]=="CD" and pos_sen[x+1][1]=="NNP" and pos_sen[x+1][0]!="AM" and pos_sen[x+1][0]!="am" and pos_sen[x+1][0]!="pm" and pos_sen[x+1][0]!="PM" ):
                if pos_sen[x][0] not in locationMatches:
                    locationMatches.append(pos_sen[x][0])

                if pos_sen[x+1][0] not in locationMatches:
                    locationMatches.append(pos_sen[x+1][0])

                x+=1
                x+=1
                while pos_sen[x][1] == "NNP":
                    if pos_sen[x][0] not in locationMatches:
                        locationMatches.append(pos_sen[x][0])
                    x += 1



        except:
            pass

    if ("at" in tokens):
        try:
            x = tokens.index("at")
            x += 1
            while pos_sen[x][1] == "NNP":
                if pos_sen[x][0] not in locationMatches:
                    locationMatches.append(pos_sen[x][0])
                x+=1
            if (pos_sen[x][1] == "CD" and pos_sen[x + 1][1] == "NNP" and pos_sen[x+1][0]!="AM" and pos_sen[x+1][0]!="am" and pos_sen[x+1][0]!="pm" and pos_sen[x+1][0]!="PM" ):
                if pos_sen[x][0] not in locationMatches:
                    locationMatches.append(pos_sen[x][0])

                if pos_sen[x + 1][0] not in locationMatches:
                    locationMatches.append(pos_sen[x + 1][0])

                x += 1
                x += 1
                while pos_sen[x][1] == "NNP":
                    if pos_sen[x][0] not in locationMatches:
                        locationMatches.append(pos_sen[x][0])
                    x += 1

        except:

            pass

    if ("AT" in tokens):
        try:
            x = tokens.index("AT")
            x += 1
            while pos_sen[x][1] == "NNP":
                if pos_sen[x][0] not in locationMatches:
                    locationMatches.append(pos_sen[x][0])
                x+=1
            if (pos_sen[x][1] == "CD" and pos_sen[x + 1][1] == "NNP" and pos_sen[x+1][0]!="AM" and pos_sen[x+1][0]!="am" and pos_sen[x+1][0]!="pm" and pos_sen[x+1][0]!="PM" ):
                if pos_sen[x][0] not in locationMatches:
                    locationMatches.append(pos_sen[x][0])

                if pos_sen[x + 1][0] not in locationMatches:
                    locationMatches.append(pos_sen[x + 1][0])

                x += 1
                x += 1
                while pos_sen[x][1] == "NNP":
                    if pos_sen[x][0] not in locationMatches:
                        locationMatches.append(pos_sen[x][0])
                    x += 1

        except:
            pass
    #print(locationMatches)
    removal=[]
    if (len(locationMatches) > 0 and len(span) > 0):
        for eachMatch in locationMatches:
            #print(len(locationMatches))
            try:
                #print(span.find(eachMatch))
                if span.find(eachMatch) != -1:
                    removal.append(eachMatch)
            except:
                print("Exception Distinct")

        for removeItem in removal:
            locationMatches.remove(removeItem)

    location= (span + " " + " ".join(locationMatches)).strip()


    #Extracting Time using Regular Expression:
    re6 = r"(24:00|2[0-3]:[0-5][0-9]|[0-1][0-9]:[0-5][0-9]:[0-5][0-9])([\s]*[AaPp][Mm])"
    re2 = r"(24:00|2[0-3]:[0-5][0-9]|[0-1][0-9]:[0-5][0-9]:[0-5][0-9])"
    re3 = r"24:00|2[0-3]:[0-5][0-9]|[0-1][0-9]:[0-5][0-9]([\s]*[AaPp][Mm])"
    re4 = r"24:00|2[0-3]:[0-5][0-9]|[0-1][0-9]:[0-5][0-9]"
    re5 = r"([0-9][0-9]?:[0-5][0-9]|[0-1][0-9]:[0-5][0-9])([\s]*[AaPp]*[Mm]*)"
    re1 = r"([0-9][0-9]*:[0-5][0-9]:[0-5][0-9])([\s]*[AaPp]*[Mm]*)"
    re7 = r"([0-9][0-9]*:[0-5][0-9])"

    try:
        time=(re.compile("(%s|%s|%s|%s|%s|%s|%s)" % (re1, re2, re3, re4, re5, re6, re7)).findall(sentence))[0][0]
        time=str(time)
        if(len(time.strip())>0):
            print("Time: "+str(time))
            timestamp=time
    except BaseException as e:
        print("Time : "+timestamp)


    severity= severity_classifier.severity_finder(sentence)
    severityStr=""
    for eachKeyword in severity:
        severityStr+=str(eachKeyword)+" "
    print("Severity: "+severityStr)

    if (len(location) > 0):
        print("Location: " + location)
        e2 = {"predictedClassLabel": "Accidental", "tweet": sentence, "timestamp": timestamp, "location":location,"severity":severityStr}
    else:
        e2 = {"predictedClassLabel": "Accidental", "tweet": sentence, "timestamp": timestamp,"severity":severityStr}
    res2 = es.index(index=indexName2, doc_type=typeName2, body=e2)
def test_matcher_no_zero_length(en_vocab):
    doc = Doc(en_vocab, words=["a", "b"], tags=["A", "B"])
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
    assert len(matcher(doc)) == 0
예제 #16
0
def ground_mentioned_concepts(nlp, matcher, s, ans=None):

    s = s.lower()
    doc = nlp(s)
    matches = matcher(doc)

    mentioned_concepts = set()
    span_to_concepts = {}

    # print('ground step 0')
    if ans is not None:
        ans_matcher = Matcher(nlp.vocab)
        ans_words = nlp(ans)
        # print(ans_words)
        ans_matcher.add(ans, None, [{
            'TEXT': token.text.lower()
        } for token in ans_words])

        ans_match = ans_matcher(doc)
        ans_mentions = set()
        for _, ans_start, ans_end in ans_match:
            ans_mentions.add((ans_start, ans_end))

    # print('ground step 1')
    for match_id, start, end in matches:
        if ans is not None:
            if (start, end) in ans_mentions:
                continue

        span = doc[start:end].text  # the matched span

        # a word that appears in answer is not considered as a mention in the question
        # if len(set(span.split(" ")).intersection(set(ans.split(" ")))) > 0:
        #     continue
        original_concept = nlp.vocab.strings[match_id]
        original_concept_set = set()
        original_concept_set.add(original_concept)

        # print("span", span)
        # print("concept", original_concept)
        # print("Matched '" + span + "' to the rule '" + string_id)

        # why do you lemmatize a mention whose len == 1?

        if len(original_concept.split("_")) == 1:
            # tag = doc[start].tag_
            # if tag in ['VBN', 'VBG']:

            original_concept_set.update(
                lemmatize(nlp, nlp.vocab.strings[match_id]))

        if span not in span_to_concepts:
            span_to_concepts[span] = set()

        span_to_concepts[span].update(original_concept_set)

    # print('ground step 2')
    for span, concepts in span_to_concepts.items():
        concepts_sorted = list(concepts)
        # print("span:")
        # print(span)
        # print("concept_sorted:")
        # print(concepts_sorted)
        concepts_sorted.sort(key=len)

        # mentioned_concepts.update(concepts_sorted[0:2])

        shortest = concepts_sorted[0:3]

        for c in shortest:
            if c in blacklist:
                continue

            # a set with one string like: set("like_apples")
            lcs = lemmatize(nlp, c)
            intersect = lcs.intersection(shortest)
            if len(intersect) > 0:
                mentioned_concepts.add(list(intersect)[0])
            else:
                mentioned_concepts.add(c)

        # if a mention exactly matches with a concept

        exact_match = set([
            concept for concept in concepts_sorted
            if concept.replace("_", " ").lower() == span.lower()
        ])
        # print("exact match:")
        # print(exact_match)
        # print('assert len exact match')
        assert len(exact_match) < 2
        mentioned_concepts.update(exact_match)

    return mentioned_concepts
예제 #17
0
def test_invalid_greediness(doc, text):
    matcher = Matcher(doc.vocab)
    with pytest.raises(ValueError):
        matcher.add("RULE", [pattern1], greedy="GREEDY")
예제 #18
0
def load_date_matcher(nlp):

    # Create matcher object with list of rules and return
    matcher = Matcher(nlp.vocab)

    # Add to vocab
    add_to_vocab(nlp, months_dict.keys())
    add_to_vocab(nlp, ordinals)
    add_to_vocab(nlp, date_delimiters)

    # Create flag for MONTH
    is_month = FLAG62
    target_ids = {nlp.vocab.strings[s.lower()] for s in months_dict.keys()}
    for lexeme in nlp.vocab:
        if lexeme.lower in target_ids:
            lexeme.set_flag(is_month, True)

    # Create flag for ORDINALS
    is_ordinal = FLAG61
    target_ids = {nlp.vocab.strings[s.lower()] for s in ordinals}
    for lexeme in nlp.vocab:
        if lexeme.lower in target_ids:
            lexeme.set_flag(is_ordinal, True)

    # Create flag for DATE_DELIMITER
    is_date_delimiter = FLAG60
    target_ids = {nlp.vocab.strings[s.lower()] for s in date_delimiters}
    for lexeme in nlp.vocab:
        if lexeme.lower in target_ids:
            lexeme.set_flag(is_date_delimiter, True)

    # print('December', nlp.vocab.__contains__('December'))
    # print('Diciembre', nlp.vocab.__contains__('diciembre'))

    # print('December', nlp.vocab['december'].check_flag(is_month))
    # print('Diciembre', nlp.vocab['diciembre'].check_flag(is_month))

    # Add rules

    # March 25, 2017
    # March 25th, 2017
    # March 25th 2017
    # March 25 2017
    matcher.add_pattern('DATE', [{
        is_month: True
    }, {
        IS_DIGIT: True,
        LENGTH: 1
    }, {
        is_ordinal: True,
        'OP': '?'
    }, {
        ORTH: ',',
        'OP': '?'
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=1)
    matcher.add_pattern('DATE', [{
        is_month: True
    }, {
        IS_DIGIT: True,
        LENGTH: 2
    }, {
        is_ordinal: True,
        'OP': '?'
    }, {
        ORTH: ',',
        'OP': '?'
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=2)

    # 25 March, 2017
    # 25th March, 2017
    # 25th March 2017
    # 25 March 2017
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 1
    }, {
        is_date_delimiter: True,
        'OP': '?'
    }, {
        is_month: True
    }, {
        is_ordinal: True,
        'OP': '?'
    }, {
        ORTH: ',',
        'OP': '?'
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=3)
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 2
    }, {
        is_date_delimiter: True,
        'OP': '?'
    }, {
        is_month: True
    }, {
        is_ordinal: True,
        'OP': '?'
    }, {
        ORTH: ',',
        'OP': '?'
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=4)

    # 25/05/2016
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 1
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        is_month: True
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=5)
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 2
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        is_month: True
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=6)

    # 05/25/2016
    matcher.add_pattern('DATE', [{
        is_month: True
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        IS_DIGIT: True,
        LENGTH: 1
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=7)
    matcher.add_pattern('DATE', [{
        is_month: True
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        IS_DIGIT: True,
        LENGTH: 2
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=8)

    # Diciembre, 2009
    # December 2009
    matcher.add_pattern('DATE', [{
        is_month: True,
        IS_DIGIT: False
    }, {
        ORTH: ','
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=9)
    matcher.add_pattern('DATE', [{
        is_month: True,
        IS_DIGIT: False
    }, {
        IS_DIGIT: True,
        LENGTH: 4
    }],
                        label=9)

    # 2013-12-04
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 4
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        is_month: True
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        IS_DIGIT: True,
        LENGTH: 2
    }],
                        label=10)
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 4
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        is_month: True
    }, {
        is_date_delimiter: True,
        'OP': '+'
    }, {
        IS_DIGIT: True,
        LENGTH: 1
    }],
                        label=11)

    # 9 days ago
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True
    }, {
        POS: 'NOUN'
    }, {
        LOWER: 'ago'
    }],
                        label=12)

    # 1 Jul
    # 1. Jul
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 2
    }, {
        is_ordinal: True
    }, {
        is_date_delimiter: True
    }, {
        is_month: True,
        IS_DIGIT: False
    }],
                        label=13)
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 2
    }, {
        is_ordinal: True
    }, {
        is_month: True,
        IS_DIGIT: False
    }],
                        label=13)
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 2
    }, {
        is_date_delimiter: True
    }, {
        is_month: True,
        IS_DIGIT: False
    }],
                        label=13)
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 2
    }, {
        is_month: True,
        IS_DIGIT: False
    }],
                        label=13)

    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 1
    }, {
        is_ordinal: True
    }, {
        is_date_delimiter: True
    }, {
        is_month: True,
        IS_DIGIT: False
    }],
                        label=14)
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 1
    }, {
        is_ordinal: True
    }, {
        is_month: True,
        IS_DIGIT: False
    }],
                        label=14)
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 1
    }, {
        is_date_delimiter: True
    }, {
        is_month: True,
        IS_DIGIT: False
    }],
                        label=14)
    matcher.add_pattern('DATE', [{
        IS_DIGIT: True,
        LENGTH: 1
    }, {
        is_month: True,
        IS_DIGIT: False
    }],
                        label=14)

    # Jul 2nd
    matcher.add_pattern('DATE', [{
        is_month: True,
        IS_DIGIT: False
    }, {
        is_date_delimiter: True
    }, {
        IS_DIGIT: True,
        LENGTH: 2
    }, {
        is_ordinal: True
    }],
                        label=15)
    matcher.add_pattern('DATE', [{
        is_month: True,
        IS_DIGIT: False
    }, {
        is_date_delimiter: True
    }, {
        IS_DIGIT: True,
        LENGTH: 2
    }],
                        label=15)
    matcher.add_pattern('DATE', [{
        is_month: True,
        IS_DIGIT: False
    }, {
        IS_DIGIT: True,
        LENGTH: 2
    }, {
        is_ordinal: True
    }],
                        label=15)
    matcher.add_pattern('DATE', [{
        is_month: True,
        IS_DIGIT: False
    }, {
        IS_DIGIT: True,
        LENGTH: 2
    }],
                        label=15)

    matcher.add_pattern('DATE', [{
        is_month: True,
        IS_DIGIT: False
    }, {
        is_date_delimiter: True
    }, {
        IS_DIGIT: True,
        LENGTH: 1
    }, {
        is_ordinal: True
    }],
                        label=16)
    matcher.add_pattern('DATE', [{
        is_month: True,
        IS_DIGIT: False
    }, {
        is_date_delimiter: True
    }, {
        IS_DIGIT: True,
        LENGTH: 1
    }],
                        label=16)
    matcher.add_pattern('DATE', [{
        is_month: True,
        IS_DIGIT: False
    }, {
        IS_DIGIT: True,
        LENGTH: 1
    }, {
        is_ordinal: True
    }],
                        label=16)
    matcher.add_pattern('DATE', [{
        is_month: True,
        IS_DIGIT: False
    }, {
        IS_DIGIT: True,
        LENGTH: 1
    }],
                        label=16)

    return matcher
예제 #19
0
def make_matcher(vocab, max_length):
    abstract_patterns = []
    for length in range(1, max_length+1):
        abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
    return Matcher(vocab, {'Candidate': ('CAND', {}, abstract_patterns)})
예제 #20
0
def test_issue588(en_vocab):
    matcher = Matcher(en_vocab)
    with pytest.raises(ValueError):
        matcher.add("TEST", [[]])
예제 #21
0
edu = {}
# Extract education degree
for idx, text in enumerate(nlp_text):
    text_unigrams = text.split()
    text_bigrams = [tup[0] + tup[1] for tup in list(ngrams(text_unigrams, 2))]
    all_grams = text_unigrams + text_bigrams

    for tok in all_grams:
        # Replace special symbols and lowercase
        re_tok = re.sub(SYMBOLS_ext, '', tok.lower().strip())
        print(re_tok)
        if re_tok in EDUCATION and re_tok not in STOPWORDS:
            edu[tok] = text + nlp_text[idx + 1]

matcher = Matcher(cv_obj.nlp.vocab)

nlp_text = cv_obj.doc

# First name and Last name are always Proper Nouns
pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]

matcher.add('NAME', None, pattern)

matches = matcher(nlp_text)

for match_id, start, end in matches:
    span = nlp_text[start:end]
    print(span.text)

# test
예제 #22
0
def identify_GROSS_TONNAGE_in_text(text):
    nlp = English()
    doc = nlp(text)
    matcher = Matcher(nlp.vocab)

    #
    # START - spaCy patterns
    #

    # GROSS_TONNAGE
    patterns = [[{
        "LOWER": {
            "IN": ["bruttotonnasje"]
        }
    }, {
        "LOWER": {
            "IN": ["opp", "ned"]
        }
    }, {
        "LOWER": {
            "IN": ["til"]
        }
    }, {
        'IS_DIGIT': True
    }],
                [{
                    "LOWER": {
                        "IN": ["bruttotonnasje"]
                    }
                }, {
                    "LOWER": {
                        "IN": ["under", "over"]
                    }
                }, {
                    'IS_DIGIT': True
                }],
                [{
                    "LOWER": {
                        "IN": ["bruttotonnasje"]
                    }
                }, {
                    'IS_DIGIT': True
                }, {
                    "LOWER": {
                        "IN": ["eller"]
                    }
                }, {
                    "LOWER": {
                        "IN": ["mer", "mindre"]
                    }
                }]]

    matcher.add("GROSS_TONNAGE", patterns)

    #
    # END - spaCy patterns
    #

    result = []

    for match_id, token_start, token_end in matcher(doc):

        match_id_as_string = nlp.vocab.strings[match_id]
        final_token_start = token_start
        final_token_end = token_end

        spacy_pattern_detection = doc[token_start:token_end]
        spacy_pattern_detection_as_lower_text = spacy_pattern_detection.text.lower(
        )

        #
        # convert token_span to char_span.
        # char_span is needed to display correctly withdisplacy.render().
        #
        span = doc[final_token_start:final_token_end]
        span_char_start = span[0].idx
        span_char_end = span[-1].idx + len(span[-1].text)

        # return result
        identified_entity = {
            'start': span_char_start,
            'end': span_char_end,
            'label': match_id_as_string
        }
        result.append(identified_entity)

    return result
예제 #23
0
def coronaAnalysis(sha, abstract, count, textcount):
    #doc = nlp(text)
    textcount = 0

    cleantext = [
        t.text for t in abstract if not t.is_stop and t.ent_type_ != 'GPE'
    ]  # remove stop words. Exclude Geographic location

    # convert list to nlp doc
    cleandoc = Doc(nlp.vocab, words=cleantext)

    matcher = Matcher(nlp.vocab)

    #print("Search for ", pattern22)
    #matcher.add("medicalcare", None, pattern2, pattern3, pattern4, pattern5)
    #matcher.add("medicalcare", None, pattern2)
    #matcher.add("medicalcare", None, pattern5)
    #matcher.add("medicalcare", None, pattern6)
    matcher.add("medicalcare", None, pattern21)
    matches = matcher(cleandoc)

    #print(matches)

    for match_id, start, end in matches:

        moveleft = 0
        moveright = 0

        leftwords = []
        rightwords = []

        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = cleandoc[start:end]  # The matched span
        #print("Span :", span, '\n')
        print(start, end, span.text)
        #print("Len clean Doc :", len(cleandoc))
        #print("Moveleft ", moveleft)
        #print(" Doc Lenght ", len(cleandoc))
        #print(cleandoc[start-1])
        while ((len(cleandoc) > start + moveleft)
               and (str(cleandoc[start - moveleft]) != ".")):

            #print("Prev Word :", cleandoc[start - moveleft])
            leftwords.append(cleandoc[start - moveleft])
            moveleft = moveleft + 1
            #print("movement :", moveleftprint("Sum :", end + moveright))

            #print("Sum Left :", start + moveleft)
            if len(cleandoc) == start + moveleft:
                break
        leftwords.reverse()
        #print("Left Words :", leftwords)
        #print("Moveright ", moveright)
        #print(" Doc Lenght ", len(cleandoc))

        while ((len(cleandoc) > end + moveright)
               and (str(cleandoc[end + moveright]) != ".")):

            #print("Next Word :", cleandoc[end + moveright])
            moveright = moveright + 1
            #print("movement :", moveright)
            #print("MOVE RIGHT count :", moveright)
            #print("End", end)
            #print("Abstract Length : ", len(abstract))
            #print("Clean Doc Size :", len(cleandoc))

            #print("Sum :", end + moveright)

            if len(cleandoc) == end + moveright:
                break
            rightwords.append(cleandoc[end + moveright])

        #rightwords.reverse()
        #print("Right Words :", rightwords)
        combinedList = leftwords + rightwords
        sentence = ' '.join(map(str, combinedList))
        sentence.replace(".", "")
        #print("Combined Words ", combinedList, 'SHA ', sha, 'Keyword ', span.text)
        print("Sentence ", sentence, 'SHA ', sha, 'Keyword ', span.text)

        medical_care.append([sha, span.text, sentence])

        #print(start, end, span.text, span.label)
        #print(doc)
        #print(cleandoc)
        #text_list.append([sha, cleandoc])
        #word_dict[span.text] = {}  # create dictionary for keyword
        #word_dict[span.text][cleandoc[start - 1]] = -1
        textcount = +1
예제 #24
0
    def __init__(self):
        self.count = {
            "0": 0,
            "1": 0,
            "2": 0,
            "3": 0,
            "4": 0,
            "5": 0,
            "6": 0,
            "7": 0,
            "8": 0,
            "9": 0,
            "10": 0
        }
        self.compa_sent_count = 0

        self.nlp = spacy.load("en_core_web_sm")
        self.matcher = Matcher(self.nlp.vocab)
        self.matcher.add(0, None, [{
            'ORTH': 'JJR'
        }, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'JJR'
        }, {}, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'JJR'
        }, {
            'ORTH': 'CIN'
        }, {}, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'JJR'
        }, {}, {
            'ORTH': 'CIN'
        }, {}, {
            'ORTH': 'TECH'
        }])

        self.matcher.add(8, None, [{
            'ORTH': 'RBR'
        }, {
            'ORTH': 'JJ'
        }, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'RBR'
        }, {
            'ORTH': 'JJ'
        }, {}, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }])
        self.matcher.add(2, None, [{
            'ORTH': 'CV'
        }, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }], [{
            'ORTH': 'CV'
        }, {}, {
            'ORTH': 'CIN'
        }, {
            'ORTH': 'TECH'
        }])
        self.matcher.add(3, None, [{
            'ORTH': 'CV'
        }, {
            'ORTH': 'VBG'
        }, {
            'ORTH': 'TECH'
        }])

        # self.matcher.add(6,
        #             None,
        #             [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'JJS'}],
        #             [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {'ORTH': 'JJS'}],
        #             [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'JJS'}],
        #             [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'JJS'}])
        self.matcher.add(10, None, [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VBZ'
        }, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VBZ'
        }, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VBZ'
        }, {}, {
            'ORTH': 'RBR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VBZ'
        }, {}, {
            'ORTH': 'RBR'
        }])
        self.matcher.add(7, None, [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VBZ'
        }, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VBZ'
        }, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {
            'ORTH': 'VBZ'
        }, {}, {
            'ORTH': 'JJR'
        }], [{
            'ORTH': 'TECH'
        }, {}, {
            'ORTH': 'VBZ'
        }, {}, {
            'ORTH': 'JJR'
        }])
def test_matcher_pattern_validation(en_vocab, pattern):
    matcher = Matcher(en_vocab, validate=True)
    with pytest.raises(MatchPatternError):
        matcher.add("TEST", [pattern])
def test_matcher_basic_check(en_vocab):
    matcher = Matcher(en_vocab)
    # Potential mistake: pass in pattern instead of list of patterns
    pattern = [{"TEXT": "hello"}, {"TEXT": "world"}]
    with pytest.raises(ValueError):
        matcher.add("TEST", pattern)
예제 #27
0
def patternSearch(T_0, file):
    phrase_patterns = set()
    seed_pattern = [nlp(x) for x in T_0]
    phrase_matcher = PhraseMatcher(nlp.vocab)
    phrase_matcher.add('pattern search', None, *seed_pattern)
    # find occurrences of seed phrases
    with open(file, "r") as f:
        document = nlp(f.read().lower())
        matches = phrase_matcher(document)
        for match_id, start, end in matches:
            p = tuple((start, end))
            if p not in phrase_patterns:
                phrase_patterns.add(p)
    # find patterns around seed phrases
    unranked_patterns = []
    with open(file, "r") as f:
        text = nlp(f.read().lower())
        for phrase_pattern in phrase_patterns:
            start = phrase_pattern[0]
            end = phrase_pattern[1]
            if (text[start - 1].text == '\n'):
                continue
            # add context pattern
            tmp = []
            for i in range(2, 0, -1):
                tmp.append({"TEXT": text[start - i].text})
            # add content pattern
            span = text[start:end]
            for token in span:
                tmp.append({"POS": token.pos_})
            if tmp not in unranked_patterns:
                unranked_patterns.append(tmp)
                print(tmp)
    unranked_phrases = list(getPhrases(file, unranked_patterns))
    # build context graph
    context_graph = nx.Graph()
    # add tuples and patterns into graph
    for i in range(len(unranked_phrases)):
        node = 't' + str(i)
        context_graph.add_node(node, pos=(0, i))
    for i in range(len(unranked_patterns)):
        node = 'p' + str(i)
        context_graph.add_node(node, pos=(2, i))
    context_matrix = np.zeros((len(unranked_phrases), len(unranked_patterns)))
    # find c (t, p)
    with open(file, 'r') as f:
        t = f.read().lower()
        matcher = Matcher(nlp.vocab)
        doc = nlp(t)
        for i in range(len(unranked_patterns)):
            matcher.add("extraction", None, unranked_patterns[i])
            matches = matcher(doc)
            for match_id, start, end in matches:
                span = doc[start + 2:end].text
                j = unranked_phrases.index(span)
                context_matrix[j, i] += 1
            matcher.remove("extraction")
    # add context nodes into graph
    c_count = 0
    for i in range(context_matrix.shape[0]):
        for j in range(context_matrix.shape[1]):
            if context_matrix[i, j] != 0:
                occur = context_matrix[i, j]
                node_t = 't' + str(i)
                node_p = 'p' + str(j)
                node_c = 'c' + str(c_count)
                c_count += 1
                context_graph.add_node(node_c, pos=(1, c_count))
                context_graph.add_edge(node_t, node_c, weight=occur)
                context_graph.add_edge(node_c, node_p, weight=occur)
    # draw context graph
    plt.figure()
    pos = nx.get_node_attributes(context_graph, 'pos')
    nx.draw(context_graph, pos, with_labels=True)
    labels = nx.get_edge_attributes(context_graph, 'weight')
    nx.draw_networkx_edge_labels(context_graph, pos, edge_labels=labels)
    # return patterns
    return unranked_phrases
def test_matcher_valid_callback(en_vocab):
    """Test that on_match can only be None or callable."""
    matcher = Matcher(en_vocab)
    with pytest.raises(ValueError):
        matcher.add("TEST", [[{"TEXT": "test"}]], on_match=[])
    matcher(Doc(en_vocab, words=["test"]))
예제 #29
0
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses.")

# Escreva uma expressão que corresponda a um adjetivo seguido de um ou dois substantivos
pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]

# Adicione uma expressão ao comparador matcher e aplique o matcher ao doc
matcher.add("ADJ_NOUN_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Faça a iteração sobre as correspondencias e imprima a partição do texto
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)
예제 #30
0
def run_prdualrank(T_0, unranked_patterns, unranked_phrases, file):
    global final_patterns, final_keywords, pattern_to_score_map, keyword_to_score_map, ngram_prob_map, phrase_seg_score, removed_phrases, wiki_ir_cache, error_count, total_ngram_counts
    phrase2id = {}
    for i in range(len(unranked_phrases)):
        phrase2id[unranked_phrases[i]] = i

    id2phrase = {}
    for i in range(len(unranked_phrases)):
        id2phrase[i] = unranked_phrases[i]

    id2pattern = {}
    for i in range(len(unranked_patterns)):
        id2pattern[i] = unranked_patterns[i]

    seedIdwConfidence = {}
    for key, val in phrase2id.items():
        if key in T_0:
            seedIdwConfidence[val] = 0.0

    id2patterns = defaultdict(set)
    pattern2ids = defaultdict(set)

    context_matrix = np.zeros((len(unranked_phrases), len(unranked_patterns)))
    # find c (t, p)
    with open(file, 'r') as f:
        file_chunk = partition(f)
        matcher = Matcher(nlp.vocab)
        for t in file_chunk:
            doc = nlp(t)
            for i in range(len(unranked_patterns)):
                offset = 0
                for pattern_dict in unranked_patterns[i]:
                    if 'POS' in pattern_dict:
                        break
                    offset += 1
                matcher.add("extraction", None, unranked_patterns[i])
                matches = matcher(doc)
                for match_id, start, end in matches:
                    span = doc[start+offset:end].text
                    j = unranked_phrases.index(span) if span in unranked_phrases else -1
                    if j == -1:
                        continue
                    context_matrix[j, i] += 1
                    id2patterns[j].add(i)
                    pattern2ids[i].add(j)
                matcher.remove("extraction")


    id2sup = {}
    for i in range(len(unranked_phrases)):
        id2sup[i] = 0

    pattern2sup = {}
    for i in range(len(unranked_patterns)):
        pattern2sup[i] = 0

    for id in id2patterns.keys():
        sum = 0
        for col in range(len(unranked_patterns)):
            sum += context_matrix[id, col]
        id2sup[id] = sum

    for pattern in pattern2ids.keys():
        sum = 0
        for row in range(len(unranked_phrases)):
            sum += context_matrix[row, pattern]
        pattern2sup[pattern] = sum

    l1, l2, l3, l4, m1, m2, m3, m4 = prDualRank(seedIdwConfidence, [], id2patterns, pattern2ids, {},
             {}, {}, {}, id2phrase, context_matrix.tolist(), id2sup, pattern2sup,
             FLAGS_VERBOSE=False, FLAGS_DEBUG=False)

    return l1, l2, l3, l4, m1, m2, m3, m4