def __init__(self): self.count = { "0": 0, "1": 0, "2": 0, "3": 0, "4": 0, "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "10": 0 } self.compa_sent_count = 0 self.matcher = Matcher(nlp.vocab) self.matcher.add(0, None, [{ 'ORTH': 'JJR' }, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }], [{ 'ORTH': 'JJR' }, {}, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }], [{ 'ORTH': 'JJR' }, { 'ORTH': 'CIN' }, {}, { 'ORTH': 'TECH' }], [{ 'ORTH': 'JJR' }, {}, { 'ORTH': 'CIN' }, {}, { 'ORTH': 'TECH' }], [{ 'ORTH': 'JJ' }, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }], [{ 'ORTH': 'JJ' }, {}, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }], [{ 'ORTH': 'JJ' }, { 'ORTH': 'CIN' }, {}, { 'ORTH': 'TECH' }], [{ 'ORTH': 'JJ' }, {}, { 'ORTH': 'CIN' }, {}, { 'ORTH': 'TECH' }]) self.matcher.add( 1, None, [{ 'ORTH': 'VB' }, { 'ORTH': 'TECH' }, { 'ORTH': 'TO' }, { 'ORTH': 'VB' }], [{ 'ORTH': 'VB' }, { 'ORTH': 'TECH' }, {}, { 'ORTH': 'TO' }, { 'ORTH': 'VB' }], ) self.matcher.add(8, None, [{ 'ORTH': 'RBR' }, { 'ORTH': 'JJ' }, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }], [{ 'ORTH': 'RBR' }, { 'ORTH': 'JJ' }, {}, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }]) self.matcher.add(2, None, [{ 'ORTH': 'CV' }, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }], [{ 'ORTH': 'CV' }, {}, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }]) self.matcher.add(3, None, [{ 'ORTH': 'CV' }, { 'ORTH': 'VBG' }, { 'ORTH': 'TECH' }]) self.matcher.add( 5, None, [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, {}, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, {}, { 'ORTH': 'NN' }], ) # self.matcher.add(6, # None, # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'JJS'}], # [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {'ORTH': 'JJS'}], # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'JJS'}], # [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'JJS'}]) self.matcher.add(7, None, [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, {}, {}, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, {}, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, {}, {}, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, {}, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'JJR' }]) self.matcher.add(10, None, [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, {}, {}, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, {}, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, {}, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, {}, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'RBR' }]) # self.matcher.add(9, # None, # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'RBS'}], # [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {'ORTH': 'RBS'}], # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RBS'}], # [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RBS'}]) self.matcher.add( 11, None, [{ 'ORTH': 'TECH' }, { 'ORTH': 'NP' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'NP' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'NP' }], )
def get_search_result(): req = request.get_json() query = req.get('Query') process_query = query # query after removing all matched patterns doc = nlp(query) phraseMatcher = PhraseMatcher(nlp.vocab, attr='LOWER') tokenMatcher = Matcher(nlp.vocab) GE_terms = [ "lok sabha", "ls", "ge", "general election", "general elections", "national" ] GE_patterns = list(nlp.tokenizer.pipe(GE_terms)) phraseMatcher.add("GE_PATTERN", None, *GE_patterns) AE_terms = [ "ae", "vidhan sabha", "state election", "state elections", "assembly election", "assembly elections" ] AE_patterns = list(nlp.tokenizer.pipe(AE_terms)) phraseMatcher.add("AE_PATTERN", None, *AE_patterns) state_patterns = [nlp.make_doc(key) for key in stateNamesDict] phraseMatcher.add("STATE_PATTERN", None, *state_patterns) matches = phraseMatcher(doc) electionType = "" stateName = "Lok_Sabha" party = [] years = [] for i in range(len(matches)): string_id = nlp.vocab.strings[matches[i][0]] if string_id == "GE_PATTERN": electionType = "GE" elif string_id == "AE_PATTERN": electionType = "AE" elif string_id == "STATE_PATTERN": start, end = matches[i][1], matches[i][2] span = doc[start:end] stateName = stateNamesDict.get(span.text.lower()) if i < len(matches) - 1 and (matches[i][1] != matches[i + 1][1]): start, end = matches[i][1], matches[i][2] span = doc[start:end] process_query = re.sub(span.text, '', process_query) tokenMatcher.add("YEAR_PATTERN", None, [{ "TEXT": { "REGEX": "[1-9][0-9][0-9][0-9]" } }]) matches2 = tokenMatcher(doc) for match_id, start, end in matches2: span = doc[start:end] years.append(span.text) process_query = re.sub(span.text, '', process_query) new_doc = nlp(process_query) codes_json = open('ChartsMapsCodes.json') codes_data = json.load(codes_json) similar_modules = {} for code in codes_data: similar_modules[code['modulename']] = new_doc.similarity( nlp(code['title'])) sorted_modules = sorted(similar_modules.items(), key=operator.itemgetter(1), reverse=True) module = "" full_party_names = {} party_options_modules = [ "cvoteShareChart", "seatShareChart", "tvoteShareChart", "strikeRateChart" ] for i in range(len(sorted_modules)): module_name = sorted_modules[i][0] if module_name in party_options_modules: module = module_name break connection = connectdb(db_config) if connection.is_connected(): cursor = connection.cursor() cursor.execute("show tables") tables = cursor.fetchall() db_tables = [] for (table, ) in tables: db_tables.append(table) tableName = module_to_table(module) if tableName in db_tables: cursor = connection.cursor(prepared=True) query_input = list() get_table = "Select distinct Party from " + tableName get_count = "Select count(distinct Party) as count from " + tableName get_full_names = "Select distinct Party,Expanded_Party_Name from " + tableName # query_input.append(tableName) get_election = " where Election_Type = %s" if electionType == "": query_input.append("GE") else: query_input.append(electionType) get_state = "" if stateName is not None: get_state = " and State_Name = %s" query_input.append(stateName) party_names_query = get_full_names + get_election + get_state + " and position <10" cursor.execute(party_names_query, tuple(query_input)) party_names = cursor.fetchall() print(query_input) for (name, full_name) in party_names: # print(name) # print(full_name) full_party_names.update({name: full_name}) party_patterns = [] for key, value in full_party_names.items(): print(key, value) if key is not None: party_patterns.append(nlp.make_doc(key)) if value is not None: party_patterns.append(nlp.make_doc(value)) partyMatcher = PhraseMatcher(nlp.vocab, attr='LOWER') partyMatcher.add("PARTY_PATTERN", None, *party_patterns) party_matches = partyMatcher(new_doc) for match_id, start, end in party_matches: span = doc[start:end] party_match = span.text.upper() for key, value in full_party_names.items(): if party_match == key or party_match == value: party.append(key) results = {} results["electionType"] = electionType results["stateName"] = stateName results["year"] = years results["similarModules"] = sorted_modules results["party"] = party return jsonify({'results': results})
class NLP(): nlp = spacy.load('en_core_web_sm') nlp.add_pipe(Readability(), last=True) matcher = Matcher(nlp.vocab) def __init__(self, text): self.doc = self.nlp(text) self.blob = TextBlob(self.doc.text) self.readability = self.readability_indexes() self.word_tokens = self.tokenize_words(self.doc) self.sents = list(self.doc.sents) self.polysyllables = self.get_polysyllables(self.word_tokens[1]) self.nominalized_words = self.get_nominalized(self.word_tokens[1]) self.pos = self.get_pos(self.doc) self.prepositional_phrases = self.get_pps(self.doc) self.passive_phrases = self.get_passive_phrases(self.doc) self.get_pronouns(self.doc) self.get_weak_verbs(self.doc) self.sentence_count = len(self.sents) self.statistics() self.word_count = len(self.word_tokens[1]) self.get_freq_dist() #self.lexicon_count = len(self.lexicon) self.get_intities() def readability_indexes(self): readability_scores = {} readability_scores['ari'] = self.doc._.automated_readability_index readability_scores['coleman_liau_index'] = self.doc._.coleman_liau_index readability_scores['dale_chall'] = self.doc._.dale_chall readability_scores['flesch_kincaid_grade'] = self.doc._.flesch_kincaid_grade_level readability_scores['flesch_kincaid_re'] = self.doc._.flesch_kincaid_reading_ease readability_scores['forcast'] = self.doc._.forcast readability_scores['smog'] = self.doc._.smog return readability_scores def tokenize_words(self, document): spacy_word_tokens = [t.text for t in document] no_punct_word_tokens = [] for w in spacy_word_tokens: for p in punctuation: w = w.replace(p, "").replace("\n", "").replace("", '') no_punct_word_tokens.append(w.lower()) no_punct_word_tokens.remove('') return (spacy_word_tokens, no_punct_word_tokens) def get_polysyllables(self, some_list): polysyllables = [] for w in some_list: if syllables.estimate(w) > 3: polysyllables.append(w) return polysyllables # def get_polysyllables2(self, doc): # phoney = BigPhoney() # self.total_syllables = phoney.count_syllables(self.doc.text) # self.polys = [] # for token in doc: # if phoney.count_syllables(token.text) > 3: # self.polys.append(token.text) # else: # pass def get_nominalized(self, list): nominalized_words = {} nominalized_words['-tion words'] = [] for word in list: if word.endswith("tion"): nominalized_words['-tion words'].append(word) else: pass return nominalized_words def get_pos(self, nlp_doc): parts_of_speech = {} parts_of_speech['gerunds'] = [] parts_of_speech['adjectives'] = [] parts_of_speech['adverbs'] = [] parts_of_speech['prepositions'] = [] for token in nlp_doc: if token.tag_ == "VBG": parts_of_speech['gerunds'].append(token.text) elif token.pos_ == "ADJ": parts_of_speech['adjectives'].append(token.text) elif token.pos_ == "ADV": parts_of_speech['adverbs'].append(token.text) else: pass return parts_of_speech def get_pps(self, doc): #Function to get prepositions from a parsed document. pps = [] for token in doc: if token.pos_ == 'ADP': pp = ' '.join([tok.orth_ for tok in token.subtree]) pps.append(pp) return pps def get_passive_phrases(self, doc): self.passive_sents = [] passive_phrases = [] passive_rule = [{'DEP': 'nsubjpass'}, {'DEP':'aux','OP':'*'}, {'DEP':'auxpass'}, {'TAG':'VBN'} ] self.matcher.add('passive', None, passive_rule) sents = list(doc.sents) matches = self.matcher(doc) for match_id, start, end in matches: string_id = doc.vocab.strings[match_id] span = doc[start:end] passive_phrases.append(span.text) for s in self.sents: for p in passive_phrases: if p in s.text: self.passive_sents.append(s.text) #return passive_phrases def get_weak_verbs(self, doc): self.weak_verbs = {} self.weak_verbs['to be'] = [] self.weak_verbs['auxiliary'] = [] for token in doc: if token.lemma_ == "be": self.weak_verbs['to be'].append(token.text) elif token.pos_ == 'AUX': self.weak_verbs['auxiliary'].append(token.text) else: pass def get_pronouns(self, doc): self.personal_pronouns = {} self.personal_pronouns['first person pronouns'] = [] self.personal_pronouns['second person pronouns'] = [] self.pronouns = [] for token in doc: if token.tag_ == 'PRP' or token.tag_ == "PRP$": if token.text.lower() in ['i', 'me', 'mine', 'my', 'myself']: self.personal_pronouns['first person pronouns'].append(token.text) elif token.text.lower() in ['you', 'your', 'yours', 'yourself']: self.personal_pronouns['second person pronouns'].append(token.text) else: pass elif token.pos_ == "PRON": self.pronouns.append(token.text.lower()) else: pass def statistics(self): self.statistics = {} self.statistics['per sentence'] = {} # rate per sentence self.statistics['per sentence'].update({'preposition rate':len(self.prepositional_phrases)/self.sentence_count}) self.statistics['per sentence'].update({'be rate':len(self.weak_verbs['to be'])/self.sentence_count}) self.statistics['per sentence'].update({'passive rate':len(self.passive_sents)/self.sentence_count}) self.statistics['percent of sentences'] = {} self.statistics['percent of sentences'].update({'prepositions':self.statistics['per sentence']['preposition rate'] * 100}) self.statistics['percent of sentences'].update({'to be':self.statistics['per sentence']['be rate'] * 100}) self.statistics['percent of sentences'].update({'passives':self.statistics['per sentence']['passive rate'] * 100}) self.statistics['ratios'] = {} self.statistics['ratios'].update({'adverbs to adjectives':len(self.pos['adverbs'])/len(self.pos['adjectives'])}) def get_freq_dist(self): words = [token.text for token in self.doc if token.is_stop != True and token.is_punct != True and token.text.isalpha() == True] nouns = [token.text for token in self.doc if token.is_stop != True and token.is_punct != True and token.pos_ == "NOUN" and token.text.isalpha() == True] verbs = [token.text for token in self.doc if token.is_stop != True and token.is_punct != True and token.pos_ == "VERB" and token.text.isalpha() == True] word_freq = Counter(words) noun_freq = Counter(nouns) verb_freq = Counter(verbs) self.common_words = word_freq.most_common(10) self.common_nouns = noun_freq.most_common(10) self.common_verbs = verb_freq.most_common(10) def get_intities(self): self.entities = {} for ent in self.doc.ents: self.entities[ent.text] = ent.label_
def __init__(self, vocab, boundary_protection_rules=[]): self.matcher = Matcher(vocab) for rule in boundary_protection_rules: self.matcher.add(rule['label'], None, rule['pattern'])
# -*- coding: utf-8 -*- # !/usr/bin/python __author__ = "biavarone" from spacy.matcher import Matcher from utils import nlp interactions_matcher = Matcher(nlp.vocab, validate=True) # alone alone1 = [{'LEMMA': 'on'}, {'LOWER': 'my'}, {'LEMMA': 'own'}] alone2 = [{'LEMMA': 'by'}, {'LOWER': 'myself'}] alone3 = [{'LEMMA': 'alone', 'POS': {'IN': ['ADV', 'ADJ']}}] interactions_matcher.add('alone', None, alone1, alone2, alone3) # animal animal1 = [{ 'LEMMA': { 'IN': [ 'animal', 'cat', 'cub', 'dog', 'kitten', 'kitty', 'pet', 'pup', 'puppy' ] }, 'POS': 'NOUN' }] animal2 = [{'LOWER': {'IN': ['doggie', 'doggo', 'doggy']}, 'POS': 'NOUN'}] interactions_matcher.add('animal', None, animal1, animal2)
from src.utils.LoopTimer import LoopTimer path_to_db = "/media/norpheo/mySQL/db/ssorc" nlp_model = "en_wa_v2" path_to_annotations = os.path.join(path_to_db, "annotations_version", nlp_model) pandas_path = os.path.join(path_to_db, "pandas") path_to_ner = os.path.join(path_to_db, "NER") threshold = 3 print("Loading NLP Model and Vocab") nlp = spacy.load(os.path.join(path_to_db, "models", nlp_model)) vocab = nlp.vocab.from_disk(os.path.join(path_to_annotations, "spacy.vocab")) matcher = Matcher(vocab) mla = set() with open(os.path.join(path_to_ner, "ml_algos_noacronyms.txt"), "r") as handle: for line in handle: mla.add(line.replace("\n", "")) for ml_algo in mla: ml_doc = nlp(ml_algo) pattern = [{"LOWER": token.lower_} for token in ml_doc] pattern_name = "".join([entity["LOWER"] for entity in pattern]).lower() matcher.add(pattern_name, None, pattern) infoDF = pd.read_pickle(os.path.join(path_to_annotations, 'info_db.pandas')) targ = len(infoDF) TRAIN_DATA = list()
def matches( doc: Doc, patterns: Union[str, List[str], List[Dict[str, str]], List[List[Dict[str, str]]]], *, on_match: Callable = None, ) -> Iterable[Span]: """ Extract ``Span`` s from a ``Doc`` matching one or more patterns of per-token attr:value pairs, with optional quantity qualifiers. Args: doc patterns: One or multiple patterns to match against ``doc`` using a :class:`spacy.matcher.Matcher`. If List[dict] or List[List[dict]], each pattern is specified as attr: value pairs per token, with optional quantity qualifiers: * ``[{"POS": "NOUN"}]`` matches singular or plural nouns, like "friend" or "enemies" * ``[{"POS": "PREP"}, {"POS": "DET", "OP": "?"}, {"POS": "ADJ", "OP": "?"}, {"POS": "NOUN", "OP": "+"}]`` matches prepositional phrases, like "in the future" or "from the distant past" * ``[{"IS_DIGIT": True}, {"TAG": "NNS"}]`` matches numbered plural nouns, like "60 seconds" or "2 beers" * ``[{"POS": "PROPN", "OP": "+"}, {}]`` matches proper nouns and whatever word follows them, like "Burton DeWilde yaaasss" If str or List[str], each pattern is specified as one or more per-token patterns separated by whitespace where attribute, value, and optional quantity qualifiers are delimited by colons. Note that boolean and integer values have special syntax --- "bool(val)" and "int(val)", respectively --- and that wildcard tokens still need a colon between the (empty) attribute and value strings. * ``"POS:NOUN"`` matches singular or plural nouns * ``"POS:PREP POS:DET:? POS:ADJ:? POS:NOUN:+"`` matches prepositional phrases * ``"IS_DIGIT:bool(True) TAG:NNS"`` matches numbered plural nouns * ``"POS:PROPN:+ :"`` matches proper nouns and whatever word follows them Also note that these pattern strings don't support spaCy v2.1's "extended" pattern syntax; if you need such complex patterns, it's probably better to use a List[dict] or List[List[dict]], anyway. on_match: Callback function to act on matches. Takes the arguments ``matcher``, ``doc``, ``i`` and ``matches``. Yields: Next matching ``Span`` in ``doc``, in order of appearance Raises: TypeError ValueError See Also: * https://spacy.io/usage/rule-based-matching * https://spacy.io/api/matcher """ # noqa: E501 if isinstance(patterns, str): patterns = [_make_pattern_from_string(patterns)] elif isinstance(patterns, (list, tuple)): if all(isinstance(item, str) for item in patterns): patterns = [_make_pattern_from_string(pattern) for pattern in patterns] elif all(isinstance(item, dict) for item in patterns): patterns = [patterns] elif all(isinstance(item, (list, tuple)) for item in patterns): pass # already in the right format! else: raise TypeError( errors.type_invalid_msg( "patterns", type(patterns), Union[ str, List[str], List[Dict[str, str]], List[List[Dict[str, str]]] ], ) ) else: raise TypeError( errors.type_invalid_msg( "patterns", type(patterns), Union[str, List[str], List[Dict[str, str]], List[List[Dict[str, str]]]], ) ) matcher = Matcher(doc.vocab) matcher.add("match", patterns, on_match=on_match) for _, start, end in matcher(doc): yield doc[start:end]
def __init__(self, nlp): self.nlp = nlp Token.set_extension('feature_is_mass_unit', default=False) nlp.entity.add_label('mass_unit') Token.set_extension('feature_is_volume_unit', default=False) nlp.entity.add_label('volume_unit') Token.set_extension('feature_is_time_unit', default=False) nlp.entity.add_label('time_unit') Token.set_extension('feature_is_route_type', default=False) nlp.entity.add_label('route_type') Token.set_extension('feature_is_form_unit', default=False) nlp.entity.add_label('form_unit') Token.set_extension('feature_is_frequency_indicator', default=False) nlp.entity.add_label('frequency_indicator') Token.set_extension('feature_is_measurement_unit', default=False) nlp.entity.add_label('measurement_unit') Token.set_extension('feature_is_measurement', default=False) nlp.entity.add_label('measurement') Token.set_extension('feature_is_duration_pattern', default=False) nlp.entity.add_label('duration_pattern') self.mass_matcher = Matcher(nlp.vocab) self.volume_matcher = Matcher(nlp.vocab) self.time_matcher = Matcher(nlp.vocab) self.route_matcher = Matcher(nlp.vocab) self.form_matcher = Matcher(nlp.vocab) self.unit_of_measurement_matcher = Matcher(nlp.vocab) self.measurement_matcher = Matcher(nlp.vocab) self.frequency_matcher = Matcher(nlp.vocab) self.duration_matcher = Matcher(nlp.vocab) self.mass_matcher.add('UNIT_OF_MASS', None, [{ 'LOWER': 'mcg' }], [{ 'LOWER': 'microgram' }], [{ 'LOWER': 'micrograms' }], [{ 'ORTH': 'mg' }], [{ 'LOWER': 'milligram' }], [{ 'LOWER': 'g' }], [{ 'LOWER': 'kg' }], [{ 'ORTH': 'mEq' }]) self.volume_matcher.add('UNIT_OF_VOLUME', None, [{ 'LOWER': 'ml' }], [{ 'ORTH': 'dL' }], [{ 'LOWER': 'cc' }], [{ 'ORTH': 'L' }]) self.time_matcher.add('UNIT_OF_TIME', None, [{ 'LOWER': 'sec' }], [{ 'LOWER': 'second' }], [{ 'LOWER': 'seconds' }], [{ 'LOWER': 'min' }], [{ 'LOWER': 'minute' }], [{ 'LOWER': 'minutes' }], [{ 'LOWER': 'hr' }], [{ 'LOWER': 'hour' }], [{ 'LOWER': 'day' }], [{ 'LOWER': 'days' }], [{ 'LOWER': 'week' }], [{ 'LOWER': 'weeks' }], [{ 'LOWER': 'month' }], [{ 'LOWER': 'months' }], [{ 'LOWER': 'year' }], [{ 'LOWER': 'years' }], [{ 'LOWER': 'yrs' }]) self.frequency_matcher.add('FREQUENCY_MATCHER', None, [{ 'LOWER': 'bid' }], [{ 'LOWER': 'prn' }], [{ 'LOWER': 'qid' }], [{ 'LOWER': 'tid' }], [{ 'LOWER': 'qd' }], [{ 'LOWER': 'daily' }], [{ 'LOWER': 'hs' }], [{ 'LOWER': 'as' }, { 'LOWER': 'needed' }], [{ 'LOWER': 'once' }, { 'LOWER': 'a' }, { 'LOWER': 'day' }], [{ 'LOWER': 'twice' }, { 'LOWER': 'a' }, { 'LOWER': 'day' }]) self.form_matcher.add('UNIT_OF_FORM', None, [{ 'ORTH': 'dose' }], [{ 'ORTH': 'doses' }], [{ 'LEMMA': 'pill' }], [{ 'LEMMA': 'tablet' }], [{ 'LEMMA': 'unit' }], [{ 'LEMMA': 'u' }], [{ 'LEMMA': 'patch' }], [{ 'LEMMA': 'unit' }], [{ 'ORTH': 'lotion' }], [{ 'ORTH': 'powder' }], [{ 'ORTH': 'amps' }], [{ 'LOWER': 'actuation' }], [{ 'LEMMA': 'suspension' }], [{ 'LEMMA': 'syringe' }], [{ 'LEMMA': 'puff' }], [{ 'LEMMA': 'liquid' }], [{ 'LEMMA': 'aerosol' }], [{ 'LEMMA': 'cap' }]) self.route_matcher.add('TYPE_OF_ROUTE', None, [{ 'LOWER': 'IV' }], [{ 'ORTH': 'intravenous' }], [{ 'LOWER': 'po' }], [{ 'ORTH': 'gtt' }], [{ 'LOWER': 'drip' }], [{ 'LOWER': 'inhalation' }], [{ 'LOWER': 'by' }, { 'LOWER': 'mouth' }], [{ 'LOWER': 'topical' }], [{ 'LOWER': 'subcutaneous' }], [{ 'LOWER': 'ophthalmic' }], [{ 'LEMMA': 'injection' }], [{ 'LOWER': 'mucous' }, { 'LOWER': 'membrane' }], [{ 'LOWER': 'oral' }], [{ 'LOWER': 'nebs' }], [{ 'LOWER': 'transdermal' }], [{ 'LOWER': 'nasal' }]) self.unit_of_measurement_matcher.add('UNIT_OF_MEASUREMENT', None, [{ 'ENT_TYPE': 'mass_unit' }, { 'ORTH': '/' }, { 'ENT_TYPE': 'volume_unit' }], [{ 'ENT_TYPE': 'volume_unit' }, { 'ORTH': '/' }, { 'ENT_TYPE': 'time_unit' }], [{ 'ENT_TYPE': 'form_unit' }, { 'ORTH': '/' }, { 'ENT_TYPE': 'volume_unit' }]) self.measurement_matcher.add('MEASUREMENT', None, [{ 'LIKE_NUM': True }, { 'ORTH': '%' }], [{ 'LIKE_NUM': True }, { 'ENT_TYPE': 'measurement_unit' }], [{ 'LIKE_NUM': True }, { 'ENT_TYPE': 'mass_unit' }], [{ 'LIKE_NUM': True }, { 'ENT_TYPE': 'volume_unit' }], [{ 'LIKE_NUM': True }, { 'ENT_TYPE': 'form_unit' }], [{ 'LIKE_NUM': True }, { 'LOWER': 'x' }, { 'ENT_TYPE': 'form_unit' }]) self.duration_matcher.add('DURATION', None, [{ 'POS': 'PREP' }, { 'LIKE_NUM': True }, { 'ENT_TYPE': 'time_unit' }], [{ 'LIKE_NUM': True }, { 'ENT_TYPE': 'time_unit' }], [{ 'LOWER': 'in' }, { 'LIKE_NUM': True }, { 'ENT_TYPE': 'time_unit' }], [{ 'LOWER': 'prn' }])
def custom_tokenizer_to_df(nlp, doc): # Initialize the Matcher with a vocab matcher = Matcher(nlp.vocab) ############################################################### # Add pattern for valid hashtag, i.e. '#' plus any ASCII token matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ALPHA": True}]) # Register token extension for hashtag Token.set_extension("is_hashtag", default=False, force=True) # Fit in text in matcher matches = matcher(doc) # Find hashtag and merge, assign hashtag label hashtags = [] for match_id, start, end in matches: if doc.vocab.strings[match_id] == "HASHTAG": hashtags.append(doc[start:end]) with doc.retokenize() as retokenizer: for span in hashtags: retokenizer.merge(span) for token in span: token._.is_hashtag = True ############################################################## ############################################################## # Find number and merge, assign number label # Add pattern for valid hashtag, i.e. '#' plus any ASCII token matcher.add("LONG_NUMBER", None, [{ "IS_DIGIT": True }, { "ORTH": ',' }, { "IS_DIGIT": True }]) matcher.add("LONG_NUMBER", None, [{ "IS_DIGIT": True }, { "ORTH": '.' }, { "IS_DIGIT": True }]) # Register token extension for hashtag Token.set_extension("is_long_number", default=False, force=True) # Fit in text in matcher matches = matcher(doc) long_number = [] for match_id, start, end in matches: if doc.vocab.strings[match_id] == "LONG_NUMBER": long_number.append(doc[start:end]) with doc.retokenize() as retokenizer: for span in long_number: retokenizer.merge(span) for token in span: token._.is_long_number = True ############################################################## for i, token in enumerate(doc): if token._.is_hashtag: token.tag_ = 'Hashtag' if token.like_url: token.tag_ = 'URL' if token.like_email: token.tag_ = 'Email' if token.is_stop: token.tag_ = 'Stop Word' if token.like_num: token.tag_ = 'Number' if token._.is_long_number: token.tag_ = 'Number' if token.is_punct: token.tag_ = 'Punctuation' # Write the tokens to data frame df = pd.DataFrame() df['Token'] = [token.text for token in doc] df['POS'] = [token.pos_ for token in doc] df['NE'] = [token.ent_iob_ for token in doc] df['Lemma'] = [token.lemma_ for token in doc] df['Tag'] = [token.tag_ for token in doc] df['Language'] = np.nan df['Candidate'] = True df['Anglicism'] = np.nan return df
]) infix_re = compile_infix_regex(infixes) return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search, suffix_search=nlp.tokenizer.suffix_search, infix_finditer=infix_re.finditer, token_match=nlp.tokenizer.token_match, rules=nlp.Defaults.tokenizer_exceptions) nlp = spacy.load("en_core_web_sm") nlp.tokenizer = custom_tokenizer(nlp) MATCHER = Matcher(nlp.vocab) RELATION_PATTERN = [{ 'DEP': 'ROOT' }, { 'DEP': 'prep', 'OP': "?" }, { 'DEP': 'agent', 'OP': "?" }, { 'POS': 'ADJ', 'OP': "?" }] def extract_noun_chunks(doc, dep_tag):
def __init__(self): self.count = { "0": 0, "1": 0, "2": 0, "3": 0, "4": 0, "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "10": 0 } self.compa_sent_count = 0 self.nlp = spacy.load("en") self.matcher = Matcher(self.nlp.vocab) # self.matcher.add(6, # None, # [{'ORTH': 'JJR'}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}], # [{'ORTH': 'JJR'}, {}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}], # [{'ORTH': 'JJR'}, {'ORTH': 'CIN'}, {}, {'ORTH': 'TECH'}], # [{'ORTH': 'JJR'}, {}, {'ORTH': 'CIN'}, {}, {'ORTH': 'TECH'}]) # self.matcher.add(7, # None, # [{'ORTH': 'RB'}, {'ORTH': 'JJ'}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}], # [{'ORTH': 'RB'}, {'ORTH': 'JJ'}, {}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}]) # self.matcher.add(8, # None, # [{'ORTH': 'RBR'}, {'ORTH': 'JJ'}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}], # [{'ORTH': 'RBR'}, {'ORTH': 'JJ'}, {}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}]) # # # self.matcher.add(4, # None, # [{'ORTH': 'NN'}, {'ORTH': 'IN'}, {'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RB'}], # [{'ORTH': 'NN'}, {'ORTH': 'IN'}, {'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}], # [{'ORTH': 'NN'}, {'ORTH': 'IN'}, {'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'RB'}], # [{'ORTH': 'NN'}, {'ORTH': 'IN'}, {}, {'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'RB'}], # [{'ORTH': 'NN'}, {'ORTH': 'IN'}, {}, {'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RB'}], # # # ) # self.matcher.add( 5, None, [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBP' }, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBP' }, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBP' }, {}, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBP' }, {}, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBZ' }, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'NN' }], ) self.matcher.add( 1, None, [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, { 'ORTH': 'JJ' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBZ' }, { 'ORTH': 'JJ' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'JJ' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'JJ' }], # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {}, {}, {'ORTH': 'JJ'}], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'JJ' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBD' }, { 'ORTH': 'JJ' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBD' }, { 'ORTH': 'JJ' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBD' }, {}, { 'ORTH': 'JJ' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBD' }, {}, { 'ORTH': 'JJ' }], # [{'ORTH': 'TECH'}, {'ORTH': 'VBD'}, {}, {}, {}, {'ORTH': 'JJ'}], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBD' }, {}, { 'ORTH': 'JJ' }]) self.matcher.add( 3, None, [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, { 'ORTH': 'RB' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBZ' }, { 'ORTH': 'RB' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'RB' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'RB' }], # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {}, {}, {'ORTH': 'RB'}], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'RB' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBD' }, { 'ORTH': 'RB' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBD' }, { 'ORTH': 'RB' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBD' }, {}, { 'ORTH': 'RB' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBD' }, {}, { 'ORTH': 'RB' }], # [{'ORTH': 'TECH'}, {'ORTH': 'VBD'}, {}, {}, {}, {'ORTH': 'RB'}], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBD' }, {}, { 'ORTH': 'RB' }])
def feature_extraction(df, ft_model, nlp): # Extracting all the single nouns in the corpus all_nouns = [] for review in df['spacyObj']: for token in review: if token.pos_ == "NOUN": all_nouns.append(token.text) all_nouns = pd.Series(all_nouns) # Finding unique nouns along with their counts sorted in descending order unique_nouns = all_nouns.value_counts() noun_phrases = [] # Pattern to match i.e. two nouns occuring together patterns = [[{'TAG': 'NN'}, {'TAG': 'NN'}]] matcher = Matcher(nlp.vocab) matcher.add('NounPhrasees', patterns) for review in df['spacyObj']: matches = matcher(review) for match_id, start, end in matches: noun_phrases.append(review[start:end].text) noun_phrases = pd.Series(noun_phrases) unique_noun_phrases = noun_phrases.value_counts() # Remove nouns with single or double character for noun in unique_nouns.index: # if noun length is less than 3 or if nouns contain any numbers, it is considered invalid if len(noun) < 3 or re.match(r".*[0-9].*", noun) is not None: del unique_nouns[noun] # Extracting Top Features top2 = len(unique_nouns) * 0.05 # considering top 5% of features top2 = int(top2) top_features = unique_nouns[0:top2] # this will contain all the final features features_bucket = OrderedDict() top_features_list = list(top_features.keys()) top_features_set = set(top_features.keys()) unique_noun_phrases_set = set(unique_noun_phrases.keys()) # Applying assocation rule mining to group nouns occuring together for feature1 in top_features_list: for feature2 in top_features_list: feature_phrase = feature1 + ' ' + feature2 if feature1 in top_features_set and feature2 in top_features_set and feature_phrase in unique_noun_phrases_set: # If the condition is true, we have identified a noun phrase which is a combination of two nouns # in the top_features. So one of the nouns cn be eliminated from top features. # Ex. if "battery life" is found, then "life" can be eliminated from top features as it is not a feature # by itself. It is just part of the feature "battery life" # Now we need to find out if frequency of the lesser occuring noun (in our ex., the word "life") matches # with the frequency of the noun phrase (in our ex., "battery life") by a certain confidence. # If it does so, then we can be sure that the lesser occuring noun occurs only in that particular noun_phrase # i.e in our ex "life" occurs primaryly in the phrase "battery life" lesser_occurring_noun = "" often_occurring_noun = "" if unique_nouns[feature1] < unique_nouns[feature2]: lesser_occurring_noun = feature1 often_occurring_noun = feature2 else: lesser_occurring_noun = feature2 often_occurring_noun = feature1 # assuming confidence interval of 40% # i.e. accordnig to 'battery life' example, out of total times that 'life' is seen, 'battery' is seen next to it 40% of the time. if unique_noun_phrases[feature_phrase] / unique_nouns[ lesser_occurring_noun] > 0.4: try: if often_occurring_noun not in features_bucket: features_bucket[often_occurring_noun] = [] features_bucket[often_occurring_noun].append( lesser_occurring_noun) top_features_set.remove(lesser_occurring_noun) # print(lesser_occurring_noun) except BaseException as error: print(error) continue main_features = list(features_bucket.keys()) top_features_to_add = set(top_features_list[:20]) # here we are manually adding adding 20 top nouns as features which were previously not # added by the assocation rule mining step above. # But before adding, we are checking if any similar nouns exist among the 20 nouns. # Ex. If 'display' and 'screen' occur in the top 20, we must add only the most commonly occuring # one among the two and remove the other. # Here we are only eliminating the nouns that are similar to existing ones in features_bucket. for feature1 in top_features_list[:20]: for feature2 in main_features: if feature1 not in features_bucket and feature1 in top_features_set: similarity = cosine_similarity( ft_model.get_word_vector(feature1).reshape(1, -1), ft_model.get_word_vector(feature2).reshape(1, -1)) if similarity[0][0] > 0.64: top_features_to_add.discard(feature1) else: top_features_to_add.discard(feature1) top_features_to_add_list = list(top_features_to_add) # Here we are eliminating nouns that are similar to one another in the top_features_to_add for feature1 in top_features_to_add_list: for feature2 in top_features_to_add_list: if feature1 in top_features_to_add and feature2 in top_features_to_add: similarity = cosine_similarity( ft_model.get_word_vector(feature1).reshape(1, -1), ft_model.get_word_vector(feature2).reshape(1, -1)) if similarity[0][0] < 0.99 and similarity[0][0] > 0.64: feature_to_remove = min( (unique_nouns[feature1], feature1), (unique_nouns[feature2], feature2))[1] top_features_to_add.remove(feature_to_remove) for feature in top_features_to_add: features_bucket[feature] = [] for main_noun in features_bucket.keys(): top_features_set.remove(main_noun) # Here we are going through the top 5% of the nouns that we originally considering and checking # if any of them are similar to the ones already present in features_bucket. top_features_copy = list(top_features_set) main_features = features_bucket.keys() for feature2 in top_features_copy: best_similarity = 0 most_matching_main_feature = "" for feature1 in main_features: if feature2 in top_features_set: similarity = cosine_similarity( ft_model.get_word_vector(feature1).reshape(1, -1), ft_model.get_word_vector(feature2).reshape(1, -1)) if similarity[0][0] <= 0.99 and similarity[0][0] > 0.62: if similarity[0][0] > best_similarity: best_similarity = similarity[0][0] most_matching_main_feature = feature1 if best_similarity != 0 and most_matching_main_feature != "": features_bucket[most_matching_main_feature].append(feature2) top_features_set.remove(feature2) # We finally sort the features in descending order based on how often they occur. final_features = list(features_bucket.items()) final_features_with_counts = [] for feature in final_features: count = unique_nouns[feature[0]] final_features_with_counts.append((feature, count)) final_features_with_counts.sort(key=lambda x: x[1], reverse=True) final_features = OrderedDict() for feature, count in final_features_with_counts: final_features[feature[0]] = feature[1] return final_features
:param i: is the index of the text matches :param matches: matches found in the text """ match_id, start, end = matches[i] # indices of matched term span = doc[start:end] # extract matched term print('span: {} | start_ind:{:5} | end_ind:{:5} | id:{}'.format( span, start, end, match_id)) # set a pattern of text to collect # find all mentions of the word fees pattern = [{'LOWER':'fees'}] # LOWER coverts words to lowercase before matching # instantiate matcher matcher = Matcher(nlp.vocab) # add pattern to the matcher (one matcher can look for many unique patterns) # provice a pattern name, function to apply to matches, pattern to identify matcher.add('fee', collect_sents, pattern) # pass the doc to the matcher to run the collect_sents function matcher(doc) # change the function to print the sentence of the matched term (span) def collect_sents(matcher, doc, i, matches): match_id, start, end = matches[i] span = doc[start:end] print('SPAN: {}'.format(span)) # span.sent provides the sentence that contains the span
def task2(sentence,timestamp): # loading spacy model nlp = spacy.load("en_core_web_sm") import en_core_web_sm nlp = en_core_web_sm.load() print(sentence) if (isAlredyPresent(sentence) == False): processedTweets.append(sentence) call(["aplay", "Air.wav"]) doc = nlp(sentence) # print(sutime.SUTime(sentence)) # print([(X.text, X.label_) for X in doc.ents]) # Tokenization tokens = [] tokens = nltk.word_tokenize(sentence); #print("Tokens: ", tokens) # tweetFile = open("stanford-ner-2018-10-16/tweet.txt", 'w') nlp = spacy.load("en_core_web_sm") # Matcher class object matcher = Matcher(nlp.vocab) matcher.add("matching", None, [{'POS': 'PROPN'}, {'LOWER': {'IN': ['ave', 'avenue', 'st', 'street', 'rd', 'road', 'dr', 'drive', 'pkwy', 'parkway', 'bend', 'bnd', 'boulevard', 'blvd', 'court', 'ct', 'expressway', 'expy', 'freeway', 'fwy', 'highway', 'hwy', 'junction', 'jct', 'lane', 'ln', 'loop', 'motorway', 'mtwy', 'parkway', 'pkwy', 'point', 'pt', 'ramp', 'turnpike', 'tpke', 'tunnel', 'tunl', 'underpass']}}]) matches = matcher(doc) span = "" for match_id, start, end in matches: span = doc[start:end] # print(span) st = StanfordNERTagger('stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz', "stanford-ner-2018-10-16/stanford-ner.jar", encoding='utf-8') classifiedText = st.tag(tokens) location = "" #print(classifiedText) i = 0 locationMatches = [] for eachOut in classifiedText: if "LOCATION" in eachOut[1]: locationMatches.append(eachOut[0]) # print(locationMatches) span = str(span) #print(span) # Lemmatization without POS tags lems = [] lemmatizer = WordNetLemmatizer() pos_sen = nltk.pos_tag(tokens); #print("\n POS Tags: \n", pos_sen); pos_wn = [(s[0], penn_to_wn(s[1])) for s in pos_sen] # print("\n POS Tags for wordnet: \n", pos_wn) lems_pos = [] for w in pos_wn: if (w[1]): lems_pos.append(lemmatizer.lemmatize(w[0], pos=w[1])) else: lems_pos.append(lemmatizer.lemmatize(w[0])) # print("\n Lemmatization by taking into account the pos tags: \n") # print(lems_pos) if("on" in tokens): try: x = tokens.index("on") x+=1 while pos_sen[x][1]=="NNP": if pos_sen[x][0] not in locationMatches: locationMatches.append(pos_sen[x][0]) x+=1 if(pos_sen[x][1]=="CD" and pos_sen[x+1][1]=="NNP" and pos_sen[x+1][0]!="AM" and pos_sen[x+1][0]!="am" and pos_sen[x+1][0]!="pm" and pos_sen[x+1][0]!="PM" ): if pos_sen[x][0] not in locationMatches: locationMatches.append(pos_sen[x][0]) if pos_sen[x+1][0] not in locationMatches: locationMatches.append(pos_sen[x+1][0]) x+=1 x+=1 while pos_sen[x][1] == "NNP": if pos_sen[x][0] not in locationMatches: locationMatches.append(pos_sen[x][0]) x += 1 except: pass if ("at" in tokens): try: x = tokens.index("at") x += 1 while pos_sen[x][1] == "NNP": if pos_sen[x][0] not in locationMatches: locationMatches.append(pos_sen[x][0]) x+=1 if (pos_sen[x][1] == "CD" and pos_sen[x + 1][1] == "NNP" and pos_sen[x+1][0]!="AM" and pos_sen[x+1][0]!="am" and pos_sen[x+1][0]!="pm" and pos_sen[x+1][0]!="PM" ): if pos_sen[x][0] not in locationMatches: locationMatches.append(pos_sen[x][0]) if pos_sen[x + 1][0] not in locationMatches: locationMatches.append(pos_sen[x + 1][0]) x += 1 x += 1 while pos_sen[x][1] == "NNP": if pos_sen[x][0] not in locationMatches: locationMatches.append(pos_sen[x][0]) x += 1 except: pass if ("AT" in tokens): try: x = tokens.index("AT") x += 1 while pos_sen[x][1] == "NNP": if pos_sen[x][0] not in locationMatches: locationMatches.append(pos_sen[x][0]) x+=1 if (pos_sen[x][1] == "CD" and pos_sen[x + 1][1] == "NNP" and pos_sen[x+1][0]!="AM" and pos_sen[x+1][0]!="am" and pos_sen[x+1][0]!="pm" and pos_sen[x+1][0]!="PM" ): if pos_sen[x][0] not in locationMatches: locationMatches.append(pos_sen[x][0]) if pos_sen[x + 1][0] not in locationMatches: locationMatches.append(pos_sen[x + 1][0]) x += 1 x += 1 while pos_sen[x][1] == "NNP": if pos_sen[x][0] not in locationMatches: locationMatches.append(pos_sen[x][0]) x += 1 except: pass #print(locationMatches) removal=[] if (len(locationMatches) > 0 and len(span) > 0): for eachMatch in locationMatches: #print(len(locationMatches)) try: #print(span.find(eachMatch)) if span.find(eachMatch) != -1: removal.append(eachMatch) except: print("Exception Distinct") for removeItem in removal: locationMatches.remove(removeItem) location= (span + " " + " ".join(locationMatches)).strip() #Extracting Time using Regular Expression: re6 = r"(24:00|2[0-3]:[0-5][0-9]|[0-1][0-9]:[0-5][0-9]:[0-5][0-9])([\s]*[AaPp][Mm])" re2 = r"(24:00|2[0-3]:[0-5][0-9]|[0-1][0-9]:[0-5][0-9]:[0-5][0-9])" re3 = r"24:00|2[0-3]:[0-5][0-9]|[0-1][0-9]:[0-5][0-9]([\s]*[AaPp][Mm])" re4 = r"24:00|2[0-3]:[0-5][0-9]|[0-1][0-9]:[0-5][0-9]" re5 = r"([0-9][0-9]?:[0-5][0-9]|[0-1][0-9]:[0-5][0-9])([\s]*[AaPp]*[Mm]*)" re1 = r"([0-9][0-9]*:[0-5][0-9]:[0-5][0-9])([\s]*[AaPp]*[Mm]*)" re7 = r"([0-9][0-9]*:[0-5][0-9])" try: time=(re.compile("(%s|%s|%s|%s|%s|%s|%s)" % (re1, re2, re3, re4, re5, re6, re7)).findall(sentence))[0][0] time=str(time) if(len(time.strip())>0): print("Time: "+str(time)) timestamp=time except BaseException as e: print("Time : "+timestamp) severity= severity_classifier.severity_finder(sentence) severityStr="" for eachKeyword in severity: severityStr+=str(eachKeyword)+" " print("Severity: "+severityStr) if (len(location) > 0): print("Location: " + location) e2 = {"predictedClassLabel": "Accidental", "tweet": sentence, "timestamp": timestamp, "location":location,"severity":severityStr} else: e2 = {"predictedClassLabel": "Accidental", "tweet": sentence, "timestamp": timestamp,"severity":severityStr} res2 = es.index(index=indexName2, doc_type=typeName2, body=e2)
def test_matcher_no_zero_length(en_vocab): doc = Doc(en_vocab, words=["a", "b"], tags=["A", "B"]) matcher = Matcher(en_vocab) matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]]) assert len(matcher(doc)) == 0
def ground_mentioned_concepts(nlp, matcher, s, ans=None): s = s.lower() doc = nlp(s) matches = matcher(doc) mentioned_concepts = set() span_to_concepts = {} # print('ground step 0') if ans is not None: ans_matcher = Matcher(nlp.vocab) ans_words = nlp(ans) # print(ans_words) ans_matcher.add(ans, None, [{ 'TEXT': token.text.lower() } for token in ans_words]) ans_match = ans_matcher(doc) ans_mentions = set() for _, ans_start, ans_end in ans_match: ans_mentions.add((ans_start, ans_end)) # print('ground step 1') for match_id, start, end in matches: if ans is not None: if (start, end) in ans_mentions: continue span = doc[start:end].text # the matched span # a word that appears in answer is not considered as a mention in the question # if len(set(span.split(" ")).intersection(set(ans.split(" ")))) > 0: # continue original_concept = nlp.vocab.strings[match_id] original_concept_set = set() original_concept_set.add(original_concept) # print("span", span) # print("concept", original_concept) # print("Matched '" + span + "' to the rule '" + string_id) # why do you lemmatize a mention whose len == 1? if len(original_concept.split("_")) == 1: # tag = doc[start].tag_ # if tag in ['VBN', 'VBG']: original_concept_set.update( lemmatize(nlp, nlp.vocab.strings[match_id])) if span not in span_to_concepts: span_to_concepts[span] = set() span_to_concepts[span].update(original_concept_set) # print('ground step 2') for span, concepts in span_to_concepts.items(): concepts_sorted = list(concepts) # print("span:") # print(span) # print("concept_sorted:") # print(concepts_sorted) concepts_sorted.sort(key=len) # mentioned_concepts.update(concepts_sorted[0:2]) shortest = concepts_sorted[0:3] for c in shortest: if c in blacklist: continue # a set with one string like: set("like_apples") lcs = lemmatize(nlp, c) intersect = lcs.intersection(shortest) if len(intersect) > 0: mentioned_concepts.add(list(intersect)[0]) else: mentioned_concepts.add(c) # if a mention exactly matches with a concept exact_match = set([ concept for concept in concepts_sorted if concept.replace("_", " ").lower() == span.lower() ]) # print("exact match:") # print(exact_match) # print('assert len exact match') assert len(exact_match) < 2 mentioned_concepts.update(exact_match) return mentioned_concepts
def test_invalid_greediness(doc, text): matcher = Matcher(doc.vocab) with pytest.raises(ValueError): matcher.add("RULE", [pattern1], greedy="GREEDY")
def load_date_matcher(nlp): # Create matcher object with list of rules and return matcher = Matcher(nlp.vocab) # Add to vocab add_to_vocab(nlp, months_dict.keys()) add_to_vocab(nlp, ordinals) add_to_vocab(nlp, date_delimiters) # Create flag for MONTH is_month = FLAG62 target_ids = {nlp.vocab.strings[s.lower()] for s in months_dict.keys()} for lexeme in nlp.vocab: if lexeme.lower in target_ids: lexeme.set_flag(is_month, True) # Create flag for ORDINALS is_ordinal = FLAG61 target_ids = {nlp.vocab.strings[s.lower()] for s in ordinals} for lexeme in nlp.vocab: if lexeme.lower in target_ids: lexeme.set_flag(is_ordinal, True) # Create flag for DATE_DELIMITER is_date_delimiter = FLAG60 target_ids = {nlp.vocab.strings[s.lower()] for s in date_delimiters} for lexeme in nlp.vocab: if lexeme.lower in target_ids: lexeme.set_flag(is_date_delimiter, True) # print('December', nlp.vocab.__contains__('December')) # print('Diciembre', nlp.vocab.__contains__('diciembre')) # print('December', nlp.vocab['december'].check_flag(is_month)) # print('Diciembre', nlp.vocab['diciembre'].check_flag(is_month)) # Add rules # March 25, 2017 # March 25th, 2017 # March 25th 2017 # March 25 2017 matcher.add_pattern('DATE', [{ is_month: True }, { IS_DIGIT: True, LENGTH: 1 }, { is_ordinal: True, 'OP': '?' }, { ORTH: ',', 'OP': '?' }, { IS_DIGIT: True, LENGTH: 4 }], label=1) matcher.add_pattern('DATE', [{ is_month: True }, { IS_DIGIT: True, LENGTH: 2 }, { is_ordinal: True, 'OP': '?' }, { ORTH: ',', 'OP': '?' }, { IS_DIGIT: True, LENGTH: 4 }], label=2) # 25 March, 2017 # 25th March, 2017 # 25th March 2017 # 25 March 2017 matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 1 }, { is_date_delimiter: True, 'OP': '?' }, { is_month: True }, { is_ordinal: True, 'OP': '?' }, { ORTH: ',', 'OP': '?' }, { IS_DIGIT: True, LENGTH: 4 }], label=3) matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 2 }, { is_date_delimiter: True, 'OP': '?' }, { is_month: True }, { is_ordinal: True, 'OP': '?' }, { ORTH: ',', 'OP': '?' }, { IS_DIGIT: True, LENGTH: 4 }], label=4) # 25/05/2016 matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 1 }, { is_date_delimiter: True, 'OP': '+' }, { is_month: True }, { is_date_delimiter: True, 'OP': '+' }, { IS_DIGIT: True, LENGTH: 4 }], label=5) matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 2 }, { is_date_delimiter: True, 'OP': '+' }, { is_month: True }, { is_date_delimiter: True, 'OP': '+' }, { IS_DIGIT: True, LENGTH: 4 }], label=6) # 05/25/2016 matcher.add_pattern('DATE', [{ is_month: True }, { is_date_delimiter: True, 'OP': '+' }, { IS_DIGIT: True, LENGTH: 1 }, { is_date_delimiter: True, 'OP': '+' }, { IS_DIGIT: True, LENGTH: 4 }], label=7) matcher.add_pattern('DATE', [{ is_month: True }, { is_date_delimiter: True, 'OP': '+' }, { IS_DIGIT: True, LENGTH: 2 }, { is_date_delimiter: True, 'OP': '+' }, { IS_DIGIT: True, LENGTH: 4 }], label=8) # Diciembre, 2009 # December 2009 matcher.add_pattern('DATE', [{ is_month: True, IS_DIGIT: False }, { ORTH: ',' }, { IS_DIGIT: True, LENGTH: 4 }], label=9) matcher.add_pattern('DATE', [{ is_month: True, IS_DIGIT: False }, { IS_DIGIT: True, LENGTH: 4 }], label=9) # 2013-12-04 matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 4 }, { is_date_delimiter: True, 'OP': '+' }, { is_month: True }, { is_date_delimiter: True, 'OP': '+' }, { IS_DIGIT: True, LENGTH: 2 }], label=10) matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 4 }, { is_date_delimiter: True, 'OP': '+' }, { is_month: True }, { is_date_delimiter: True, 'OP': '+' }, { IS_DIGIT: True, LENGTH: 1 }], label=11) # 9 days ago matcher.add_pattern('DATE', [{ IS_DIGIT: True }, { POS: 'NOUN' }, { LOWER: 'ago' }], label=12) # 1 Jul # 1. Jul matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 2 }, { is_ordinal: True }, { is_date_delimiter: True }, { is_month: True, IS_DIGIT: False }], label=13) matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 2 }, { is_ordinal: True }, { is_month: True, IS_DIGIT: False }], label=13) matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 2 }, { is_date_delimiter: True }, { is_month: True, IS_DIGIT: False }], label=13) matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 2 }, { is_month: True, IS_DIGIT: False }], label=13) matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 1 }, { is_ordinal: True }, { is_date_delimiter: True }, { is_month: True, IS_DIGIT: False }], label=14) matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 1 }, { is_ordinal: True }, { is_month: True, IS_DIGIT: False }], label=14) matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 1 }, { is_date_delimiter: True }, { is_month: True, IS_DIGIT: False }], label=14) matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 1 }, { is_month: True, IS_DIGIT: False }], label=14) # Jul 2nd matcher.add_pattern('DATE', [{ is_month: True, IS_DIGIT: False }, { is_date_delimiter: True }, { IS_DIGIT: True, LENGTH: 2 }, { is_ordinal: True }], label=15) matcher.add_pattern('DATE', [{ is_month: True, IS_DIGIT: False }, { is_date_delimiter: True }, { IS_DIGIT: True, LENGTH: 2 }], label=15) matcher.add_pattern('DATE', [{ is_month: True, IS_DIGIT: False }, { IS_DIGIT: True, LENGTH: 2 }, { is_ordinal: True }], label=15) matcher.add_pattern('DATE', [{ is_month: True, IS_DIGIT: False }, { IS_DIGIT: True, LENGTH: 2 }], label=15) matcher.add_pattern('DATE', [{ is_month: True, IS_DIGIT: False }, { is_date_delimiter: True }, { IS_DIGIT: True, LENGTH: 1 }, { is_ordinal: True }], label=16) matcher.add_pattern('DATE', [{ is_month: True, IS_DIGIT: False }, { is_date_delimiter: True }, { IS_DIGIT: True, LENGTH: 1 }], label=16) matcher.add_pattern('DATE', [{ is_month: True, IS_DIGIT: False }, { IS_DIGIT: True, LENGTH: 1 }, { is_ordinal: True }], label=16) matcher.add_pattern('DATE', [{ is_month: True, IS_DIGIT: False }, { IS_DIGIT: True, LENGTH: 1 }], label=16) return matcher
def make_matcher(vocab, max_length): abstract_patterns = [] for length in range(1, max_length+1): abstract_patterns.append([{tag: True} for tag in get_bilou(length)]) return Matcher(vocab, {'Candidate': ('CAND', {}, abstract_patterns)})
def test_issue588(en_vocab): matcher = Matcher(en_vocab) with pytest.raises(ValueError): matcher.add("TEST", [[]])
edu = {} # Extract education degree for idx, text in enumerate(nlp_text): text_unigrams = text.split() text_bigrams = [tup[0] + tup[1] for tup in list(ngrams(text_unigrams, 2))] all_grams = text_unigrams + text_bigrams for tok in all_grams: # Replace special symbols and lowercase re_tok = re.sub(SYMBOLS_ext, '', tok.lower().strip()) print(re_tok) if re_tok in EDUCATION and re_tok not in STOPWORDS: edu[tok] = text + nlp_text[idx + 1] matcher = Matcher(cv_obj.nlp.vocab) nlp_text = cv_obj.doc # First name and Last name are always Proper Nouns pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}] matcher.add('NAME', None, pattern) matches = matcher(nlp_text) for match_id, start, end in matches: span = nlp_text[start:end] print(span.text) # test
def identify_GROSS_TONNAGE_in_text(text): nlp = English() doc = nlp(text) matcher = Matcher(nlp.vocab) # # START - spaCy patterns # # GROSS_TONNAGE patterns = [[{ "LOWER": { "IN": ["bruttotonnasje"] } }, { "LOWER": { "IN": ["opp", "ned"] } }, { "LOWER": { "IN": ["til"] } }, { 'IS_DIGIT': True }], [{ "LOWER": { "IN": ["bruttotonnasje"] } }, { "LOWER": { "IN": ["under", "over"] } }, { 'IS_DIGIT': True }], [{ "LOWER": { "IN": ["bruttotonnasje"] } }, { 'IS_DIGIT': True }, { "LOWER": { "IN": ["eller"] } }, { "LOWER": { "IN": ["mer", "mindre"] } }]] matcher.add("GROSS_TONNAGE", patterns) # # END - spaCy patterns # result = [] for match_id, token_start, token_end in matcher(doc): match_id_as_string = nlp.vocab.strings[match_id] final_token_start = token_start final_token_end = token_end spacy_pattern_detection = doc[token_start:token_end] spacy_pattern_detection_as_lower_text = spacy_pattern_detection.text.lower( ) # # convert token_span to char_span. # char_span is needed to display correctly withdisplacy.render(). # span = doc[final_token_start:final_token_end] span_char_start = span[0].idx span_char_end = span[-1].idx + len(span[-1].text) # return result identified_entity = { 'start': span_char_start, 'end': span_char_end, 'label': match_id_as_string } result.append(identified_entity) return result
def coronaAnalysis(sha, abstract, count, textcount): #doc = nlp(text) textcount = 0 cleantext = [ t.text for t in abstract if not t.is_stop and t.ent_type_ != 'GPE' ] # remove stop words. Exclude Geographic location # convert list to nlp doc cleandoc = Doc(nlp.vocab, words=cleantext) matcher = Matcher(nlp.vocab) #print("Search for ", pattern22) #matcher.add("medicalcare", None, pattern2, pattern3, pattern4, pattern5) #matcher.add("medicalcare", None, pattern2) #matcher.add("medicalcare", None, pattern5) #matcher.add("medicalcare", None, pattern6) matcher.add("medicalcare", None, pattern21) matches = matcher(cleandoc) #print(matches) for match_id, start, end in matches: moveleft = 0 moveright = 0 leftwords = [] rightwords = [] string_id = nlp.vocab.strings[match_id] # Get string representation span = cleandoc[start:end] # The matched span #print("Span :", span, '\n') print(start, end, span.text) #print("Len clean Doc :", len(cleandoc)) #print("Moveleft ", moveleft) #print(" Doc Lenght ", len(cleandoc)) #print(cleandoc[start-1]) while ((len(cleandoc) > start + moveleft) and (str(cleandoc[start - moveleft]) != ".")): #print("Prev Word :", cleandoc[start - moveleft]) leftwords.append(cleandoc[start - moveleft]) moveleft = moveleft + 1 #print("movement :", moveleftprint("Sum :", end + moveright)) #print("Sum Left :", start + moveleft) if len(cleandoc) == start + moveleft: break leftwords.reverse() #print("Left Words :", leftwords) #print("Moveright ", moveright) #print(" Doc Lenght ", len(cleandoc)) while ((len(cleandoc) > end + moveright) and (str(cleandoc[end + moveright]) != ".")): #print("Next Word :", cleandoc[end + moveright]) moveright = moveright + 1 #print("movement :", moveright) #print("MOVE RIGHT count :", moveright) #print("End", end) #print("Abstract Length : ", len(abstract)) #print("Clean Doc Size :", len(cleandoc)) #print("Sum :", end + moveright) if len(cleandoc) == end + moveright: break rightwords.append(cleandoc[end + moveright]) #rightwords.reverse() #print("Right Words :", rightwords) combinedList = leftwords + rightwords sentence = ' '.join(map(str, combinedList)) sentence.replace(".", "") #print("Combined Words ", combinedList, 'SHA ', sha, 'Keyword ', span.text) print("Sentence ", sentence, 'SHA ', sha, 'Keyword ', span.text) medical_care.append([sha, span.text, sentence]) #print(start, end, span.text, span.label) #print(doc) #print(cleandoc) #text_list.append([sha, cleandoc]) #word_dict[span.text] = {} # create dictionary for keyword #word_dict[span.text][cleandoc[start - 1]] = -1 textcount = +1
def __init__(self): self.count = { "0": 0, "1": 0, "2": 0, "3": 0, "4": 0, "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "10": 0 } self.compa_sent_count = 0 self.nlp = spacy.load("en_core_web_sm") self.matcher = Matcher(self.nlp.vocab) self.matcher.add(0, None, [{ 'ORTH': 'JJR' }, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }], [{ 'ORTH': 'JJR' }, {}, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }], [{ 'ORTH': 'JJR' }, { 'ORTH': 'CIN' }, {}, { 'ORTH': 'TECH' }], [{ 'ORTH': 'JJR' }, {}, { 'ORTH': 'CIN' }, {}, { 'ORTH': 'TECH' }]) self.matcher.add(8, None, [{ 'ORTH': 'RBR' }, { 'ORTH': 'JJ' }, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }], [{ 'ORTH': 'RBR' }, { 'ORTH': 'JJ' }, {}, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }]) self.matcher.add(2, None, [{ 'ORTH': 'CV' }, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }], [{ 'ORTH': 'CV' }, {}, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }]) self.matcher.add(3, None, [{ 'ORTH': 'CV' }, { 'ORTH': 'VBG' }, { 'ORTH': 'TECH' }]) # self.matcher.add(6, # None, # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'JJS'}], # [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {'ORTH': 'JJS'}], # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'JJS'}], # [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'JJS'}]) self.matcher.add(10, None, [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBZ' }, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'RBR' }]) self.matcher.add(7, None, [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBZ' }, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'JJR' }])
def test_matcher_pattern_validation(en_vocab, pattern): matcher = Matcher(en_vocab, validate=True) with pytest.raises(MatchPatternError): matcher.add("TEST", [pattern])
def test_matcher_basic_check(en_vocab): matcher = Matcher(en_vocab) # Potential mistake: pass in pattern instead of list of patterns pattern = [{"TEXT": "hello"}, {"TEXT": "world"}] with pytest.raises(ValueError): matcher.add("TEST", pattern)
def patternSearch(T_0, file): phrase_patterns = set() seed_pattern = [nlp(x) for x in T_0] phrase_matcher = PhraseMatcher(nlp.vocab) phrase_matcher.add('pattern search', None, *seed_pattern) # find occurrences of seed phrases with open(file, "r") as f: document = nlp(f.read().lower()) matches = phrase_matcher(document) for match_id, start, end in matches: p = tuple((start, end)) if p not in phrase_patterns: phrase_patterns.add(p) # find patterns around seed phrases unranked_patterns = [] with open(file, "r") as f: text = nlp(f.read().lower()) for phrase_pattern in phrase_patterns: start = phrase_pattern[0] end = phrase_pattern[1] if (text[start - 1].text == '\n'): continue # add context pattern tmp = [] for i in range(2, 0, -1): tmp.append({"TEXT": text[start - i].text}) # add content pattern span = text[start:end] for token in span: tmp.append({"POS": token.pos_}) if tmp not in unranked_patterns: unranked_patterns.append(tmp) print(tmp) unranked_phrases = list(getPhrases(file, unranked_patterns)) # build context graph context_graph = nx.Graph() # add tuples and patterns into graph for i in range(len(unranked_phrases)): node = 't' + str(i) context_graph.add_node(node, pos=(0, i)) for i in range(len(unranked_patterns)): node = 'p' + str(i) context_graph.add_node(node, pos=(2, i)) context_matrix = np.zeros((len(unranked_phrases), len(unranked_patterns))) # find c (t, p) with open(file, 'r') as f: t = f.read().lower() matcher = Matcher(nlp.vocab) doc = nlp(t) for i in range(len(unranked_patterns)): matcher.add("extraction", None, unranked_patterns[i]) matches = matcher(doc) for match_id, start, end in matches: span = doc[start + 2:end].text j = unranked_phrases.index(span) context_matrix[j, i] += 1 matcher.remove("extraction") # add context nodes into graph c_count = 0 for i in range(context_matrix.shape[0]): for j in range(context_matrix.shape[1]): if context_matrix[i, j] != 0: occur = context_matrix[i, j] node_t = 't' + str(i) node_p = 'p' + str(j) node_c = 'c' + str(c_count) c_count += 1 context_graph.add_node(node_c, pos=(1, c_count)) context_graph.add_edge(node_t, node_c, weight=occur) context_graph.add_edge(node_c, node_p, weight=occur) # draw context graph plt.figure() pos = nx.get_node_attributes(context_graph, 'pos') nx.draw(context_graph, pos, with_labels=True) labels = nx.get_edge_attributes(context_graph, 'weight') nx.draw_networkx_edge_labels(context_graph, pos, edge_labels=labels) # return patterns return unranked_phrases
def test_matcher_valid_callback(en_vocab): """Test that on_match can only be None or callable.""" matcher = Matcher(en_vocab) with pytest.raises(ValueError): matcher.add("TEST", [[{"TEXT": "test"}]], on_match=[]) matcher(Doc(en_vocab, words=["test"]))
import spacy from spacy.matcher import Matcher nlp = spacy.load("en_core_web_sm") matcher = Matcher(nlp.vocab) doc = nlp( "Features of the app include a beautiful design, smart search, automatic " "labels and optional voice responses.") # Escreva uma expressão que corresponda a um adjetivo seguido de um ou dois substantivos pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}] # Adicione uma expressão ao comparador matcher e aplique o matcher ao doc matcher.add("ADJ_NOUN_PATTERN", None, pattern) matches = matcher(doc) print("Total matches found:", len(matches)) # Faça a iteração sobre as correspondencias e imprima a partição do texto for match_id, start, end in matches: print("Match found:", doc[start:end].text)
def run_prdualrank(T_0, unranked_patterns, unranked_phrases, file): global final_patterns, final_keywords, pattern_to_score_map, keyword_to_score_map, ngram_prob_map, phrase_seg_score, removed_phrases, wiki_ir_cache, error_count, total_ngram_counts phrase2id = {} for i in range(len(unranked_phrases)): phrase2id[unranked_phrases[i]] = i id2phrase = {} for i in range(len(unranked_phrases)): id2phrase[i] = unranked_phrases[i] id2pattern = {} for i in range(len(unranked_patterns)): id2pattern[i] = unranked_patterns[i] seedIdwConfidence = {} for key, val in phrase2id.items(): if key in T_0: seedIdwConfidence[val] = 0.0 id2patterns = defaultdict(set) pattern2ids = defaultdict(set) context_matrix = np.zeros((len(unranked_phrases), len(unranked_patterns))) # find c (t, p) with open(file, 'r') as f: file_chunk = partition(f) matcher = Matcher(nlp.vocab) for t in file_chunk: doc = nlp(t) for i in range(len(unranked_patterns)): offset = 0 for pattern_dict in unranked_patterns[i]: if 'POS' in pattern_dict: break offset += 1 matcher.add("extraction", None, unranked_patterns[i]) matches = matcher(doc) for match_id, start, end in matches: span = doc[start+offset:end].text j = unranked_phrases.index(span) if span in unranked_phrases else -1 if j == -1: continue context_matrix[j, i] += 1 id2patterns[j].add(i) pattern2ids[i].add(j) matcher.remove("extraction") id2sup = {} for i in range(len(unranked_phrases)): id2sup[i] = 0 pattern2sup = {} for i in range(len(unranked_patterns)): pattern2sup[i] = 0 for id in id2patterns.keys(): sum = 0 for col in range(len(unranked_patterns)): sum += context_matrix[id, col] id2sup[id] = sum for pattern in pattern2ids.keys(): sum = 0 for row in range(len(unranked_phrases)): sum += context_matrix[row, pattern] pattern2sup[pattern] = sum l1, l2, l3, l4, m1, m2, m3, m4 = prDualRank(seedIdwConfidence, [], id2patterns, pattern2ids, {}, {}, {}, {}, id2phrase, context_matrix.tolist(), id2sup, pattern2sup, FLAGS_VERBOSE=False, FLAGS_DEBUG=False) return l1, l2, l3, l4, m1, m2, m3, m4