def test_matcher_remove_zero_operator(en_vocab): matcher = Matcher(en_vocab) pattern = [{"OP": "!"}] matcher.add("Rule", [pattern]) doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) matches = matcher(doc) assert len(matches) == 0 assert "Rule" in matcher matcher.remove("Rule") assert "Rule" not in matcher
def test_matcher_from_api_docs(en_vocab): matcher = Matcher(en_vocab) pattern = [{"ORTH": "test"}] assert len(matcher) == 0 matcher.add("Rule", None, pattern) assert len(matcher) == 1 matcher.remove("Rule") assert "Rule" not in matcher matcher.add("Rule", None, pattern) assert "Rule" in matcher on_match, patterns = matcher.get("Rule") assert len(patterns[0])
def get_single_match(self, doc, pattern): matcher = Matcher(self.nlp.vocab) if "newMatch" in matcher: matcher.remove("newMatch") matcher.add("newMatch", None, pattern) matches = matcher(doc) try: if len(matches) > 0: for match_id, start, end in matches: return doc[start:end] except Exception as e: return e return ""
def test_matcher_remove(en_vocab): matcher = Matcher(en_vocab) pattern = [{"ORTH": "test"}, {"OP": "?"}] assert len(matcher) == 0 matcher.add("Rule", None, pattern) assert "Rule" in matcher # removing once should work matcher.remove("Rule") # removing again should throw an error with pytest.raises(ValueError): matcher.remove("Rule")
def getPhrases(file, context_pattern): new_phrases = set() with open(file, 'r') as f: t = f.read().lower() matcher = Matcher(nlp.vocab) doc = nlp(t) for cp in context_pattern: matcher.add("extraction", None, cp) matches = matcher(doc) for match_id, start, end in matches: span = doc[start + 2:end].text if span not in new_phrases: new_phrases.add(span) matcher.remove("extraction") return new_phrases
def match_sentence(self, question): matcher = Matcher(self.nlp.vocab) sent_tokens = [self.nlp(i) for i in self.textsplit] question_nlp = self.nlp(question) # for t in q_token_set: # question_pattern.extend([{'LEMMA': t, 'OP': '?'}]) matched_scores = {} for sent_token in sent_tokens: sent_pattern = [] this_length = 0 sent_token_set = set([tok.lemma_.lower() for tok in sent_token]) for t in sent_token_set: sent_pattern.extend([{'LEMMA': t, 'OP': '?'}]) matcher.add("sent_pattern", None, sent_pattern) matches = matcher(question_nlp) for match_id, start, end in matches: # string_id = nlp.vocab.strings[match_id] # Get string representation # span = question_nlp[start:end] # The matched span # this_length += len(span.text.split(" ")) this_length += end - start matched_scores[sent_token] = this_length matcher.remove("sent_pattern") matched_scores = {k: v/len(question_nlp) for k, v in matched_scores.items()} # from the default similarity function in spacy, find the simiarity score # for the sentence, and then weight it with the matching score. final_scores = {} if question_nlp[0].text in ["Did", "Do", "Does", "Is", "Are", "Were", "Was", "Had", "Has", "Have"]: for sent in matched_scores.keys(): if len(sent) <= 2: continue similarity_score = sent.similarity(question_nlp) # print(sent) # print("match_score: %.3f" % matched_scores[sent]) # print("similarity score: %.3f" % similarity_score) final_scores[sent.text] = 0.7*matched_scores[sent] + 0.3*similarity_score else: for sent in matched_scores: if len(sent) <= 2: continue similarity_score = sent.similarity(question_nlp) # print(sent) # print(matched_scores[sent]) final_scores[sent.text] = 0.5*matched_scores[sent] + 0.5*similarity_score return final_scores
def getPhrases(file, context_pattern): new_phrases = set() with open(file, 'r') as f: matcher = Matcher(nlp.vocab) file_chunk = partition(f) for t in file_chunk: doc = nlp(t) for cp in context_pattern: pos_indices = [i for i in range(len(cp)) if 'POS' in cp[i]] start_offset = min(pos_indices) end_offset = max(pos_indices) + 1 matcher.add("extraction", None, cp) matches = matcher(doc) for match_id, start, end in matches: span = doc[start+start_offset:start+end_offset].text if span not in new_phrases: new_phrases.add(span) matcher.remove("extraction") return new_phrases
def getPhrases(file, context_pattern): new_phrases = set() with open(file, 'r') as f: matcher = Matcher(nlp.vocab) file_chunk = partition(f) for t in file_chunk: doc = nlp(t) for cp in context_pattern: offset = 0 for i in range(len(cp)): if 'POS' in cp[i]: break offset += 1 matcher.add("extraction", None, cp) matches = matcher(doc) for match_id, start, end in matches: span = doc[start + offset:end].text if span not in new_phrases: new_phrases.add(span) # print(span) matcher.remove("extraction") return new_phrases
def is_valid_drivenRequirements(nlp_doc): pattern = [{'TEXT': {'REGEX': '(?i)^(when|if|while)*$'}}] pattern1 = '' matcher = Matcher(nlp.vocab) matcher.add('validdriven', None, pattern) matches = matcher(nlp_doc) first_word = str(nlp_doc[0]).lower() if matches: if first_word == 'when': pattern1 = [{'TAG': 'NNP'}, {'TAG': ','}] if first_word == 'if': pattern1 = [{'TAG': 'VBN'}, {'TAG': ','}] if first_word == 'while': pattern1 = [{'TAG': 'NN'}, {'TAG': ','}] matcher.remove('validdriven') matcher.add("commapostion", None, pattern1) matches = matcher(nlp_doc) if matches: return True else: return False else: return False
def test_matcher_remove(): nlp = English() matcher = Matcher(nlp.vocab) text = "This is a test case." pattern = [{"ORTH": "test"}, {"OP": "?"}] assert len(matcher) == 0 matcher.add("Rule", None, pattern) assert "Rule" in matcher # should give two matches results1 = matcher(nlp(text)) assert len(results1) == 2 # removing once should work matcher.remove("Rule") # should not return any maches anymore results2 = matcher(nlp(text)) assert len(results2) == 0 # removing again should throw an error with pytest.raises(ValueError): matcher.remove("Rule")
def run_prdualrank(T_0, unranked_patterns, unranked_phrases, file): phrase2id = {} for i in range(len(unranked_phrases)): phrase2id[unranked_phrases[i]] = i id2phrase = {} for i in range(len(unranked_phrases)): id2phrase[i] = unranked_phrases[i] id2pattern = {} for i in range(len(unranked_patterns)): id2pattern[i] = unranked_patterns[i] seedIdwConfidence = {} for key, val in phrase2id.items(): if key in T_0: seedIdwConfidence[val] = 0.0 id2patterns = defaultdict(set) pattern2ids = defaultdict(set) context_matrix = np.zeros((len(unranked_phrases), len(unranked_patterns))) # find c (t, p) with open(file, 'r') as f: file_chunk = partition(f) matcher = Matcher(nlp.vocab) for t in file_chunk: doc = nlp(t) for i in range(len(unranked_patterns)): offset = 0 for pattern_dict in unranked_patterns[i]: if 'POS' in pattern_dict: break offset += 1 matcher.add("extraction", None, unranked_patterns[i]) matches = matcher(doc) for match_id, start, end in matches: span = doc[start + offset:end].text j = unranked_phrases.index(span) context_matrix[j, i] += 1 id2patterns[j].add(i) pattern2ids[i].add(j) matcher.remove("extraction") id2sup = {} pattern2sup = {} for id in id2patterns.keys(): sum = 0 for col in range(len(unranked_patterns)): sum += context_matrix[id, col] id2sup[id] = sum for pattern in pattern2ids.keys(): sum = 0 for row in range(len(unranked_phrases)): sum += context_matrix[row, pattern] pattern2sup[pattern] = sum l1, l2, l3, l4, m1, m2, m3, m4 = prDualRank(seedIdwConfidence, [], id2patterns, pattern2ids, {}, {}, {}, {}, id2phrase, context_matrix.tolist(), id2sup, pattern2sup, FLAGS_VERBOSE=False, FLAGS_DEBUG=False) return l1, l2, l3, l4, m1, m2, m3, m4
def manage_speaker(doc_clean, pattern_1, pattern_2, president_1, president_2): matcher = Matcher(nlp.vocab) matcher.add("speaker", None, pattern_1, pattern_2) matches = matcher(doc_clean) speaker = [] i = 0 for match_id, start, end in matches: i += 1 speaker.append( (start + 1, end - 1)) #append span of speaker as list of tuples matcher.remove("speaker") # find speaker using party matching, vom redner nicht autorisiert, punctuation matcher = PhraseMatcher(nlp.vocab, attr="LOWER") terms = [ "(von der rednerin nicht autorisiert):", "(vom redner nicht autorisiert):" ] patterns = [nlp.make_doc(text) for text in terms] matcher.add("nicht_autorisiert", None, *patterns) matches = matcher(doc_clean) for match_id, start, end in matches: for i in range(10): if doc_clean[start - i].is_sent_start and doc_clean[start - i].pos_ != 'PUNCT': #print(doc_clean[start-i:end-1]) speaker.append((start - i, end - 1)) break matcher.remove("nicht_autorisiert") # matcher for (vice)president; 'OP' operator does not behave greedily, thus yielding redundant results matcher = Matcher(nlp.vocab) matcher.add("president", None, president_1, president_2) matches = matcher(doc_clean) # filter redundant results so that only longest span is kept; exploit the fact that first result is always the longest seen = set() keep = [] for match_id, start, end in matches: if end - 1 in seen: continue else: seen.add(end - 1) keep.append((start, end - 1)) pres = keep #for start, end in pres: #span = doc_clean[start:end] #print(span.text, start, end) matcher.remove("president") # append presidents for i in pres: speaker.append(i) #sort speaker list by first element of the tuples speaker.sort(key=operator.itemgetter(0)) ##all speakers should be here!! #for start, end in speaker: #print(doc_clean[start:end+4]) print('there are', len(speaker), 'speakers in this session') return speaker
def manage_interruptions(doc, party, exception): matcher = Matcher(nlp.vocab) party_parenth = [{ "TEXT": "(" }, { 'LOWER': { 'IN': party } }, { "TEXT": ")" }] ##'regular' speaker (e.g. Muller (spd):) matcher.add("party_parenthesis", None, party_parenth) matches = matcher(doc) end_p = [] for match_id, start, end in matches: end_p.append(end) # identify interruptions in doc and print them tmp = [] for token in doc: if token.text == '(' and doc[token.i + 1].lower_ not in party and doc[ token.i + 1].lower_ not in exception and token.i + 80 < len(doc): for i in range(1, 80): if doc[token.i + i].text == ')' and token.i + i + 1 not in end_p: #find last parentheses that does not belong to party #print(doc[token.i:token.i+i+1], token.i, token.i+i, token.i+i+1) #print interruption, index and subsequent token tmp.append( (token.i, token.i + i + 1) ) #store index of span of interruptions as list of tuples break #avoid capturing subsequent interruptions due to i going till 80 tokens forward matcher.remove("party_parenthesis") ##check long (> 50 tokens) interruptions among those above for i in range(len(tmp)): if (tmp[i][1] - tmp[i][0]) > 50: print('wow! This is a very long interruption: -->', doc[tmp[i][0]:tmp[i][1]], '\n') # create variable that contains index of each token that is within interruptions seen = set() t = [] for i in tmp: #print(doc[i[0]: i[1]]) #check it is printing all interruptions for token in doc[i[0]:i[1]]: if token.i in seen: # avoid adding parts of already identified interruptions continue else: seen.add(token.i) t.append(token.i) # define getter function that returns True if a token is part of interruptions def is_in_interruption(token): in_int = token.i in t return in_int # set a token custom extension to check whether token is in interruption Token.set_extension('is_in_interruption', getter=is_in_interruption) #store tokens that are not within interruptions in clean_doc clean_doc = [] for token in doc: if not token._.is_in_interruption: clean_doc.append(token) Token.remove_extension('is_in_interruption') # create a new doc object that does not contain interruptions doc_clean = nlp(''.join(map(lambda x: x.text_with_ws, clean_doc))) return doc_clean
pattern_3 = [ { 'LOWER' : 'solar' }, { 'LOWER' : 'power' } ] matcher.add( 'SolarPower', None, pattern_1, pattern_2, pattern_3 ) doc = nlp( u'The Solar Power inductry continues to grow as solarpower increases. Solar-Power is great.' ) def format_matcher( doc, matcher ): for match_id, start, end in matcher: string_id = nlp.vocab.strings[ match_id ] span = doc[ start : end ] print( match_id, string_id, start, end, span.text ) format_matcher( doc, matcher( doc ) ) # Remove Pattern matcher.remove( 'SolarPower' ) # Compress pattern_2 and pattern_3 pattern_1 = [ { 'LOWER' : 'solarpower' } ] # '*' means match 0 or more times pattern_2 = [ { 'LOWER' : 'solar' }, { 'IS_PUNCT' : True, 'OP' : '*' }, { 'LOWER' : 'power' } ] matcher.add( 'SolarPower', None, pattern_1, pattern_2 ) doc = nlp( u'The Solar Power inductry continues to grow as solarpower increases. Solar--Power is great.' ) format_matcher( doc, matcher( doc ) ) from spacy.matcher import PhraseMatcher matcher = PhraseMatcher( nlp.vocab )
def create_dataset_input(rule_based_candidates, mention_context_cache_path, data_folder_path, overall_output_path=None, is_test=False, output_path=None, overwrite_dataset=False): """Function to take in the rule based candidates and create the input format for the SGTB model. This function is intended to be used for processing test data, as the main function in this file will convert and save train, dev, and test output. @param rule_based_candidates: a list of candidates from the rule based model @param mention_context_cache_path: path to a dictionary mapping <pub_id>:<mention_text> pairs to all contexts @param data_folder_path: path to the data folder @param overall_output_path: path to the overall output folder (optional, used for SGTB training) @param is_test: parameter indicating whether or not the data being processed is test data @param output_path: the path to write the output to (if not processing test data) @param overwrite_dataset: whether or not to overwrite the existing dataset (will be true for train and false for dev and test) """ scispacy_parser = scispacy_util.SciSpaCyParser() prior_entity_probs = compute_entity_probabilities() prior_entity_given_mention_probs = compute_entity_given_mention_probs() prior_mention_given_entity_probs = compute_mention_given_entity_probs() glove_path = os.path.abspath( os.path.join("project", "data", "glove", "glove.6B.50d.txt")) with open(glove_path, "r") as lines: glove = { line.split()[0]: np.array([float(value) for value in line.split()[1:]]) for line in lines } # I haven't run the experiments to tell if having a cache actually helps or not, it takes a while to load # the cache when it is used # if is_test: # mention_context_cache = {} # else: # try: # print("Loading cache...") # mention_context_cache = joblib.load(mention_context_cache_path)["cache"] # print("Cache loaded...") # except: # mention_context_cache = {} mention_context_cache = {} kb_path = os.path.abspath(os.path.join("project", "data", "data_sets.json")) with open(kb_path) as kb_file: kb_json = json.load(kb_file) dataset_id_to_kb_entry = {} for dataset in kb_json: dataset_id_to_kb_entry[dataset["data_set_id"]] = dataset matcher = Matcher(scispacy_parser.nlp.vocab) section_matcher = Matcher(scispacy_parser.nlp.vocab) for section_name in SECTION_STRINGS: section_matcher.add(section_name, None, [{ "LOWER": section_name }, { "ORTH": "\n" }], [{ "LOWER": section_name }, { "ORTH": ":" }], [{ "ORTH": "\n" }, { "LOWER": section_name }, { "ORTH": "." }]) output_docs = [] pub_ids = [] # we will write a new file on the first document, and amend to it afterwards first_doc = True cache_changed = False for pub_id in tqdm(rule_based_candidates, desc='create dataset in create_sgtb_dataset.py'): spacy_doc = get_scispacy_doc(data_folder_path, pub_id, scispacy_parser) pub_ids.append(pub_id) doc_candidates = rule_based_candidates[pub_id] output_doc = [] dataset_id_to_longest_mention_text = {} for row in doc_candidates: mention_text = row["mention"] dataset_id = row["candidate_dataset_ids"][0] if dataset_id in dataset_id_to_longest_mention_text: if len(mention_text) > len( dataset_id_to_longest_mention_text[dataset_id]): dataset_id_to_longest_mention_text[ dataset_id] = mention_text else: dataset_id_to_longest_mention_text[dataset_id] = mention_text for row in doc_candidates: mention_text = row["mention"] dataset_id = row["candidate_dataset_ids"][0] # if mention_text != dataset_id_to_longest_mention_text[dataset_id]: # continue mention_context_cache_key = str(pub_id) + "_" + mention_text if mention_context_cache_key in mention_context_cache: mention_contexts = mention_context_cache[ mention_context_cache_key] else: # search for the mention text in the doc spacy_mention_text = scispacy_parser.scispacy_create_doc( mention_text) pattern = [] for token in spacy_mention_text: pattern.append({"ORTH": token.text}) try: matcher.add("MENTION", None, pattern) matches = list(matcher(spacy_doc)) except ValueError: continue # build and save a mapping of <pub_id>_<mention_text> to all contexts the mention # is found in cache_changed = True mention_contexts = [] token_idx_to_sent_idx = {} sentences_list = list(spacy_doc.sents) context_size = 3 for sent_idx, sent in enumerate(sentences_list): for token in sent: token_idx = token.i token_idx_to_sent_idx[token_idx] = sent_idx for match_id, start, end in matches: sentence_idx = token_idx_to_sent_idx[start] start_context_sent_idx = max(0, sentence_idx - context_size) if start_context_sent_idx == 0: match_sentence_idx = sentence_idx else: match_sentence_idx = context_size end_context_sent_idx = min(len(sentences_list), sentence_idx + context_size) mention_context = sentences_list[ start_context_sent_idx:end_context_sent_idx + 1] sentences_as_docs = [] for sentence in mention_context: sentences_as_docs.append(sentence.as_doc()) start_context_token_idx = sentences_list[ start_context_sent_idx].start end_context_token_idx = sentences_list[end_context_sent_idx - 1].end context_with_offsets = (sentences_as_docs, (start_context_token_idx, end_context_token_idx), (start, end), match_sentence_idx) mention_contexts.append(context_with_offsets) # limit featurizing to first 3 contexts in order of appearance mention_contexts = mention_contexts[:3] mention_context_cache[ mention_context_cache_key] = mention_contexts matcher.remove("MENTION") if mention_contexts != []: output_mention = create_output_mention( is_test, row, prior_entity_probs, prior_entity_given_mention_probs, mention_text, prior_mention_given_entity_probs, dataset_id_to_kb_entry, mention_contexts, scispacy_parser, glove, spacy_doc, section_matcher) output_doc.append(output_mention) # only write output to file if not processing test data if not is_test: if first_doc: with open(output_path, "w") as output_file: json.dump(output_doc, output_file) output_file.write("\n") first_doc = False if overwrite_dataset: with open(overall_output_path, "w") as overall_output_file: json.dump(output_doc, overall_output_file) overall_output_file.write("\n") else: with open(output_path, "a") as output_file: json.dump(output_doc, output_file) output_file.write("\n") with open(overall_output_path, "a") as overall_output_file: json.dump(output_doc, overall_output_file) overall_output_file.write("\n") output_docs.append(json.loads(json.dumps(output_doc))) # if cache_changed and not is_test: # joblib.dump({"cache": mention_context_cache}, mention_context_cache_path) return output_docs, pub_ids
def patternSearch(T_0, file): phrase_patterns = set() seed_pattern = [nlp(x) for x in T_0] phrase_matcher = PhraseMatcher(nlp.vocab) phrase_matcher.add('pattern search', None, *seed_pattern) # find occurrences of seed phrases with open(file, "r") as f: document = nlp(f.read().lower()) matches = phrase_matcher(document) for match_id, start, end in matches: p = tuple((start, end)) if p not in phrase_patterns: phrase_patterns.add(p) # find patterns around seed phrases unranked_patterns = [] with open(file, "r") as f: text = nlp(f.read().lower()) for phrase_pattern in phrase_patterns: start = phrase_pattern[0] end = phrase_pattern[1] if (text[start - 1].text == '\n'): continue # add context pattern tmp = [] for i in range(2, 0, -1): tmp.append({"TEXT": text[start - i].text}) # add content pattern span = text[start:end] for token in span: tmp.append({"POS": token.pos_}) if tmp not in unranked_patterns: unranked_patterns.append(tmp) print(tmp) unranked_phrases = list(getPhrases(file, unranked_patterns)) # build context graph context_graph = nx.Graph() # add tuples and patterns into graph for i in range(len(unranked_phrases)): node = 't' + str(i) context_graph.add_node(node, pos=(0, i)) for i in range(len(unranked_patterns)): node = 'p' + str(i) context_graph.add_node(node, pos=(2, i)) context_matrix = np.zeros((len(unranked_phrases), len(unranked_patterns))) # find c (t, p) with open(file, 'r') as f: t = f.read().lower() matcher = Matcher(nlp.vocab) doc = nlp(t) for i in range(len(unranked_patterns)): matcher.add("extraction", None, unranked_patterns[i]) matches = matcher(doc) for match_id, start, end in matches: span = doc[start + 2:end].text j = unranked_phrases.index(span) context_matrix[j, i] += 1 matcher.remove("extraction") # add context nodes into graph c_count = 0 for i in range(context_matrix.shape[0]): for j in range(context_matrix.shape[1]): if context_matrix[i, j] != 0: occur = context_matrix[i, j] node_t = 't' + str(i) node_p = 'p' + str(j) node_c = 'c' + str(c_count) c_count += 1 context_graph.add_node(node_c, pos=(1, c_count)) context_graph.add_edge(node_t, node_c, weight=occur) context_graph.add_edge(node_c, node_p, weight=occur) # draw context graph plt.figure() pos = nx.get_node_attributes(context_graph, 'pos') nx.draw(context_graph, pos, with_labels=True) labels = nx.get_edge_attributes(context_graph, 'weight') nx.draw_networkx_edge_labels(context_graph, pos, edge_labels=labels) # return patterns return unranked_phrases
for match_id, start, end in found_matches: string_id = nlp.vocab.strings[match_id] #adquirir a representação da string span = doc[start:end] #adquirir em qual linha iniciou e finalizou a palavra identificada print(match_id,string_id, start, end, span.text) # In[ ]: # In[20]: matcher.remove('ArtificialIntelligence') # In[ ]: # In[29]: #artificialintelligence ArtificialIntelligence pattern1 = [{'LOWER': 'artificialintelligence'}] #Articial.Intelligence pattern2 = [{'Lower':'Artificial'},{'IS_PUNCT':True, 'OP':'*'},{'LOWER':'intelligence'}]
class RefMatcher: def __init__(self, nlp): self.nlp = nlp self.matcher = Matcher(nlp.vocab) def clean_matcher(self): # no native method to clean spaCy matcher # or retrieve pattern names # so always add ints, starting from zero # and clean ints from 0 till not found i = 0 while len(self.matcher) > 0 and i < 100: if i in self.matcher: self.matcher.remove(i) i += 1 @staticmethod def is_negative(p): if "OP" in p and p["OP"] == "!": return True return False @staticmethod def is_droppable(p): if "OP" in p and p["OP"] in ["*", "?"]: return True return False @staticmethod def is_multitoken(p): if "OP" in p and p["OP"] in ["*", "+"]: return True return False def remove_skipped_ops(self, span, pattern): skipped_idx = [] op_tokens = [ i for (i, p) in enumerate(pattern) if RefMatcher.is_droppable(p) ] for op in op_tokens: op_pattern = copy.deepcopy(pattern) # remove "?" to require 1 instead of 0 if op_pattern[op]["OP"] == "?": if len(op_pattern[op]) == 1: # if no more props, # add dummy string that will never match # since its not 1 token :) op_pattern[op]["TEXT"] = "alice and bob" op_pattern[op]["OP"] = "!" del op_pattern[op]["OP"] # change "*" to "+", to require 1+ instead of 0+ elif op_pattern[op]["OP"] == "*": op_pattern[op]["OP"] = "+" self.matcher.add(op, None, op_pattern) # check whether it still matches matches = self.matcher(span.as_doc()) max_matches = [ m for (m, s, e) in matches if (s == 0) and (e == len(span)) ] # clean the matcher self.clean_matcher() non_op_pattern = [] for i, p in enumerate(pattern): # is optional if "OP" in p: # but not found if not i in max_matches and not RefMatcher.is_negative(p): # => to do marked non matched, skip skipped_idx.append(i) continue else: if p["OP"] == "+": if len(p) == 1: # if no more props, # add dummy string that will never match # since its not 1 token :) p["TEXT"] = "alice and bob" p["OP"] = "!" else: del p["OP"] elif p["OP"] == "*": p["OP"] = "+" non_op_pattern.append(p) return non_op_pattern, skipped_idx def insert_empty_idx(self, pattern_ref, idx): pattern_ref_insert = {} for p, v in pattern_ref.items(): if p >= idx: pattern_ref_insert[p + 1] = v else: pattern_ref_insert[p] = v pattern_ref_insert[idx] = [] return pattern_ref_insert def shift_pattern_ref(self, pattern_ref, skipped_idx): for idx in skipped_idx: pattern_ref = self.insert_empty_idx(pattern_ref, idx) return pattern_ref def __call__(self, span, orig_pattern): pattern = copy.deepcopy(orig_pattern) # remove props not supported by SpaCy matcher: for p in pattern: if "TEMPLATE_ID" in p: del p["TEMPLATE_ID"] # case I: tokens <-> patterns # if lengths match # if no OP # => everything has been matched if len(span) == len(pattern) and not any(["OP" in p for p in pattern]): return {k: [k] for k in range(len(pattern))} # check which tokens are matched, remove non matched non_op_pattern, skipped_idx = self.remove_skipped_ops(span, pattern) # case II: # if lengths match # if no multitoken OPs # => everything has been matched if len(span) == len(non_op_pattern) and not any( [RefMatcher.is_multitoken(p) for p in non_op_pattern]): pattern_ref = {k: [k] for k in range(len(non_op_pattern))} return self.shift_pattern_ref(pattern_ref, skipped_idx) # case III: # worst case # get shifts for multitokens # ie rematching cropped spans and patterns # A. get cropped patterns for i in range(len(non_op_pattern)): self.matcher.add(i, None, non_op_pattern[i:]) # B. get cropped spans docs = [span[i:].as_doc() for i in range(len(span))] # C. rematch matches = self.matcher.pipe(docs, batch_size=len(span), return_matches=True) # D. get pattern_ref pattern_ref = {} for i, (d, m) in enumerate(matches): # take max span match for doc if len(m): # len 0 shouldn't happen except weird white spaces m_id, m_start, m_end = max(m, key=lambda x: x[2] - x[1]) # if cropped span matches cropped pattern # 1st token of cropped span belongs to 1st cropped pattern item if not m_id in pattern_ref: pattern_ref[m_id] = [i] else: # no changes in pattern # pattern item had more tokens matched # ex. "very fast ..." & "fast ... " # matched with {"POS": "ADJ", "OP": "+"} ... pattern_ref[m_id].append(i) # clean self.clean_matcher() # shift by skipped ops pattern_ref = self.shift_pattern_ref(pattern_ref, skipped_idx) return pattern_ref
class AbbreviationDetector: """ Detects abbreviations using the algorithm in "A simple algorithm for identifying abbreviation definitions in biomedical text.", (Schwartz & Hearst, 2003). This class sets the `._.abbreviations` attribute on spaCy Doc. The abbreviations attribute is a `List[Span]` where each Span has the `Span._.long_form` attribute set to the long form definition of the abbreviation. Note that this class does not replace the spans, or merge them. """ def __init__(self, nlp) -> None: Doc.set_extension("abbreviations", default=[], force=True) Span.set_extension("long_form", default=None, force=True) self.matcher = Matcher(nlp.vocab) self.matcher.add("parenthesis", None, [{ "ORTH": "(" }, { "OP": "+" }, { "ORTH": ")" }]) self.global_matcher = Matcher(nlp.vocab) def find(self, span: Span, doc: Doc) -> Tuple[Span, Set[Span]]: """ Functional version of calling the matcher for a single span. This method is helpful if you already have an abbreviation which you want to find a definition for. """ dummy_matches = [(-1, int(span.start), int(span.end))] filtered = filter_matches(dummy_matches, doc) abbreviations = self.find_matches_for(filtered, doc) if not abbreviations: return span, set() else: return abbreviations[0] def __call__(self, doc: Doc) -> Doc: matches = self.matcher(doc) matches_no_brackets = [(x[0], x[1] + 1, x[2] - 1) for x in matches] filtered = filter_matches(matches_no_brackets, doc) occurences = self.find_matches_for(filtered, doc) for (long_form, short_forms) in occurences: for short in short_forms: short._.long_form = long_form doc._.abbreviations.append(short) return doc def find_matches_for(self, filtered: List[Tuple[Span, Span]], doc: Doc) -> List[Tuple[Span, Set[Span]]]: rules = {} all_occurences: Dict[Span, Set[Span]] = defaultdict(set) already_seen_long: Set[str] = set() already_seen_short: Set[str] = set() for (long_candidate, short_candidate) in filtered: short, long = find_abbreviation(long_candidate, short_candidate) # We need the long and short form definitions to be unique, because we need # to store them so we can look them up later. This is a bit of a # pathalogical case also, as it would mean an abbreviation had been # defined twice in a document. There's not much we can do about this, # but at least the case which is discarded will be picked up below by # the global matcher. So it's likely that things will work out ok most of the time. new_long = long.string not in already_seen_long if long else False new_short = short.string not in already_seen_short if long is not None and new_long and new_short: already_seen_long.add(long.string) already_seen_short.add(short.string) all_occurences[long].add(short) rules[long.string] = long # Add a rule to a matcher to find exactly this substring. self.global_matcher.add(long.string, None, [{ "ORTH": x.text } for x in short]) to_remove = set() global_matches = self.global_matcher(doc) for match, start, end in global_matches: string_key = self.global_matcher.vocab.strings[match] to_remove.add(string_key) all_occurences[rules[string_key]].add(doc[start:end]) for key in to_remove: # Clean up the global matcher. self.global_matcher.remove(key) return list((k, v) for k, v in all_occurences.items())
import spacy from spacy.matcher import Matcher import os.path import io data_folder = os.path.join( "/Users/juangarciaberdoy/Documents/GitHub/philhistcomp/projects", "piraha_language") file_to_open = os.path.join(data_folder, "corpus.txt") ff = io.open(file_to_open, 'r', encoding='utf-8') nlp = spacy.load("en_core_web_sm") matcher = Matcher(nlp.vocab) patterns = [[{"LOWER": "one"}], [{"LOWER": "is"}]] doc = nlp(ff.read()) for pattern in patterns: print(pattern) matcher.add("tempId", None, pattern) matches = matcher(doc) print(len(matches)) matcher.remove("tempId")
class NlpService(nlp_pb2_grpc.NlpServicer): def __init__(self): self.modelName = None self.nlp = None self.matcher = None def LoadModel(self, request, context): self.modelName = request.text self.nlp = spacy.load(request.text) response = nlp_pb2.TextResponse() response.message = "Model loaded '{}'".format(request.text) return response def NlpProcess(self, request, context): doc = self.nlp(request.text) response = utils.doc2proto(doc, self.modelName) return response def DocSimilarity(self, request, context): docA = self.nlp(request.texta) docB = self.nlp(request.textb) response = nlp_pb2.TextSimilarity() response.similarity = docA.similarity(docB) return response def AddRule(self, request, context): if self.matcher == None: self.matcher = Matcher(self.nlp.vocab) matcher_id = request.id patterns = [{pat.key: pat.value} for pat in request.patterns] self.matcher.add(matcher_id, None, patterns) response = nlp_pb2.TextResponse() response.message = "Rule with id '{}' added to matcher.".format( matcher_id) return response def RemoveRule(self, request, context): if self.matcher == None: return nlp_pb2.TextResponse(message="No rules exists with matcher") self.matcher.remove(request.text) return nlp_pb2.TextResponse( message="Rule with id '{}' removed from matcher.".format( request.text)) def GetRule(self, request, context): if self.matcher == None: return nlp_pb2.TextResponse(message="No rules exists with matcher") _, patterns = self.matcher.get(request.text) return nlp_pb2.Rule( id=request.text, patterns=[ nlp_pb2.Pattern(key=list(pat.keys())[0], value=list(pat.values())[0]) for pat in patterns[0] ], ) def GetMatches(self, request, context): doc = self.nlp(request.text) matches = self.matcher(doc) reponse = nlp_pb2.Matches(matches=[ nlp_pb2.Match(id=str(i[0]), start=i[1], end=i[2]) for i in matches ]) return reponse def ResetMatcher(self, request, context): self.matcher = None return nlp_pb2.TextResponse(message="Matcher object reset successful.")
class Classifier(): def __init__(self, inferenceEngine, colorFile="corpora/colors.csv", sizeFile="corpora/sizes.txt", shapeFile="corpora/shapes.txt", nerModel="models/nerModel"): self.query = "" self.nlp = spacy.load('en') ner = spacy.load(nerModel).pipeline[0][1] self.nlp.replace_pipe("ner", ner) self.inferenceEngine = inferenceEngine self.matcher = Matcher(self.nlp.vocab) self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) self.scene = { "objects": [], "backgrounds": [] } self.subjects = {} self.referenceWords = ["the", "it", "that", "his", "hers", "theirs"] self.colors = {} with open(colorFile, "r") as colorReader: for line in colorReader: colorValue = line.split(",") self.colors[colorValue[0].lower()] = colorValue[1].strip("\n") self.sizes = {} with open(sizeFile, "r") as sizeReader: for line in sizeReader: line = line.strip().lower() sizeValue = line.split(",") self.sizes[sizeValue[0]] = sizeValue[1].strip("\n") self.shapes = [] with open(shapeFile, "r") as shapeReader: self.shapes = [shape.strip().lower() for shape in shapeReader] def getBlankObject(self): identifiedObject = {} identifiedObject["subject"] = None identifiedObject["modifiers"] = {} identifiedObject["modifiers"]["color"] = None identifiedObject["modifiers"]["shape"] = None identifiedObject["modifiers"]["size"] = None identifiedObject["modifiers"]["quantity"] = 1 return identifiedObject def classifyDescriptors(self, descriptors): classifiedDescriptors = {} pastRef = False classifiedDescriptors["color"] = set() classifiedDescriptors["size"] = set() classifiedDescriptors["shape"] = set() classifiedDescriptors["quantity"] = 1 classifiedDescriptors["entity"] = None for descriptor in descriptors: lemma = descriptor.lemma_.lower() if lemma in self.referenceWords: pastRef = True elif descriptor.text.lower() in self.colors: classifiedDescriptors["color"].add(self.colors[descriptor.text.lower()]) elif lemma in self.sizes: classifiedDescriptors["size"].add(float(self.sizes[lemma])) elif lemma in self.shapes: classifiedDescriptors["shape"].add(lemma) elif descriptor.pos_ == "NUM": classifiedDescriptors["quantity"] = descriptor.lemma_ return (classifiedDescriptors, pastRef) def addSubjectDescriptors(self, subject, descriptors, subjectEntType=None, pronoun=False): subject = self.lemmatizer(subject, "NOUN")[0] descriptors, pastRef = self.classifyDescriptors(descriptors) if subject not in self.subjects: self.subjects[subject] = [descriptors] else: # TODO: If past ref and referring to multiple quantities, then get lemma of subject and modify all subjects being referred to # TODO: Compare descriptors to existing descriptors and choose the one that best fits, preferring the most recent if pastRef or pronoun: for propertyName, props in self.subjects[subject][-1].items(): if isinstance(props, set): self.subjects[subject][-1][propertyName] = self.subjects[subject][-1][propertyName].union(descriptors[propertyName]) else: self.subjects[subject].append(descriptors) if subjectEntType: for individual in self.subjects[subject]: if "entity" not in individual or not individual["entity"]: individual["entity"] = subjectEntType def detectBackground(self, match): return "entity" in match and match["entity"] in ["GPE", "LOC", "EVENT", "FAC"] def addSubjectsToScene(self): for subject, matches in self.subjects.items(): for match in matches: appendTo = "objects" if self.detectBackground(match): appendTo = "backgrounds" match.pop("entity", None) self.scene[appendTo].append({ "subject": subject, "modifiers": match }) def inferContext(self): for object in self.scene["objects"]: descriptiveWords = self.inferenceEngine.getDescriptiveWords(object["subject"]) matchingColors = [] matchingSizes = [] for word in descriptiveWords: word = word.lower() lemma = self.lemmatizer(word, "ADJ")[0] if word in self.colors: matchingColors.append(self.colors[word]) if not object["modifiers"]["size"] and lemma in self.sizes: matchingSizes.append(float(self.sizes[lemma])) if matchingColors and not object["modifiers"]["color"]: object["modifiers"]["color"] = {random.choice(matchingColors)} if matchingSizes and not object["modifiers"]["size"]: object["modifiers"]["size"] = {random.choice(matchingSizes)} def addUniqueMatches(self, doc, subject, pronoun=False): matchedRanges = [] for match_id, start, end in self.matcher(doc): skipMatch = False for prevStart, prevEnd in matchedRanges: if start >= prevStart and end <= prevEnd: skipMatch = True break if skipMatch: continue matchedRanges.append((start, end)) print("match", doc[start:end]) self.addSubjectDescriptors(subject, [token for token in doc[start:end] if token.text != subject], pronoun=pronoun) def matchPattern(self, doc, pattern, subject, pronoun=False): self.matcher.add(subject, None, pattern) self.addUniqueMatches(doc, subject, pronoun=pronoun) self.matcher.remove(subject) def matchPatterns(self, sentence): doc = self.nlp(sentence.text) for subject in self.subjects: pattern = [{'POS': 'DET', 'OP': '?'}, {'POS': 'ADJ', 'OP': '*'}, {'LOWER': subject}, {'LEMMA': 'be'}, {'POS': 'ADJ'}] self.matchPattern(doc, pattern, subject) pattern = [{'LEMMA': '-PRON-'}, {'LEMMA': 'be'}, {'POS': 'ADJ'}] subject = "" for subject in list(self.subjects)[::-1]: if not self.detectBackground(self.subjects[subject][-1]): self.matchPattern(doc, pattern, subject, pronoun=True) break def classify(self, query): self.scene = { "objects": [], "backgrounds": [] } self.subjects = {} doc = self.nlp(query) for i, sentence in enumerate(doc.sents): for chunk in sentence.noun_chunks: subject = chunk.root.text if chunk.root.lemma_ == "-PRON-": continue descriptors = [word for word in chunk if word.text != subject] self.addSubjectDescriptors(subject, descriptors, chunk.root.ent_type_) self.matchPatterns(sentence) self.addSubjectsToScene() self.inferContext() return self.scene
matcher.add('Solar', None, pattern1, pattern2, pattern3) doc = nlp( u'The solar-power industry is growing up now-a-days. Solar Power distribution is invlolving by govt and now solarpower is getting polpular day by day' ) find = matcher(doc) #print(find) for matched_id, start, end in find: string_id = nlp.vocab.strings[ matched_id] #-----------------Finding the matched pattern name span = doc[ start: end] # -----------------------------------Finding the pattern matched pattern string print(matched_id, string_id, start, end, span) #--------To remove the saved pattern form the matcher object.... matcher.remove('Solar') pattern1 = [{'Lower': 'solarpower'}] pattern2 = [{ 'Lower': 'solar' }, { 'IS_PUNC': True, 'OP': '*' }, { 'Lower': 'power' }] #-------thE '*' means punctuation can be zero or more times