def rm_main(): ScienceEvent = { "scienza", "universo", "geologia", "biologia", "scientifico", "scientifica", "scienziato", "scienzata" } VisualArtEvent = { "pittura", "scultura", "artista", "artisti", "art", "opere", "opera", "part" } keyPArt = KeywordProcessor() keyPScience = KeywordProcessor() for element in ScienceEvent: keyPScience.add_keyword(element) for element in VisualArtEvent: keyPArt.add_keyword(element) url = 'https://raw.githubusercontent.com/andreamatt/KDI/master/dataset/muse.json' obj = json.loads(requests.get(url).text) eventsArray = obj["events"] while {} in eventsArray: eventsArray.remove({}) for event in eventsArray: for k, v in event.items(): event[k] = v.replace('\n', '; ') artList = keyPArt.extract_keywords(event["description"]) scienceList = keyPScience.extract_keywords(event["description"]) if len(artList) > len(scienceList): event.update({'Subcategory': visual}) elif len(artList) <= len(scienceList): event.update({'Subcategory': science}) obj["events"] = eventsArray return json.dumps(obj)
def filter_keywords(data: pd.DataFrame, keywords: Iterable[str], source: Union[str, List[str]] = "text", case_sensitive: bool = True) -> pd.DataFrame: """Filters out rows that do not have any keywords in any source column(s) Args: data: dataframe containing [source] column(s) of type str keywords: Iterable of strings to search for in text source: column name or names containing the source text case_sensitive: Toggle keyword case sensitivity Returns: Original dataframe with rows filtered out """ # Get keyword processor and add keywords proc = KeywordProcessor(case_sensitive=case_sensitive) proc.add_keywords_from_list(keywords) # If single source column, only need to check one element in each row, otherwise, apply any(..) # to check all source columns iteratively through each row if isinstance(source, str): mask = data[source].apply( lambda sent: bool(proc.extract_keywords(sent))) else: mask = data[source].apply(lambda sents: any( bool(proc.extract_keywords(sent)) for sent in sents)) output = data[mask] # Use mask to filter out rows without any keywords return output
def get_eduYrs(s): keyword_processor = KeywordProcessor() education_yrs = 0.0 keyword_processor.add_keyword('four-year') keyword_processor.add_keyword('four years') sentences = sent_tokenize(s) selected_sentences = [ sent for sent in sentences if "degree" in word_tokenize(sent) ] selected_sentences1 = [ sent for sent in sentences if "Graduation" in word_tokenize(sent) ] for i in range(len(selected_sentences)): keywords_found = keyword_processor.extract_keywords( selected_sentences[i]) if (len(keywords_found) > 0): education_yrs = 4.0 for i in range(len(selected_sentences1)): keywords_found = keyword_processor.extract_keywords( selected_sentences1[i]) if (len(keywords_found) > 0): education_yrs = 4.0 return education_yrs
def find(model, keyword, types): """ Easily find a metabolite or reaction by a keyword. Parameters: model (obj) : model keyword (str) : keyword to search for types (str|tuple) : 'met' or 'reac' Returns: matches (list) : list of matches """ keyword_processor = KeywordProcessor() keyword_processor.add_keyword(keyword) matches = [] if "met" in types: for met in model.smx.rnames: match = keyword_processor.extract_keywords(met) if match: matches.append(met) if "reac" in types: for reac in model.smx.cnames: match = keyword_processor.extract_keywords(reac) if match: matches.append(reac) return matches
def UploadAction(event=None): filename = filedialog.askopenfilename() a = filename pytesseract.pytesseract.tesseract_cmd=r"C:\Program Files\Tesseract-OCR\tesseract.exe" img=Image.open(a) text=pytesseract.image_to_string(img) print(text) kp = KeywordProcessor() kp.add_keyword('invoice', ('Monument', 'Taj Mahal')) kp.add_keyword('bill', ('Location', 'Delhi')) kp.extract_keywords(text)
def find_needles(): needles = [] start = time.time() db = Database(None) print "Connected to DB in", time.time() - start, "secs" start = time.time() db.execute("SELECT STR FROM `Final_consolidated`") print "Retrieved info from DB in", time.time() - start, "secs" start = time.time() for ele in db.cursor: needles.append(ele[0].lower()) print "Listed needles in", time.time() - start, "secs" start = time.time() processor = KeywordProcessor() processor.add_keywords_from_list(needles) print "Processed needles in", time.time() - start, "secs" start = time.time() file = open("hello.txt", "r") for haystack in file.readlines(): print "Before hyphen removal:" found = list(set(processor.extract_keywords(haystack.lower(), span_info=True))) string = "" for ele in found: ele = ele.replace('"', '\\"') string += ',"' + ele + '"' print string, "\n\n" haystack = haystack.replace("-", " ") print "After hyphen removal:" found = list(set(processor.extract_keywords(haystack.lower()))) string = "" for ele in found: ele = ele.replace('"', '\\"') string += ',"' + ele + '"' print string, "\n\n\n" # haystacks = [] # db.execute("SELECT id, detailed_description FROM `sherlock_ct_new`.`trials`") # for row in db.cursor: # if row[1]: # haystack = row[1].lower() # found = list(set(processor.extract_keywords(haystack.lower()))) # file.write(str(row[0]) + "," + json.dumps(found)) file.close() time.time() - start print "Found needles within haystack in", time.time() - start, "secs"
def extract_brands( processor: KeywordProcessor, text: str, data_source_name: str, automatic_processing: bool, ) -> List[Prediction]: predictions = [] for (brand_tag, brand), span_start, span_end in processor.extract_keywords( text, span_info=True): match_str = text[span_start:span_end] predictions.append( Prediction( type=PredictionType.brand, value=brand, value_tag=brand_tag, automatic_processing=automatic_processing, predictor=data_source_name, data={ "text": match_str, "notify": False }, )) return predictions
class FlashtextBackend(Backend): """This backend recognizes entities using the FlashText algorithm. See https://flashtext.readthedocs.io/en/latest/ for more information about the FlashText algorithm. """ def __init__(self): self.keyword_processor = KeywordProcessor() def register_recognizer(self, recognizer_cls: Type[FlashtextRecognizer]): recognizer = recognizer_cls() key = f"{recognizer.TAG}:{recognizer.SCORE}:{recognizer_cls.__name__}" keyword_dict = {key: recognizer.keywords} self.keyword_processor.add_keywords_from_dict(keyword_dict) def run(self, text): keywords = self.keyword_processor.extract_keywords(text, span_info=True) ents = [] for keyword_key, start, end in keywords: tag, score, recognizer_name = keyword_key.split(":") ent = NamedEntity(start, end, tag, text[start:end], float(score), recognizer_name) ents.append(ent) return ents
def search_from_keywords(synonyms_list, sentences, keywords_with_weight): #synonyms_list is a list of dictionaries global keywords_rank final_result = [] for index in range(len(synonyms_list)): synonyms_dict = synonyms_list[index] keywords_list = list(synonyms_dict.keys()) threshold = 0.5 * len(keywords_list) if not keywords_list: final_result.append(None) continue keywords_rank = keywords_with_weight[index] matched_sentences = PriorityQueue() keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(synonyms_dict) for sent in sentences: keywords_found = set(keyword_processor.extract_keywords(sent)) if keywords_found: entry_obj = Entry(list(keywords_found), sent) if entry_obj.matched >= threshold: matched_sentences.put(entry_obj) if not matched_sentences.empty(): best_sentence = matched_sentences.get() final_result.append(best_sentence.sentence) else: final_result.append(None) return final_result
def get_DL(s): dl = False dl_valid = False dl_State = "" arr = ['driver', 'license'] keyword_processor = KeywordProcessor() keyword_processor.add_keyword('california') if any(re.findall('|'.join(arr), qual)): dl = True if (dl == True): sentences = sent_tokenize(s) selected_sentence = [ sent for sent in sentences if "driver" in word_tokenize(sent) ] if (len(selected_sentence) > 0): words = selected_sentence[0].split() selected_word = [word for word in words if "valid" in words] if len(selected_word) > 0: dl_valid = True for i in range(len(selected_sentence)): keywords_found = keyword_processor.extract_keywords( selected_sentence[i]) for i in range(len(keywords_found)): if keywords_found[i] == 'california': dl_State = "CA" if (dl_valid) == True: dl_valid = "R" else: dl_valid = "P" return dl_valid, dl_State
def get_expYrs(s): keyword_processor = KeywordProcessor() exp_yrs = 0.0 keyword_processor.add_keyword('four-year') keyword_processor.add_keyword('four years') keyword_processor.add_keyword('three years') keyword_processor.add_keyword('one year') keyword_processor.add_keyword('two years') keyword_processor.add_keyword('six years') sentences = sent_tokenize(s) selected_sentences = [ sent for sent in sentences if "experience" in word_tokenize(sent) ] for i in range(len(selected_sentences)): keywords_found = keyword_processor.extract_keywords( selected_sentences[i]) for i in range(len(keywords_found)): if keywords_found[i] == 'two years': exp_yrs = 2.0 elif keywords_found[i] == 'one year': exp_yrs = 1.0 elif keywords_found[i] == 'three years': exp_yrs = 3.0 elif keywords_found[i] == 'six years': exp_yrs = 6.0 elif keywords_found[i] == 'four years': exp_yrs = 4.0 elif keywords_found[i] == 'four-year': exp_yrs = 4.0 return exp_yrs
def get_college(s): college = "" keyword_processor = KeywordProcessor() keyword_processor.add_keyword('college or university') keyword_processor.add_keyword('college') keyword_processor.add_keyword('university') keyword_processor.add_keyword('high school') sentences = sent_tokenize(s) for j in range(len(sentences)): sentence = sentences[j] keywords_found = keyword_processor.extract_keywords(sentence) if (len(keywords_found) > 0): for i in range(len(keywords_found)): if (keywords_found[i] == 'college or university'): college = 'college or university' break elif (keywords_found[i] == 'college'): college = 'college' break elif (keywords_found[i] == 'university'): college = 'university' break elif (keywords_found[i] == 'high school'): college = 'high school' break return college
def extract_frequencies(sentence: Sentence, words: KeywordProcessor, frequencies: MutableMapping) -> MutableMapping: """Extracts statistical information for each word in words and mwes and stores it in w_stats. Args: sentence: A Sentence object. words: KeywordProcessor containing the list of words to count (see _get_wlist()). frequencies: A dictionary with the frequencies to update. Keys are lemma, values are WordFrequency objects. Returns: Reference to the updated dictionary. """ if not frequencies: frequencies['last_id'] = 0 all_lemmas = ' '.join([word.lemma for word in sentence]) matched_indexes = [ match[1] for match in words.extract_keywords(all_lemmas, span_info=True) ] matched_lemmas = [w for w in sentence if w.lemma_i in matched_indexes] for word in matched_lemmas: lemma = word.lemma if lemma not in frequencies: frequencies['last_id'] += 1 frequencies[lemma] = WordFrequencies(lemma, frequencies['last_id']) norm_token = word.token # Only the normalized form of proper nouns is left capitalized. if not word.token.istitle(): norm_token = norm_token.lower() frequencies[lemma].freq += 1 frequencies[lemma].norm[norm_token] += 1 frequencies[lemma].cap[_cap_type(word.token)] += 1 frequencies[lemma].pos_dep[word.pos + "_" + word.dep] += 1 return frequencies
class Entity(object): def __init__(self, nlp, keywords_list=[], keywords_dict={}, keywords_file=None, label='', attrs=('has_entities', 'is_entity', 'entity_desc', 'entities')): self._has_entities, self._is_entity, self._entity_desc, self._entities = attrs self.keyword_processor = KeywordProcessor() self.keyword_processor.add_keywords_from_list(keywords_list) self.keyword_processor.add_keywords_from_dict(keywords_dict) if keywords_file: self.keyword_processor.add_keyword_from_file(keywords_file) self.label = label Doc.set_extension(self._has_entities, getter=self.has_entities) Doc.set_extension(self._entities, getter=self.iter_entities) Span.set_extension(self._has_entities, getter=self.has_entities) Span.set_extension(self._entities, getter=self.iter_entities) Token.set_extension(self._is_entity, default=False) Token.set_extension(self._entity_desc, getter=self.get_entity_desc) def __call__(self, doc): matches = self.keyword_processor.extract_keywords(doc.text, span_info=True) spans = [] # keep spans here to merge them later for _, start, end in matches: entity = doc.char_span(start, end, label=self.label) for token in entity: token._.set(self._is_entity, True) spans.append(entity) doc.ents = list(doc.ents) + [entity] for span in spans: span.merge() return doc def has_entities(self, tokens): return any(token._.get(self._is_entity) for token in tokens) def iter_entities(self, tokens): return [(t.text, i, t._.get(self._entity_desc)) for i, t in enumerate(tokens) if t._.get(self._is_entity)] def get_entity_desc(self, token): return token.text
def __call__(self, from_file=False, file_path=None, stdout=False): watchlist = self.open_watchlist() keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_list(watchlist) if from_file: egg_path = _ask_spy_file() elif file_path: egg_path = file_path else: raise ValueError("Expected egg, from_file or file_path argument.") gui = Scanner(egg_path) for line, reaction in enumerate(open(egg_path).readlines(), start=1): if reaction and not reaction.startswith('#'): match = keyword_processor.extract_keywords(reaction) if match: matches = ', '.join(match) reaction = reaction.strip() gui.insert_data(line, matches, reaction) if stdout: log.info( f"Line {line}: {', '.join(match)} \n{reaction}")
def prediction(text): keyword_processor = KeywordProcessor() keyword_processor.add_keyword('school') keyword_processor.add_keyword('bus') keyword_processor.add_keyword('!@#$%') keywords_found = keyword_processor.extract_keywords(text, span_info=True) return keywords_found
def find_keyword(keyword_dict, text_to_test): keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(keyword_dict) result = keyword_processor.extract_keywords(text_to_test) return result
def cleanDomain(self): L = [] black = [] keyword_processor = KeywordProcessor() f = open(self.whitePath, "r") for i in f.readlines(): L.append(i.strip('\r\n')) keyword_processor.add_keywords_from_list(L) try: for i in self.param: #待过滤的domain hostname, ip, ipBelong, firstVisitTime, lastestVisitTime, userSet, visitNum, similarityValue, imitate = \ i.strip('\r\n').split(";") hostname = hostname.strip('\r\n') try: # if re.search("\."+j.strip('\r\n')+'$',hostname): if len(keyword_processor.extract_keywords(hostname)) == 0: black.append(i) except: traceback.print_exc() except: traceback.print_exc() print("匹配错误") finally: f.close() return black
def get_sentences_for_keyword(keywords, sentences): keyword_processor = KeywordProcessor() keyword_sentences = {} for word in keywords: word = word.strip() keyword_sentences[word] = [] keyword_processor.add_keyword(word) for sentence in sentences: keywords_found = keyword_processor.extract_keywords(sentence) for key in keywords_found: keyword_sentences[key].append(sentence) for key in keyword_sentences.keys(): values = keyword_sentences[key] values = sorted(values, key=len, reverse=True) keyword_sentences[key] = values delete_keys = [] for k in keyword_sentences.keys(): if len(keyword_sentences[k]) == 0: delete_keys.append(k) for del_key in delete_keys: del keyword_sentences[del_key] return keyword_sentences
class PhraseFeatsFromTopicModel(FeatsFromTopicModelBase, FeatsFromSpacyDoc): ''' This class allows you to make use of a topic model which has multi-token entries (i.e., terms in topics which have spaces in them.) It requires Flashtext to be installed. ''' def __init__(self, topic_model, use_lemmas=False, entity_types_to_censor=set(), entity_types_to_use=None, tag_types_to_censor=set(), strip_final_period=False, keyword_processor_args = {'case_sensitive' :False}): from flashtext import KeywordProcessor self._keyword_processor = KeywordProcessor(**keyword_processor_args) self._topic_model = topic_model for keyphrase in reduce(lambda x, y: set(x) | set(y), topic_model.values()): self._keyword_processor.add_keyword(keyphrase) FeatsFromSpacyDoc.__init__(self, use_lemmas, entity_types_to_censor, tag_types_to_censor, strip_final_period) FeatsFromTopicModelBase.__init__(self, topic_model) def get_top_model_term_lists(self): return self._topic_model def _get_terms_from_doc(self, doc): return Counter(self._keyword_processor.extract_keywords(doc)) def get_feats(self, doc): return Counter(self._get_terms_from_doc(str(doc)))
class FeatsFromTopicModel(FeatsFromTopicModelBase, FeatsFromSpacyDoc): def __init__(self, topic_model, use_lemmas=False, entity_types_to_censor=set(), entity_types_to_use=None, tag_types_to_censor=set(), strip_final_period=False, keyword_processor_args={'case_sensitive': False}): self._keyword_processor = KeywordProcessor(**keyword_processor_args) self._topic_model = topic_model.copy() if keyword_processor_args.get('case_sensitive', None) is False: for k, v in self._topic_model.items(): self._topic_model[k] = [e.lower() for e in v] for keyphrase in reduce(lambda x, y: set(x) | set(y), self._topic_model.values()): self._keyword_processor.add_keyword(keyphrase) FeatsFromSpacyDoc.__init__(self, use_lemmas, entity_types_to_censor, tag_types_to_censor, strip_final_period) FeatsFromTopicModelBase.__init__(self, topic_model) def get_top_model_term_lists(self): return self._topic_model def _get_terms_from_doc(self, doc): return Counter(self._keyword_processor.extract_keywords(str(doc))) def get_feats(self, doc): return Counter(self._get_terms_from_doc(str(doc)))
def get_sentences_for_keyword(keywords, sentences): """ For each keyword, find the sentence(s) that correspond to that keyword """ keyword_processor = KeywordProcessor( ) # use this implementation as fast alternative to keyword matching keyword_sentences = {} # loop through all keywords for word in keywords: keyword_sentences[word] = [] keyword_processor.add_keyword(word) # loop through each sentence and keyword for sentence in sentences: keywords_found = keyword_processor.extract_keywords(sentence) for key in keywords_found: keyword_sentences[key].append(sentence) for key in keyword_sentences.keys(): values = keyword_sentences[key] values = sorted(values, key=len, reverse=True) keyword_sentences[key] = values return keyword_sentences
def test_flashtext(): keyword_processor = KeywordProcessor() keyword_processor.add_keyword("Big Apple", "New York") keyword_processor.add_keyword("Bay Area") keywords_found = keyword_processor.extract_keywords( "I love big Apple and Bay Area.", span_info=True) print(keywords_found)
def extract_summary(self): result = {} result['Development Languages'] = [] keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(self.skills_list) for keyword in self.candidate_keywords: matched_keyword = keyword_processor.extract_keywords( self.stemmer.stem(keyword)) if not matched_keyword: # print('"{}" is not matched in our skill database.'.format(keyword)) continue for i in range(len(matched_keyword)): category, name = str(matched_keyword[i]).split(": ", 1) if category not in result: result[category] = {} if name not in result[category]: result[category][name] = [] result[category][name].append(keyword) if category == 'Tools & Technologies': result['Development Languages'].append(keyword) return result
def searchkeywords(key,values,words): keyword_processor = KeywordProcessor() for term_1 in values: keyword_processor.add_keyword(term_1) keywords_found = keyword_processor.extract_keywords(str(words)) #print("Set correspoding to " + key + str(set(keywords_found))) return list(set(keywords_found)),key
class PhraseSubPopulation(SubPopulation): r""" Filter samples based on a group of phrases for example, with phrase_name = 'question':: sample 1: "Who is Jack.", score: 1 sample 2: "I am Jack.", score: 0 """ def __init__(self, phrase_name='negation'): super().__init__() self.phrase_name = phrase_name if self.phrase_name == 'negation': self.phrases = NEGATION elif self.phrase_name == 'question': self.phrases = QUESTION self.phrase_processor = KeywordProcessor(case_sensitive=True) self.phrase_processor.add_keywords_from_dict( {self.phrase_name: self.phrases}) def __repr__(self): return "PhraseSubPopulation" + "-" + self.phrase_name def phrase_match(self, text): match = False # Search for phrases result = self.phrase_processor.extract_keywords(text) if result: match = True return match def _score(self, sample, fields, **kwargs): """ 1 or 0 indicates whether sample fields match phrase groups :param sample: data sample :param list fields: list of field str :param kwargs: :return int: score for sample """ text = ' '.join([sample.get_text(field) for field in fields]) match = self.phrase_match(text) return match def get_slice(self, scores, dataset): r""" Save the samples that mach the phrase groups """ sub_samples = [] for i, sample in enumerate(dataset): if scores[i]: sub_samples.append(sample) return sub_samples
def flashmatch(): lentities = entities['name'].tolist() processor = KeywordProcessor() processor.add_keywords_from_list(lentities) keywords['entity']='' for i in range(len(keywords)): keywords['entity'][i]=processor.extract_keywords(keywords['keywords'][i]) nermatch() return(keywords)
def flash_text(): # haystack = "Narendra Narendra Modi is the Prime Minister of India. He was a Chief Minister of Gujarat. Gujarat is a nice place" haystack = "sof/vel is a short form for sofosbuvir/velpatasvir" # needles = ["Narendra Modi", "Prime Minister", "Gujarat", "Modi", "Narendra"] needles = ["sof/vel", "sofosbuvir/velpatasvir"] processor = KeywordProcessor() processor.add_keywords_from_list(needles) found = processor.extract_keywords(haystack) print found
def extract_image_flag_flashtext(processor: KeywordProcessor, text: str): for (_, key), span_start, span_end in processor.extract_keywords( text, span_info=True): match_str = text[span_start:span_end] return { "text": match_str, "type": "text", "label": key, }
class _AnalyzeDocument(object): def __init__(self, document_id, ip_document, dictionary_terms): self.document_id = document_id self.ip_document = ip_document self.dictionary_terms = dictionary_terms self.keyword_processor = KeywordProcessor() self.keyword_processor.add_keywords_from_list(list(dictionary_terms.keys())) self.result = [] def analyze(self, doc_location): for ip_paragraph in self.ip_document.get_paragraphs(): if ip_paragraph.get_paragraph_index() < doc_location.get_paragraph_index(): continue text = ip_paragraph.get_text() match_results = self.keyword_processor.extract_keywords(text, span_info=True) if not match_results: continue for match, start, end in match_results: if doc_location.get_paragraph_index() == ip_paragraph.get_paragraph_index(): if start < doc_location.get_start_offset(): continue sbh_uri = self.dictionary_terms[match] analyze_result = AnalyzeResult(ip_paragraph.get_paragraph_index(), match, sbh_uri, start, end-1) self.result.append(analyze_result) def remove_first_occurrence(self, paragraph_index, matching_term, sbh_uri, start_offset, end_offset): for index in reversed(range(len(self.result))): analyze_result = self.result[index] # if users want to manually enter in a sbh_uri then allow users to remove current result # as long as the term and position where the term occurs in the document matches. if (analyze_result.get_paragraph_index() == paragraph_index and analyze_result.get_matching_term() == matching_term and analyze_result.get_start_offset() == start_offset and analyze_result.get_end_offset() == end_offset): self.result.pop(index) return True return False def remove_all(self, term): removed_item = [] for index in reversed(range(len(self.result))): analyze_result = self.result[index] if analyze_result.get_matching_term() == term: self.result.pop(index) removed_item.append(analyze_result) return removed_item def get_result(self): return self.result
#!/usr/bin/python3 # coding: utf-8 # http://blog.csdn.net/CoderPai/article/details/78574863 # https://github.com/vi3k6i5/flashtext # 关键词搜索 from flashtext import KeywordProcessor keyword_processor = KeywordProcessor() # keyword_processor.add_keyword(<别名>, <标准词>) keyword_processor.add_keyword('广东', '广东省') keyword_processor.add_keyword('北京') keywords_found = keyword_processor.extract_keywords('北京与广东都是中国的') print(keywords_found) # ['北京', '广东省'] # 关键词替换 keyword_processor.add_keyword('A卡', '邂逅a卡') new_sentence = keyword_processor.replace_keywords('a卡与b卡哪个卡好') print(new_sentence) # Out[40]: '邂逅a卡与b卡哪个卡好' # 提取关键词,区分大小写字母 keyword_processor = KeywordProcessor(case_sensitive=True) keyword_processor.add_keyword('A卡', '邂逅a卡') keyword_processor.add_keyword('b卡') keywords_found = keyword_processor.extract_keywords('a卡与b卡哪个卡好?') print(keywords_found) # ['b卡']