def find_keyword(keyword_dict, text_to_test): keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(keyword_dict) result = keyword_processor.extract_keywords(text_to_test) return result
def __init__(self, f_dict=None, prefix=''): ''' Initialize the parser. Args: f_dict: filename, location of the replacement dictionary. prefix: string, text to prefix each replacement. ''' self.logger = logging.getLogger(__name__) if f_dict is None: local_path = os.path.dirname(__file__) f_dict = os.path.join(local_path, self.f_MeSH) self.logger.debug('Using default dictionary: %s' % f_dict) if not os.path.exists(f_dict): msg = "Can't find dictionary {}".format(f_dict) self.logger.error(msg) raise IOError() self.prefix = prefix terms = collections.defaultdict(list) with open(f_dict) as FIN: csvfile = csv.DictReader(FIN) for row in csvfile: terms[row["replacement"]].append(row['term']) self.FT = KeywordProcessor() self.FT.add_keywords_from_dict(terms)
def load_datasets(self, entity_code_location_string_dict): for entity_code_location_string in entity_code_location_string_dict: entity_code = entity_code_location_string["code"] location_string = entity_code_location_string["location"] # remember location string self.location_strings[entity_code] = location_string # load entities into dataset new_data = DatasetManager.load_dataset_from_location_string( location_string, { "term": str, "entity_code": str, "parent_terms": str })[0] self.dataset = self.dataset.append(new_data) # update flashtext self.flashtext = KeywordProcessor() data_for_flashtext = pd.DataFrame({ "against": [ "`{}``SN``{}`´".format(row["term"], row["entity_code"]) if not row["parent_terms"] else "`{}``PN``{}``{}`´".format( row["term"], row["entity_code"], row["parent_terms"]) for index, row in self.dataset.iterrows() ], "replace": self.dataset["term"], }) dict_for_flashtext = data_for_flashtext.set_index( "against").T.to_dict("list") self.flashtext.add_keywords_from_dict(dict_for_flashtext)
def test_remove_keywords_dictionary_len(self): """For each of the test case initialize a new KeywordProcessor. Add the keywords the test case to KeywordProcessor. Remove the keywords in remove_keyword_dict Extract keywords and check if they match the expected result for the test case. """ for test_id, test_case in enumerate(self.test_cases): keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(test_case['keyword_dict']) keyword_processor.remove_keywords_from_dict( test_case['remove_keyword_dict']) kp_len = len(keyword_processor) new_dictionary = defaultdict(list) for key, values in test_case['keyword_dict'].items(): for value in values: if not (key in test_case['remove_keyword_dict'] and value in test_case['remove_keyword_dict'][key]): new_dictionary[key].append(value) keyword_processor_two = KeywordProcessor() keyword_processor_two.add_keywords_from_dict(new_dictionary) kp_len_two = len(keyword_processor_two) self.assertEqual( kp_len, kp_len_two, "keyword processor length doesn't match for Text ID {}".format( test_id))
def _create_flashtext_object(): """ Instantiates a Flashtext object. Separators are specified to not be considered as word boundaries """ keyword_processor = KeywordProcessor() # special characters are included as natively flashtext library does not handle them correctly for separator in [ "-", "_", "/", "é", "è", "ê", "â", "ô", "ö", "ü", "û", "ù", "ï", "î", "æ", ]: keyword_processor.add_non_word_boundary(separator) return keyword_processor
def __call__(self, from_file=False, file_path=None, stdout=False): watchlist = self.open_watchlist() keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_list(watchlist) if from_file: egg_path = _ask_spy_file() elif file_path: egg_path = file_path else: raise ValueError("Expected egg, from_file or file_path argument.") gui = Scanner(egg_path) for line, reaction in enumerate(open(egg_path).readlines(), start=1): if reaction and not reaction.startswith('#'): match = keyword_processor.extract_keywords(reaction) if match: matches = ', '.join(match) reaction = reaction.strip() gui.insert_data(line, matches, reaction) if stdout: log.info( f"Line {line}: {', '.join(match)} \n{reaction}")
class PhraseFeatsFromTopicModel(FeatsFromTopicModelBase, FeatsFromSpacyDoc): ''' This class allows you to make use of a topic model which has multi-token entries (i.e., terms in topics which have spaces in them.) It requires Flashtext to be installed. ''' def __init__(self, topic_model, use_lemmas=False, entity_types_to_censor=set(), entity_types_to_use=None, tag_types_to_censor=set(), strip_final_period=False, keyword_processor_args = {'case_sensitive' :False}): from flashtext import KeywordProcessor self._keyword_processor = KeywordProcessor(**keyword_processor_args) self._topic_model = topic_model for keyphrase in reduce(lambda x, y: set(x) | set(y), topic_model.values()): self._keyword_processor.add_keyword(keyphrase) FeatsFromSpacyDoc.__init__(self, use_lemmas, entity_types_to_censor, tag_types_to_censor, strip_final_period) FeatsFromTopicModelBase.__init__(self, topic_model) def get_top_model_term_lists(self): return self._topic_model def _get_terms_from_doc(self, doc): return Counter(self._keyword_processor.extract_keywords(doc)) def get_feats(self, doc): return Counter(self._get_terms_from_doc(str(doc)))
def used_func_for_fast_key_word_matching(): # Load tokenizer path_stanford_corenlp_full_2017_06_09 = str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*') drqa.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09) tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner']) keyword_processor = KeywordProcessor(case_sensitive=True) id_to_key_dict = load_keyword_dict(config.DATA_ROOT / "id_dict.jsonl") # Write this in a for loop to keep track of the progress for clean_name, keywords in tqdm(id_to_key_dict.items()): if not isinstance(keywords, list): raise AttributeError("Value of key {} should be a list".format(clean_name)) for keyword in keywords: keyword_processor.add_keyword(keyword, clean_name) # Load data for predicting d_list = load_data(config.FEVER_DEV_JSONL) sample_answer(d_list, tok, keyword_p=keyword_processor) # save the the results for evaluating out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl" save_intermidiate_results(d_list, out_filename=out_fname) # Evaluating # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_06_29_17:41:14_r/dev.jsonl' d_list = load_data(out_fname) eval_mode = {'check_doc_id_correct': True, 'standard': False} # print(fever_score(d_list, d_list, mode=eval_mode, error_analysis_file=Path(out_fname).parent / "analysis.log")) print(fever_score(d_list, d_list, mode=eval_mode))
class FeatsFromTopicModel(FeatsFromTopicModelBase, FeatsFromSpacyDoc): def __init__(self, topic_model, use_lemmas=False, entity_types_to_censor=set(), entity_types_to_use=None, tag_types_to_censor=set(), strip_final_period=False, keyword_processor_args={'case_sensitive': False}): self._keyword_processor = KeywordProcessor(**keyword_processor_args) self._topic_model = topic_model.copy() if keyword_processor_args.get('case_sensitive', None) is False: for k, v in self._topic_model.items(): self._topic_model[k] = [e.lower() for e in v] for keyphrase in reduce(lambda x, y: set(x) | set(y), self._topic_model.values()): self._keyword_processor.add_keyword(keyphrase) FeatsFromSpacyDoc.__init__(self, use_lemmas, entity_types_to_censor, tag_types_to_censor, strip_final_period) FeatsFromTopicModelBase.__init__(self, topic_model) def get_top_model_term_lists(self): return self._topic_model def _get_terms_from_doc(self, doc): return Counter(self._keyword_processor.extract_keywords(str(doc))) def get_feats(self, doc): return Counter(self._get_terms_from_doc(str(doc)))
def processing(i, file_list): tk = RegexpTokenizer(r'\w\S+\w') # create English stop words list en_stop = get_stop_words('en') stopword_processor = KeywordProcessor() for w in en_stop: stopword_processor.add_keyword(w, ' ') with open('stopword_processor.pkl', 'wb') as f: pickle.dump(stopword_processor, f) p_stemmer = PorterStemmer() with codecs.open('whole_dialogs_stem_%d' % i, 'w', 'utf-8') as out: for fi in tqdm(file_list): with codecs.open(fi, 'r', 'utf-8') as f: sentences = [ stopword_processor.replace_keywords( line.strip().split('\t')[-1].lower()) for line in f ] words = functools.reduce(lambda x, y: x + y, map(tk.tokenize, sentences)) words = map(p_stemmer.stem, words) out.write(' '.join(words) + '\n')
class FlashtextBackend(Backend): """This backend recognizes entities using the FlashText algorithm. See https://flashtext.readthedocs.io/en/latest/ for more information about the FlashText algorithm. """ def __init__(self): self.keyword_processor = KeywordProcessor() def register_recognizer(self, recognizer_cls: Type[FlashtextRecognizer]): recognizer = recognizer_cls() key = f"{recognizer.TAG}:{recognizer.SCORE}:{recognizer_cls.__name__}" keyword_dict = {key: recognizer.keywords} self.keyword_processor.add_keywords_from_dict(keyword_dict) def run(self, text): keywords = self.keyword_processor.extract_keywords(text, span_info=True) ents = [] for keyword_key, start, end in keywords: tag, score, recognizer_name = keyword_key.split(":") ent = NamedEntity(start, end, tag, text[start:end], float(score), recognizer_name) ents.append(ent) return ents
async def main(job_title: str, sh_links_we_already_have: list[str], skills: dict[str, list[str]]): # Import this function to collect vacancies for a given job title. fake_agent = get_user_agent() async with ClientSession(headers={ "user-agent": fake_agent, "Connection": "close" }) as session: all_links = await scan_all_search_results(job_title, session) for _ in range(10): try: vacancies_without_skills = await fetch_all_vacancy_pages( all_links, sh_links_we_already_have, session) keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(skills) collected_jobs = ( process_vacancy_content(vacancy_without_skills, keyword_processor) for vacancy_without_skills in vacancies_without_skills if vacancy_without_skills is not None) await asyncio.sleep(60) return collected_jobs except OSError: logger.warning(f"🚨 OSError occured for {job_title}.") # If couldn't recover after errors, then return an empty list. await asyncio.sleep(60) return []
def get_DL(s): dl = False dl_valid = False dl_State = "" arr = ['driver', 'license'] keyword_processor = KeywordProcessor() keyword_processor.add_keyword('california') if any(re.findall('|'.join(arr), qual)): dl = True if (dl == True): sentences = sent_tokenize(s) selected_sentence = [ sent for sent in sentences if "driver" in word_tokenize(sent) ] if (len(selected_sentence) > 0): words = selected_sentence[0].split() selected_word = [word for word in words if "valid" in words] if len(selected_word) > 0: dl_valid = True for i in range(len(selected_sentence)): keywords_found = keyword_processor.extract_keywords( selected_sentence[i]) for i in range(len(keywords_found)): if keywords_found[i] == 'california': dl_State = "CA" if (dl_valid) == True: dl_valid = "R" else: dl_valid = "P" return dl_valid, dl_State
def search_from_keywords(synonyms_list, sentences, keywords_with_weight): #synonyms_list is a list of dictionaries global keywords_rank final_result = [] for index in range(len(synonyms_list)): synonyms_dict = synonyms_list[index] keywords_list = list(synonyms_dict.keys()) threshold = 0.5 * len(keywords_list) if not keywords_list: final_result.append(None) continue keywords_rank = keywords_with_weight[index] matched_sentences = PriorityQueue() keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(synonyms_dict) for sent in sentences: keywords_found = set(keyword_processor.extract_keywords(sent)) if keywords_found: entry_obj = Entry(list(keywords_found), sent) if entry_obj.matched >= threshold: matched_sentences.put(entry_obj) if not matched_sentences.empty(): best_sentence = matched_sentences.get() final_result.append(best_sentence.sentence) else: final_result.append(None) return final_result
def rm_main(): ScienceEvent = { "scienza", "universo", "geologia", "biologia", "scientifico", "scientifica", "scienziato", "scienzata" } VisualArtEvent = { "pittura", "scultura", "artista", "artisti", "art", "opere", "opera", "part" } keyPArt = KeywordProcessor() keyPScience = KeywordProcessor() for element in ScienceEvent: keyPScience.add_keyword(element) for element in VisualArtEvent: keyPArt.add_keyword(element) url = 'https://raw.githubusercontent.com/andreamatt/KDI/master/dataset/muse.json' obj = json.loads(requests.get(url).text) eventsArray = obj["events"] while {} in eventsArray: eventsArray.remove({}) for event in eventsArray: for k, v in event.items(): event[k] = v.replace('\n', '; ') artList = keyPArt.extract_keywords(event["description"]) scienceList = keyPScience.extract_keywords(event["description"]) if len(artList) > len(scienceList): event.update({'Subcategory': visual}) elif len(artList) <= len(scienceList): event.update({'Subcategory': science}) obj["events"] = eventsArray return json.dumps(obj)
def load_dataset(self, location_string): # update location_string self.location_string = location_string # load data self.dataset = DatasetManager.load_dataset_from_location_string( location_string, { "term": str, "parent_terms": str })[0] # setup flashtext for later string replacements temp_replace_against_dataset = self.dataset.copy() temp_replace_against_dataset["replace"] = temp_replace_against_dataset[ "term"] temp_replace_against_dataset["against"] = temp_replace_against_dataset[ "replace"] temp_replace_against_dataset.loc[ temp_replace_against_dataset["parent_terms"] != "", "against"] = ("`" + temp_replace_against_dataset["term"] + "``PK``" + temp_replace_against_dataset["parent_terms"] + "`´") temp_replace_against_dataset.loc[ temp_replace_against_dataset["parent_terms"] == "", "against"] = ("`" + temp_replace_against_dataset["term"] + "``SK`´") temp_replace_against_dataset = temp_replace_against_dataset[[ "replace", "against" ]] temp_replace_against_dataset_as_dict = { row["against"]: [row["replace"]] for index, row in temp_replace_against_dataset.iterrows() } self.flashtext = KeywordProcessor() self.flashtext.add_keywords_from_dict( temp_replace_against_dataset_as_dict)
class Tokenizer(object): def __init__(self): log.info("Tokenizer initialization") from .lookup_dic import _PhraseDictionary as __PD log.debug("Tokenizer: calls lookup_dic.read_phrases") self.lookup_dic = __PD() log.debug("Instanciate flashtext.KeyworkProcessor") self.__keyword_processor = KeywordProcessor() log.debug("Insert data into flashtext.KeyworkProcessor instance.") self.__keyword_processor.add_keywords_from_dict( self.lookup_dic.lookup_dic_CODE) log.info("Tokenizer initialization successful") def tokenize(self, text): log.debug(f"Tokenizer called on {text}") log.debug("Phase I: Replacing phrases.") text = self.__keyword_processor.replace_keywords(text) log.debug("Phase II: Split by space.") tokens_list = text.split() log.debug("Phase III: Replace back token id to its original form.") tokens_list = [ self.lookup_dic.reverse_replace(token) if token in self.lookup_dic.lookup_dic_CODE else token for token in tokens_list ] return tokens_list def __call__(self, text): return self.tokenize(text)
def load_gbif_categories( gbif_extract_file="../data/gbif_extract_categories.csv"): """ load gbif categies dataset """ """ input: csv file name """ """ ourput: fleshtext keyword_processor """ df_canonicalName = pd.read_csv(gbif_extract_file, sep=";") print( gbif_extract_file, ", loaded", df_canonicalName.shape, list(df_canonicalName.columns), ) kwords = list(df_canonicalName["name"].values) keyword_processor = KeywordProcessor() for k in kwords: tab = k.split(" ") for y in tab: if len(y) > 0: keyword_processor.add_keyword(y) print("len(keyword_processor):", len(keyword_processor)) # ['family' 'genus' 'species' 'subspecies'] print(df_canonicalName["rank"].unique()) return keyword_processor
def __init__(self, **kwargs): self.label = kwargs.get('label', True) filename_to_load = kwargs.get('filename', None) self.skip_misses = kwargs.get('skip_misses', True) self.include_hashtags = kwargs.get('include_hashtags', True) # Load the keywords '''keywords = [] with open(filename_to_load) as my_input: for line in my_input: phrase = line.strip().lower() keywords.append(phrase) if self.include_hashtags and not phrase.startswith('#'): phrase = '#' + phrase keywords.append(phrase) self.trie = Trie(keywords)''' # flashtext implementation self.trie = KeywordProcessor() with open(filename_to_load) as my_input: for line in my_input: phrase = line.strip().lower() self.trie.add_keyword(phrase) if self.include_hashtags and not phrase.startswith('#'): phrase = '#' + phrase self.trie.add_keyword(phrase)
def get_sentences_for_keyword(keywords, sentences): """ For each keyword, find the sentence(s) that correspond to that keyword """ keyword_processor = KeywordProcessor( ) # use this implementation as fast alternative to keyword matching keyword_sentences = {} # loop through all keywords for word in keywords: keyword_sentences[word] = [] keyword_processor.add_keyword(word) # loop through each sentence and keyword for sentence in sentences: keywords_found = keyword_processor.extract_keywords(sentence) for key in keywords_found: keyword_sentences[key].append(sentence) for key in keyword_sentences.keys(): values = keyword_sentences[key] values = sorted(values, key=len, reverse=True) keyword_sentences[key] = values return keyword_sentences
def test_replace_keywords(self): """For each of the test case initialize a new KeywordProcessor. Add the keywords the test case to KeywordProcessor. Replace keywords and check if they match the expected result for the test case. """ for test_id, test_case in enumerate(self.test_cases): keyword_replacer = KeywordProcessor() # To handle issue like https://github.com/vi3k6i5/flashtext/issues/8 # clean names are replaced with "_" in place of white space. for key, values in test_case['keyword_dict'].items(): for value in values: keyword_replacer.add_keyword(value, key.replace(" ", "_")) new_sentence = keyword_replacer.replace_keywords(test_case['sentence']) replaced_sentence = test_case['sentence'] keyword_mapping = {} for val in test_case['keyword_dict']: for value in test_case['keyword_dict'][val]: keyword_mapping[value] = val.replace(" ", "_") for key in sorted(keyword_mapping, key=len, reverse=True): lowercase = re.compile(r'(?<!\w){}(?!\w)'.format(re.escape(key))) replaced_sentence = lowercase.sub(keyword_mapping[key], replaced_sentence) self.assertEqual(new_sentence, replaced_sentence, "new_sentence don't match the expected results for test case: {}".format(test_id))
def __init__(self): self.req_params = None self.keywords = [] self.islands_visited = {} self.keyword_processor = KeywordProcessor() self.price_threshold = 0 self.response = None
def __init__(self, crawl_type="", keywords=[]): super(StdOutListener, self).__init__() self.crawl_type = crawl_type self.keywords = keywords self.tweet_dump = [] self.keyword_processor = KeywordProcessor() self.keyword_processor.add_keywords_from_list(self.keywords)
def make_ne_founder(DICT_FILE): keyword_processor = KeywordProcessor(case_sensitive=False) with open(DICT_FILE, 'r') as r: vocab_list = [vocab for vocab in r] for vocab in vocab_list: keyword_processor.add_keyword(vocab.strip('\n')) return keyword_processor
def __init__(self, document_id, ip_document, dictionary_terms): self.document_id = document_id self.ip_document = ip_document self.dictionary_terms = dictionary_terms self.keyword_processor = KeywordProcessor() self.keyword_processor.add_keywords_from_list(list(dictionary_terms.keys())) self.result = []
def get_sentences_for_keyword(keywords, sentences): keyword_processor = KeywordProcessor() keyword_sentences = {} for word in keywords: word = word.strip() keyword_sentences[word] = [] keyword_processor.add_keyword(word) for sentence in sentences: keywords_found = keyword_processor.extract_keywords(sentence) for key in keywords_found: keyword_sentences[key].append(sentence) for key in keyword_sentences.keys(): values = keyword_sentences[key] values = sorted(values, key=len, reverse=True) keyword_sentences[key] = values delete_keys = [] for k in keyword_sentences.keys(): if len(keyword_sentences[k]) == 0: delete_keys.append(k) for del_key in delete_keys: del keyword_sentences[del_key] return keyword_sentences
def extract_summary(self): result = {} result['Development Languages'] = [] keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(self.skills_list) for keyword in self.candidate_keywords: matched_keyword = keyword_processor.extract_keywords( self.stemmer.stem(keyword)) if not matched_keyword: # print('"{}" is not matched in our skill database.'.format(keyword)) continue for i in range(len(matched_keyword)): category, name = str(matched_keyword[i]).split(": ", 1) if category not in result: result[category] = {} if name not in result[category]: result[category][name] = [] result[category][name].append(keyword) if category == 'Tools & Technologies': result['Development Languages'].append(keyword) return result
def test_remove_keywords_dictionary_compare(self): """For each of the test case initialize a new KeywordProcessor. Add the keywords the test case to KeywordProcessor. Remove the keywords in remove_keyword_dict Extract keywords and check if they match the expected result for the test case. """ for test_id, test_case in enumerate(self.test_cases): keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(test_case['keyword_dict']) keyword_processor.remove_keywords_from_dict( test_case['remove_keyword_dict']) keyword_trie_dict = keyword_processor.keyword_trie_dict new_dictionary = defaultdict(list) for key, values in test_case['keyword_dict'].items(): for value in values: if not (key in test_case['remove_keyword_dict'] and value in test_case['remove_keyword_dict'][key]): new_dictionary[key].append(value) keyword_processor_two = KeywordProcessor() keyword_processor_two.add_keywords_from_dict(new_dictionary) keyword_trie_dict_two = keyword_processor_two.keyword_trie_dict self.assertTrue( keyword_trie_dict == keyword_trie_dict_two, "keywords_extracted don't match the expected results for test case: {}" .format(test_id))
def __init__(self,type_link,isLower,iscase_sensitive,currFlag): self.keyword_processor = KeywordProcessor(case_sensitive=iscase_sensitive) if type_link is None: return curr_dict = {"USD": ["$","$", "dollars", "U.S. Dollar", "USD", "US$", "United States dollar"]} # Temp logic for hierarchy identification -- remove # asset_dict ={"Assets": ["assets","total assets"]} # curAss_Dict={"Current Assets": ["current assets","total current assets"]} # nonCurAss_Dict={"NonCurrent Assets": ["longterm assets","long-term assets","non current assets","non-current assets","total noncurrent assets","total non-current assets"]} # liab_Dict={"Liabilities": ["liabilities","liabilities and common shareholders equity","liabilities and shareholders equity","total liabilities","total liabilities and equity","total liabilities and shareholders equity","total liabilities and stockholders equity","total liabilities and common shareholders equity"]} # curLiab_Dict ={"Current Liabilities" : ["current liabilities","total current liabilities"]} # nonCurrLiab_Dict = {"NonCurrent Liabilities" : ["non current liabilities","non-current liabilities","longterm liabilities","long-term liabilities","total noncurrent liabilities","total non-current liabilities"]} # shrEq_Dict = {"Shareholders Equity" : ["shareholders equity","equity","stockholders equity","total equity","total stockholders equity","total common shareholders equity","total wells fargo stockholders equity"]} with open(type_link,encoding='utf-8',errors='ignore') as inf: lines = inf.readlines() for line in lines: if len(line.strip()) >0 : if isLower: self.keyword_processor.add_keyword(line.strip().lower()) else: self.keyword_processor.add_keyword(line.strip()) if currFlag: self.keyword_processor.add_keywords_from_dict(curr_dict)
def cleanDomain(self): L = [] black = [] keyword_processor = KeywordProcessor() f = open(self.whitePath, "r") for i in f.readlines(): L.append(i.strip('\r\n')) keyword_processor.add_keywords_from_list(L) try: for i in self.param: #待过滤的domain hostname, ip, ipBelong, firstVisitTime, lastestVisitTime, userSet, visitNum, similarityValue, imitate = \ i.strip('\r\n').split(";") hostname = hostname.strip('\r\n') try: # if re.search("\."+j.strip('\r\n')+'$',hostname): if len(keyword_processor.extract_keywords(hostname)) == 0: black.append(i) except: traceback.print_exc() except: traceback.print_exc() print("匹配错误") finally: f.close() return black
#!/usr/bin/python3 # coding: utf-8 # http://blog.csdn.net/CoderPai/article/details/78574863 # https://github.com/vi3k6i5/flashtext # 关键词搜索 from flashtext import KeywordProcessor keyword_processor = KeywordProcessor() # keyword_processor.add_keyword(<别名>, <标准词>) keyword_processor.add_keyword('广东', '广东省') keyword_processor.add_keyword('北京') keywords_found = keyword_processor.extract_keywords('北京与广东都是中国的') print(keywords_found) # ['北京', '广东省'] # 关键词替换 keyword_processor.add_keyword('A卡', '邂逅a卡') new_sentence = keyword_processor.replace_keywords('a卡与b卡哪个卡好') print(new_sentence) # Out[40]: '邂逅a卡与b卡哪个卡好' # 提取关键词,区分大小写字母 keyword_processor = KeywordProcessor(case_sensitive=True) keyword_processor.add_keyword('A卡', '邂逅a卡') keyword_processor.add_keyword('b卡') keywords_found = keyword_processor.extract_keywords('a卡与b卡哪个卡好?') print(keywords_found) # ['b卡']