class Entity(object): def __init__(self, nlp, keywords_list=[], keywords_dict={}, keywords_file=None, label='', attrs=('has_entities', 'is_entity', 'entity_desc', 'entities')): self._has_entities, self._is_entity, self._entity_desc, self._entities = attrs self.keyword_processor = KeywordProcessor() self.keyword_processor.add_keywords_from_list(keywords_list) self.keyword_processor.add_keywords_from_dict(keywords_dict) if keywords_file: self.keyword_processor.add_keyword_from_file(keywords_file) self.label = label Doc.set_extension(self._has_entities, getter=self.has_entities) Doc.set_extension(self._entities, getter=self.iter_entities) Span.set_extension(self._has_entities, getter=self.has_entities) Span.set_extension(self._entities, getter=self.iter_entities) Token.set_extension(self._is_entity, default=False) Token.set_extension(self._entity_desc, getter=self.get_entity_desc) def __call__(self, doc): matches = self.keyword_processor.extract_keywords(doc.text, span_info=True) spans = [] # keep spans here to merge them later for _, start, end in matches: entity = doc.char_span(start, end, label=self.label) for token in entity: token._.set(self._is_entity, True) spans.append(entity) doc.ents = list(doc.ents) + [entity] for span in spans: span.merge() return doc def has_entities(self, tokens): return any(token._.get(self._is_entity) for token in tokens) def iter_entities(self, tokens): return [(t.text, i, t._.get(self._entity_desc)) for i, t in enumerate(tokens) if t._.get(self._is_entity)] def get_entity_desc(self, token): return token.text
def extract_summary(self): result = {} result['Development Languages'] = [] keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(self.skills_list) for keyword in self.candidate_keywords: matched_keyword = keyword_processor.extract_keywords( self.stemmer.stem(keyword)) if not matched_keyword: # print('"{}" is not matched in our skill database.'.format(keyword)) continue for i in range(len(matched_keyword)): category, name = str(matched_keyword[i]).split(": ", 1) if category not in result: result[category] = {} if name not in result[category]: result[category][name] = [] result[category][name].append(keyword) if category == 'Tools & Technologies': result['Development Languages'].append(keyword) return result
def find_keyword(keyword_dict, text_to_test): keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(keyword_dict) result = keyword_processor.extract_keywords(text_to_test) return result
class FlashtextBackend(Backend): """This backend recognizes entities using the FlashText algorithm. See https://flashtext.readthedocs.io/en/latest/ for more information about the FlashText algorithm. """ def __init__(self): self.keyword_processor = KeywordProcessor() def register_recognizer(self, recognizer_cls: Type[FlashtextRecognizer]): recognizer = recognizer_cls() key = f"{recognizer.TAG}:{recognizer.SCORE}:{recognizer_cls.__name__}" keyword_dict = {key: recognizer.keywords} self.keyword_processor.add_keywords_from_dict(keyword_dict) def run(self, text): keywords = self.keyword_processor.extract_keywords(text, span_info=True) ents = [] for keyword_key, start, end in keywords: tag, score, recognizer_name = keyword_key.split(":") ent = NamedEntity(start, end, tag, text[start:end], float(score), recognizer_name) ents.append(ent) return ents
def test_remove_keywords_dictionary_len(self): """For each of the test case initialize a new KeywordProcessor. Add the keywords the test case to KeywordProcessor. Remove the keywords in remove_keyword_dict Extract keywords and check if they match the expected result for the test case. """ for test_id, test_case in enumerate(self.test_cases): keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(test_case['keyword_dict']) keyword_processor.remove_keywords_from_dict( test_case['remove_keyword_dict']) kp_len = len(keyword_processor) new_dictionary = defaultdict(list) for key, values in test_case['keyword_dict'].items(): for value in values: if not (key in test_case['remove_keyword_dict'] and value in test_case['remove_keyword_dict'][key]): new_dictionary[key].append(value) keyword_processor_two = KeywordProcessor() keyword_processor_two.add_keywords_from_dict(new_dictionary) kp_len_two = len(keyword_processor_two) self.assertEqual( kp_len, kp_len_two, "keyword processor length doesn't match for Text ID {}".format( test_id))
async def main(job_title: str, sh_links_we_already_have: list[str], skills: dict[str, list[str]]): # Import this function to collect vacancies for a given job title. fake_agent = get_user_agent() async with ClientSession(headers={ "user-agent": fake_agent, "Connection": "close" }) as session: all_links = await scan_all_search_results(job_title, session) for _ in range(10): try: vacancies_without_skills = await fetch_all_vacancy_pages( all_links, sh_links_we_already_have, session) keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(skills) collected_jobs = ( process_vacancy_content(vacancy_without_skills, keyword_processor) for vacancy_without_skills in vacancies_without_skills if vacancy_without_skills is not None) await asyncio.sleep(60) return collected_jobs except OSError: logger.warning(f"🚨 OSError occured for {job_title}.") # If couldn't recover after errors, then return an empty list. await asyncio.sleep(60) return []
class Tokenizer(object): def __init__(self): log.info("Tokenizer initialization") from .lookup_dic import _PhraseDictionary as __PD log.debug("Tokenizer: calls lookup_dic.read_phrases") self.lookup_dic = __PD() log.debug("Instanciate flashtext.KeyworkProcessor") self.__keyword_processor = KeywordProcessor() log.debug("Insert data into flashtext.KeyworkProcessor instance.") self.__keyword_processor.add_keywords_from_dict( self.lookup_dic.lookup_dic_CODE) log.info("Tokenizer initialization successful") def tokenize(self, text): log.debug(f"Tokenizer called on {text}") log.debug("Phase I: Replacing phrases.") text = self.__keyword_processor.replace_keywords(text) log.debug("Phase II: Split by space.") tokens_list = text.split() log.debug("Phase III: Replace back token id to its original form.") tokens_list = [ self.lookup_dic.reverse_replace(token) if token in self.lookup_dic.lookup_dic_CODE else token for token in tokens_list ] return tokens_list def __call__(self, text): return self.tokenize(text)
def search_from_keywords(synonyms_list, sentences, keywords_with_weight): #synonyms_list is a list of dictionaries global keywords_rank final_result = [] for index in range(len(synonyms_list)): synonyms_dict = synonyms_list[index] keywords_list = list(synonyms_dict.keys()) threshold = 0.5 * len(keywords_list) if not keywords_list: final_result.append(None) continue keywords_rank = keywords_with_weight[index] matched_sentences = PriorityQueue() keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(synonyms_dict) for sent in sentences: keywords_found = set(keyword_processor.extract_keywords(sent)) if keywords_found: entry_obj = Entry(list(keywords_found), sent) if entry_obj.matched >= threshold: matched_sentences.put(entry_obj) if not matched_sentences.empty(): best_sentence = matched_sentences.get() final_result.append(best_sentence.sentence) else: final_result.append(None) return final_result
def test_remove_keywords_len(self): """For each of the test case initialize a new KeywordProcessor. Add the keywords the test case to KeywordProcessor. Remove the keywords in remove_keyword_dict Extract keywords and check if they match the expected result for the test case. """ for test_id, test_case in enumerate(self.test_cases): keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(test_case['keyword_dict']) # check length kp_len = len(keyword_processor) kp_len_expected = sum([ len(values) for key, values in test_case['keyword_dict'].items() ]) self.assertEqual( kp_len, kp_len_expected, "keyword processor length doesn't match".format(test_id)) keyword_processor.remove_keywords_from_dict( test_case['remove_keyword_dict']) # check length kp_len = len(keyword_processor) kp_len_decreased = sum([ len(values) for key, values in test_case['remove_keyword_dict'].items() ]) self.assertEqual( kp_len, kp_len_expected - kp_len_decreased, "keyword processor length doesn't match for Text ID {}".format( test_id))
def test_remove_keywords_dictionary_compare(self): """For each of the test case initialize a new KeywordProcessor. Add the keywords the test case to KeywordProcessor. Remove the keywords in remove_keyword_dict Extract keywords and check if they match the expected result for the test case. """ for test_id, test_case in enumerate(self.test_cases): keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(test_case['keyword_dict']) keyword_processor.remove_keywords_from_dict( test_case['remove_keyword_dict']) keyword_trie_dict = keyword_processor.keyword_trie_dict new_dictionary = defaultdict(list) for key, values in test_case['keyword_dict'].items(): for value in values: if not (key in test_case['remove_keyword_dict'] and value in test_case['remove_keyword_dict'][key]): new_dictionary[key].append(value) keyword_processor_two = KeywordProcessor() keyword_processor_two.add_keywords_from_dict(new_dictionary) keyword_trie_dict_two = keyword_processor_two.keyword_trie_dict self.assertTrue( keyword_trie_dict == keyword_trie_dict_two, "keywords_extracted don't match the expected results for test case: {}" .format(test_id))
def test_add_keyword_file_missing(self): keyword_processor = KeywordProcessor() keyword_dict = { "java": "java_2e", "product management": "product manager" } with pytest.raises(AttributeError): keyword_processor.add_keywords_from_dict(keyword_dict)
class PhraseSubPopulation(SubPopulation): r""" Filter samples based on a group of phrases for example, with phrase_name = 'question':: sample 1: "Who is Jack.", score: 1 sample 2: "I am Jack.", score: 0 """ def __init__(self, phrase_name='negation'): super().__init__() self.phrase_name = phrase_name if self.phrase_name == 'negation': self.phrases = NEGATION elif self.phrase_name == 'question': self.phrases = QUESTION self.phrase_processor = KeywordProcessor(case_sensitive=True) self.phrase_processor.add_keywords_from_dict( {self.phrase_name: self.phrases}) def __repr__(self): return "PhraseSubPopulation" + "-" + self.phrase_name def phrase_match(self, text): match = False # Search for phrases result = self.phrase_processor.extract_keywords(text) if result: match = True return match def _score(self, sample, fields, **kwargs): """ 1 or 0 indicates whether sample fields match phrase groups :param sample: data sample :param list fields: list of field str :param kwargs: :return int: score for sample """ text = ' '.join([sample.get_text(field) for field in fields]) match = self.phrase_match(text) return match def get_slice(self, scores, dataset): r""" Save the samples that mach the phrase groups """ sub_samples = [] for i, sample in enumerate(dataset): if scores[i]: sub_samples.append(sample) return sub_samples
def init(): global __keyword_processor global __reverse_lookup_dict global lookup_dic_CODE global pattern lookup_dic.read_phrases() __keyword_processor = KeywordProcessor() __keyword_processor.add_keywords_from_dict(lookup_dic.lookup_dic_CODE)
def replace(text): kwp = KeywordProcessor() kwp.non_word_boundaries = set() kwp.add_keywords_from_dict( {" {} ".format(v): [k] for k, v in UNICODE_EMOJI.items()}) clean_text = kwp.replace_keywords(text).strip() return clean_text
def prepare_keyword_processor(synonyms: Dict[str, list]) -> KeywordProcessor: """ 28x faster than a compiled regexp for 1,000 keywords https://github.com/vi3k6i5/flashtext :param synonyms: dict of entity synonyms :return: """ kp = KeywordProcessor(case_sensitive=True) kp.add_keywords_from_dict(synonyms) return kp
def keyword_processor_generation(): keyword_processor = KeywordProcessor() data = open('merge_syno.txt', 'r', encoding='utf-8').readlines() word = [] sys_word = [] for i in data: word_item, sys_word_item = i.strip('\n').split(' ') word.append(word_item) sys_word.append(sys_word_item.split()) keyword_dict = dict(zip(word, sys_word)) keyword_processor.add_keywords_from_dict(keyword_dict) return keyword_processor
def count_corpus_mentions_cooccurs(mentions: Dict[str, List[str]]) -> Any: mention2idx = get_mention2idx_dict(mentions) def count_cooccur( bag: Iterable[int], cooccur: Dict[int, Dict[int, int]] = defaultdict(lambda: defaultdict(int)) ) -> Dict[int, Dict[int, int]]: for a in bag: for b in bag: cooccur[a][b] += 1 return cooccur nlp = spacy.load('en_core_web_sm') processor = KeywordProcessor() processor.add_keywords_from_dict(mentions) sent_cooccur, atk_cooccur = defaultdict( lambda: defaultdict(int)), defaultdict(lambda: defaultdict(int)) atk_bags = [] for i, hit in enumerate(es.scan_scraper_page("*cnbc*")): if i % 100 == 0: print(i) if i > 100: break sents = [] try: sents.append(hit.article_title) except: pass try: sents += [sent.text for sent in nlp(hit.article_text).sents] except: pass atk_bag = set() for sent in sents: kws = processor.extract_keywords(sent, span_info=False) sent_bag = [mention2idx[kw] for kw in kws] sent_cooccur = count_cooccur(sent_bag, sent_cooccur) atk_bag |= set(sent_bag) atk_cooccur = count_cooccur(atk_bag, atk_cooccur) atk_bags.append(atk_bag) return ( # 在一個句子中,兩個entity同時出現的次數: {a: {a: 4, b: 2, c:1}, b: {...}} dict(sent_cooccur), # 在一篇文章中,兩個entity同時出現的次數: {a: {a: 4, b: 2, c:1}, b: {...}} dict(atk_cooccur), # 在一篇文章中出現的entities集合成一個bag: [{a, b, c}, {a, c, f}, ...] set(map(frozenset, atk_bags)), )
def main(job_title: str, indeed_links_we_already_have: list[str], skills: dict[str, list[str]]): # Main flow of parsing the vacancies and counting the relevant skills. all_links = scan_all_search_results(job_title) vacancies_without_skills = fetch_all_vacancy_pages( all_links, indeed_links_we_already_have) keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(skills) collected_jobs = (process_vacancy_content(vacancy_without_skills, keyword_processor) for vacancy_without_skills in vacancies_without_skills if vacancy_without_skills is not None) return collected_jobs
def test_remove_keywords(self): """For each of the test case initialize a new KeywordProcessor. Add the keywords the test case to KeywordProcessor. Remove the keywords in remove_keyword_dict Extract keywords and check if they match the expected result for the test case. """ for test_id, test_case in enumerate(self.test_cases): keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(test_case['keyword_dict']) keyword_processor.remove_keywords_from_dict(test_case['remove_keyword_dict']) keywords_extracted = keyword_processor.extract_keywords(test_case['sentence']) self.assertEqual(keywords_extracted, test_case['keywords'], "keywords_extracted don't match the expected results for test case: {}".format(test_id))
def test_dictionary_loading(self): keyword_processor = KeywordProcessor() keyword_dict = { "java": ["java_2e", "java programing"], "product management": ["product management techniques", "product management"] } keyword_processor.add_keywords_from_dict(keyword_dict) sentence = 'I know java_2e and product management techniques' keywords_extracted = keyword_processor.extract_keywords(sentence) self.assertEqual(keywords_extracted, ['java', 'product management'], "Failed file format one test") sentence_new = keyword_processor.replace_keywords(sentence) self.assertEqual(sentence_new, "I know java and product management", "Failed file format one test")
async def extract_keywords(article: ArticleContent): try: keyword_processor = KeywordProcessor() keyword_dict = {} for keyword in article.keywords: keyword_dict[keyword.name] = keyword.keywords keyword_processor.add_keywords_from_dict(keyword_dict) found_keywords = keyword_processor.extract_keywords(article.text) return found_keywords except Exception as e: raise HTTPException(status_code=500, detail=e)
class KeyProc: with open('./data/cleaned_name.pickle', 'rb') as handle: dict_cleaned_name = pickle.load(handle) def __init__(self, dict=dict_cleaned_name): #self.text = text self.kp = KeywordProcessor() self.kp.add_keywords_from_dict(dict) def extractKeywords(self, text): return self.kp.extract_keywords(text) def replaceKeywords(self, text): return self.kp.replace_keywords(text)
class Entity(object): name = 'entity' def __init__(self, nlp, keywords_list=[], keywords_dict={}, keywords_file=None, label='', attrs=('has_entities', 'is_entity', 'entity_desc', 'entities')): self._has_entities, self._is_entity, self._entity_desc, self._entities = attrs self.keyword_processor = KeywordProcessor() self.keyword_processor.add_keywords_from_list(keywords_list) self.keyword_processor.add_keywords_from_dict(keywords_dict) if keywords_file: self.keyword_processor.add_keyword_from_file(keywords_file) self.label = label # Add attributes Doc.set_extension(self._has_entities, getter=self.has_entities) Doc.set_extension(self._entities, getter=self.iter_entities) Span.set_extension(self._has_entities, getter=self.has_entities) Span.set_extension(self._entities, getter=self.iter_entities) Token.set_extension(self._is_entity, default=False) Token.set_extension(self._entity_desc, getter=self.get_entity_desc) def __call__(self, doc): matches = self.keyword_processor.extract_keywords(doc.text, span_info=True) for _, start, end in matches: entity = doc.char_span(start, end, label=self.label) if entity: for token in entity: token._.set(self._is_entity, True) # Overwrite doc.ents and add entity – be careful not to replace! doc.ents = list(doc.ents) + [entity] return doc def has_entities(self, tokens): return any(token._.get(self._is_entity) for token in tokens) def iter_entities(self, tokens): return [(t.text, i, t._.get(self._entity_desc)) for i, t in enumerate(tokens) if t._.get(self._is_entity)] def get_entity_desc(self, token): return token.text
def find_skills_in_resume(text_from_resume: str) -> set[str]: # Extract from resume the list of unique skills that need to be matched against the vacancies. skills_from_db = cache.get("skills_from_db") # Cache skills for 12 hours – additional check despite the custom cron command to warmup the cache. if skills_from_db is None: skills_from_db = list(Skill.objects.all()) cache.set("skills_from_db", skills_from_db, 12 * 60 * 60) skills = { skill.clean_name: ast.literal_eval(skill.unclean_names) for skill in skills_from_db } keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(skills) skills_from_resume = set( keyword_processor.extract_keywords(text_from_resume)) return skills_from_resume
class Keywords_Classifier: def __init__(self, dic_dir): self.dic_dir = Path(dic_dir) self.keywords_dict = {} self.processor = None self.label_dict = defaultdict(set) self.label_dict['other'] = set() type_of_dic = [i.stem for i in self.dic_dir.glob('*.dic')] for label in type_of_dic: self.label_dict[label] = set() def build_keyword_dict(self): '''Read a text file that contains specific dictionary which words or phrase is arranged by line by line''' dic_files = self.dic_dir.glob(('**/*.dic')) pbar = tqdm(dic_files) for dic_file in pbar: with dic_file.open() as f: dic_name = dic_file.stem dic_value = [word.strip() for word in f.readlines()] self.keywords_dict[dic_name] = dic_value pbar.set_description(f"Loading {dic_name} Dict Completely!\n") self.processor = KeywordProcessor(case_sensitive=False) self.processor.add_keywords_from_dict(self.keywords_dict) def query(self, phrase): '''Query keyword to get related category''' cat = self.processor.extract_keywords(phrase) return cat def classifier(self, keywords): '''Input a list of keywords, output a dict key of dict is label of word, value of dict is set of keywords''' for i, keyword in enumerate(keywords): result = self.query(keyword) if result: for each_label in result: self.label_dict[each_label].add(keyword) else: self.label_dict['other'].add(keyword) return self.label_dict.items()
def process(self, message, **kwargs): """Retrieve the text message, pass it to the classifier and append the prediction results to the message class.""" # 提取应用实体 app_keyword_processor = KeywordProcessor() query = InFo.select(InFo.content).where(InFo.name == "app_kw_dict") app_dict = ast.literal_eval(query[0].content) app_keyword_processor.add_keywords_from_dict(app_dict) # print(f'query为:{message.text}') try: app_name = app_keyword_processor.extract_keywords(message.text)[0] # print(f'app_name词槽为:{app_name}') entity = self.save_to_app(app_name) message.set("entities", [entity], add_to_output=True) except Exception as ex: print(ex)
def custom_entity_extraction(uuid, date, scenario_id, text, dic_ref): """ Goes through the news for the Custom Entities that we defined """ columns = [ "story_uuid", "text", "label", "salience", "published_date", "scenario_id", "wiki", "mentions" ] processor = KeywordProcessor() processor.add_keywords_from_dict(dic_ref) found = processor.extract_keywords( text, span_info=True, ) dic = defaultdict(list) for i, j, k in found: if i not in dic.keys(): dic[i] = [text[j:k]] else: dic[i].append(text[j:k]) final = pd.DataFrame() for key in dic.keys(): df = pd.DataFrame({'label': key, 'text': dic[key], 'mentions': 1}) final = final.append(df) logging.info("{} Custom Entities found.".format(final.shape[0])) if final.shape[0] == 0: return [] final = final.groupby(['label', 'text'], as_index=False)['mentions'].agg('sum') final['story_uuid'] = uuid final['published_date'] = date final['scenario_id'] = scenario_id final['salience'] = 0 final['wiki'] = "custom" return final[columns].values.tolist()
def for_english(): # all_posts = load_posts('analysis/_2_remove_unrelated_data/english.json') # this line is to check whether this new algorithm differ from the previous algo all_posts = load_posts('analysis/_1_process_raw_data/output/english.json') leaders = json.load(open(f'keywords/target/leader.json')) parties = json.load(open(f'keywords/target/party.json')) combined = {**leaders, **parties} keyword_dict = {} for key, value in combined.items(): keyword_dict[key] = [key] + combined[key]["alias_en"] kp = KeywordProcessor() kp.add_keywords_from_dict(keyword_dict) for p in all_posts: p["related_to"] = list(set(kp.extract_keywords(p["value"]))) purified = [x for x in all_posts if len(x['related_to']) > 0] log(f"Number of removed posts = " + str(len(all_posts) - len(purified)), 1) save_posts(purified, f'analysis/_2_remove_unrelated_data/english.json')
class gazetteer(): def __init__(self,type_link,isLower,iscase_sensitive,currFlag): self.keyword_processor = KeywordProcessor(case_sensitive=iscase_sensitive) if type_link is None: return curr_dict = {"USD": ["$","$", "dollars", "U.S. Dollar", "USD", "US$", "United States dollar"]} # Temp logic for hierarchy identification -- remove # asset_dict ={"Assets": ["assets","total assets"]} # curAss_Dict={"Current Assets": ["current assets","total current assets"]} # nonCurAss_Dict={"NonCurrent Assets": ["longterm assets","long-term assets","non current assets","non-current assets","total noncurrent assets","total non-current assets"]} # liab_Dict={"Liabilities": ["liabilities","liabilities and common shareholders equity","liabilities and shareholders equity","total liabilities","total liabilities and equity","total liabilities and shareholders equity","total liabilities and stockholders equity","total liabilities and common shareholders equity"]} # curLiab_Dict ={"Current Liabilities" : ["current liabilities","total current liabilities"]} # nonCurrLiab_Dict = {"NonCurrent Liabilities" : ["non current liabilities","non-current liabilities","longterm liabilities","long-term liabilities","total noncurrent liabilities","total non-current liabilities"]} # shrEq_Dict = {"Shareholders Equity" : ["shareholders equity","equity","stockholders equity","total equity","total stockholders equity","total common shareholders equity","total wells fargo stockholders equity"]} with open(type_link,encoding='utf-8',errors='ignore') as inf: lines = inf.readlines() for line in lines: if len(line.strip()) >0 : if isLower: self.keyword_processor.add_keyword(line.strip().lower()) else: self.keyword_processor.add_keyword(line.strip()) if currFlag: self.keyword_processor.add_keywords_from_dict(curr_dict) # temp logic for hierarchy identification -- remove # self.keyword_processor.add_keywords_from_dict(asset_dict) # self.keyword_processor.add_keywords_from_dict(curAss_Dict) # self.keyword_processor.add_keywords_from_dict(nonCurAss_Dict) # self.keyword_processor.add_keywords_from_dict(liab_Dict) # self.keyword_processor.add_keywords_from_dict(curLiab_Dict) # self.keyword_processor.add_keywords_from_dict(nonCurrLiab_Dict) # self.keyword_processor.add_keywords_from_dict(shrEq_Dict) def findPhrases(self,text): list_phrases = list() flash_text=self.keyword_processor.extract_keywords(text, span_info=True) for flash in flash_text: list_phrases.append(flash[0]) return set(list_phrases)
def build_flashtext_trie(file, limit=numpy.Inf): id2phrases = {} def process_line(line): line = line.replace('\n', '') if '\t' in line: id, phrase = line.split('\t') if id in id2phrases: id2phrases[id].append(phrase) else: id2phrases[id] = [phrase] else: phrase = line id2phrases[len(id2phrases) + 1] = [phrase] [process_line(line) for line in data_io.read_lines(file, limit=limit)] keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(id2phrases) # print(keyword_processor.extract_keywords('gesellschaftervertrag')) return keyword_processor
keyword_processor = KeywordProcessor() keyword_processor.add_keyword('Big Apple') keyword_processor.add_keyword('Bay Area') keywords_found = keyword_processor.extract_keywords('I love big Apple and Bay Area.') print(keywords_found) # ['Big Apple', 'Bay Area'] # 同时添加多个关键词 keyword_processor = KeywordProcessor() # 字典格式的关键词,其对应的key为最终匹配出的词,但key不记入关键词搜索的范围 keyword_dict = { "java": ["java_2e", "java programing"], "product management": ["PM", "product manager"] } # {'clean_name': ['list of unclean names']} keyword_processor.add_keywords_from_dict(keyword_dict) # Or add keywords from a list: keyword_processor.add_keywords_from_list(["java", "python"]) keyword_processor.extract_keywords('I am a product manager for a java_2e platform') # output ['product management', 'java'] # 删除关键词 keyword_processor = KeywordProcessor() keyword_dict = { "java": ["java_2e", "java programing"], "product management": ["PM", "product manager"] } keyword_processor.add_keywords_from_dict(keyword_dict) print(keyword_processor.extract_keywords('I am a product manager for a java_2e platform')) # output ['product management', 'java'] keyword_processor.remove_keyword('java_2e')