def getkeywords_key_bert(text): #text = getemail(name) stopwords = stopwordslist("stopwords.txt") kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens') keywords = kw_extractor.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words=stopwords, min_df=1, use_maxsum=True, use_mmr=True) print("Keywords of article", keywords) return keywords
def extract_paper_keywords(input_csv, out_csv, keywordCount): # record_list = pd.read_csv(input_csv).to_dict(orient='records') list_dic = pd.read_csv(input_csv).to_dict(orient='list') model = KeyBERT('distilbert-base-nli-mean-tokens') # res = [] res = {} # for index, record in enumerate(record_list): keyword_list_list = [] for index, doc in enumerate(list_dic['allTitleAndAbstract']): print('-' * 100) print("index: " + str(index + 1) + "/" + str(len(list_dic['nameWithOwner'])) + ", repo: " + str(list_dic['nameWithOwner'][index])) # 为了使结果多样化,我们可以使用最大余量相关性(MMR)创建也基于余弦相似度的关键字/关键词。具有高度多样性的结果: tuple_list = model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words='english', top_n=keywordCount, use_mmr=True, diversity=0.7) # 拿尽量多的关键词 tmp = keywordCount while len(tuple_list) == 0: tmp = tmp - 5 tuple_list = model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words='english', top_n=tmp, use_mmr=True, diversity=0.7) keyword_list_list.append([candidate[0] for candidate in tuple_list]) pre_keyword_list_list = lemmatisation(keyword_list_list) res['nameWithOwner'] = list_dic['nameWithOwner'] res['content'] = [' '.join(x) for x in pre_keyword_list_list] pd.DataFrame.from_dict(res, orient='columns').to_csv(out_csv, index=False) pass
def keybert_keyword_extractor(filename, keyphrase_range=(1, 2)): model = KeyBERT('distilbert-base-nli-mean-tokens') extractor_data = load_as_json(filename) categories = dict() result_dict = dict() for content in extractor_data: if content['category'] not in categories: categories[content['category']] = [] result_dict[content['category']] = [] categories[content['category']].append(content['text']) for category, category_array in categories.items(): print(category) word_set = set() for text_from_category in category_array: keywords = model.extract_keywords( text_from_category, keyphrase_ngram_range=keyphrase_range, stop_words='english') for keyword, value in keywords: word_set.add(keyword) word_array = list(word_set) print(len(word_array)) result_dict[category] = word_array return result_dict
class NLP_Wuwana(): def __init__( self, db, languages, weight_field, spacy_model, remove_words="./data/words_to_remove.txt", replace_words="./data/words_to_replace.txt", tags_alwaysmain="./data/finaltags_alwaysmain.txt", tags_toremove="./data/finaltags_toremove.txt", empha_words=False, empha_multi=1, desc_field="description", max_words=5, ): """Class defined to process wuwana description tags. It attacks db and uses 3 NLP Libraries: - Spacy as tokenizer. - Wordcloud as tag modeller. - Gensim as tag modeller. Parameters ----------- languages: list with languages in format: ["es","fr","zh-cn"]. remove_words: path to file of words to be removed. replace_words: path to file of words to be replaced. tags_alwaysmain: tags that will always be the main tag (just first ocurrence, in order of appeareance). tags_toremove: tags that will never appear. spacy_mode: pretrained Spacy model. max_words: max words to be extracted from description texts. desc_field: field where text is stored in company table. weight_field: Field in company table where weights will be stored. """ #file with words to be removed from tags self.file_words = open(remove_words, "r", encoding="utf-8") self.remove_words = self.file_words.read().split(";") #file with words to emphasize self.empha_multi = empha_multi self.empha_words = empha_words if (empha_words): self.file_empha = open(empha_words, "r", encoding="utf-8") self.words_to_emphasize = self.file_empha.read().split(";") #tags to remove self.file_words = open(tags_toremove, "r", encoding="utf-8") self.tags_toremove = self.file_words.read().split(";") #tags always as main self.file_words = open(tags_alwaysmain, "r", encoding="utf-8") self.tags_alwaysmain = self.file_words.read().split(";") #bag of words that should be replaced, such as abbreviations with open(replace_words, "r", encoding="utf-8") as f_in: self.replace_words = json.load(f_in) self.translator = google_translator() self.db = db self.cursor_tag = self.db.cursor() self.max_words = max_words self.desc_field = desc_field self.languages = languages self.weight_field = weight_field # English pretrained Spacy model try: self.nlp = spacy.load("en_core_web_lg") except: sys.exit( "ERROR: You must download en_core_web_lg spacy model. Use 'python -m spacy download en_core_web_lg' " ) ######## #MAIN## ######## def process_query_companies(self, lib, onlyid=False, column_pos=1): """ Function that launch a sql query and extract main tags from column Parameters ----------- lib: NLP Lib to use (Gensim or Wordcloud) onlyid: Check if changes only applied to one company id column_pos: position of the first column to extract text """ if (onlyid): query = "select company.ID, company." + str( self.desc_field) + " from company where ID = '" + str( onlyid) + "'" else: query = "select company.ID, company." + str( self.desc_field) + " from company " if ((lib != "gensim") & (lib != "wordcloud") & (lib != "keybert")): sys.exit("ERROR: Unknown library: " + str(lib)) else: pass if (lib == "keybert"): self.model = KeyBERT('distilbert-base-nli-mean-tokens') updates_list = [] try: # Execute the SQL command cursor = self.db.cursor() result = cursor.execute(query) self.db.commit() except Exception as e: print("ERROR LOADING DB ", str(e)) pass n = 0 if (result): rows = cursor.fetchall() print("Processing " + str(result) + " companies.") for row in rows: try: text = row[column_pos] nouns_ex = self.process_text(text, lib) tags_english = self.get_keywords(nouns_ex, self.max_words, lib=lib) if (tags_english): tags_main = dict() tags_all = dict() tags_english_split = tags_english[1].split(";") #remove predefined tags. insert main tags. (from file) tags_english_split = self.remove_finaltags( tags_english_split) tags_english_split = self.put_maintags( tags_english_split) for l in self.languages: tags_main[l] = self.get_first_text( self.get_translation(tags_english[0], lang=l)) tags_all[l] = self.get_first_text( self.get_translation(tags_english[1], lang=l)) tag_list = [] main_tag = False second_tag = False other_tags = False for x in range(0, len(tags_english_split)): if (len(tags_english_split[x]) > 0): for s in tags_all.keys(): try: tag_split = tags_all[s].split( ";")[x].strip() except: tag_split = "-" tag_list.append(tag_split) self.check_and_insert_tag( tags_english_split[x], tag_list) tag_list = [] if (x == 0): main_tag = tags_english_split[x].strip() elif (x == 1): second_tag = tags_english_split[x].strip() elif (x == 2): other_tags = tags_english_split[x].strip() else: other_tags += ";" + tags_english_split[ x].strip() updates_list.append( self.update_company_tags(main_tag, row[0], tags_english[2], second_tag, other_tags)) else: print("WARNING: No tags extracted for ID", row[0], "with text:", text) except Exception as e: print("ERROR Processing query row: ", str(e)) else: print("WARNING: NO rows for that ID.") #update tags - execute querys for i in updates_list: try: cursor.execute(i) self.db.commit() except Exception as e: print("Error ", str(e)) def process_text(self, text, lib): """ Function that processes a text with a pipeline of tasks, and returns transformed and cleaned text to be used by NLP libs Parameters ----------- text: text to extract tags lib: NLP library to be used afterwards (wordcloud or gensim) return: cleaned and transformed text """ #remove hastags, mentions, and links. Comment this line to let hastags and metions appear. text = self.strip_all_entities(self.strip_links(text)) #remove special chars. text = self.remove_special_characters(text) #remove emojis. text = self.remove_emojis(text) #detect source lang and translate to english if necessary. source_lang = self.detect_lang(text) #print("ORIG TEXT:", text) if source_lang: if source_lang != 'en': text = self.get_translation(text) else: print( "WARNING: No specific language detected. Translating sentences (slow)" ) text = self.translate_sentence_by_sentence(text) #to lowercase. text = text.lower() #emphasize words if required. It repeats certain words in text (from file). if (self.empha_words): text = self.emphasize_words(text) # Spacy model and custom tokenizer self.nlp.tokenizer = self.custom_tokenizer() sentence = '' # Extract sentences text_lines = text.split(".") if (lib == "wordcloud"): #get nouns longer than 1 char for word in self.nlp(text): if ((word.pos_ in ['NOUN']) & (len(word.text) > 1)): sentence += word.text + ' ' #replace some words with others sentence = self.replace_dict(sentence) #remove specific words and lemmatize sentence = self.remove_common(sentence) #and lemmatize sentence = self.lemmatize(sentence) #last nouns filter fin_sent = '' for word in self.nlp(sentence): if word.pos_ in ["NOUN"]: fin_sent += word.text + ' ' elif ((lib == "gensim")): #get nouns and adjetives longer than 1 char for word in self.nlp(text): if ((word.pos_ in ["NOUN", "ADJ"]) & (len(word.text) > 1)): sentence += word.text + ' ' #replace some words with others sentence = self.replace_dict(sentence) #remove specific words sentence = self.remove_common(sentence) #and lemmatize sentence = self.lemmatize(sentence) fin_sent = sentence elif ((lib == "keybert")): new_lines = [] for line in text_lines: new_line = [] #get nouns and adjetives longer than 1 char for word in self.nlp(line): if ((word.pos_ in ["NOUN", "ADJ"]) & (len(word.text) > 1)): new_line.append(word.text) new_lines.append(" ".join(new_line)) sentence = ". ".join(new_lines) #replace some words with others sentence = self.replace_dict(sentence) #remove specific words sentence = self.remove_common(sentence) #and lemmatize sentence = self.lemmatize(sentence) fin_sent = sentence #print("SENTENCE:",sentence) else: sys.exit("ERROR: LIB NOT FOUND: " + str(lib)) return fin_sent ######## #MYSQL## ######## def update_company_tags(self, first_tag, idcomp, weights, second_tag=False, other_tags=False): """Creates SQL Query for update the tag table Parameters ----------- first_tag: main tag idcomp: id of company weights: weights of every tag second_tag: the second tag most relevant other_tags: rest of tags return: sql query to update """ weights = self.get_weight_string(weights) print("\nID:", idcomp, "\nFIRST:", first_tag, "\nSECOND:", second_tag, "\nOTHERS:", other_tags, "\nWEIGHTS:", weights) if (other_tags): sql_upd = "UPDATE company set FirstTagID='{0}', SecondTagID='{1}', OtherTags = '{2}', {5} = '{4}' where ID = {3}".format( first_tag, second_tag, other_tags, idcomp, weights, self.weight_field) elif (second_tag): sql_upd = "UPDATE company set FirstTagID='{0}', SecondTagID='{1}', OtherTags = '', {4} = '{3}' where ID = {2}".format( first_tag, second_tag, idcomp, weights, self.weight_field) else: sql_upd = "UPDATE company set FirstTagID='{0}', SecondTagID='', OtherTags = '', {3} = '{2}' where ID = {1}".format( first_tag, idcomp, weights, self.weight_field) return (sql_upd) def check_and_insert_tag(self, eng_tag, tags): """Checks if tag exists in table tag and creates if not Parameters ----------- eng_tag: Tag in english tags: Rest languages tags return: main tag """ tag_compo = "" for i in tags: tag_compo += i + ";" try: sql_tag = "Select * from tag where ID = '{0}'".format(eng_tag) count = self.cursor_tag.execute(sql_tag) if (count == 0): #not exists sql_tag = "Insert into tag (ID, Names) values ('{0}', '{1}') ".format( eng_tag.lower().replace("'", ""), tag_compo.lower().replace("'", "")) self.cursor_tag.execute(sql_tag) self.db.commit() return eng_tag except Exception as e: print("ERROR: check_and_insert_tag ", str(e)) ########## ###NLP#### ########## def detect_lang(self, text): """ Function that detects the language of a text Parameters ----------- text: Text to be detected return: lang detected """ try: lang = self.translator.detect(text)[0] return lang except: print("WARNING: No language detected in text") return False def get_translation(self, text, lang="en"): """ Function that translate text to english Parameters ----------- text: Text to be translated return: translated text """ max_len = 4900 #library limit 5000 if (len(text) > max_len): sub_text = "" for i in range(0, math.ceil(len(text) / max_len)): start = i * max_len end = (i + 1) * (max_len) sub_text += text[ start: end] #translator.translate(text[start:end], lang_tgt='en') text = sub_text else: text = self.translator.translate(text, lang_tgt=lang) if (isinstance(text, list)): text = text[0].replace(",", ";") else: text = text.replace(",", ";") time.sleep(0.5) #1 second delay in order to avoid ip blocking return text def translate_sentence_by_sentence(self, text): """ Function that translate sentece by sentence a string to english. Separated by '.' Parameters ----------- text: Text to be translated return: translated text """ sub_text = "" sentences = text.split(".") for s in sentences: sub_text += self.translator.translate(s, lang_tgt='en') return sub_text def replace_dict(self, sentence): """ Function that replace words in a sentence according to a dictionary or words (replace_words) Parameters ----------- sentence: Text to be modified return: cleaned text """ sentence = sentence.lower() # convert to lower case for word, abbr in self.replace_words.items(): sentence = sentence.replace(word.lower(), abbr) return sentence def remove_common(self, sentence): """ Function that remove words in a sentence according to a dictionary or words (remove_words) Parameters ----------- sentence: Text to be modified return: cleaned text """ final_sentence = "" stops = [" ", ".", ",", "-", ";"] # common_words to remove for word in sentence.split(" "): tmp = word.lower() for i in stops: tmp = tmp.replace(i, "") if tmp not in self.remove_words: final_sentence += word.lower() + " " return final_sentence def lemmatize(self, sentence): """ Function that extract lemmas from sentence Parameters ----------- sentence: Text to be analysed return: transformed text """ self.nlp.tokenizer = self.custom_tokenizer() final_sentence = '' # common_words to remove for word in self.nlp(sentence): final_sentence += word.lemma_.lower() + ' ' return final_sentence def get_weight_string(self, weights): """ Function that transform weight object to string. Parameters ----------- weights: Weight object returned by nlp return: weight transformed to string """ if (isinstance(weights, dict)): #gensim weights = json.dumps(weights).replace("'", "") elif (isinstance(weights, list)): #wordcloud weights = ', '.join(str(e).replace(",", ":") for e in weights).replace("'", '"').replace( "(", '').replace(")", '') weights = "{" + weights + "}" return weights def custom_tokenizer(self): """ Function that defines a tokenizer in order to be used Parameters ----------- nlp: spacy loaded object return: prepared tokenizer """ infixes = ( LIST_ELLIPSES + LIST_ICONS + [ r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), #r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), ]) infix_re = compile_infix_regex(infixes) return Tokenizer(self.nlp.vocab, prefix_search=self.nlp.tokenizer.prefix_search, suffix_search=self.nlp.tokenizer.suffix_search, infix_finditer=infix_re.finditer, token_match=self.nlp.tokenizer.token_match, rules=self.nlp.Defaults.tokenizer_exceptions) def remove_special_characters(self, text): """ Function that removes special characters from a text Parameters ----------- text: text to be modified return: cleaned text """ bad_chars = [';', ':', '!', "*", "¿", "?", "¡"] for i in bad_chars: text = text.replace(i, ' ') return text def remove_emojis(self, text): """ Function that removes emojis from a text Parameters ----------- text: text to be modified return: cleaned text """ emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) return emoji_pattern.sub(r'', text) def emphasize_words(self, text): """ Function that repeats emphasize_words if found in text text: text to find words and modify. """ for i in self.words_to_emphasize: if i.lower() in text.lower(): if (len(i) > 0): for x in range(0, self.empha_multi): text += ". " + i.lower() return text def get_keywords(self, words, amount=3, lib="wordcloud", sep=";"): """ Function that extract main keywords from processed text Parameters ----------- words: bag of words to extract tags amount: amount of of words to be extracted. 3 max words for gensim lib: lib to be used - gensim, wordcloud, keybert sep: separator for returned words return: main tag, list with all tags, weighted tags """ if (len(words) > 0): if (lib == "gensim"): tmp = keywords(words, words=min(amount, 3), split=True) info = keywords(words, words=min(amount, 3), scores=True) if (tmp): return tmp[0], sep.join(tmp), info else: return False elif (lib == "wordcloud"): listw = "" wcloud = wordcloud.WordCloud().generate(words) n = 0 if (wcloud.words_): for i in wcloud.words_: if (n == 0): main = i listw += i + sep else: if (n < amount): listw += i + sep n += 1 return main, listw, wcloud.words_ else: return False elif (lib == "keybert"): tags = self.model.extract_keywords(words, keyphrase_ngram_range=(0, 2), stop_words='english', use_mmr=True, diversity=0.2, top_n=amount) if (len(tags) > 0): return tags[0], sep.join(tags), "" else: return "", "", "" else: #print("Warning: No words to extract tags: ", words) return False def strip_links(self, text): """ Removes urls from text Parameters ----------- text: String to remove urls return: cleaned text """ link_regex = re.compile( '((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL) links = re.findall(link_regex, text) for link in links: text = text.replace(link[0], ', ') return text def strip_all_entities(self, text): """ Removes rrss hastags and mentions from text Parameters ----------- text: String to remove hastags return: cleaned text """ entity_prefixes = ['@', '#'] for separator in string.punctuation: if separator not in entity_prefixes: text = text.replace(separator, ' ') words = [] for word in text.split(): word = word.strip() if word: if word[0] not in entity_prefixes: words.append(word) return ' '.join(words) def get_first_text(self, obj): """Extracts from abn objet: - first occurrence if array - text if string Parameters ----------- obj: object to extract text (array or str) return: first ocurrence """ if (isinstance(obj, list)): if (isinstance(obj[0], list)): obj[0][0].strip() else: return obj[0].strip() else: return obj.strip() def remove_finaltags(self, tags): """Remove tags from final processing Parameters ----------- tags: list to be cleaned return: cleaned tag list """ tmp_list = [] for i in tags: if i not in self.tags_toremove: tmp_list.append(i) return tmp_list def put_maintags(self, tags): """Pririze some tags as main tag Parameters ----------- tags: list to be modified return: modified tag list """ for i in self.tags_alwaysmain: if (i in tags): pos = (tags.index(i)) tmp = tags[0] tags[pos] = tmp tags[0] = i return tags return tags
from keybert import KeyBERT doc = """O aprendizado automático (português brasileiro) ou a aprendizagem automática (português europeu) ou também aprendizado de máquina (português brasileiro) ou aprendizagem de máquina (português europeu) (em inglês: machine learning) é um subcampo da Engenharia e da ciência da computação que evoluiu do estudo de reconhecimento de padrões e da teoria do aprendizado computacional em inteligência artificial[1]. Em 1959, Arthur Samuel definiu aprendizado de máquina como o "campo de estudo que dá aos computadores a habilidade de aprender sem serem explicitamente programados"[2](livre tradução). O aprendizado automático explora o estudo e construção de algoritmos que podem aprender de seus erros e fazer previsões sobre dados[3]. Tais algoritmos operam construindo um modelo a partir de inputs amostrais a fim de fazer previsões ou decisões guiadas pelos dados ao invés de simplesmente seguindo inflexíveis e estáticas instruções programadas. Enquanto que na inteligência artificial existem dois tipos de raciocínio (o indutivo, que extrai regras e padrões de grandes conjuntos de dados, e o dedutivo), o aprendizado de máquina só se preocupa com o indutivo.""" #paraphrase-xlm-r-multilingual-v1 #bert-base-multilingual-cased model = KeyBERT('bert-base-multilingual-cased') keywords = model.extract_keywords(doc) model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words=None) model.extract_keywords(doc, keyphrase_ngram_range=(1, 2), stop_words=None)
if el == 'error': dict_.update({key: [value]}) else: el.append(value) def common_words(key_words: list) -> list: return [el for el in key_words if el in data] with open(PATH_TO_TEXT, 'r') as code_book: for sent_index, sent in enumerate(sent_tokenize(code_book.read())): try: n1 = model.extract_keywords(sent.lower(), keyphrase_length=1, use_maxsum=True) n2 = model.extract_keywords(sent.lower(), keyphrase_length=2, use_maxsum=True) key_words = key_words.union(set(n1 + n2)) except Exception as e: e = e main(sent, sent_index) with open(PATH_TO_SAVE, 'w') as fp: json.dump(result, fp) bert_words = common_words(key_words=key_words)
# %% start = time.time() logging.info('Getting Happy Elements...') query_hm = args.query_hm top_n = args.num_he df = pd.read_csv(args.hm_file) df = df[df['cleaned_up_hm'] != query_hm].sample(args.num_hms) all_hms = [query_hm, *list(df['cleaned_up_hm'].values)] df = pd.DataFrame({'happy_moment': all_hms}) happy_elements = kb_model.extract_keywords(all_hms, keyphrase_ngram_range=(1, 2), stop_words='english', use_mmr=True, diversity=1, top_n=1) input_events = [he[0] for hes in happy_elements for he in hes] df['happy_element'] = input_events logging.info('Total Time Used: ' + str(time.time() - start) + "s") # %% [markdown] # input happy elements to conceptnet and get the outputs # %% relation = ['AtLocation', 'HasPrerequisite', 'Desires', 'UsedFor'] sampling_algorithm = args.sampling_algorithm
class KeyBertExtractor(): """https://github.com/MaartenGr/KeyBERT""" #TODO there really are many many configs and I think changing these changes a great deal! see https://github.com/MaartenGr/KeyBERT and try out stuff!! #TODO there is a minimum-frequency-argument!! https://github.com/MaartenGr/KeyBERT/blob/master/keybert/_model.py#L83-L101 #TODO does this use the phrase_in_text function? SHOULD IT? def __init__(self, is_multilan, faster=False, max_ngram=1): """available models: https://github.com/MaartenGr/KeyBERT#25-embedding-models""" from keybert import KeyBERT #lazily loaded as it needs tensorflow which takes some time to init assert not (is_multilan and faster) if faster: self.model_name = "paraphrase-MiniLM-L6-v2" elif is_multilan: self.model_name = "paraphrase-multilingual-MiniLM-L12-v2" else: self.model_name = "paraphrase-mpnet-base-v2" print(f"Using model {self.model_name}") self.kw_model = KeyBERT(self.model_name) self.max_ngram = max_ngram def _fix_hyphenated(self, cand, comparedtext): # it may be the case that the candiate is something like "particle systems", however the text only has "many-particle systems". # if so, then `(not phrase_in_text(cand, without_stops)) and cand in without_stops == True` words_before_onset = comparedtext[:comparedtext.find(cand)].count(" ") chars_before_onset = len(" ".join( comparedtext.split(" ")[:words_before_onset])) if chars_before_onset > 0 and chars_before_onset + 1 != comparedtext.find( cand): # then the first word is hyphenated return comparedtext[chars_before_onset + 1:comparedtext.find(cand) + len(cand)] elif words_before_onset == 0 and bool( re.fullmatch(WORD_NUM_REGEX, comparedtext[:comparedtext.find(cand)])): return comparedtext[:comparedtext.find(cand) + len(cand)] else: # then not the first word is hyphenated, but the last chars_after_hyphen = comparedtext[comparedtext.find(cand) + len(cand):].find(" ") if chars_after_hyphen > 0: return comparedtext[comparedtext. find(cand):comparedtext.find(cand) + len(cand) + chars_after_hyphen] elif re.fullmatch( WORD_NUM_REGEX, comparedtext[comparedtext.find(cand) + len(cand):]): return comparedtext[comparedtext.find(cand):] print("hm?!") return "NOPE" def extract_candidate(self, cand, text, without_stops, inds_without_stops, only_words, inds_only_words): #TODO not sure if this version can also correct hyphenated stuff like the old one ARGH!! if (not phrase_in_text(cand, without_stops)) and cand in without_stops: cand = self._fix_hyphenated(cand, without_stops) if phrase_in_text(cand, text): #maybe we're already done here return cand #now the cand is fixed and you can continue to checking phrase_in_text if phrase_in_text(cand, without_stops): tokenized_with_stops = tokenize_text(text, stopwords=None)[1] startpos = without_stops.find(cand) start_ind = without_stops[:startpos].count(" ") stoppos = startpos + len(cand) stop_ind = start_ind + without_stops[startpos:stoppos].count(" ") actual_phrase = " ".join(tokenized_with_stops[ inds_without_stops[start_ind]:inds_without_stops[stop_ind] + 1]) if phrase_in_text(actual_phrase, text): if actual_phrase.split(" ")[0] == cand.split( " ")[0] and actual_phrase.split(" ")[-1] == cand.split( " ")[-1]: # print(f"FROM {cand} TO {actual_phrase}") return actual_phrase else: print() return print() return if (not phrase_in_text(cand, only_words)) and cand in only_words: cand = self._fix_hyphenated(cand, only_words) #now the cand is fixed and you can continue to checking phrase_in_text if phrase_in_text(cand, only_words): tokenized_with_stops = tokenize_text(text, stopwords=None)[1] startpos = only_words.find(cand) start_ind = only_words[:startpos].count(" ") stoppos = startpos + len(cand) stop_ind = start_ind + only_words[startpos:stoppos].count(" ") actual_phrase = " ".join(tokenized_with_stops[ inds_only_words[start_ind]:inds_only_words[stop_ind] + 1]) if any( i in actual_phrase[:-1] for i in list("?!") + ['"'] ): #if the phrase is not an actual phrase but split by punctuation print( f"{cand} is not an actual phrase - in the text it is `{actual_phrase}`" ) return None if phrase_in_text(actual_phrase, text): if actual_phrase.split(" ")[0] == cand.split( " ")[0] and actual_phrase.split(" ")[-1] == cand.split( " ")[-1]: # print(f"FROM {cand} TO {actual_phrase}") return actual_phrase else: print() return print() return if cand in without_stops: print("In without_stops") return if cand in only_words: print("in only_words") return #another thing: cand is "internship self organization", but in the text it's "internship self-organization". Maybe remove everything but letters and then re-apply? c2 = re.sub(re.compile(r'[\W\d]', re.U), "|", cand) t2 = re.sub(re.compile(r'[\W\d]', re.U), "|", text).lower() if c2 in t2: cand = text[t2.find(c2):t2.find(c2) + len(c2)] if phrase_in_text(cand, text): return cand else: print("whatever.") w2 = re.sub(re.compile(r'[\W\d]', re.U), "|", without_stops) if c2 in w2: cand = without_stops[w2.find(c2):w2.find(c2) + len(c2)] return self.extract_candidate(cand, text, without_stops, inds_without_stops, only_words, inds_only_words) o2 = re.sub(re.compile(r'[\W\d]', re.U), "|", only_words) if c2 in o2: cand = only_words[o2.find(c2):o2.find(c2) + len(c2)] return self.extract_candidate(cand, text, without_stops, inds_without_stops, only_words, inds_only_words) print(f"This does not work: {cand}") def __call__(self, text, lang="en"): #TODO lang shouldn't be en!!! """see scripts/notebooks/proof_of_concept/proofofconcept_keyBERT.ipynb for why this is like this""" #TODO so extract_keywords can be passed a `vectorizer`, and that is by default Sklearn's CountVectorizer. # You can ALSO pass `candidates`, "to use instead of extracting them from the document(s)"!!! # Put a breakpoint in /home/chris/.local/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:395 for details # TODO also why do I get this ^ warning ("Your stop_words may be inconsistent with your preprocessing") ?? # Does KeyBERT need already preprocessed descriptions?! if so, how much preprocessed, and how do I know this??! stopwords = get_stopwords(lang) candidates = set() for nwords in range(1, self.max_ngram): n_candidates = self.kw_model.extract_keywords( text, keyphrase_ngram_range=(1, nwords), stop_words=stopwords) candidates |= set(i[0] for i in n_candidates) candidates = list(candidates) #TODO: what if there are special chars in the candidates? is everything ok then with the word-splitting? #TODO does this work for numbers?! inds_without_stops, without_stops = tokenize_text(text, stopwords) ind_word_list = [ (ind, word) for ind, word in zip(inds_without_stops, without_stops) if WORD_NUM_REGEX.fullmatch(word) ] inds_only_words, only_words = list(zip( *ind_word_list)) if ind_word_list else ([], []) without_stops = " ".join(without_stops) only_words = " ".join(only_words) actual_keyphrases = [] used_candidates = [] n_immediateworking = n_fixed = n_errs = 0 for cand in candidates: # if not all(WORD_REGEX.fullmatch(i) for i in cand.split(" ")): # print(f"The candidate `{cand}` is not purely textual!") if phrase_in_text(cand, text): actual_keyphrases.append(cand) used_candidates.append(cand) n_immediateworking += 1 else: intextcand = self.extract_candidate(cand, text, without_stops, inds_without_stops, only_words, inds_only_words) #TODO wenn in candidate ne zahl oder so ist die entfernen und es neu versuchen if intextcand: if phrase_in_text(intextcand, text): actual_keyphrases.append(intextcand) used_candidates.append(cand) n_fixed += 1 continue else: print( "The extracted candidate is STILL not in the text!" ) n_errs += 1 return actual_keyphrases, used_candidates, (n_immediateworking, n_fixed, n_errs)
def train(self, documents, **kwargs): extractor = KeyBERT('distilbert-base-nli-mean-tokens') stop_words = kwargs.get('stop_words', 'english') self.the_total_keywords = extractor.extract_keywords( ' '.join(documents), keyphrase_ngram_range=(1, 5))[:self.total_keywords_in_training]
t_keys = topic_rank_kw_extraction( os.path.join(TEMP_DIR, "news_temp.txt"), news) t_kw_freq = nr_keywords_in_text(t_keys, news, news) tw_keys = topic_rank_kw_extraction( os.path.join(TEMP_DIR, "news_temp.txt"), tweets) tw_kw_freq = nr_keywords_in_text(tw_keys, news, tweets) evaluate_baseline(t_kw_freq, tw_kw_freq, fill_results_dict=True, base_name="TopicRank") # Pre-trained KeyBert topic_keywords = key_bert.extract_keywords(news, top_n=TOP_N_KEYWORDS, keyphrase_ngram_range=(1, 2)) t_keys = [kw for kw, _ in topic_keywords] t_kw_freq = nr_keywords_in_text(t_keys, news, news) tweets_keywords = key_bert.extract_keywords(tweets, top_n=TOP_N_KEYWORDS, keyphrase_ngram_range=(1, 2)) tw_keys = [kw for kw, score in tweets_keywords] tw_kw_freq = nr_keywords_in_text(tw_keys, news, tweets) evaluate_baseline(t_kw_freq, tw_kw_freq, fill_results_dict=True, base_name="KeyBert")
def set_keyword_score_list(self, **kwargs): extractor = KeyBERT('distilbert-base-nli-mean-tokens') stop_words = kwargs.get('stop_words', 'english') self._keyword_score_list = extractor.extract_keywords(self._document, keyphrase_ngram_range=(1, 4), stop_words=stop_words)[:len(self._document)]
d = json.loads(line) texts.append("\n ".join([porterStem(p) for p in d['targetParagraphs']])) titles.append(d['targetTitle']) labels.append(dictLabels[d['id']]) index += 1 if (index == 10): break sim = scipy.spatial.distance.cosine n = 2 k = len(titles[n].split()) print(titles[n], "\nClickbait average: ", labels[n]) keyWords = " ".join(model.extract_keywords(texts[n], top_n=k)) u = vectorizer.encode(titles[n]) v = vectorizer.encode(keyWords) print(keyWords, sim(u, v)) keyWords = " ".join(model.extract_keywords(texts[n], use_maxsum=True, top_n=k)) v = vectorizer.encode(keyWords) print(keyWords, sim(u, v)) keyWords = " ".join( model.extract_keywords(texts[n], use_mmr=True, diversity=0.2, top_n=k)) v = vectorizer.encode(keyWords) print(keyWords, sim(u, v)) keyWords = " ".join( model.extract_keywords(texts[n], use_mmr=True, use_maxsum=True, top_n=k))