def get_important(splitted: List[str]): print("split", splitted) freq_r = rake_nltk.Rake(max_length=3, ranking_metric=Metric.WORD_FREQUENCY) freq_r.extract_keywords_from_text(" and ".join(splitted)) freq_phrases = freq_r.get_ranked_phrases() deg_r = rake_nltk.Rake(max_length=3, ranking_metric=Metric.WORD_DEGREE) deg_r.extract_keywords_from_text(" ".join(splitted)) deg_phrases = deg_r.get_ranked_phrases() freq_data = {i: find_freq(freq_phrases, i.lower()) for i in splitted} deg_data = {i: find_deg(deg_phrases, i.lower()) for i in splitted} data = {} for k, freq_v in freq_data.items(): deg_v = deg_data[k] data[k] = freq_v + deg_v part_of_speech = nltk.pos_tag(splitted) print(part_of_speech) for i, part in part_of_speech: if part not in good: data[i] *= 0.5 if i.lower() in [ 'thing', 'things', 'yeah', 'yes', 'something', 'actually', 'really', 'life', 'lives', 'lot', 'ok', 'oh', 'well', 'stuff', 'talking', 'look', 'looking', 'talking', 'isn\'t', 'that\'s', 'right' ]: data[i] *= 0.1 """ FW foreign word JJ adjective 'big' JJR adjective, comparative 'bigger' JJS adjective, superlative 'biggest' NN noun, singular 'desk' NNS noun plural 'desks' NNP proper noun, singular 'Harrison' NNPS proper noun, plural 'Americans' RB adverb very, silently, RBR adverb, comparative better RBS adverb, superlative best RP particle give up VB verb, base form take VBD verb, past tense took VBG verb, gerund/present participle taking VBN verb, past participle taken VBZ verb, 3rd person sing. present takes WP wh-pronoun who, what WRB wh-abverb where, when """ # print("deg_phrases",deg_phrases) # print("deg_data",deg_data) # print("freq_phrases",freq_phrases) # print("freq_data",freq_data) return [j[0] for j in sorted(data.items(), key=lambda i: -i[1])[:3]]
def register_influencer(): # Extract keywords and add compat text = request.args['text'].lower() r = rake_nltk.Rake() r.extract_keywords_from_text(text) influencer_tags_set = set(list(r.get_ranked_phrases())) most_used_word_tag = list(r.get_ranked_phrases_with_scores())[0] print(influencer_tags_set) db.collection('influencers').document(request.args['influencer_id']).set({ 'tags': list(influencer_tags_set), 'most_used_word': str(most_used_word_tag) }, merge=True) for brand_doc_ref in db.collection('brands').stream(): brand_tags_set = set(list(brand_doc_ref.to_dict()['tags'])) db \ .collection('compat') \ .document(brand_doc_ref.id + "&" + request.args['influencer_id']) \ .set({ 'common_tags': list(brand_tags_set.intersection(influencer_tags_set)), 'score': 100 if len(influencer_tags_set.intersection(brand_tags_set)) == len(brand_tags_set) else len(influencer_tags_set.intersection(brand_tags_set)) * 100 / len(brand_tags_set) }) return jsonify({'tags_extracted': str(influencer_tags_set), 'user': '******', 'most_used_word': most_used_word_tag})
def getMetadataSoup(metadata): m = metadata m['genre'] = metadata['genre'].map(lambda x: x.lower().split(' ')) m['type_name'] = metadata['type_name'].map(lambda x: x.lower().split(' ')) m['title'] = metadata['title'].map(lambda x: x.lower().split(' ')) m['author'] = metadata['author'].map(lambda x: x.lower().split(' ')) m['description_keywords'] = "" for i in m.index: description = m.at[i, 'description'] r = rake_nltk.Rake() r.extract_keywords_from_text(description) keywords_dict_score = r.get_word_degrees() m.at[i, 'description_keywords'] = list(keywords_dict_score.keys()) m.drop(columns=['description'], inplace=True) m.set_index('item_id', inplace=True) m['soup'] = '' columns = m.columns for index, rows in m.iterrows(): words = '' for col in columns: words = words + ' '.join(rows[col]) + ' ' rows['soup'] = words m.drop(columns=[col for col in m.columns if col != 'soup'], inplace=True) return metadata
def get_keywords(text, cutoff_score=None, limit=None): rake = rake_nltk.Rake() rake.extract_keywords_from_text(text) score_words = rake.get_ranked_phrases_with_scores() if limit is not None: score_words = score_words[:limit] if cutoff_score is not None: score_words = filter(lambda score_word: score_word[0] >= cutoff_score, score_words) return [score_word[1] for score_word in score_words]
def get_rake_keywords(text, max_length=100000, metric=None): rake = rake_nltk.Rake( language='german', max_length=max_length, stopwords=stopwords_list, ranking_metric=rake_nltk.Metric.DEGREE_TO_FREQUENCY_RATIO) rake.extract_keywords_from_text(text) #if not keywords_number: # keywords_number = len(sentences)//3 return rake.get_ranked_phrases()
def keyword_extraction(text): stoppath = 'data/stoplists/SmartStoplist.txt' rake_object = rake.Rake(stoppath, 5, 3, 4) keywords = rake_object.run(text) final_list = list() for key in keywords: if (key[1] >= 4.5): final_list.append(key[0]) print(key[0]) else: break return final_list
def __init__(self, algorithm: str, **kwargs) -> None: """ Keyword Extractor constructor. Parameters ---------- algorithm : str The algorithm used to extract keywords. Supported algorithms: `rake` and `yake`. **kwargs: keyword arguments passed to the keyword extraction algorithm. Raises ------ ImportError If the keyword extraction algorithm is not installed. ValueError If the specified extraction algorithm is not supported. """ if algorithm == "rake": try: import rake_nltk except ImportError: print( "Problem occured while trying to import rake-nltk. " "If the library is not installed visit " "https://csurfer.github.io/rake-nltk/_build/html/index.html " "for more details." ) raise extractor = rake_nltk.Rake(**kwargs) elif algorithm == "yake": try: import yake except ImportError: print( "Problem occured while trying to import yake. " "If the library is not installed visit " "https://github.com/LIAAD/yake for more details." ) raise extractor = yake.KeywordExtractor(**kwargs) else: raise ValueError( f"{algorithm} is not supported as keyword extraction algorithm. " f"Available algorithms: {['rake', 'yake']}" ) self._algorithm = algorithm self._kw_extractor = extractor
def __init__(self): self.rakeObj = rake_nltk.Rake() self.rakeDict = {} self.ranking = None self.wordList = None self.punct = set(string.punctuation) #self.snowStemmer = nltk.stem.snowball.SnowballStemmer("english") self.stopwords = set(nltk.corpus.stopwords.words('english')) self.chunkParser = nltk.RegexpParser( r""" NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN PP: {<IN><NP>} # Chunk prepositions followed by NP VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments CLAUSE: {<NP><VP>} # Chunk NP, VP """)
def register_brand(): text = request.args['text'].lower() r = rake_nltk.Rake() r.extract_keywords_from_text(text) brands_tags_set = set(list(r.get_ranked_phrases())) most_used_word_tag = list(r.get_ranked_phrases_with_scores())[0] print("Tags extracted from the description", brands_tags_set) db.collection('brands') \ .document(request.args['brand_id']) \ .set({ 'tags': list(brands_tags_set), 'most_used_word': str(most_used_word_tag) }, merge=True) return jsonify({'tags_extracted': str(brands_tags_set), 'user': '******', 'most_used_word': most_used_word_tag})
def score(self, text: str, args: list) -> float: source_text = args[0] #checks if this is a rephrasing example process_parity = args[1] if not process_parity: return 0.0 r = rake_nltk.Rake(max_length=3) r.extract_keywords_from_text(source_text) # getting top 10 key words/phrases keyword_max = round(0.05 * len(nltk.word_tokenize(source_text))) phrases_original = r.get_ranked_phrases()[:keyword_max] r.extract_keywords_from_text(text) phrases_text = r.get_ranked_phrases( )[:round(0.10 * len(nltk.word_tokenize(text)))] #count how many of the ranked phrases appear as keywords in the user's text phrase_count = 0 for phrase in phrases_original: if phrase in phrases_text: phrase_count += 1 return phrase_count / keyword_max
def keywords_rake(text_file, num_keywords): # Read text file. with open(text_file, 'r') as f: text = f.read() rake = rake_nltk.Rake() # Extraction given the text. rake.extract_keywords_from_text(text) def no_numbers(phrase): for char in phrase: if char.isdigit(): return False return True # To get keyword phrases ranked highest to lowest with scores. return [ (keyword, np.exp(-score)) for (score, keyword) in rake.get_ranked_phrases_with_scores()[:num_keywords] if no_numbers(keyword) ]
OM_Func = lambda line, lib: line[lib['app_installs']] * (6.0 - line[lib[ 'app_score']]) * (line[lib['app_price']] + 0.43) Like_Func = lambda line, lib: line[lib['app_score']] # ------------------------------------------------------------------------------------------------------------------- # END OF Data Predicting Selection Stage # ------------------------------------------------------------------------------------------------------------------- #Get rid of un-wanted columns and calculate the scores OM_Input, OM_Targets = formatForMeasure( globalDat, globalAttributes, ['app_category_primary', 'app_category_secondary'], Like_Func) #expand catagories to be mutually independant. formatted = catagoryExpand(OM_Input) #Get text features text = [line[attrLib['app_description']] for line in globalDat] #raking parses through and gets the words that seem the most important in the sentence. raker = rake_nltk.Rake(max_length=2) for num in range(len(text)): raker.extract_keywords_from_text(text[num]) text[num] = raker.get_ranked_phrases_with_scores() #text = [raker.extract_keywords_from_text(desc).get_ranked_phrases_with_scores() for desc in text] #(rank, word) snowy = SnowballStemmer( "english") #stem is like training -> train. It helps simplify text = [[(tuppy[0], snowy.stem(tuppy[1].lower())) for tuppy in desc] for desc in text] #stem all the words wordBucket = {} #store the 'worth' of the features, which I am determining as a function #of relevance and frequency. for desc in text: for tuppy in desc: if wordBucket.get(tuppy[1]) == None: wordBucket[tuppy[1]] = tuppy[0]
def __init__(self, classifier_dims, num_classes, embedding_dims, gaussian_noise, dropout, internal_dims, n_layers, featurizer, final_layer_builder, n_tokens_in=64, n_tokens_out=16, capabilities2dims=dict(), use_as_super=False, **kwargs): super(LangFeaturesModel, self).__init__(classifier_dims, num_classes, embedding_dims, gaussian_noise, dropout, internal_dims, n_layers, featurizer, final_layer_builder, n_tokens_in, n_tokens_out, use_as_super=True, **kwargs) assert "capabilities" in kwargs capabilities = kwargs["capabilities"] kwargs[ "rake_dims"] = kwargs["rake_dims"] if "rake_dims" in kwargs else 32 kwargs[ "yake_dims"] = kwargs["yake_dims"] if "yake_dims" in kwargs else 32 assert "key_phrases" not in capabilities or ( "key_phrases" in capabilities and "spacy" in capabilities) use_layer_norm = kwargs[ "use_layer_norm"] if "use_layer_norm" in kwargs else False self.capabilities = capabilities embedding_dim = 8 cap_to_dim_map = { "spacy": 128, "snlp": 32, "key_phrases": 64, "nltk": 192, "full_view": 64, "tmoji": 32, "ibm_max": 16, "gensim": 256, "fasttext_crawl": 256 } cap_to_dim_map.update(capabilities2dims) all_dims = sum([cap_to_dim_map[c] for c in capabilities]) self.cap_to_dim_map = cap_to_dim_map self.all_dims = all_dims if "spacy" in capabilities: tr = pytextrank.TextRank(token_lookback=7) self.nlp = spacy.load("en_core_web_lg", disable=[]) self.nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) spacy_in_dims = (96 * 2) + (11 * embedding_dim) + 2 self.spacy_nn = ExpandContract(spacy_in_dims, cap_to_dim_map["spacy"], dropout, use_layer_norm=use_layer_norm, groups=(2, 4)) if "fasttext_crawl" in capabilities: self.bpe = BPEmb(dim=200) self.cngram = CharNGram() fasttext_crawl_file = kwargs[ "fasttext_crawl_file"] if "fasttext_crawl_file" in kwargs else "crawl-300d-2M-subword.bin" self.crawl = fasttext.load_model(fasttext_crawl_file) self.crawl_nn = ExpandContract(200 + 300 + 100, cap_to_dim_map["fasttext_crawl"], dropout, use_layer_norm=use_layer_norm, groups=(4, 4)) if "gensim" in capabilities: gensim = [ api.load("glove-twitter-50"), api.load("glove-wiki-gigaword-50"), api.load("word2vec-google-news-300"), api.load("conceptnet-numberbatch-17-06-300") ] self.gensim = gensim self.gensim_nn = ExpandContract(400, cap_to_dim_map["gensim"], dropout, use_layer_norm=use_layer_norm, groups=(4, 4)) if "full_view" in capabilities: full_sent_in_dims = 300 self.full_sent_nn = ExpandContract(full_sent_in_dims, cap_to_dim_map["full_view"], dropout, use_layer_norm=use_layer_norm, groups=(4, 4)) if "snlp" in capabilities: import stanza self.snlp = stanza.Pipeline( 'en', processors='tokenize,pos,lemma,depparse,ner', use_gpu=False, pos_batch_size=2048) self.snlp_nn = ExpandContract(embedding_dim * 5, cap_to_dim_map["snlp"], dropout, use_layer_norm=use_layer_norm) if "key_phrases" in capabilities: import yake self.kw_extractor = yake.KeywordExtractor(lan="en", n=3, dedupLim=0.9, dedupFunc='seqm', windowsSize=3, top=10, features=None) self.key_occ_cnt_pytextrank = nn.Embedding(8, embedding_dim) nn.init.normal_(self.key_occ_cnt_pytextrank.weight, std=1 / embedding_dim) self.key_wc_pytextrank = nn.Embedding(4, embedding_dim) nn.init.normal_(self.key_wc_pytextrank.weight, std=1 / embedding_dim) yake_dims = kwargs["yake_dims"] if "yake_dims" in kwargs else 32 self.yake_dims = yake_dims self.yake_nn = ExpandContract(300, yake_dims, dropout, use_layer_norm=use_layer_norm, groups=(2, 2)) try: from multi_rake import Rake rake_dims = kwargs["rake_dims"] if "rake_dims" in kwargs else 32 self.rake_dims = rake_dims self.rake_nn = ExpandContract(300, rake_dims, dropout, use_layer_norm=use_layer_norm, groups=(2, 2)) self.rake = Rake(language_code="en") keyphrases_dim = 2 * embedding_dim + rake_dims + yake_dims except: self.rake = None keyphrases_dim = 2 * embedding_dim + yake_dims self.keyphrase_nn = ExpandContract(keyphrases_dim, cap_to_dim_map["key_phrases"], dropout, use_layer_norm=use_layer_norm, groups=(4, 4)) fasttext_file = kwargs[ "fasttext_file"] if "fasttext_file" in kwargs else "wiki-news-300d-1M-subword.bin" if not set(capabilities).isdisjoint( {"key_phrases", "full_view", "nltk"}): self.text_model = fasttext.load_model(fasttext_file) self.pdict = get_all_tags() self.tag_em = nn.Embedding(len(self.pdict) + 1, embedding_dim) nn.init.normal_(self.tag_em.weight, std=1 / embedding_dim) self.sw_em = nn.Embedding(2, embedding_dim) nn.init.normal_(self.sw_em.weight, std=1 / embedding_dim) self.sent_start_em = nn.Embedding(2, embedding_dim) nn.init.normal_(self.sent_start_em.weight, std=1 / embedding_dim) self.is_oov_em = nn.Embedding(2, embedding_dim) nn.init.normal_(self.is_oov_em.weight, std=1 / embedding_dim) self.has_digit_em = nn.Embedding(2, embedding_dim) nn.init.normal_(self.has_digit_em.weight, std=1 / embedding_dim) self.is_mask_em = nn.Embedding(2, embedding_dim) nn.init.normal_(self.is_mask_em.weight, std=1 / embedding_dim) self.w_len = nn.Embedding(16, embedding_dim) nn.init.normal_(self.w_len.weight, std=1 / embedding_dim) self.wc_emb = nn.Embedding(16, embedding_dim) nn.init.normal_(self.wc_emb.weight, std=1 / embedding_dim) if "nltk" in capabilities: import rake_nltk from textblob import TextBlob from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VaderSentimentIntensityAnalyzer self.stop_words = set(stopwords.words('english')) self.rake_nltk = rake_nltk.Rake() self.key_wc_rake_nltk = nn.Embedding(4, embedding_dim) nn.init.normal_(self.key_wc_rake_nltk.weight, std=1 / embedding_dim) self.nltk_sid = SentimentIntensityAnalyzer() self.vader_sid = VaderSentimentIntensityAnalyzer() in_dims = 310 + 5 * embedding_dim self.nltk_nn = ExpandContract(in_dims, cap_to_dim_map["nltk"], dropout, use_layer_norm=use_layer_norm, groups=(2, 4)) if "ibm_max" in capabilities: from ..external import ModelWrapper self.ibm_max = ModelWrapper() for p in self.ibm_max.model.parameters(): p.requires_grad = False self.ibm_nn = ExpandContract(6, cap_to_dim_map["ibm_max"], dropout, use_layer_norm=use_layer_norm, groups=(1, 1)) if "tmoji" in capabilities: from torchmoji.sentence_tokenizer import SentenceTokenizer from torchmoji.model_def import torchmoji_emojis from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH with open(VOCAB_PATH, 'r') as f: maxlen = self.n_tokens_in self.vocabulary = json.load(f) self.st = SentenceTokenizer(self.vocabulary, maxlen) self.tmoji = torchmoji_emojis(PRETRAINED_PATH) for p in self.tmoji.parameters(): p.requires_grad = False self.tm_nn = ExpandContract(64, cap_to_dim_map["tmoji"], dropout, use_layer_norm=use_layer_norm, groups=(1, 1)) self.contract_nn = ExpandContract(self.all_dims, embedding_dims, dropout, use_layer_norm=True, unit_norm=False, groups=(4, 4)) if not use_as_super: if featurizer == "cnn": self.featurizer = CNN1DFeaturizer(n_tokens_in, embedding_dims, n_tokens_out, classifier_dims, internal_dims, n_layers, gaussian_noise, dropout) elif featurizer == "gru": self.featurizer = GRUFeaturizer(n_tokens_in, embedding_dims, n_tokens_out, classifier_dims, internal_dims, n_layers, gaussian_noise, dropout) elif featurizer == "basic": self.featurizer = BasicFeaturizer(n_tokens_in, embedding_dims, n_tokens_out, classifier_dims, internal_dims, n_layers, gaussian_noise, dropout) elif featurizer == "transformer": self.attention_drop_proba = kwargs[ "attention_drop_proba"] if "attention_drop_proba" in kwargs else 0.0 n_encoders = kwargs.pop("n_encoders", n_layers) n_decoders = kwargs.pop("n_decoders", n_layers) self.featurizer = TransformerFeaturizer( n_tokens_in, embedding_dims, n_tokens_out, classifier_dims, internal_dims, n_encoders, n_decoders, gaussian_noise, dropout, self.attention_drop_proba) else: raise NotImplementedError() self.final_layer = final_layer_builder(classifier_dims, n_tokens_out, num_classes, dropout, **kwargs) if "stored_model" in kwargs: load_stored_params(self, kwargs["stored_model"]) self.reg_layers = get_regularization_layers(self)
def get_imp_terms_rake(x): r = rake_nltk.Rake() r.extract_keywords_from_text(x) r.get_ranked_phrases()[0:4]
x = pd.Series(x, index=tfidf_bigrams.get_feature_names()) x = x.sort_values(ascending=False) return x.head(4).index.tolist() transcripts['imp_terms_bigrams'] = [ get_imp_terms_bigram(x) for x in matrix_bigrams ] transcripts['imp_terms_bigrams'] = transcripts['imp_terms_bigrams'].map( lambda x: ",".join(x)) ## Keywords using RAKE, didn't work import rake_nltk r = rake_nltk.Rake() def get_imp_terms_rake(x): r = rake_nltk.Rake() r.extract_keywords_from_text(x) r.get_ranked_phrases()[0:4] transcripts['imp_terms_rake'] = [get_imp_terms_rake(x) for x in Text] transcripts['imp_terms_rake'] = transcripts['imp_terms_rake'].map( lambda x: ",".join(x)) transcripts.drop("imp_terms_rake", axis=1, inplace=True) transcripts.to_csv("transcripts_key_words.csv", index=False, encoding="utf-8")