def get_phrases(text=''): rake = Rake() rake.extract_keywords_from_text(''.join(text)) phrases = rake.get_ranked_phrases() if len(phrases) >= 5: return phrases[:5] else: return phrases
def phrase(ques): phrase=[] new_list=[] r=Rake() question=ques r.extract_keywords_from_text(question) phrase=r.get_ranked_phrases() for items in phrase: new_list.extend(items.lower().split()) return new_list
def extract_keywords_from_doc(doc, phrases=True, return_scores=False): if phrases: r = Rake() if isinstance(doc, (list, tuple)): r.extract_keywords_from_sentences(doc) else: r.extract_keywords_from_text(doc) if return_scores: return [(b, a) for a, b in r.get_ranked_phrases_with_scores()] else: return r.get_ranked_phrases() else: if not isinstance(doc, (list, tuple)): doc = [doc] ret = [] for x in doc: for t in nltk.word_tokenize(x): if t.lower() not in stop_words: ret.append(t) return ret
def predict_rake(tasks, order, phrases): predictions = [] expected = [] durations = [] r = Rake() for task, actual in zip(tasks, order): scores = dict() cover_scores = dict() expected.append(actual) words = [] cover = {} for _, row in phrases.iterrows(): search_terms = word_tokenize(row["phrase"]) search_terms = [x for x in search_terms if not x in stop_words] search_terms = [x for x in search_terms if len(x) > 2] search_terms = [x for x in search_terms if x in model] occurs = 0 coverage = 0 covered = [] for word in search_terms: if word in task: occurs += task[word] coverage += 1 covered.append(word) coverage = coverage/len(search_terms) scores[row["expected"]] = occurs cover_scores[row["expected"]] = coverage cover[row["expected"]] = [(x, task[x]) for x in covered] scores = normalize_score(scores) cover_scores = normalize_score(scores) for key in scores.keys(): scores[key] = scores[key] * 1 + cover_scores[key] * 0 predictions.append(get_prediction(scores)) return predictions, expected
def get_key_phrases_from_text(text, max_length=None): """ Find key phrases within an html page. :param text: the text from an html page. :type text: str :param max_length: the max length of each key phrase. :type max_length: int or None :return: a list of all key phrases within the text. :rtype: list of str """ if max_length is not None: r = Rake(max_length=max_length) else: r = Rake() r.extract_keywords_from_text(text) return filter_key_phrases(r.get_ranked_phrases())
def _get_keyphrases(self): # Extract keywords and phrases from the current document so we know # what to search for in ES. r = Rake() r.extract_keywords_from_text(self.instance.content) key_phrases = [ keyphrase[1] for keyphrase in r.get_ranked_phrases_with_scores() if keyphrase[0] >= self.min_rank_score ] if not key_phrases: key_phrases = [ keyphrase[1] for keyphrase in r.get_ranked_phrases_with_scores()[:self.default_list_length] ] return key_phrases
class KeywordFinder(): rake: Rake def __init__(self): self.rake = Rake(min_length=1, max_length=5) pass def find_keyword(self, text): self.rake.extract_keywords_from_text(text) return self.rake.get_ranked_phrases()[0] def find_keywords(self, text): self.rake.extract_keywords_from_text(text) return self.rake.get_ranked_phrases() pass
def analyse(tab): keys = [ 'Наименование продукта/технологии', 'Уникальные характеристики', 'Задачи, которые решает продукт', 'Технические характеристики', 'Ожидаемые эффекты' ] res = [] r = Rake() r.language = "russian" # Extraction given the text. for i in tab: text = "\n".join(list(map(lambda x: i[x], keys))) r.extract_keywords_from_text(text) ranked = r.get_ranked_phrases_with_scores() res.append(ranked) return res
def search_keyword(request): if request.method == "POST": search_title = request.POST["search_title"] if search_title is None: return HttpResponse("found") user = User.objects.get(pk=UserDataManagement.MainData.EnteredUser.UserEmail) history = History() history.Date_Time = datetime.datetime.now() history.SearchTitle = search_title history.UserEmailFK = user history.save() topics = Topic.objects.all() for topic in topics: if (str(topic.TopicName).lower()) == (str(search_title).lower()): return render(request, "Search.html", {"articles": topic.article_set.all(), "UserName": UserDataManagement.MainData.EnteredUser.UserName, "check": "1"}) Newslist = [] RakeAlgoritm = Rake() RakeAlgoritm.extract_keywords_from_text(search_title) KeyWordsList1 = RakeAlgoritm.get_ranked_phrases() for topic in topics: for article in topic.article_set.all(): RakeAlgoritm.extract_keywords_from_text(article.ArticleDescription) KeyWordsList = RakeAlgoritm.get_ranked_phrases() intersection = set(KeyWordsList) & set(KeyWordsList1) if intersection == set(): continue else: Newslist.append(article.ArticleDescription) if len(Newslist) != 0: return render(request, "Search.html", {"articles": Newslist, "UserName": UserDataManagement.MainData.EnteredUser.UserName, "check": "2"}) return HttpResponse("Notfound")
def parse_keywords(self): r = Rake() if self.keyword_limit == 0: sentence = self.sentence r.extract_keywords_from_text(sentence) score_words = r.get_ranked_phrases_with_scores() for keyword in score_words: if keyword[0] > 1: self.keywords.append(keyword[1]) return self.keywords else: sentences = [self.sentence] r.extract_keywords_from_sentences(sentences) keywords = r.ranked_phrases return keywords[0:self.keyword_limit]
class KeywordExtraction: THRESHOLD = 3.5 def __init__(self, text): self.r = Rake() self.text = text def return_keywords(self) -> list: self.r.extract_keywords_from_text(self.text) return self.r.get_ranked_phrases() def return_keywords_with_score(self) -> tuple: self.r.extract_keywords_from_text(self.text) return self.r.get_ranked_phrases_with_scores() def return_keywords_with_score_more_than_threshold(self) -> list: return [tup[1] for tup in self.return_keywords_with_score() if tup[0] > self.THRESHOLD]
def filter_bert(res, query, w, num, num_bert): r = Rake() text_corpus = [] score_arr = [] query = str(query) query_corpus = [] query_corpus.append(query) res_new = [] for i in range(len(res)): text = str(res[i]['_source']['title_body']) # remove stop words #text_tokens = word_tokenize(text) #tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()] #text = (" ").join(tokens_without_sw) key_text = extract_key(text, w, r) text_corpus.append(key_text) text_emb = np.array(model.encode(text_corpus)) query_emb = np.array(model.encode(query_corpus)) query_emb = query_emb[0] #print("text emb size: ", text_emb.shape) #print("query emb size: ", query_emb.shape) #for t, emb in zip(text_corpus, text_emb): for emb in text_emb: score = scoring_bert(query_emb, emb) score_arr.append(score) score_arr = np.array(score_arr) max_ind = score_arr.argsort()[-num_bert:][::-1] for i in max_ind: res_new.append(res[i]) return res_new
def __init__(self, name): super(App, self).__init__(name) print("[INFO] Loading models") cur_dir = os.path.dirname(os.path.realpath(__file__)) gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # add the EOS token as PAD token to avoid warnings gpt2_model = GPT2LMHeadModel.from_pretrained( "gpt2", pad_token_id=gpt2_tokenizer.eos_token_id ) t5_model = T5ForConditionalGeneration.from_pretrained("t5-base") t5_tokenizer = T5Tokenizer.from_pretrained("t5-base") t5_state_path = os.path.join(cur_dir, "./models/final.pt") t5_model.load_state_dict(torch.load(t5_state_path)) print("[INFO] Initializing classes") self.spacy_model = SpacyModel(size="md") self.gpt2 = GPT2(gpt2_tokenizer, gpt2_model) self.t5 = T5(t5_tokenizer, t5_model, 100) self.r = Rake() token = os.getenv("ACCESS_TOKEN") dataset = os.path.join(cur_dir, './data/artists.txt') self.artists = open(dataset).readlines() self.client = GeniusClient(token)
def shorten_title(title, max_title_len, alpha_only=True): """ Shortens a title using important phrases and keywords in the title. Args: title (str): Title to shorten. max_title_len (int): Maximum length of the final title. alpha_only (bool): Whether to only use alphabetic characters. Returns: str: Shortened, all lower-case title with a length less than `max_title_len`. """ title = title.lower() if len(title) <= max_title_len: # Title is already short enough. return title if alpha_only: filter = re.compile("[^a-z ]") title = filter.sub("", title) # Try using the highest ranked phrase from the title. r = Rake() r.extract_keywords_from_text(title) new_title = r.get_ranked_phrases()[0] if len(new_title) <= max_title_len: return new_title # Title is still too long. Use as many of the important words as will fit within the max # title length. words = sorted(r.get_word_degrees()) new_title = words[0] if len(new_title) > max_title_len: # Cut the single-word title short. return new_title[:max_title_len] for w in words[1:]: append_title = "{} {}".format(new_title, w) if len(append_title) > max_title_len: break new_title = append_title return new_title
def test_extract_keywords_from_text(self): r = Rake() r.extract_keywords_from_text(self.test_text) ranked_phrases = [ 'minimal generating sets', 'linear diophantine equations', 'minimal supporting set', 'minimal set', 'linear constraints', 'upper bounds', 'strict inequations', 'nonstrict inequations', 'natural numbers', 'mixed types', 'corresponding algorithms', 'considered types', 'set', 'types', 'considered', 'algorithms', 'used', 'systems', 'system', 'solving', 'solutions', 'given', 'criteria', 'construction', 'constructing', 'components', 'compatibility' ] self.assertEqual(r.get_ranked_phrases(), ranked_phrases) self.assertEqual( [phrase for _, phrase in r.get_ranked_phrases_with_scores()], ranked_phrases)
def test_extract_keywords_from_text_word_frequency_metric(): r = Rake(ranking_metric=Metric.WORD_FREQUENCY) r.extract_keywords_from_text(text) ranked_phrases = [ 'minimal supporting set', 'minimal set', 'minimal generating sets', 'considered types', 'systems', 'systems', 'systems', 'systems', 'mixed types', 'linear diophantine equations', 'types', 'strict inequations', 'solutions', 'solutions', 'solutions', 'set', 'nonstrict inequations', 'linear constraints', 'corresponding algorithms', 'upper bounds', 'natural numbers', 'criteria', 'criteria', 'considered', 'compatibility', 'compatibility', 'algorithms', 'used', 'system', 'solving', 'given', 'construction', 'constructing', 'components', ] assert r.get_ranked_phrases() == ranked_phrases assert [phrase for _, phrase in r.get_ranked_phrases_with_scores() ] == ranked_phrases
def Extract(train_data, test_data, max_score, j, Enter_rank=True): train, test = Rake(), Rake() train.extract_keywords_from_text(train_data) test.extract_keywords_from_text(test_data) train_keywords = lematize(break_phrases(train.get_ranked_phrases())) test_keywords = lematize(break_phrases(test.get_ranked_phrases())) for x in test_keywords: print(x) testlist.append(x) result = 0 dict = key[j] trainlist = [] print(dict) for x in dict.keys(): trainlist.append(x) print(trainlist) i = 0 for x in testlist: if x in dict.keys(): print(x) result = result + (dict[x] * max_score) / 100 print(result, dict[x]) i = i + 1 else: syn = PyDictionary.PyDictionary().synonym(trainlist[i]) if syn == None: continue print(syn) for j in syn: if j in testlist: print(trainlist[i], j) print(dict) print(x) dict[j] = (dict[x] * max_score) / 100 result = result + dict[j] * max_score matched.append(i) i = i + 1 app.startSubWindow("one", modal=True) app.addLabel("l1", result) app.stopSubWindow() app.addButton("get score", score)
def keywords_extraction(article, method, k=20, with_weight=False): doc = "" if method == 0: model = lda.build_lda_model(article, 1) return lda.get_topic(model, num_topics=1, num_words=k, with_weight=with_weight)[0] if method == 1: if isinstance(article, str): article = [article] text_list = text_process.general_processing_file(article) for arti in text_list: doc += arti return jieba.analyse.extract_tags(doc, topK=k, withWeight=with_weight, allowPOS=()) elif method == 2: if isinstance(article, str): article = [article] article = text_process.general_processing_file(article) for arti in article: doc += arti return textrank.extract_key_phrases(doc) elif method == 3: if isinstance(article, str): article = [article] article = text_process.text_processing_rake(article) for arti in article: doc += arti r = Rake() r.extract_keywords_from_text(doc) rank = r.get_ranked_phrases() if with_weight == False: return rank[0:len(rank) / 2 + 1] score = r.get_ranked_phrases_with_scores() return score[0:len(rank) / 2] #docs_phase else: raise ValueError('wrong method code')
def extract_keywords(): #read data user_data = dataset.iloc[:, 0] review_data = dataset.iloc[:, 1] #new list to remove stopwords review = [] for data in review_data: review.append(data) #extracting keywords keywords = [] #creating object for the class rake = Rake() for data in review: if "not" not in data: extracted_keywords = rake.extract_keywords_from_text(data) ranked_phrase_keywords = rake.get_ranked_phrases() keywords.append(ranked_phrase_keywords) else: extracted_keywords = rake.extract_keywords_from_text(data) ranked_phrase_keywords = rake.get_ranked_phrases() keywords.append(ranked_phrase_keywords) #print(keywords) sentiment_result = [] st = SentimentIntensityAnalyzer() #joininig the keywords separated by commas for stmt in keywords: words = " ".join(str(e) for e in stmt) sentiment_result.append(words) #print(sentiment_result) result = [] for statement in sentiment_result: ss = st.polarity_scores(statement) for k in ss: result.append([k, ss[k]]) return sentiment_result
def run_rake_model(posts, rake_limit): # from nltk.corpus import stopwords # stop_words = stopwords.words('english') # stop_words.extend(['from', 'subject', 're', 'edu', 'use']) # data_words = list(sent_to_words(posts)) # data_words_nostops = remove_stopwords(data_words) # data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ']) # print(data_lemmatized) # all_tokens = [j for i in data_lemmatized for j in i] # combined_text = " ".join(all_tokens) combined_text = " ".join(posts) # text = ["RAKE short for Rapid Automatic Keyword Extraction algorithm, " \ # "is a domain independent keyword extraction algorithm which tries " \ # "to determine key phrases in a body of text by analyzing the frequency " \ # "of word appearance and its co-occurance with other words in the text."] r = Rake(max_length=3, min_length=1, ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO) # print('lemmatized',data_lemmatized) # total_data = [] # for each in data_lemmatized: # total_data+=each # print(total_data) # cleaned_text = " ".join(total_data) # print('cleaned',cleaned_text) # print('combined',text) r.extract_keywords_from_text(combined_text) # To get keyword phrases ranked highest to lowest. res = r.get_ranked_phrases_with_scores() res_words = r.get_ranked_phrases() # print(res) # print(res_words) return res_words[:100] # run_rake_model("F://Armitage_project/crawl_n_depth/extracted_json_files/www.axcelerate.com.au_0_data.json",50)
def tagging(filename): df = pd.read_excel(filename) cols = df.columns.values r = Rake() df2 = pd.DataFrame() for i in cols: # print(i) sent = [str(j) for j in df[i].values if j != 0] # print(sent) r.extract_keywords_from_text(" ".join(sent)) # print(r.get_word_frequency_distribution()) # print(r.get_word_degrees()) fdis = r.get_word_frequency_distribution() wdig = r.get_word_degrees() fdis_ls = [] wdig_ls = [] wdig = { a: b for a, b in sorted( wdig.items(), key=lambda item: item[1], reverse=True) } # print(wdig) for j in fdis.most_common(): # print(j[0]) if len(j[0]) > 3: fdis_ls.append(j[0]) for j in wdig.keys(): if len(j) > 3: wdig_ls.append(j) print(fdis_ls[:5]) print(wdig_ls[:5]) res = [fdis_ls[:5], wdig_ls[:5]] df2[i] = res # break df2.to_excel("datafile/tagged.xlsx")
def test_extract_keywords_from_text_word_frequency_metric(self): r = Rake(ranking_metric=Metric.WORD_FREQUENCY) r.extract_keywords_from_text(self.text) ranked_phrases = [ "minimal supporting set", "minimal set", "minimal generating sets", "considered types", "mixed types", "linear diophantine equations", "types", "strict inequations", "set", "nonstrict inequations", "linear constraints", "corresponding algorithms", "upper bounds", "natural numbers", "considered", "algorithms", "used", "systems", "system", "solving", "solutions", "given", "criteria", "construction", "constructing", "components", "compatibility", ] self.assertEqual(r.get_ranked_phrases(), ranked_phrases) self.assertEqual( [phrase for _, phrase in r.get_ranked_phrases_with_scores()], ranked_phrases, )
def index(request): textform = TextForm() if request.method == 'POST': text = request.POST.get("text") text_object = Text.objects.create(text=text) r = Rake() r.extract_keywords_from_text(text) list_phrases = r.get_ranked_phrases() for phrase in list_phrases: params = { 'action': 'wbsearchentities', 'format': 'json', 'language': 'en', 'search': phrase } r = requests.get(API_ENDPOINT, params=params) result = r.json()['search'] if result == []: exist = False disambiguation = False else: exist = True if len(result) > 1: disambiguation = True else: disambiguation = False text_object.keyphrase_set.create(phrase=phrase, exist=exist, disambiguation=disambiguation) return render(request, "index.html", {"form": textform}) return render(request, "index.html", { "form": textform, "top_keyphrase": df['phrase'].to_list()[::-1][:10] })
.map(lambda row : extract_with_row_id(row["id"], row["summary"]))\ .flatMap(lambda xs: [(x) for x in xs]) all_keywords_list = [ keywords_from_content, keywords_from_title, keywords_from_keywords_col, keywords_from_meta_keywords, keywords_from_meta_description, keywords_from_tags, keywords_from_summary ] all_keywords_rdd = sc.union(all_keywords_list) all_keywords_rdd = all_keywords_rdd\ .filter(lambda row: len(row[0]) > 2)\ .reduceByKey(concat) all_keywords_df = all_keywords_rdd.toDF(["Keyword", "RowId & Score"]) all_keywords_df.write.csv(outputfolderpath, header=True, quote='"', escape='"') sc.stop() rake = Rake() inputfolderpath = sys.argv[1] outputfolderpath = sys.argv[2] jobname = sys.argv[3] main(inputfolderpath, outputfolderpath, jobname)
df = pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7') df = df[['Title', 'Genre', 'Director', 'Actors', 'Plot']] #cleaning # initializing the new column df['Key_words'] = "" for index, row in df.iterrows(): plot = row['Plot'] # instantiating Rake, by default it uses english stopwords from NLTK # and discards all puntuation characters as well r = Rake() # extracting the words by passing the text r.extract_keywords_from_text(plot) # getting the dictionary whith key words as keys and their scores as values key_words_dict_scores = r.get_word_degrees() # assigning the key words to the new column for the corresponding movie row['Key_words'] = list(key_words_dict_scores.keys()) # dropping the Plot column df.drop(columns=['Plot'], inplace=True) #processing all the rows to be a single unique word and in all lowercase to ommit duplications #cleaning the title row not to be cleaned as it is the target variable for our system
def SA(): r = Rake() # Opens file and reads in training data # NB classifier trains using the read in data with open("datasets/trainingData.csv", 'r') as trainingdata: classifier = NaiveBayesClassifier(trainingdata, format="csv") print("Training Data") classifier.show_informative_features(15) # Opens file and reads in testing data # Prints testing data accuracy # Not needed for final product with open("datasets/testingData.csv", 'r') as testingdata: print("Testing data accuracy", classifier.accuracy(testingdata)) # Asks for user input userInput = input("Please provide a test input: ") # Removes all non letter characters regex = re.compile('[^a-zA-Z ]') punctuationRemoved = regex.sub('', userInput) print("Punctuation removed: ", punctuationRemoved) # Defines stopwords stop_words = set(stopwords.words('english')) # Takes user input, removes stopwords word_tokens = word_tokenize(punctuationRemoved) # Creates list size based on number of words left after stop words are removed filtered_sentence = [w for w in word_tokens if not w in stop_words] # Initialize empty list filtered_sentence = [] # Appends each word to end of list # Runs for as many words are stored in word_tokens for w in word_tokens: # If word is not in stop_words, append to end of list if w not in stop_words: filtered_sentence.append(w) # Prints list to see new sentence with stopwords removed print("Stopwords removed: ", filtered_sentence) # Converts the filtered stop word sentence to string stringWithoutStopwords = ' '.join( [str(elem) for elem in filtered_sentence]) # Extracts keywords from the filtered sentence r.extract_keywords_from_text(stringWithoutStopwords) # Ranks the keywords that have been extracted ranked_phrases = r.get_ranked_phrases() print("Keywords extracted: ", ranked_phrases) # Converts extracted keywords list to string listToStr = ' '.join([str(elem) for elem in ranked_phrases]) # Runs string through trained NB classifier finalString = TextBlob(listToStr, classifier=classifier) # Print string followed by classification print("String followed by classification: ", finalString, finalString.classify()) if finalString.classify() == ("pos"): binaryClassify = 1 else: binaryClassify = 0 print(binaryClassify)
def parser(text: str): r = Rake() r.extract_keywords_from_text(text) r.get_ranked_phrases_with_scores() return r.rank_list
ES = ES[['Titulo', 'autores', 'materia']] # In[216]: ES['materia'] = ES['materia'].astype(str) # In[217]: #inicializando la columna ES['palabras_clave'] = "" for index, row in ES.iterrows(): materia = row['materia'] # instanciando rake, que utiliza las stopwords en el idioma ingles y descartando # puntuaciones r = Rake(language="spanish") # extrayendo a las palabras y pasandolas al texto r.extract_keywords_from_text(materia) key_words_dict_scores = r.get_word_degrees() # asignando las palabras clave a la columna palabras_clave row['palabras_clave'] = list(key_words_dict_scores.keys()) # In[218]: ES = ES.drop("materia", axis=1) # In[219]: ES.set_index('Titulo', inplace=True)
error_bad_lines=False) if __name__ == '__main__': #read data user_data = dataset.iloc[:, 0] review_data = dataset.iloc[:, 1] #new list to remove stopwords review = [] for data in review_data: review.append(data) #extracting keywords keywords = [] #creating object for the class rake = Rake() for data in review: extracted_keywords = rake.extract_keywords_from_text(data) ranked_phrase_keywords = rake.get_ranked_phrases() keywords.append(ranked_phrase_keywords) #print(keywords) sentiment_result = [] st = SentimentIntensityAnalyzer() #joininig the keywords separated by commas for stmt in keywords: words = " ".join(str(e) for e in stmt) sentiment_result.append(words) print(sentiment_result)
def __init__(self, text): self.text = text self.rake = Rake()
def main(): primary_ui = PrimaryUI() rake = Rake() while True: event, value = primary_ui.Read() if event is None: break else: if event == PrimaryUI.SUBMIT: extraction_type = value[TYPE_SELECTION] input_text = re.sub(r'[^A-Za-z0-9\.?!"\' ]', '', value[INPUT_TEXT].strip()) if input_text: if extraction_type == PrimaryUI.KEYWORD_COUNT: keyword_count_dict = {} for line in input_text.splitlines(): for keyword in line.split(' '): if (keyword.upper() in words or not keyword.strip()): continue keyword_count_dict[keyword] = ( keyword_count_dict[keyword] + 1 if keyword in keyword_count_dict.keys() else 1) output_text = [] keyword_count_dict = [ (k, keyword_count_dict[k]) for k in sorted(keyword_count_dict, key=keyword_count_dict.get, reverse=True) ] for keyword, count in keyword_count_dict: output_text.append(f'{keyword} : {count}') primary_ui.set_output_text('\n'.join(output_text)) elif extraction_type == PrimaryUI.RANKED_PHRASES: rake.extract_keywords_from_text(input_text) primary_ui.set_output_text('\n'.join( rake.get_ranked_phrases())) else: primary_ui.display_warning_dialog( "No input text was provided. Please provide Input.") elif event == PrimaryUI.CLEAR: primary_ui.clear_input_text()
def postdata(): data = request.get_json() print(data) import pandas as pd from rake_nltk import Rake import numpy as np from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import CountVectorizer pd.set_option('display.max_columns', 100) df = pd.read_csv( 'https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7') df.head() df = df[['Title', 'Genre', 'Director', 'Actors', 'Plot']] df.head() #df['Actors'] = df['Actors'].map(lambda x: x.split(',')[:3]) #df['Genre'] = df['Genre'].map(lambda x: x.lower().split(',')) #df['Director'] = df['Director'].map(lambda x: x.split(' ')) #for index, row in df.iterrows(): #row['Actors'] = [x.lower().replace(' ','') for x in row['Actors']] #row['Director'] = ''.join(row['Director']).lower() df['Key_words'] = "" for index, row in df.iterrows(): plot = row['Plot'] r = Rake() r.extract_keywords_from_text(plot) key_words_dict_scores = r.get_word_degrees() row['Key_words'] = list(key_words_dict_scores.keys()) df.drop(columns=['Plot'], inplace=True) df.set_index('Title', inplace=True) df.head() df['bag_of_words'] = '' columns = df.columns for index, row in df.iterrows(): words = '' for col in columns: if col != 'Director': words = words + ' '.join(row[col]) + ' ' else: words = words + row[col] + ' ' row['bag_of_words'] = words df.drop(columns=[col for col in df.columns if col != 'bag_of_words'], inplace=True) df.head() count = CountVectorizer() count_matrix = count.fit_transform(df['bag_of_words']) indices = pd.Series(df.index) indices[:5] cosine_sim = cosine_similarity(count_matrix, count_matrix) cosine_sim recommended_movies = [] def recommendations(title, cosine_sim=cosine_sim): print("You are in the recommendations section") idx = indices[indices == title].index[0] score_series = pd.Series(cosine_sim[idx]).sort_values(ascending=False) top_10_indexes = list(score_series.iloc[1:11].index) for i in top_10_indexes: recommended_movies.append(list(df.index)[i]) return recommended_movies for key, value in data.items(): recommendations(value) return json.dumps(recommended_movies)
#!/usr/bin/python3 # coding: utf-8 # pip install rake-nltk from rake_nltk import Rake from nltk import tokenize r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters by default ################################################################## ## Extraction given the text. mytext = '''Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.''' r.extract_keywords_from_text(mytext) print(r.get_ranked_phrases()) # To get keyword phrases ranked highest to lowest. # ['linear diophantine equations', 'minimal generating sets', 'minimal supporting set', 'minimal set', 'upper bounds', 'strict inequations', 'nonstrict inequations', 'mixed types', 'corresponding algorithms', 'considered types', 'types', 'considered', 'algorithms', 'used', 'systems', 'system', 'solving', 'solutions', 'given', 'criteria', 'construction', 'constructing', 'components', 'compatibility'] print(r.get_ranked_phrases_with_scores()) # To get keyword phrases ranked highest to lowest with scores. # [(9.0, 'linear diophantine equations'), (8.666666666666666, 'minimal generating sets'), (8.166666666666666, 'minimal supporting set'), (5.166666666666666, 'minimal set'), (4.0, 'upper bounds'), (4.0, 'strict inequations'), (4.0, 'nonstrict inequations'), (3.666666666666667, 'mixed types'), (3.5, 'corresponding algorithms'), (3.166666666666667, 'considered types'), (1.6666666666666667, 'types'), (1.5, 'considered'), (1.5, 'algorithms'), (1.0, 'used'), (1.0, 'systems'), (1.0, 'system'), (1.0, 'solving'), (1.0, 'solutions'), (1.0, 'given'), (1.0, 'criteria'), (1.0, 'construction'), (1.0, 'constructing'), (1.0, 'components'), (1.0, 'compatibility')] ################################################################## ## Extraction given the list of strings where each string is a sentence. r.extract_keywords_from_sentences(tokenize.sent_tokenize(mytext)) print(r.get_ranked_phrases()) # ['linear diophantine equations', 'minimal generating sets', 'minimal supporting set', 'minimal set', 'upper bounds', 'strict inequations', 'nonstrict inequations', 'mixed types', 'corresponding algorithms', 'considered types', 'types', 'considered', 'algorithms', 'used', 'systems', 'system', 'solving', 'solutions', 'given', 'criteria', 'construction', 'constructing', 'components', 'compatibility'] print(r.get_ranked_phrases_with_scores()) # [(9.0, 'linear diophantine equations'), (8.666666666666666, 'minimal generating sets'), (8.166666666666666, 'minimal supporting set'), (5.166666666666666, 'minimal set'), (4.0, 'upper bounds'), (4.0, 'strict inequations'), (4.0, 'nonstrict inequations'), (3.666666666666667, 'mixed types'), (3.5, 'corresponding algorithms'), (3.166666666666667, 'considered types'), (1.6666666666666667, 'types'), (1.5, 'considered'), (1.5, 'algorithms'), (1.0, 'used'), (1.0, 'systems'), (1.0, 'system'), (1.0, 'solving'), (1.0, 'solutions'), (1.0, 'given'), (1.0, 'criteria'), (1.0, 'construction'), (1.0, 'constructing'), (1.0, 'components'), (1.0, 'compatibility')]