for line in docs: if re.search(term, line, flags=re.I): list_fact.append(line) return list_fact def sent_to_words(sent): # splits sentence to words, filtering out non-alphabetical terms words = nltk.word_tokenize(sent) words_filtered = filter(lambda x: x.isalpha(), words) return words_filtered movie_name = [] movie_dict = {} for name, vote in zip(read_predata()['movie'], read_predata()['votes']): movie_name.append(name) movie_dict[name] = vote index_orig_forms = build_inverted_index_orig_forms(movie_name) vocabulary = Counter() for name in movie_name: for word in sent_to_words(name.lower()): vocabulary[word] += 1 WORDS = vocabulary # Norvig's spellchecker def P(word, N=sum(WORDS.values())): """Probability of `word`."""
def title_handler(query): data = query.data movie_name = data.replace('*', '') # print('up', movie_name) result = recommendations(movie_name) one = result.pop(0) save_one.append(one) # print(len(save_one)) # print(save_one) if data.startswith('*'): markups = telebot.types.InlineKeyboardMarkup() for idx in result: markups.add( telebot.types.InlineKeyboardButton( text='{} ({})'.format(read_predata()['movie'].values[idx], read_predata()['year'].values[idx]), callback_data=str(read_predata()['movie'].values[idx]))) reply = str("Top 5 recommended movies to *{}* are:".format(movie_name)) markups.add( telebot.types.InlineKeyboardButton(text="<<Try Again>>", callback_data="back")) bot.edit_message_text(reply, query.message.chat.id, query.message.message_id, parse_mode="Markdown", reply_markup=markups) else: markups = telebot.types.InlineKeyboardMarkup() stars = ', '.join(read_predata()['stars'].values[one]) reply = "*{} ({})*".format(movie_name, read_predata()['year'].values[one]) + "\n" \ "{} | {}".format(read_predata()['certificate'].values[one], read_predata()['duration'].values[one]) + "\n" \ "IMDB Rating: *{}*".format(read_predata()['imdb'].values[one]) + "\n\n" \ "*Director*: {}".format(read_predata()['director'].values[one].replace("'", "")) + "\n" \ "*Stars*: {}".format(stars.replace("'", "")) + "\n" \ "*Genre*: {}".format('[' + ', '.join(read_predata()['genre'].values[one]) + ']') + "\n\n" \ "*Description*: _{}_".format(read_predata()['description'].values[one]) + "\n" \ "[Movie Poster]({})".format(read_predata()['movie_img'].values[one]) # print('down', read_predata()['movie'].values[save_one[0]]) markups.add( telebot.types.InlineKeyboardButton( text="<<Back>>", callback_data=str( '*' + read_predata()['movie'].values[save_one[0]]))) bot.edit_message_text(reply, query.message.chat.id, query.message.message_id, parse_mode="Markdown", reply_markup=markups)
def get_title_from_index(index): return read_predata()[read_predata()['Unnamed: 0'] == index]["movie"].values[0]
def get_index_from_title(title): return read_predata()[read_predata()["movie"] == title]["Unnamed: 0"].values[0]
# Library from preprocess_data import read_predata from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity, linear_kernel # count = CountVectorizer(analyzer='word', stop_words='english') score = TfidfVectorizer(analyzer='word', stop_words='english') # movies_matrix = count.fit_transform(read_predata()['list_bag']) movies_matrix = score.fit_transform(read_predata()['list_bag']) # cosine_sim = cosine_similarity(movies_matrix) cosine_sim = linear_kernel(movies_matrix, movies_matrix) def get_title_from_index(index): return read_predata()[read_predata()['Unnamed: 0'] == index]["movie"].values[0] def get_index_from_title(title): return read_predata()[read_predata()["movie"] == title]["Unnamed: 0"].values[0] def recommendations(movie_user_likes): movie_index = get_index_from_title(movie_user_likes) similar_movies = list(enumerate(cosine_sim[movie_index])) sorted_similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)[0:6]