Python read_predata示例，preprocess_data.read_predata Python示例

示例#1

0

显示文件

        for line in docs:
            if re.search(term, line, flags=re.I):
                list_fact.append(line)
    return list_fact


def sent_to_words(sent):
    # splits sentence to words, filtering out non-alphabetical terms
    words = nltk.word_tokenize(sent)
    words_filtered = filter(lambda x: x.isalpha(), words)
    return words_filtered


movie_name = []
movie_dict = {}
for name, vote in zip(read_predata()['movie'], read_predata()['votes']):
    movie_name.append(name)
    movie_dict[name] = vote
index_orig_forms = build_inverted_index_orig_forms(movie_name)

vocabulary = Counter()
for name in movie_name:
    for word in sent_to_words(name.lower()):
        vocabulary[word] += 1

WORDS = vocabulary


# Norvig's spellchecker
def P(word, N=sum(WORDS.values())):
    """Probability of `word`."""

示例#2

0

显示文件

文件： benmovie.py 项目： zakariabeni/BenMovie

def title_handler(query):
    data = query.data
    movie_name = data.replace('*', '')
    # print('up', movie_name)
    result = recommendations(movie_name)
    one = result.pop(0)
    save_one.append(one)
    # print(len(save_one))
    # print(save_one)
    if data.startswith('*'):
        markups = telebot.types.InlineKeyboardMarkup()
        for idx in result:
            markups.add(
                telebot.types.InlineKeyboardButton(
                    text='{} ({})'.format(read_predata()['movie'].values[idx],
                                          read_predata()['year'].values[idx]),
                    callback_data=str(read_predata()['movie'].values[idx])))
        reply = str("Top 5 recommended movies to *{}* are:".format(movie_name))
        markups.add(
            telebot.types.InlineKeyboardButton(text="<<Try Again>>",
                                               callback_data="back"))
        bot.edit_message_text(reply,
                              query.message.chat.id,
                              query.message.message_id,
                              parse_mode="Markdown",
                              reply_markup=markups)
    else:
        markups = telebot.types.InlineKeyboardMarkup()
        stars = ', '.join(read_predata()['stars'].values[one])
        reply = "*{} ({})*".format(movie_name, read_predata()['year'].values[one]) + "\n" \
                "{} | {}".format(read_predata()['certificate'].values[one], read_predata()['duration'].values[one]) + "\n" \
                "IMDB Rating: *{}*".format(read_predata()['imdb'].values[one]) + "\n\n" \
                "*Director*: {}".format(read_predata()['director'].values[one].replace("'", "")) + "\n" \
                "*Stars*: {}".format(stars.replace("'", "")) + "\n" \
                "*Genre*: {}".format('[' + ', '.join(read_predata()['genre'].values[one]) + ']') + "\n\n" \
                "*Description*: _{}_".format(read_predata()['description'].values[one]) + "\n" \
                "[Movie Poster]({})".format(read_predata()['movie_img'].values[one])
        # print('down', read_predata()['movie'].values[save_one[0]])
        markups.add(
            telebot.types.InlineKeyboardButton(
                text="<<Back>>",
                callback_data=str(
                    '*' + read_predata()['movie'].values[save_one[0]])))
        bot.edit_message_text(reply,
                              query.message.chat.id,
                              query.message.message_id,
                              parse_mode="Markdown",
                              reply_markup=markups)

示例#3

0

显示文件

文件： cosine_similarity.py 项目： zakariabeni/BenMovie

def get_title_from_index(index):
    return read_predata()[read_predata()['Unnamed: 0'] ==
                          index]["movie"].values[0]

示例#4

0

显示文件

文件： cosine_similarity.py 项目： zakariabeni/BenMovie

def get_index_from_title(title):
    return read_predata()[read_predata()["movie"] ==
                          title]["Unnamed: 0"].values[0]

示例#5

0

显示文件

文件： cosine_similarity.py 项目： zakariabeni/BenMovie

# Library
from preprocess_data import read_predata
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

# count = CountVectorizer(analyzer='word', stop_words='english')
score = TfidfVectorizer(analyzer='word', stop_words='english')
# movies_matrix = count.fit_transform(read_predata()['list_bag'])
movies_matrix = score.fit_transform(read_predata()['list_bag'])

# cosine_sim = cosine_similarity(movies_matrix)
cosine_sim = linear_kernel(movies_matrix, movies_matrix)


def get_title_from_index(index):
    return read_predata()[read_predata()['Unnamed: 0'] ==
                          index]["movie"].values[0]


def get_index_from_title(title):
    return read_predata()[read_predata()["movie"] ==
                          title]["Unnamed: 0"].values[0]


def recommendations(movie_user_likes):
    movie_index = get_index_from_title(movie_user_likes)
    similar_movies = list(enumerate(cosine_sim[movie_index]))

    sorted_similar_movies = sorted(similar_movies,
                                   key=lambda x: x[1],
                                   reverse=True)[0:6]