def get_data(): df = readers.read_file('./data/work_experiences.dat', sep="::") rows = len(df) df = df.iloc[np.random.permutation(rows)].reset_index(drop=True) split_index = int(rows * 0.9) df_train = df[0:split_index] df_test = df[split_index:].reset_index(drop=True) return df_train, df_test
def get_data(): # Prepare training and testing data df = readers.read_file('../trainer/ml-1m/ratings.dat') rows = len(df) df = df.iloc[np.random.permutation(rows)].reset_index(drop=True) split_index = int(rows * 0.9) train_df = df[0:split_index] test_df = df[split_index:].reset_index(drop=True) return train_df, test_df
def create_df(ratings_df=readers.read_file(FLAGS.data_file, sep="::")): """ Use to create a trained DataFrame,all missing values in user-item table is filled here using SVD trained model INPUTS : ratings_df : rating dataframe, store all users rating for respective movies OUTPUT: Filled rating dataframe where user is row and item is col """ if os.path.isfile("./user_item_table.pkl"): df=pd.read_pickle("user_item_table.pkl") else: df = ratings_df.pivot(index = 'item', columns ='user', values = 'rate').fillna(0) df.to_pickle("user_item_table.pkl") users=[] items=[] start = time.time() print("Start creating user-item dense table") total_movies=list(ratings_df.item.unique()) for index in df.columns.tolist(): #rated_movies=ratings_df[ratings_df['user']==index].drop(['st', 'user'], axis=1) rated_movie=[] rated_movie=list(ratings_df[ratings_df['user']==index].drop(['st', 'user'], axis=1)['item'].values) unseen_movies=[] unseen_movies=list(set(total_movies) - set(rated_movie)) for movie in unseen_movies: users.append(index) items.append(movie) end = time.time() print (("Found in %.2f seconds" % (end-start))) del df rated_list = [] init_op = tf.global_variables_initializer() #checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) with tf.Session(config = session_conf) as sess: #sess.run(init_op) print("prediction started ...") new_saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_prefix)) new_saver.restore(sess, tf.train.latest_checkpoint(FLAGS.checkpoint_dir)) test_err2 = np.array([]) rated_list = sess.run(infer, feed_dict={user_batch: users, item_batch: items}) rated_list = clip(rated_list) print("Done !!!") sess.close() df_dict={'user':users,'item':items,'rate':rated_list} df = ratings_df.drop(['st'],axis=1).append(pd.DataFrame(df_dict)).pivot(index = 'user', columns ='item', values = 'rate').fillna(0) df.to_pickle("user_item_table_train.pkl") return df
def get_data(): # Reads file using the demiliter :: form the ratings file # Columns are user ID, item ID, rating, and timestamp # Sample data - 3::1196::4::978297539 df = readers.read_file("./ml-1m/ratings.dat", sep="::") rows = len(df) # Purely integer-location based indexing for selection by position df = df.iloc[np.random.permutation(rows)].reset_index(drop=True) # Separate data into train and test, 90% for train and 10% for test split_index = int(rows * 0.9) # Use indices to separate the data df_train = df[0:split_index] df_test = df[split_index:].reset_index(drop=True) return df_train, df_test
def get_data(): # Reads file using the demiliter :: form the ratings file # Download movie lens data from: http://files.grouplens.org/datasets/movielens/ml-1m.zip # Columns are user ID, item ID, rating, and timestamp # Sample data - 3::1196::4::978297539 print("Inside get data ...") df = readers.read_file(FLAGS.data_file, sep="::") rows = len(df) # Purely integer-location based indexing for selection by position df = df.iloc[np.random.permutation(rows)].reset_index(drop=True) # Separate data into train and test, 90% for train and 10% for test split_index = int(rows * 0.9) # Use indices to separate the data df_train = df[0:split_index] df_test = df[split_index:].reset_index(drop=True) df = df.pivot(index = 'item', columns ='user', values = 'rate').fillna(0) df.to_pickle("user_item_table.pkl") print("Done !!!") return df_train, df_test,df.shape[0],df.shape[1]
def get_data(): # Reads file using the demiliter :: form the ratings file # Download movie lens data from: http://files.grouplens.org/datasets/movielens/ml-1m.zip # Columns are user ID, item ID, rating, and timestamp # Sample data - 3::1196::4::978297539 print("Inside get data ...") df = readers.read_file(FLAGS.data_file, sep="::") rows = len(df) # Purely integer-location based indexing for selection by position df = df.iloc[np.random.permutation(rows)].reset_index(drop=True) # Split data into train and test, 75% for train and 25% for test split_index = int(rows * 0.75) # Use indices to separate the data df_train = df[0:split_index] df_test = df[split_index:].reset_index(drop=True) print("Done !!!") print(df.shape) return df_train, df_test, df['user'].max(), df['item'].max()
#checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) with tf.Session(config=session_conf) as sess: #sess.run(init_op) print("prediction started ...") new_saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_prefix)) new_saver.restore(sess, tf.train.latest_checkpoint(FLAGS.checkpoint_dir)) test_err2 = np.array([]) rated_list = sess.run(infer, feed_dict={ user_batch: users, item_batch: items }) rated_list = clip(rated_list) print("Done !!!") sess.close() df_dict = {'user': users, 'item': items, 'rate': rated_list} df = ratings_df.drop(['st'], axis=1).append(pd.DataFrame(df_dict)).pivot( index='user', columns='item', values='rate').fillna(0) df.to_pickle("user_item_table_train.pkl") return df create_df(ratings_df=readers.read_file(FLAGS.data_file, sep="::"))
import tensorflow as tf import pandas as pd import readers import main import kmean as km df = pd.read_pickle("user_item_table_train.pkl") ratings_df = readers.read_file("Input/ratings.dat", sep="::") clusters, movies = km.k_mean_clustering(ratings_df=ratings_df, TRAINED=False) cluster_df = pd.DataFrame({'movies': movies, 'clusters': clusters}) cluster_df.head(10) main.top_k_similar_items(9, ratings_df=ratings_df, k=10, TRAINED=False) cluster_df[cluster_df['movies'] == 1721] cluster_df[cluster_df['movies'] == 1369] cluster_df[cluster_df['movies'] == 164] cluster_df[cluster_df['movies'] == 3081] cluster_df[cluster_df['movies'] == 732] cluster_df[cluster_df['movies'] == 348] cluster_df[cluster_df['movies'] == 647] # Pearson Correlation between User-User. When you run this User Similarity function, on first run it will take time to give output but after that it's response is in real-time. main.user_similarity(1, 345, ratings_df) # Similarity between two users #Rating of User - Aspected rating for a user ratings_df.head() main.user_rating(0, 1192)
changed = True iters = 0 while changed and iters < MAX_ITERS: iters += 1 [changed, _] = sess.run([did_assignments_change, do_updates]) [centers, assignments] = sess.run([centroids, cluster_assignments]) end = time.time() print (("Found in %.2f seconds" % (end-start)), iters, "iterations") cluster_df=pd.DataFrame({'movies':df.index.values,'clusters':assignments}) cluster_df.to_csv("clusters.csv",index=True) return assignments,df.index.values # Read the main file i.e. ratings.dat ratings_df = readers.read_file(data_file, sep="::") clusters,movies = k_mean_clustering(ratings_df,K=K,MAX_ITERS = MAX_ITERS,TRAINED=TRAINED) user_item=pd.read_pickle("user_item_table.pkl") cluster=pd.read_csv("clusters.csv", index_col=False) user_item=user_item.T pcs = PCA(n_components=2, svd_solver='full') cluster['x']=pcs.fit_transform(user_item)[:,0] cluster['y']=pcs.fit_transform(user_item)[:,1] fig = plt.figure() ax = plt.subplot(111)