def calculate_top_5(app, user_download_history): app_similarity = {} for apps in user_download_history: similarity = Helper.cosine_similarity([app], apps) for other_app in apps: if app_similarity.has_key(other_app): app_similarity[ other_app] = app_similarity[other_app] + similarity else: app_similarity[other_app] = similarity if not app_similarity.has_key(app): return app_similarity.pop(app) sorted_tups = sorted(app_similarity.items(), key=operator.itemgetter(1), reverse=True) top_5_app = [ sorted_tups[0][0], sorted_tups[1][0], sorted_tups[2][0], sorted_tups[3][0], sorted_tups[4][0] ] print("top_5_app for" + str(app) + ":\t" + str(top_5_app)) # store the top 5 DataService.update_app_info({'app_id': app}, {'$set': { 'top_5_app': top_5_app }})
def main(): try: # print to a file # f = file('log.txt', 'w') # sys.stdout = f # get MongoDB client and set it in DataService client = MongoClient('localhost', 27017) # or client = MongoClient('mongodb://localhost:27017/') DataService.init(client) # persist_download_history_and_app_info() user_download_history = load_user_download_history() app_info = load_app_info() persist_top_5_apps_for_app(user_download_history, app_info) # print for testing #print "app_info: \n " + str(load_top_5_apps('C10107104')) except Exception as e: print "Exception! Go fix it!" print e finally: # clean up work if 'client' in locals(): client.close()
def run(self, app, download_history): # This need serious optimization where we calculate similarity as A->B B->A is always dup # Ideally loop through all download history # create a dict to store each other app and its similarity to this app app_similarity = {} # {app_id: similarity} for apps in download_history: if app not in apps: continue #calculate the similarity similarity = Helper.cosine_similarity_app_applist(app, apps) for other_app in apps: if other_app == app: continue if app_similarity.has_key(other_app): app_similarity[other_app] = app_similarity[other_app] + similarity else: app_similarity[other_app] = similarity # ignore apps not in any download history if len(app_similarity) == 0: return # sort app_similarity dict by value and get the top 5 as recommendation sorted_tups = self.__sort_dict_by_value(app_similarity) #print sorted_tups top_5_app = [sorted_tups[0][0], sorted_tups[1][0], sorted_tups[2][0], sorted_tups[3][0], sorted_tups[4][0]] #print "top_5_app for " + str(self.app) + ":\t" + str(top_5_app) # store the top 5 DataService.update_app_info({'app_id': app}, {'$set': {'top_5_app': top_5_app}})
def calculate_app_top_5(app, all_download_history): # create a dict to store each other app and its similarity to this app_list2 app_similarity = {} # {app_id: similarity} for apps in all_download_history: # calculate the similarity of the app and the user download history similarity = Helper.cosine_similarity([app], apps) for other_app in apps: if app_similarity.has_key(other_app): app_similarity[other_app] = app_similarity[other_app] + similarity else: app_similarity[other_app] = similarity # there could be app without related app (not in any download history) if not app_similarity.has_key(app): return # sort app_similarity dict by value and get the top 5 as recommendation app_similarity.pop(app) #sort by similarity sorted_tups = sorted(app_similarity.items(), key=operator.itemgetter(1), reverse=True) top_5_app = [sorted_tups[0][0], sorted_tups[1][0], sorted_tups[2][0], sorted_tups[3][0], sorted_tups[4][0]] # print("top_5_app for " + str(app) + ":\t" + str(top_5_app)) DataService.update_app_info({'app_id' : app}, {'$set' : {'top_5_app': top_5_app}})
def calculate_top_5(app, user_download_history): ''' cosine_similarity between an App and user's history ''' #create a dict to store each other app and its similarity to this app app_similarity = collections.defaultdict(float) #{app_id: similarity} for apps in user_download_history: #calculate the similarity similarity = Helper.cosine_similarity([app], apps) # accumluate similarity for other_app in apps: app_similarity[other_app] += similarity # There could be app without related apps (not in any download history) if not app in app_similarity: return #sort app_similarity dict by value and get the top 5 as recommendation app_similarity.pop(app) sorted_tups = sorted(app_similarity.items(), key=operator.itemgetter(1), reverse=True) #sort by similarity top_5_app = [ sorted_tups[0][0], sorted_tups[1][0], sorted_tups[2][0], sorted_tups[3][0], sorted_tups[4][0] ] #print("top_5_app for " + str(app) + ":\t" + str(top_5_app)) #store the top 5 DataService.update_app_info({'app_id': app}, {'$set': { 'top_5_app': top_5_app }})
def main(): try: # print to a file # f = file('log.txt', 'w') # sys.stdout = f # get MongoDB client and set it in DataService client = MongoClient( 'localhost', 27017) # or client = MongoClient('mongodb://localhost:27017/') DataService.init(client) # persist_download_history_and_app_info() user_download_history = load_user_download_history() app_info = load_app_info() persist_top_5_apps_for_app(user_download_history, app_info) # print for testing #print "app_info: \n " + str(load_top_5_apps('C10107104')) except Exception as e: print "Exception! Go fix it!" print e finally: # clean up work if 'client' in locals(): client.close()
def generate_recommendations_for_one_user(user_download_history, recommendations, one_user_id): all_app_id = recommendations.keys() recommended_app_list = [] sim_score = [] for app in user_download_history[ one_user_id ]: if app in all_app_id: for one_sim_app in recommendations[app]: recommended_app_list.append(one_sim_app[0]) sim_score.append(one_sim_app[1]) sorted_list = [x for (y,x) in sorted(zip(sim_score, recommended_app_list), key=lambda pair: pair[0], reverse = True)] DataService.update_user_download_history({'user_id':one_user_id}, {'$set':{"recommended_apps":sorted_list}}) return
def main(): try: client = MongoClient('localhost', 27017) DataService.init(client) user_download_history = DataService.retrieve_user_download_history() app_info = DataService.retrieve_app_info() for app in app_info.keys(): calculate_top_5(app, user_download_history.values()) except Exception as e: print(e) finally: if 'client' in locals(): client.close()
def run(): musixMatch = Musixmatch(environ["API_KEY"]) dataService = DataService() countryService = CountryService() # LANGUAGE languagetable = LanguageDetectionService().get_language_table( dataService.all_lyrics) languagetable.name = "language" # MUSIXMATCH entries = dataService.all_entries track_info = musixMatch.label_entries(entries) tracktable = DataCleanUpService.make_track_information_table(track_info) # COUNTRY countries = dataService.all_countries country_info = countryService.get_country_info(countries) countrytable = DataCleanUpService.make_country_information_table( dataService.all_countries, country_info) # COMBINE df = (dataService.contestants.join(languagetable, how="left").join( tracktable, how="left").merge(countrytable, how="left", left_on="to_country", right_on="to_country")) # OUTPUT df.to_pickle("contestdata.pickle") df.to_feather("contestdata.arrow") df.to_csv("contestantdata.csv", quoting=csv.QUOTE_NONNUMERIC)
def main(): try: #get MongoDB client and set it in DataService client = MongoClient('localhost', 27017) DataService.init(client) #work flow user_download_history = DataService.retrieve_user_download_history() app_info = DataService.retrieve_app_info() for app in app_info.keys(): calculate_top_5(app, user_download_history.values()) except Exception as e: print(e) finally: #clean up work if 'client' in locals(): client.close()
def main(): try: client = MongoClient('localhost', 27017) DataService.init(client) user_download_history = DataService.retrieve_user_download_history() # calculate_top_5('C10063783', user_download_history.values()) app_info = DataService.retrieve_app_info() for app in app_info.keys(): calculate_top_5(app, user_download_history.values()) except Exception as e: print(e) finally: if 'client' in locals(): client.close()
def load_app_info(): # retrieval work done here result = DataService.retrieve_app_info() # print app info #print "app_info: \n " + str(result) + "\n" return result
def load_user_download_history(): # retrieval work done here result = DataService.retrieve_user_download_history() # print user download history #print "user_download_history: \n " + str(result) + "\n" return result
def main(): try: #get MongoDB client and set it in DataService client = MongoClient('localhost', 27017) DataService.init(client) #work flow user_download_history = DataService.retrieve_user_download_history() #calculate_top_5('C10107104', user_download_history.values()) app_info = DataService.retrieve_app_info() for app in app_info.keys(): calculate_top_5(app, user_download_history.values()) except Exception, e: print("Exception detected:") print(e)
def main(): try: start = time.clock() client = MongoClient('localhost', 27017) DataService.init(client) user_download_history = DataService.retrieve_user_download_history() apps = DataService.retrieve_app_info() for app in apps.keys(): calculate_top_5(app, user_download_history.values()) end = time.clock() print "time: " + str(end - start) except Exception as e: print e finally: if 'client' in locals(): client.close()
def main(): try: # get MongoDB client and set it in DataService client = MongoClient('localhost', 27017) DataService.init(client) # work flow user_download_history = DataService.retrieve_user_download_history() app_info = DataService.retrieve_app_info() for app in app_info.keys(): calculate_top_5(app, user_download_history.values()) except Exception as e: print e finally: # clean up if 'client' in locals(): client.close()
def calculate_top_k(app, user_download_history, num): app_similarity = {} count = 0 for apps in user_download_history: similarity = Helper.cosine_similarity([app], apps) for other_app in apps: if app_similarity.has_key(other_app): app_similarity[other_app] = app_similarity[other_app] + similarity else: app_similarity[other_app] = similarity #print(app_similarity) if not app_similarity.has_key(app): return False app_similarity.pop(app) sorted_tups = sorted(app_similarity.items(), key=operator.itemgetter(1), reverse=True) name = "top_" + str(num) + "_app" DataService.update_app_info({'app_id':app}, {'$set':{name:sorted_tups[:num]}}) return True
def main(): try: client = MongoClient('localhost', 27017) DataService.init(client) user_download_history = DataService.retrieve_user_download_history() all_app_id = DataService.get_all_app_id() for one_id in all_app_id: calculate_top_k(one_id, user_download_history.values(), 5) recommendations = DataService.retrieve_recommended_items() generate_recommendations(user_download_history, recommendations) except Exception as e: print(e) finally: if 'client' in locals(): client.close()
def calculate_user_top_5(user_id, user_download_history, all_download_history): app_similarity = {} for apps in all_download_history: similarity = Helper.cosine_similarity(user_download_history, apps) for other_app in apps: if app_similarity.has_key(other_app): app_similarity[other_app] += similarity else: app_similarity[other_app] = similarity for app in user_download_history: app_similarity.pop(app) sorted_tups = sorted(app_similarity.items(), key=operator.itemgetter(1), reverse=True) top_5_app = [sorted_tups[0][0], sorted_tups[1][0], sorted_tups[2][0], sorted_tups[3][0], sorted_tups[4][0]] print("top_5_app for " + str(user_id) + ":\t" + str(top_5_app)) DataService.update_user_info({'user_id' : user_id}, {'$set' : {'top_5_app': top_5_app}})
def calculate_top_5(app, user_download_history): app_similarity = {} for apps in user_download_history: similarity = Helper.cosine_similarity([app], apps) for other_app in apps: if app_similarity.has_key(other_app): app_similarity[other_app] = app_similarity[other_app] + similarity else: app_similarity[other_app] = similarity if not app_similarity.has_key(app): return app_similarity.pop(app) sorted_tups = sorted(app_similarity.items(), key=operator.itemgetter(1), reverse=True) top_5_app = [sorted_tups[0][0], sorted_tups[1][0], sorted_tups[2][0], sorted_tups[3][0], sorted_tups[4][0]] print("top_5_app for" + str(app) + ":\t" + str(top_5_app)) # store the top 5 DataService.update_app_info({'app_id': app}, {'$set': {'top_5_app': top_5_app}})
def persist_download_history_and_app_info(): # persist data into 2 tables # user_download_history - (user_id, download_history) # app_info - (app_id, app_name) # clean up to remove all old data DataService.clean_up() # persist work done here DataService.read_then_persist('data/sample.data') # print for testing, comment out anything below within try block if you want # print db and collections, they will show only if any data was persisted print "dbs: " + str(DataService.client.database_names()) db = DataService.db print "collections: " + str(db.collection_names()) + "\n" # print user_download_history user_download_history_cursor = DataService.get_cursor_user_download_history( ) print "user_download_history: count=" + str( user_download_history_cursor.count()) for user_download_history in user_download_history_cursor: print " " + str(user_download_history) print "" # print app_info app_info_cursor = DataService.get_cursor_app_info() print "app_info: count=" + str(app_info_cursor.count()) for app_info in app_info_cursor: print " " + str(app_info) print ""
def persist_download_history_and_app_info(): # persist data into 2 tables # user_download_history - (user_id, download_history) # app_info - (app_id, app_name) # clean up to remove all old data DataService.clean_up() # persist work done here DataService.read_then_persist('data/sample.data') # print for testing, comment out anything below within try block if you want # print db and collections, they will show only if any data was persisted print "dbs: " + str(DataService.client.database_names()) db = DataService.db print "collections: " + str(db.collection_names()) + "\n" # print user_download_history user_download_history_cursor = DataService.get_cursor_user_download_history() print "user_download_history: count=" + str(user_download_history_cursor.count()) for user_download_history in user_download_history_cursor: print " " + str(user_download_history) print "" # print app_info app_info_cursor = DataService.get_cursor_app_info() print "app_info: count=" + str(app_info_cursor.count()) for app_info in app_info_cursor: print " " + str(app_info) print ""
def run(self, app, download_history): # This need serious optimization where we calculate similarity as A->B B->A is always dup # Ideally loop through all download history # create a dict to store each other app and its similarity to this app app_similarity = {} # {app_id: similarity} for apps in download_history: if app not in apps: continue #calculate the similarity similarity = Helper.cosine_similarity_app_applist(app, apps) for other_app in apps: if other_app == app: continue if app_similarity.has_key(other_app): app_similarity[ other_app] = app_similarity[other_app] + similarity else: app_similarity[other_app] = similarity # ignore apps not in any download history if len(app_similarity) == 0: return # sort app_similarity dict by value and get the top 5 as recommendation sorted_tups = self.__sort_dict_by_value(app_similarity) #print sorted_tups top_5_app = [ sorted_tups[0][0], sorted_tups[1][0], sorted_tups[2][0], sorted_tups[3][0], sorted_tups[4][0] ] #print "top_5_app for " + str(self.app) + ":\t" + str(top_5_app) # store the top 5 DataService.update_app_info({'app_id': app}, {'$set': { 'top_5_app': top_5_app }})
def main(): start = time.clock() try: client = MongoClient('localhost', 27017) DataService.init(client) user_download_history = DataService.retrieve_user_download_history() app_info = DataService.retrieve_app_info() for app in app_info.keys(): calculate_top_5(app, user_download_history.values()) except Exception as e: print("Exception detected:") print(e) finally: if 'client' in locals(): client.close() end = time.clock() print "time eplapsed = " + str(end - start)
def calculate_top_5(app, user_download_history): similarity = {} for u in user_download_history: if app in u: sim = Helper.cosine_similarity([app], u) for apps in u: if similarity.has_key(apps): similarity[apps] = similarity[apps] + sim else: similarity[apps] = sim # The app has not been downloaded, therefore no related history if not similarity.has_key(app): return similarity.pop(app) sorted_result = sorted(similarity.items(), key = operator.itemgetter(1), reverse = True) top_5 = [] for i in range(5): top_5.append(sorted_result[i][0]) print str(app) + " - top 5: " + str(top_5) DataService.update_app_info({'app_id': app}, {'$set': {'top_5_app': top_5}})
def main(): try: client = MongoClient('localhost', 27017) DataService.init(client) # work flow user_download_history = DataService.retrieve_user_download_history() app_info = DataService.retrieve_app_info() # print("Calculating Top 5 related apps for apps...") # start = time.clock() # for app in app_info.keys(): # # print(app) # calculate_app_top_5(app, user_download_history.values()) # # calculate_top_5('C10107104', user_download_history.values()) # end = time.clock() # print "time elapsed = " + str(end - start) print("Calculating Top 5 recommended apps for users...") start = time.clock() for user_id, download_history in user_download_history.iteritems(): # print(app) calculate_user_top_5(user_id, download_history, user_download_history.values()) # calculate_top_5('C10107104', user_download_history.values()) end = time.clock() print "time elapsed = " + str(end - start) except Exception as e: print(e) finally: # clean up work if 'client' in locals(): client.close()
def main(): try: start = time.clock() client = MongoClient('localhost', 27017) DataService.init(client) user_download_history = DataService.retrieve_user_download_history() app_ids = DataService.retrieve_all_app_id() top_5_apps = {} for appp in app_ids: calculate_top_5(appp, user_download_history.values(),top_5_apps) for users in user_download_history.keys(): users_app = user_download_history[users] all_possible_app = {} #sorted_tups = calculate_top_5_user(users, user_download_history,top_5_apps,all_possible_app) for used_app_id in users_app: top_5_possible_app = top_5_apps[used_app_id] for i in top_5_possible_app: if i in all_possible_app: all_possible_app[i] = all_possible_app[i] + 1 else: all_possible_app[i] = 1 sorted_tups = sorted(all_possible_app.items(), key = operator.itemgetter(1), reverse = True) top_5_user_app = [sorted_tups[0][0], sorted_tups[1][0], sorted_tups[2][0], sorted_tups[3][0],sorted_tups[4][0]] print("top_5_app " + str(users) + ":\t" + str(top_5_user_app)) end = time.clock() print "time elapsed = " + str(end - start) except Exception as e: print(e) finally: if 'client' in locals(): client.close()
def main(): try: client = MongoClient('localhost',27017) DataService.init(client) apps = DataService.retrieve_appinfo() #work flow for app in apps.keys(): user_download_history = DataService.retrieve_user_download_history() top_5_app = calculate_Top_5(app, user_download_history.values()) DataService.update_app_info({"app_id": app},{"$set": {"Top 5": top_5_app}}) except Exception as e: print(e) finally: #clean up work if 'client' in locals(): client.close()
def main(): noInput = True # weather user input a right command processes = [] try: while noInput: flag = input("If you want to calculate each app's recommend apps" + " please press 1, If you want to calculate" + " recommend apps for each user, please press" + " 2: ") if flag == 1: noInput = False # start time start = time.clock() # set the shared dictionary rslt = {} result = multiprocessing.Manager().dict(rslt) # control the number of apps each process deals with count = 0 # a list of appID keys = [] # get client and init it with DataService class client = MongoClient("localhost", 27017) DataService.init(client) # start analyzing userDownloadHistory = DataService.retrieveUserDownloadHistory() allApp = DataService.retrieveAppInfo() length = len(allApp) for i in range(0, length, 500): # each process will deal with 500 apps p = multiprocessing.Process(target = appWrapper, args = (allApp.keys()[i: min(i + 500, length)], userDownloadHistory.values(), result)) processes.append(p) # set process to daemon and start for p in processes: p.daemon = True p.start() # main process will wait other processes for p in processes: p.join() # update MongoDB for key in result.keys(): DataService.updateAppInfo({"app_id": key}, {"$set": {"top 5 apps": result[key]}}) elif flag == 2: noInput = False # start time start = time.clock() # set the shared dictionary rslt = {} result = multiprocessing.Manager().dict(rslt) # control the number of users each process deals with count = 0 # a list of userID keys = [] # get client and init it with DataService class client = MongoClient("localhost", 27017) DataService.init(client) # start analyzing userDownloadHistory = DataService.retrieveUserDownloadHistory() for key in userDownloadHistory.keys(): keys.append(key) count += 1 # each process will deal with 500 users if count == 500: p = multiprocessing.Process(target = userWrapper, args = (keys, userDownloadHistory, result)) processes.append(p) keys = [] count = 0 # the rest users p = multiprocessing.Process(target = userWrapper, args = (keys, userDownloadHistory, result)) processes.append(p) # set process to daemon and start for p in processes: p.daemon = True p.start() # main process will wait other processes for p in processes: p.join() # update MongoDB for key in result.keys(): DataService.updateUserInfo({"user_id": key}, {"$set": {"top 5 recommended apps": result[key]}}) else: print "Sorry, you press the wrong number, please try again." except Exception as e: print e finally: # close client if "client" in locals(): client.close() end = time.clock() print "Finished! The elapsed time is " + str(end - start)
def load_top_5_apps(app_id): return DataService.retrieve_app_info({'app_id': app_id})