def calculate_item_similarity(trainset, use_iuf_similarity=False): """ Calculate item similarity matrix by building movie-users inverse table. The calculating will only between items which are voted by common users. :param use_iuf_similarity: This is based on Item IUF similarity. if a person views a lot of movies, items' similarity will be lower. :param trainset: trainset :return: similarity matrix """ movie_popular, movie_count = calculate_movie_popular(trainset) # count co-rated items between users print('generate items co-rated similarity matrix...') # the keys of item_sim_mat are movie1's id, # the values of item_sim_mat are dicts which save {movie2's id: co-occurrence times}. # so you can seem item_sim_mat as a two-dim table. # TODO DO NOT USE DICT TO SAVE MATRIX, USE LIST INDEED. # TODO IF USE LIST, THE MATRIX WILL BE VERY SPARSE. movie_sim_mat = {} # record the calculate time has spent. movie2users_time = LogTime(print_step=1000) for user, movies in trainset.items(): for movie1 in movies: # set default similarity between movie1 and other users equals zero movie_sim_mat.setdefault(movie1, defaultdict(int)) for movie2 in movies: if movie1 == movie2: continue # ignore the score they voted. # item similarity matrix only focus on co-occurrence. if use_iuf_similarity: # if a person views a lot of movies, items' similarity will be lower. movie_sim_mat[movie1][movie2] += 1 / math.log(1 + len(movies)) else: # origin method, users'similarity based on common items count. movie_sim_mat[movie1][movie2] += 1 # log steps and times. movie2users_time.count_time() print('generate items co-rated similarity matrix success.') movie2users_time.finish() # calculate item-item similarity matrix print('calculate item-item similarity matrix...') # record the calculate time has spent. movie_sim_mat_time = LogTime(print_step=1000) for movie1, related_items in movie_sim_mat.items(): len_movie1 = movie_popular[movie1] for movie2, count in related_items.items(): len_user2 = movie_popular[movie2] # The similarity of movie1 and movie2 is len(common movies)/sqrt(len(movies1)* len(movies2) movie_sim_mat[movie1][movie2] = count / math.sqrt( len_movie1 * len_user2) # log steps and times. movie_sim_mat_time.count_time() print('calculate item-item similarity matrix success.') movie_sim_mat_time.finish() return movie_sim_mat, movie_popular, movie_count
def test(self, testset): """ Test the recommendation system by recommending scores to all users in testset. :param testset: test dataset :return: """ if not self.n_rec_movie or not self.trainset or not self.movie_popular or not self.movie_count: raise ValueError('UserCF has not init or fit method has not called yet.') self.testset = testset print('Test recommendation system start...') N = self.n_rec_movie # varables for precision and recall hit = 0 rec_count = 0 test_count = 0 # varables for coverage all_rec_movies = set() # varables for popularity popular_sum = 0 # record the calculate time has spent. test_time = LogTime(print_step=1000) for i, user in enumerate(self.trainset): test_movies = self.testset.get(user, {}) rec_movies = self.recommend(user) # type:list for movie in rec_movies: if movie in test_movies: hit += 1 all_rec_movies.add(movie) popular_sum += math.log(1 + self.movie_popular[movie]) # log steps and times. rec_count += N test_count += len(test_movies) # print time per 500 times. test_time.count_time() precision = hit / (1.0 * rec_count) recall = hit / (1.0 * test_count) coverage = len(all_rec_movies) / (1.0 * self.movie_count) popularity = popular_sum / (1.0 * rec_count) print('Test recommendation system success.') test_time.finish() print('precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f\n' % (precision, recall, coverage, popularity)) summary = [] summary.append(precision) summary.append(recall) summary.append(coverage) summary.append(popularity) return summary
def __init__(self, store=None, date=datetime.today()): """ Initialization, takes a store object and prefix to populate the object. Sets self.metadata a dictionary with path as key and value a FileMetadata object """ self.date = date if store is None: metadata = {} else: with LogTime(log.info, "Collected %s metadata" % store.__class__.__name__): metadata = store.get_metadata() log.info("\tCollected %d total files" % len(metadata)) self.metadata = metadata # A dictionary with path as key and value a FileMetadata object
def test(self, testset): """ Test the recommendation system by recommending scores to all users in testset. :param testset: test dataset :return: None """ self.testset = testset print('Test recommendation system start...') # varables for precision and recall hit = 0 rec_count = 0 test_count = 0 # varables for coverage all_rec_movies = set() # varables for popularity popular_sum = 0 # record the calculate time has spent. test_time = LogTime(print_step=1000) for user in self.users_set: test_movies = self.testset.get(user, {}) rec_movies = self.recommend(user) # type:list for movie in rec_movies: if movie in test_movies.keys(): hit += 1 all_rec_movies.add(movie) popular_sum += math.log(1 + self.item_popular[movie]) # log steps and times. rec_count += self.n_rec_movie test_count += len(test_movies) # print time per 500 times. test_time.count_time() precision = hit / (1.0 * rec_count) recall = hit / (1.0 * test_count) coverage = len(all_rec_movies) / (1.0 * self.items_count) popularity = popular_sum / (1.0 * rec_count) print('Test recommendation system success.') test_time.finish() print('precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f\n' % (precision, recall, coverage, popularity)) summary = [] summary.append(precision) summary.append(recall) summary.append(coverage) summary.append(popularity)
def predict(self, testset): """ Recommend movies to all users in testset. :param testset: test dataset :return: `dict` : recommend list for each user. """ movies_recommend = defaultdict(list) print('Predict scores start...') # record the calculate time has spent. predict_time = LogTime(print_step=500) for i, user in enumerate(testset): rec_movies = self.recommend(user) # type:list movies_recommend[user].append(rec_movies) # log steps and times. predict_time.count_time() print('Predict scores success.') predict_time.finish() return movies_recommend
model = LFM(10, 10, 0.1, 0.01, 10) else: raise ValueError('No model named ' + model_name) model.fit(trainset) recommend_test(model, [1, 100, 233, 666, 888]) model.test(testset) def recommend_test(model, user_list): for user in user_list: recommend = model.recommend(str(user)) print("recommend for userid = %s:" % user) print(recommend) print() if __name__ == '__main__': main_time = LogTime(words="Main Function") dataset_name = 'ml-100k' # dataset_name = 'ml-1m' # model_type = 'UserCF' # model_type = 'UserCF-IIF' # model_type = 'ItemCF' # model_type = 'Random' # model_type = 'MostPopular' # model_type = 'ItemCF-IUF' model_type = 'LFM' test_size = 0.1 run_model(model_type, dataset_name, test_size, False) main_time.finish()
def main(argv=None): if argv is None: argv = sys.argv if (len(argv) > 5) or (len(argv) < 4): print "Usage: " + argv[ 0] + " <config file> <domain> <v_node> [YYYY_MM_DD]" print "The config file is the same format as used for backups, backup dir, snapshot name and swift credentials are used" print 'The domain is the domain to be restored from swift and the v_node is the vertica node name to restore data for' print 'If the year/month/day is specified the most recent backup on that day will be downloaded rather than prompting' return 1 config_file = argv[1] domain = argv[2] v_node_name = argv[3] if len(argv) == 5: day = argv[4] else: day = None config = yaml.load(open(config_file, 'r')) # Setup logging logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) log = logging.getLogger(__name__) with LogTime(log.info, "Restore download completed"): # Setup swift/paths base_dir, prefix_dir = calculate_paths(config, v_node_name) swift_store = SwiftStore(config['swift_key'], config['swift_region'], config['swift_tenant'], config['swift_url'], config['swift_user'], prefix_dir, domain=domain, vnode=v_node_name) fs_store = FSStore(base_dir, prefix_dir) # Get the metadata from the last restore (if any) current_metadata = DirectoryMetadata(fs_store) # Grab the swift metadata we want to restore if day is None: pickle = choose_one(swift_store.list_pickles(), "Please choose a pickle to restore from") else: # Since the list is sorted this will find the newest that matches the given day, or None otherwise pickle = None for option in swift_store.list_pickles(): if option.startswith(day): pickle = option if pickle is None: log.error('No backups found in swift.') sys.exit(1) swift_metadata = DirectoryMetadata.load_pickle(swift_store, pickle) # Compare the files in the current restore and swift and download/delete as necessary with LogTime(log.debug, "Diff completed", seconds=True): to_download, to_del = swift_metadata.diff(current_metadata) size_downloaded = 0 with LogTime(log.info, "Download Completed"): for relative_path in to_download: size_downloaded += swift_store.download( relative_path, base_dir) log.info("\tDownloaded %s in %d items" % (sizeof_fmt(size_downloaded), len(to_download))) with LogTime(log.info, "Deleted %d items" % len(to_del)): for relative_path in to_del: fs_store.delete(relative_path) EpochFiles(os.path.join(base_dir, prefix_dir), config['snapshot_name'], swift_metadata.date).restore() # Save the swift metadata to the local fs, to indicate the restore is done swift_metadata.save(fs_store) delete_pickles(fs_store)
def main(argv=None): if argv is None: argv = sys.argv if len(argv) != 2: print "Usage: " + argv[0] + " <config file> " return 1 requests.packages.urllib3.disable_warnings() config_file = argv[1] config = yaml.load(open(config_file, 'r')) # Setup logging log_path = os.path.join( config['log_dir'], 'backup_' + datetime.today().strftime('%A') + '.log') logging.basicConfig(format='%(asctime)s %(message)s', filename=log_path, level=logging.INFO) # log_time is not used here so the timing can be reported to nagios start = time.time() exit_status = 0 epoch_files = None # Run the vbr backup command - The vbr run is quite fast typically completing in less than a minute if config['run_vbr']: run_vbr(config, 'init') run_vbr(config, 'backup') try: catalog_dir = config['catalog_dir'] base_dir = config['backup_dir'] prefix_dir = '' swift_store = SwiftStore(config['swift_key'], config['swift_region'], config['swift_tenant'], config['swift_url'], config['swift_user'], prefix_dir) fs_store = FSStore(base_dir, prefix_dir) upload_time = datetime.today() epoch_files = EpochFiles(os.path.join(base_dir, prefix_dir), catalog_dir, config['snapshot_name'], upload_time) epoch_files.archive() # Grab the local and swift metadata current_metadata = DirectoryMetadata(fs_store, upload_time) current_metadata.save(fs_store) swift_metadata = DirectoryMetadata(swift_store) # Compare the files in the current backup and swift and upload as necessary, then delete as necessary with LogTime(log.debug, "Diff operation completed", seconds=True): to_add, do_not_del = current_metadata.diff(swift_metadata) size_uploaded = 0 with LogTime(log.info, "Uploaded Completed"): for relative_path in to_add: size_uploaded += swift_store.upload(relative_path, base_dir) log.info("\tUploaded %s in %d items" % (sizeof_fmt(size_uploaded), len(to_add))) with LogTime( log.info, "Determining items to delete, retaining %d backups" % config['retain']): # Grab the pickle names I want to combine, relying on these being in order by date, newest first pickles = swift_store.list_pickles() combine_pickles = pickles[:config['retain']] # Take metadata in all these pickles combine. # It would be good to check that there is no overlap in filenames with different content. combined_metadata = DirectoryMetadata() for pickle in combine_pickles: pickle_metadata = DirectoryMetadata.load_pickle( swift_store, pickle) combined_metadata.metadata.update(pickle_metadata.metadata) # Do a diff with all that is in swift, anything in swift but not in the combined set can be deleted. should_be_empty, to_del = combined_metadata.diff(swift_metadata) if len(should_be_empty) != 0: exit_status = 1 log.error( "ERROR: Found files in the %d combined retained backups that were not in swift.\n%s" % (config['retain'], should_be_empty)) with LogTime(log.info, "Deleted %d items" % len(to_del)): for relative_path in to_del: swift_store.delete(relative_path) # Upload today's metadata pickle, this is done last so its presence an indication the backup is done. current_metadata.save(swift_store) #Clean up old pickles delete_pickles(fs_store) delete_pickles(swift_store, config['retain']) except Exception: log.exception('Unhandled Exception in Backup upload') # Move the Epoch files back to their original names so a retry run does not encounter issues with them if epoch_files is not None: epoch_files.restore() exit_status = 1 # Status message and exit stop = time.time() duration = (stop - start) / 60 duration_msg = "Backup completed in %d minutes total. Thresholds, warn %d.|%d" % \ (duration, config['warning'], duration) log.info(duration_msg) nagios_exit(exit_status, duration_msg, duration, config['warning'])
def calculate_user_similarity(trainset, use_iif_similarity=False): """ Calculate user similarity matrix by building movie-users inverse table. The calculating will only between users which have common items votes. :param use_iif_similarity: This is based on User IIF similarity. if the item is very popular, users' similarity will be lower. :param trainset: trainset :return: similarity matrix """ # build inverse table for item-users # key=movieID, value=list of userIDs who have seen this movie print('building movie-users inverse table...') movie2users = collections.defaultdict(set) movie_popular = defaultdict(int) for user, movies in trainset.items(): for movie in movies: movie2users[movie].add(user) movie_popular[movie] += 1 print('building movie-users inverse table success.') # save the total movie number, which will be used in evaluation movie_count = len(movie2users) print('total movie number = %d' % movie_count) # count co-rated items between users print('generate user co-rated movies similarity matrix...') # the keys of usersim_mat are user1's id, # the values of usersim_mat are dicts which save {user2's id: co-occurrence times}. # so you can seem usersim_mat as a two-dim table. # TODO DO NOT USE DICT TO SAVE MATRIX, USE LIST INDEED. # TODO IF USE LIST, THE MATRIX WILL BE VERY SPARSE. usersim_mat = {} # record the calculate time has spent. movie2users_time = LogTime(print_step=1000) for movie, users in movie2users.items(): for user1 in users: # set default similarity between user1 and other users equals zero usersim_mat.setdefault(user1, defaultdict(int)) for user2 in users: if user1 == user2: continue # ignore the score they voted. # user similarity matrix only focus on co-occurrence. if use_iif_similarity: # if the item is very popular, users' similarity will be lower. usersim_mat[user1][user2] += 1 / math.log(1 + len(users)) else: # origin method, users'similarity based on common items count. usersim_mat[user1][user2] += 1 # log steps and times. movie2users_time.count_time() print('generate user co-rated movies similarity matrix success.') movie2users_time.finish() # calculate user-user similarity matrix print('calculate user-user similarity matrix...') # record the calculate time has spent. usersim_mat_time = LogTime(print_step=1000) for user1, related_users in usersim_mat.items(): len_user1 = len(trainset[user1]) for user2, count in related_users.items(): len_user2 = len(trainset[user2]) # The similarity of user1 and user2 is len(common movies)/sqrt(len(user1 movies)* len(user2 movies) usersim_mat[user1][user2] = count / math.sqrt(len_user1 * len_user2) # log steps and times. usersim_mat_time.count_time() print('calculate user-user similarity matrix success.') usersim_mat_time.finish() return usersim_mat, movie_popular, movie_count