def calculate_item_similarity(trainset, use_iuf_similarity=False):
    """
    Calculate item similarity matrix by building movie-users inverse table.
    The calculating will only between items which are voted by common users.

    :param use_iuf_similarity:  This is based on Item IUF similarity.
                                if a person views a lot of movies, items' similarity will be lower.
    :param trainset: trainset
    :return: similarity matrix
    """
    movie_popular, movie_count = calculate_movie_popular(trainset)

    # count co-rated items between users
    print('generate items co-rated similarity matrix...')
    # the keys of item_sim_mat are movie1's id,
    # the values of item_sim_mat are dicts which save {movie2's id: co-occurrence times}.
    # so you can seem item_sim_mat as a two-dim table.
    # TODO DO NOT USE DICT TO SAVE MATRIX, USE LIST INDEED.
    # TODO IF USE LIST, THE MATRIX WILL BE VERY SPARSE.
    movie_sim_mat = {}
    # record the calculate time has spent.
    movie2users_time = LogTime(print_step=1000)
    for user, movies in trainset.items():
        for movie1 in movies:
            # set default similarity between movie1 and other users equals zero
            movie_sim_mat.setdefault(movie1, defaultdict(int))
            for movie2 in movies:
                if movie1 == movie2:
                    continue
                # ignore the score they voted.
                # item similarity matrix only focus on co-occurrence.
                if use_iuf_similarity:
                    # if a person views a lot of movies, items' similarity will be lower.
                    movie_sim_mat[movie1][movie2] += 1 / math.log(1 +
                                                                  len(movies))
                else:
                    # origin method, users'similarity based on common items count.
                    movie_sim_mat[movie1][movie2] += 1
        # log steps and times.
        movie2users_time.count_time()
    print('generate items co-rated similarity matrix success.')
    movie2users_time.finish()

    # calculate item-item similarity matrix
    print('calculate item-item similarity matrix...')
    # record the calculate time has spent.
    movie_sim_mat_time = LogTime(print_step=1000)
    for movie1, related_items in movie_sim_mat.items():
        len_movie1 = movie_popular[movie1]
        for movie2, count in related_items.items():
            len_user2 = movie_popular[movie2]
            # The similarity of movie1 and movie2 is len(common movies)/sqrt(len(movies1)* len(movies2)
            movie_sim_mat[movie1][movie2] = count / math.sqrt(
                len_movie1 * len_user2)
            # log steps and times.
        movie_sim_mat_time.count_time()

    print('calculate item-item similarity matrix success.')
    movie_sim_mat_time.finish()
    return movie_sim_mat, movie_popular, movie_count
示例#2
0
    def test(self, testset):
        """
        Test the recommendation system by recommending scores to all users in testset.
        :param testset: test dataset
        :return:
        """
        if not self.n_rec_movie or not self.trainset or not self.movie_popular or not self.movie_count:
            raise ValueError('UserCF has not init or fit method has not called yet.')
        self.testset = testset
        print('Test recommendation system start...')
        N = self.n_rec_movie
        #  varables for precision and recall
        hit = 0
        rec_count = 0
        test_count = 0
        # varables for coverage
        all_rec_movies = set()
        # varables for popularity
        popular_sum = 0

        # record the calculate time has spent.
        test_time = LogTime(print_step=1000)
        for i, user in enumerate(self.trainset):
            test_movies = self.testset.get(user, {})
            rec_movies = self.recommend(user)  # type:list
            for movie in rec_movies:
                if movie in test_movies:
                    hit += 1
                all_rec_movies.add(movie)
                popular_sum += math.log(1 + self.movie_popular[movie])
                # log steps and times.
            rec_count += N
            test_count += len(test_movies)
            # print time per 500 times.
            test_time.count_time()
        precision = hit / (1.0 * rec_count)
        recall = hit / (1.0 * test_count)
        coverage = len(all_rec_movies) / (1.0 * self.movie_count)
        popularity = popular_sum / (1.0 * rec_count)

        print('Test recommendation system success.')
        test_time.finish()
        
        print('precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f\n' %
              (precision, recall, coverage, popularity))
        summary = []
        summary.append(precision)
        summary.append(recall)
        summary.append(coverage)
        summary.append(popularity)
        
        return summary
示例#3
0
    def __init__(self, store=None, date=datetime.today()):
        """ Initialization, takes a store object and prefix to populate the object.
              Sets self.metadata a dictionary with path as key and value a FileMetadata object
        """
        self.date = date
        if store is None:
            metadata = {}
        else:
            with LogTime(log.info,
                         "Collected %s metadata" % store.__class__.__name__):
                metadata = store.get_metadata()
            log.info("\tCollected %d total files" % len(metadata))

        self.metadata = metadata  # A dictionary with path as key and value a FileMetadata object
示例#4
0
文件: LFM.py 项目: suppawong/convcn
    def test(self, testset):
        """
        Test the recommendation system by recommending scores to all users in testset.
        :param testset: test dataset
        :return: None
        """
        self.testset = testset
        print('Test recommendation system start...')
        #  varables for precision and recall
        hit = 0
        rec_count = 0
        test_count = 0
        # varables for coverage
        all_rec_movies = set()
        # varables for popularity
        popular_sum = 0

        # record the calculate time has spent.
        test_time = LogTime(print_step=1000)
        for user in self.users_set:
            test_movies = self.testset.get(user, {})
            rec_movies = self.recommend(user)  # type:list
            for movie in rec_movies:
                if movie in test_movies.keys():
                    hit += 1
                all_rec_movies.add(movie)
                popular_sum += math.log(1 + self.item_popular[movie])
                # log steps and times.
            rec_count += self.n_rec_movie
            test_count += len(test_movies)
            # print time per 500 times.
            test_time.count_time()
        precision = hit / (1.0 * rec_count)
        recall = hit / (1.0 * test_count)
        coverage = len(all_rec_movies) / (1.0 * self.items_count)
        popularity = popular_sum / (1.0 * rec_count)
        print('Test recommendation system success.')
        test_time.finish()
        print('precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f\n' %
              (precision, recall, coverage, popularity))
        
        summary = []
        summary.append(precision)
        summary.append(recall)
        summary.append(coverage)
        summary.append(popularity)
 def predict(self, testset):
     """
     Recommend movies to all users in testset.
     :param testset: test dataset
     :return: `dict` : recommend list for each user.
     """
     movies_recommend = defaultdict(list)
     print('Predict scores start...')
     # record the calculate time has spent.
     predict_time = LogTime(print_step=500)
     for i, user in enumerate(testset):
         rec_movies = self.recommend(user)  # type:list
         movies_recommend[user].append(rec_movies)
         # log steps and times.
         predict_time.count_time()
     print('Predict scores success.')
     predict_time.finish()
     return movies_recommend
示例#6
0
        model = LFM(10, 10, 0.1, 0.01, 10)
    else:
        raise ValueError('No model named ' + model_name)
    model.fit(trainset)
    recommend_test(model, [1, 100, 233, 666, 888])
    model.test(testset)


def recommend_test(model, user_list):
    for user in user_list:
        recommend = model.recommend(str(user))
        print("recommend for userid = %s:" % user)
        print(recommend)
        print()


if __name__ == '__main__':
    main_time = LogTime(words="Main Function")
    dataset_name = 'ml-100k'
    # dataset_name = 'ml-1m'
    # model_type = 'UserCF'
    # model_type = 'UserCF-IIF'
    # model_type = 'ItemCF'
    # model_type = 'Random'
    # model_type = 'MostPopular'
    # model_type = 'ItemCF-IUF'
    model_type = 'LFM'
    test_size = 0.1
    run_model(model_type, dataset_name, test_size, False)
    main_time.finish()
def main(argv=None):
    if argv is None:
        argv = sys.argv
    if (len(argv) > 5) or (len(argv) < 4):
        print "Usage: " + argv[
            0] + " <config file> <domain> <v_node> [YYYY_MM_DD]"
        print "The config file is the same format as used for backups, backup dir, snapshot name and swift credentials are used"
        print 'The domain is the domain to be restored from swift and the v_node is the vertica node name to restore data for'
        print 'If the year/month/day is specified the most recent backup on that day will be downloaded rather than prompting'
        return 1

    config_file = argv[1]
    domain = argv[2]
    v_node_name = argv[3]
    if len(argv) == 5:
        day = argv[4]
    else:
        day = None
    config = yaml.load(open(config_file, 'r'))

    # Setup logging
    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    log = logging.getLogger(__name__)

    with LogTime(log.info, "Restore download completed"):

        # Setup swift/paths
        base_dir, prefix_dir = calculate_paths(config, v_node_name)
        swift_store = SwiftStore(config['swift_key'],
                                 config['swift_region'],
                                 config['swift_tenant'],
                                 config['swift_url'],
                                 config['swift_user'],
                                 prefix_dir,
                                 domain=domain,
                                 vnode=v_node_name)
        fs_store = FSStore(base_dir, prefix_dir)

        # Get the metadata from the last restore (if any)
        current_metadata = DirectoryMetadata(fs_store)

        # Grab the swift metadata we want to restore
        if day is None:
            pickle = choose_one(swift_store.list_pickles(),
                                "Please choose a pickle to restore from")
        else:
            # Since the list is sorted this will find the newest that matches the given day, or None otherwise
            pickle = None
            for option in swift_store.list_pickles():
                if option.startswith(day):
                    pickle = option

        if pickle is None:
            log.error('No backups found in swift.')
            sys.exit(1)
        swift_metadata = DirectoryMetadata.load_pickle(swift_store, pickle)

        # Compare the files in the current restore and swift and download/delete as necessary
        with LogTime(log.debug, "Diff completed", seconds=True):
            to_download, to_del = swift_metadata.diff(current_metadata)

        size_downloaded = 0
        with LogTime(log.info, "Download Completed"):
            for relative_path in to_download:
                size_downloaded += swift_store.download(
                    relative_path, base_dir)
        log.info("\tDownloaded %s in %d items" %
                 (sizeof_fmt(size_downloaded), len(to_download)))

        with LogTime(log.info, "Deleted %d items" % len(to_del)):
            for relative_path in to_del:
                fs_store.delete(relative_path)

        EpochFiles(os.path.join(base_dir, prefix_dir), config['snapshot_name'],
                   swift_metadata.date).restore()

        # Save the swift metadata to the local fs, to indicate the restore is done
        swift_metadata.save(fs_store)

    delete_pickles(fs_store)
示例#8
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
    if len(argv) != 2:
        print "Usage: " + argv[0] + " <config file> "
        return 1

    requests.packages.urllib3.disable_warnings()
    config_file = argv[1]
    config = yaml.load(open(config_file, 'r'))

    # Setup logging
    log_path = os.path.join(
        config['log_dir'],
        'backup_' + datetime.today().strftime('%A') + '.log')
    logging.basicConfig(format='%(asctime)s %(message)s',
                        filename=log_path,
                        level=logging.INFO)

    # log_time is not used here so the timing can be reported to nagios
    start = time.time()
    exit_status = 0
    epoch_files = None

    # Run the vbr backup command - The vbr run is quite fast typically completing in less than a minute
    if config['run_vbr']:
        run_vbr(config, 'init')
        run_vbr(config, 'backup')

    try:
        catalog_dir = config['catalog_dir']
        base_dir = config['backup_dir']
        prefix_dir = ''
        swift_store = SwiftStore(config['swift_key'], config['swift_region'],
                                 config['swift_tenant'], config['swift_url'],
                                 config['swift_user'], prefix_dir)
        fs_store = FSStore(base_dir, prefix_dir)
        upload_time = datetime.today()

        epoch_files = EpochFiles(os.path.join(base_dir,
                                              prefix_dir), catalog_dir,
                                 config['snapshot_name'], upload_time)
        epoch_files.archive()

        # Grab the local and swift metadata
        current_metadata = DirectoryMetadata(fs_store, upload_time)
        current_metadata.save(fs_store)
        swift_metadata = DirectoryMetadata(swift_store)

        # Compare the files in the current backup and swift and upload as necessary, then delete as necessary
        with LogTime(log.debug, "Diff operation completed", seconds=True):
            to_add, do_not_del = current_metadata.diff(swift_metadata)

        size_uploaded = 0
        with LogTime(log.info, "Uploaded Completed"):
            for relative_path in to_add:
                size_uploaded += swift_store.upload(relative_path, base_dir)
        log.info("\tUploaded %s in %d items" %
                 (sizeof_fmt(size_uploaded), len(to_add)))

        with LogTime(
                log.info, "Determining items to delete, retaining %d backups" %
                config['retain']):
            # Grab the pickle names I want to combine, relying on these being in order by date, newest first
            pickles = swift_store.list_pickles()
            combine_pickles = pickles[:config['retain']]

            # Take metadata in all these pickles combine.
            # It would be good to check that there is no overlap in filenames with different content.
            combined_metadata = DirectoryMetadata()
            for pickle in combine_pickles:
                pickle_metadata = DirectoryMetadata.load_pickle(
                    swift_store, pickle)
                combined_metadata.metadata.update(pickle_metadata.metadata)

            # Do a diff with all that is in swift, anything in swift but not in the combined set can be deleted.
            should_be_empty, to_del = combined_metadata.diff(swift_metadata)
            if len(should_be_empty) != 0:
                exit_status = 1
                log.error(
                    "ERROR: Found files in the %d combined retained backups that were not in swift.\n%s"
                    % (config['retain'], should_be_empty))

        with LogTime(log.info, "Deleted %d items" % len(to_del)):
            for relative_path in to_del:
                swift_store.delete(relative_path)

        # Upload today's metadata pickle, this is done last so its presence an indication the backup is done.
        current_metadata.save(swift_store)

        #Clean up old pickles
        delete_pickles(fs_store)
        delete_pickles(swift_store, config['retain'])

    except Exception:
        log.exception('Unhandled Exception in Backup upload')
        # Move the Epoch files back to their original names so a retry run does not encounter issues with them
        if epoch_files is not None:
            epoch_files.restore()
        exit_status = 1

    # Status message and exit
    stop = time.time()
    duration = (stop - start) / 60
    duration_msg = "Backup completed in %d minutes total. Thresholds, warn %d.|%d" % \
                   (duration, config['warning'], duration)
    log.info(duration_msg)

    nagios_exit(exit_status, duration_msg, duration, config['warning'])
def calculate_user_similarity(trainset, use_iif_similarity=False):
    """
    Calculate user similarity matrix by building movie-users inverse table.
    The calculating will only between users which have common items votes.

    :param use_iif_similarity:  This is based on User IIF similarity.
                                if the item is very popular, users' similarity will be lower.
    :param trainset: trainset
    :return: similarity matrix
    """
    # build inverse table for item-users
    # key=movieID, value=list of userIDs who have seen this movie
    print('building movie-users inverse table...')
    movie2users = collections.defaultdict(set)
    movie_popular = defaultdict(int)

    for user, movies in trainset.items():
        for movie in movies:
            movie2users[movie].add(user)
            movie_popular[movie] += 1
    print('building movie-users inverse table success.')

    # save the total movie number, which will be used in evaluation
    movie_count = len(movie2users)
    print('total movie number = %d' % movie_count)

    # count co-rated items between users
    print('generate user co-rated movies similarity matrix...')
    # the keys of usersim_mat are user1's id,
    # the values of usersim_mat are dicts which save {user2's id: co-occurrence times}.
    # so you can seem usersim_mat as a two-dim table.
    # TODO DO NOT USE DICT TO SAVE MATRIX, USE LIST INDEED.
    # TODO IF USE LIST, THE MATRIX WILL BE VERY SPARSE.
    usersim_mat = {}
    # record the calculate time has spent.
    movie2users_time = LogTime(print_step=1000)
    for movie, users in movie2users.items():
        for user1 in users:
            # set default similarity between user1 and other users equals zero
            usersim_mat.setdefault(user1, defaultdict(int))
            for user2 in users:
                if user1 == user2:
                    continue
                # ignore the score they voted.
                # user similarity matrix only focus on co-occurrence.
                if use_iif_similarity:
                    # if the item is very popular, users' similarity will be lower.
                    usersim_mat[user1][user2] += 1 / math.log(1 + len(users))
                else:
                    # origin method, users'similarity based on common items count.
                    usersim_mat[user1][user2] += 1
        # log steps and times.
        movie2users_time.count_time()
    print('generate user co-rated movies similarity matrix success.')
    movie2users_time.finish()

    # calculate user-user similarity matrix
    print('calculate user-user similarity matrix...')
    # record the calculate time has spent.
    usersim_mat_time = LogTime(print_step=1000)
    for user1, related_users in usersim_mat.items():
        len_user1 = len(trainset[user1])
        for user2, count in related_users.items():
            len_user2 = len(trainset[user2])
            # The similarity of user1 and user2 is len(common movies)/sqrt(len(user1 movies)* len(user2 movies)
            usersim_mat[user1][user2] = count / math.sqrt(len_user1 * len_user2)
            # log steps and times.
        usersim_mat_time.count_time()

    print('calculate user-user similarity matrix success.')
    usersim_mat_time.finish()
    return usersim_mat, movie_popular, movie_count