Exemplo n.º 1
0
def naive_baseline(omitMovie=True):
    movieDataBase = MovieDataBase()
    movieDataBase.generate_complete_rating_data(regenerate=False)

    movieDataBase.make_sorted_rating_data([5, 1, 0])
    sorted_rating_data = movieDataBase.out_rating_data[:]
    movieDataBase.synchronize()
    movieDataBase.make_slice(count_movie_slice=range(1, 1201))
    movieDataBase.synchronize()
    movieDataBase.store_data_to_file(fileName='train_original_data')
    movieDataBase.generate_libfm_data(omitMovie=omitMovie)
    movieDataBase.store_data_to_file(movieDataBase.libfm_data, 'train_step1.libfm')

    movieDataBase.load_core_rating_data(sorted_rating_data)
    movieDataBase.make_slice(count_movie_slice=range(1201, 1441))
    movieDataBase.synchronize()
    movieDataBase.store_data_to_file(fileName='test_original_data')
    movieDataBase.generate_libfm_data(omitMovie=omitMovie, shuffle=False)
    movieDataBase.store_data_to_file(movieDataBase.libfm_data, 'test_step1.libfm')

    # subprocess.call("./Generate/libFM -task r -train Generate/train_step1.libfm -test Generate/test_step1.libfm "
    #                 "-method sgd -dim '1,1, 80' -learn_rate 0.001 -iter 160 -out Generate/prediction", shell=True)
    subprocess.call("./Generate/libFM -task r -train Generate/train_step1.libfm -test Generate/test_step1.libfm "
                    "-method mcmc -dim '1,1, 80' -out Generate/prediction", shell=True)
    compute_error.computer_error()
def baseline1_random(active_learning_ratio=0.1):
    print '\n==================================='
    print 'baseline1 with random choosing active learning data started'
    print '==================================='

    """ Choose active learning data randomly """
    number_of_active_data = (int(dataProcess.movieDataBase.TOTAL_USERS*active_learning_ratio)-1)*240
    number_of_index = len(dataProcess.test_addAllNegative_data)
    selected_data_positions = set()
    while len(selected_data_positions) < number_of_active_data:
        selected_data_positions.add(random.randint(0, number_of_index-1))

    alternative_user_movie_list = []
    for index in selected_data_positions:
        userId = dataProcess.test_addAllNegative_data[index][0]
        movieId = dataProcess.test_addAllNegative_data[index][1]
        alternative_user_movie_list.append([userId, movieId])
    # movieDataBase.make_alternative_user_movie_matrix(alternative_user_movie_list)

    """ add active learning result into train_original_data """
    print '\n==================================='
    print 'active learning regression started'
    print '==================================='
    dataProcess.movieDataBase.make_user_movie_rating_matrix(dataProcess.test_original_data)
    active_learning_train_data = []

    # this scheme is adding every thing in active learning
    count = 0
    for values in alternative_user_movie_list:
        userId = values[0]
        movieId = values[1]
        ratings = dataProcess.movieDataBase.user_movie_rating_matrix.get(userId)
        if ratings is not None:
            rating = ratings.get(movieId)
            if rating is not None:
                active_learning_train_data.append([userId, movieId, rating])
                count += 1

    train_add_active_learning_data = dataProcess.train_original_data + active_learning_train_data
    dataProcess.movieDataBase.store_data_to_file(train_add_active_learning_data, fileName='train_add_active_learning_data')
    dataProcess.movieDataBase.generate_libfm_data(train_add_active_learning_data)
    dataProcess.movieDataBase.store_data_to_file(dataProcess.movieDataBase.libfm_data, fileName='train_step3.libfm')
    # subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm "
    #                 "-method mcmc -out Generate/prediction", shell=True)
    # compute_error.computer_error()

    subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm "
                    "-method sgd -dim '1,1, 200' -learn_rate 0.001 -iter 30 -out Generate/prediction", shell=True)
    print 'number of alternative user-movie requests=', len(alternative_user_movie_list)
    print 'number of gained active learning user-movie data=', count
    compute_error.computer_error()
def baseline2_random_after_classification(active_learning_ratio=0.1):
    print "\n==================================="
    print "baseline2 random choosing active learning data after classification started"
    print "==================================="

    """the test data is still in movieDataBase.core. Next is the step 1 """
    print "\n==================================="
    print "step 1 binary classification started"
    print "==================================="

    subprocess.call(
        "./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm "
        "-method mcmc -iter 1 -out Generate/prediction",
        shell=True,
    )
    # subprocess.call("./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm "
    #                 "-method sgd -learn_rate 0.01 -out Generate/prediction", shell=True)
    selected_data_positions = compute_classification.compute_classification(len(dataProcess.test_original_data))

    positive_user_id_movie_id = numpy.zeros(
        [dataProcess.movieDataBase.TOTAL_USERS + 1, dataProcess.movieDataBase.TOTAL_MOVIES + 1], dtype=numpy.bool_
    )

    for index in selected_data_positions:
        userId = dataProcess.test_addAllNegative_data[index][0]
        movieId = dataProcess.test_addAllNegative_data[index][1]
        positive_user_id_movie_id[userId][movieId] = numpy.True_

    alternative_user_movie_list = []
    for choose_movie_num in range(1, dataProcess.TEST_MOVIES + 1):
        movieId = dataProcess.test_from_movie_num_get_movie_id[choose_movie_num]
        choose_user_id_set = set()
        """first add all positive users"""
        for userId in range(1, dataProcess.movieDataBase.TOTAL_USERS + 1):
            if positive_user_id_movie_id[userId][movieId] == numpy.True_:
                choose_user_id_set.add(userId)

        if len(choose_user_id_set) > dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio:
            choose_user_id_set = list(choose_user_id_set)[
                : int(dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio)
            ]
        else:
            while len(choose_user_id_set) < dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio:
                choose_user_id_set.add(random.randint(1, dataProcess.movieDataBase.TOTAL_USERS))
        choose_user_id_set = list(choose_user_id_set)[
            : int(dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio) - 1
        ]

        for choose_user_id in choose_user_id_set:
            alternative_user_movie_list.append([choose_user_id, movieId])

    """ add active learning result into train_original_data """
    print "\n==================================="
    print "step active learning regression started"
    print "==================================="
    dataProcess.movieDataBase.make_user_movie_rating_matrix(dataProcess.test_original_data)
    active_learning_train_data = []

    # this scheme is adding every thing in active learning
    count = 0
    for values in alternative_user_movie_list:
        userId = values[0]
        movieId = values[1]
        ratings = dataProcess.movieDataBase.user_movie_rating_matrix.get(userId)
        if ratings is not None:
            rating = ratings.get(movieId)
            if rating is not None:
                active_learning_train_data.append([userId, movieId, rating])
                count += 1

    train_add_active_learning_data = dataProcess.train_original_data + active_learning_train_data
    dataProcess.movieDataBase.store_data_to_file(
        train_add_active_learning_data, fileName="train_add_active_learning_data"
    )
    dataProcess.movieDataBase.generate_libfm_data(train_add_active_learning_data)
    dataProcess.movieDataBase.store_data_to_file(dataProcess.movieDataBase.libfm_data, fileName="train_step3.libfm")
    # subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm "
    #                 "-method mcmc -out Generate/prediction", shell=True)
    # compute_error.computer_error()

    subprocess.call(
        "./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm "
        "-method sgd -dim '1,1, 200' -learn_rate 0.01 -iter 30 -out Generate/prediction",
        shell=True,
    )
    print "number of alternative user-movie requests=", len(alternative_user_movie_list)
    print "number of gained active learning user-movie data=", count
    compute_error.computer_error()
def experiment2_user_qualification_invited_limitation(active_learning_ratio=0.1, MAX_INVITED=50):
    print "\n==================================="
    print "experiment2 choosing qualified active learning data after classification started with invitation limit"
    print "==================================="

    """the test data is still in movieDataBase.core. Next is the step 1 """
    print "\n==================================="
    print "step 1 binary classification started"
    print "==================================="

    subprocess.call(
        "./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm "
        "-method mcmc -iter 1 -out Generate/prediction",
        shell=True,
    )
    # subprocess.call("./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm "
    #                 "-method sgd -learn_rate 0.01 -out Generate/prediction", shell=True)
    selected_data_positions = compute_classification.compute_classification(len(dataProcess.test_original_data))

    print "\n==================================="
    print "choosing users with low rating MSE"
    print "==================================="

    positive_user_id_movie_id = numpy.zeros(
        [dataProcess.movieDataBase.TOTAL_USERS + 1, dataProcess.movieDataBase.TOTAL_MOVIES + 1], dtype=numpy.bool_
    )

    for index in selected_data_positions:
        userId = dataProcess.test_addAllNegative_data[index][0]
        movieId = dataProcess.test_addAllNegative_data[index][1]
        positive_user_id_movie_id[userId][movieId] = numpy.True_

    user_mse_list = list(enumerate(dataProcess.movieDataBase.user_MSE))
    user_mse_sorted_list = sorted(user_mse_list, key=lambda x: x[1])
    user_invited_times = [0] * (dataProcess.movieDataBase.TOTAL_USERS + 1)

    alternative_user_movie_list = []
    """ notice that I shuffle the list here"""
    movie_choose_list = range(1, dataProcess.TEST_MOVIES + 1)
    random.shuffle(movie_choose_list)
    for choose_movie_num in movie_choose_list:
        movieId = dataProcess.test_from_movie_num_get_movie_id[choose_movie_num]
        choose_user_id_set = set()
        """first add qualified and positive users"""
        for userId, _ in user_mse_sorted_list:
            if positive_user_id_movie_id[userId][movieId] == numpy.True_ and user_invited_times[userId] < MAX_INVITED:
                choose_user_id_set.add(userId)

        if len(choose_user_id_set) > dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio:
            choose_user_id_set = list(choose_user_id_set)[
                : int(dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio)
            ]
        else:
            """second add positive users"""
            for userId, _ in user_mse_sorted_list:
                if (
                    user_invited_times[userId] < MAX_INVITED
                    and positive_user_id_movie_id[userId][movieId] == numpy.True_
                ):
                    choose_user_id_set.add(userId)
                if len(choose_user_id_set) >= dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio:
                    break
            """third add users with low mse"""
            for userId, _ in user_mse_sorted_list:
                if user_invited_times[userId] < MAX_INVITED:
                    choose_user_id_set.add(userId)
                if len(choose_user_id_set) >= dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio:
                    break
        choose_user_id_set = list(choose_user_id_set)[
            : int(dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio) - 1
        ]
        # print len(choose_user_id_set)
        for choose_user_id in choose_user_id_set:
            alternative_user_movie_list.append([choose_user_id, movieId])
            user_invited_times[choose_user_id] += 1

    """ add active learning result into train_original_data """
    print "\n==================================="
    print "step active learning regression started"
    print "==================================="
    dataProcess.movieDataBase.make_user_movie_rating_matrix(dataProcess.test_original_data)
    active_learning_train_data = []

    # this scheme is adding every thing in active learning
    count = 0
    for values in alternative_user_movie_list:
        userId = values[0]
        movieId = values[1]
        ratings = dataProcess.movieDataBase.user_movie_rating_matrix.get(userId)
        if ratings is not None:
            rating = ratings.get(movieId)
            if rating is not None:
                active_learning_train_data.append([userId, movieId, rating])
                count += 1

    train_add_active_learning_data = dataProcess.train_original_data + active_learning_train_data
    dataProcess.movieDataBase.store_data_to_file(
        train_add_active_learning_data, fileName="train_add_active_learning_data"
    )
    dataProcess.movieDataBase.generate_libfm_data(train_add_active_learning_data)
    dataProcess.movieDataBase.store_data_to_file(dataProcess.movieDataBase.libfm_data, fileName="train_step3.libfm")
    # subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm "
    #                 "-method mcmc -out Generate/prediction", shell=True)
    # compute_error.computer_error()

    subprocess.call(
        "./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm "
        "-method sgd -dim '1,1, 200' -learn_rate 0.01 -iter 30 -out Generate/prediction",
        shell=True,
    )
    print "number of alternative user-movie requests=", len(alternative_user_movie_list)
    print "number of gained active learning user-movie data=", count
    compute_error.computer_error()
def baseline1_random(active_learning_ratio=0.1):
    print "\n==================================="
    print "baseline1 with random choosing active learning data started"
    print "==================================="

    """ Choose active learning data randomly """
    number_of_index = len(dataProcess.test_addAllNegative_data)
    alternative_user_movie_list = []

    for choose_movie_num in range(1, dataProcess.TEST_MOVIES + 1):
        choose_user_id_set = set()
        while len(choose_user_id_set) < dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio:
            choose_user_id_set.add(random.randint(1, dataProcess.movieDataBase.TOTAL_USERS))
        choose_user_id_set = list(choose_user_id_set)[
            : int(dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio) - 1
        ]

        for choose_user_id in choose_user_id_set:
            alternative_user_movie_list.append(
                [choose_user_id, dataProcess.test_from_movie_num_get_movie_id[choose_movie_num]]
            )

    # movieDataBase.make_alternative_user_movie_matrix(alternative_user_movie_list)

    """ add active learning result into train_original_data """
    print "\n==================================="
    print "active learning regression started"
    print "==================================="
    dataProcess.movieDataBase.make_user_movie_rating_matrix(dataProcess.test_original_data)
    active_learning_train_data = []

    # this scheme is adding every thing in active learning
    count = 0
    for values in alternative_user_movie_list:
        userId = values[0]
        movieId = values[1]
        ratings = dataProcess.movieDataBase.user_movie_rating_matrix.get(userId)
        if ratings is not None:
            rating = ratings.get(movieId)
            if rating is not None:
                active_learning_train_data.append([userId, movieId, rating])
                count += 1

    train_add_active_learning_data = dataProcess.train_original_data + active_learning_train_data
    dataProcess.movieDataBase.store_data_to_file(
        train_add_active_learning_data, fileName="train_add_active_learning_data"
    )
    dataProcess.movieDataBase.generate_libfm_data(train_add_active_learning_data)
    dataProcess.movieDataBase.store_data_to_file(dataProcess.movieDataBase.libfm_data, fileName="train_step3.libfm")
    # subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm "
    #                 "-method mcmc -out Generate/prediction", shell=True)
    # compute_error.computer_error()

    subprocess.call(
        "./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm "
        "-method sgd -dim '1,1, 200' -learn_rate 0.001 -iter 30 -out Generate/prediction",
        shell=True,
    )
    print "number of alternative user-movie requests=", len(alternative_user_movie_list)
    print "number of gained active learning user-movie data=", count
    compute_error.computer_error()
Exemplo n.º 6
0
def experiment1():
    movieDataBase = MovieDataBase()
    movieDataBase.generate_complete_rating_data(regenerate=False)

    movieDataBase.make_sorted_rating_data([5, 1, 0])
    sorted_rating_data = movieDataBase.out_rating_data[:]
    movieDataBase.synchronize()
    movieDataBase.make_slice(count_movie_slice=range(1, 1201))
    movieDataBase.synchronize()

    train_original_data = movieDataBase.core_rating_data[:]
    movieDataBase.store_data_to_file(fileName='train_original_data')
    movieDataBase.generate_libfm_data(omitMovie=True)
    movieDataBase.store_data_to_file(movieDataBase.libfm_data, 'train_step2.libfm')

    movieDataBase.add_negative_data()
    movieDataBase.synchronize()
    train_addNegative_data = movieDataBase.core_rating_data[:]
    movieDataBase.store_data_to_file(fileName='train_addNegative_data')
    movieDataBase.generate_libfm_data(omitMovie=True)
    movieDataBase.store_data_to_file(movieDataBase.libfm_data, 'train_step1.libfm')

    movieDataBase.load_core_rating_data(sorted_rating_data)
    movieDataBase.make_slice(count_movie_slice=range(1201, 1441))
    movieDataBase.synchronize()

    test_original_data = movieDataBase.core_rating_data[:]
    movieDataBase.store_data_to_file(fileName='test_original_data')
    movieDataBase.generate_libfm_data(shuffle=False)
    movieDataBase.store_data_to_file(movieDataBase.libfm_data, fileName='test_step3.libfm')

    movieDataBase.add_negative_data(addAllUsers=True)
    movieDataBase.synchronize()
    test_addAllNegative_data = movieDataBase.core_rating_data[:]
    movieDataBase.store_data_to_file(fileName='test_addAllNegative_data')
    movieDataBase.generate_libfm_data(omitMovie=True, shuffle=False)
    movieDataBase.store_data_to_file(movieDataBase.libfm_data, 'test_step1.libfm')
    movieDataBase.store_data_to_file(movieDataBase.libfm_data, 'test_step2.libfm')

    """the test data is still in movieDataBase.core. Next is the step 1 """
    subprocess.call("./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm "
                    "-method mcmc -out Generate/prediction", shell=True)
    # subprocess.call("./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm "
    #                 "-method sgd -learn_rate 0.01 -out Generate/prediction", shell=True)
    selected_data_positions = compute_classification.compute_classification(len(test_original_data))
    alternative_user_movie_list = []
    for index in selected_data_positions:
        userId = movieDataBase.core_rating_data[index][0]
        movieId = movieDataBase.core_rating_data[index][1]
        alternative_user_movie_list.append([userId, movieId])
    movieDataBase.make_alternative_user_movie_matrix(alternative_user_movie_list)

    """ step 2 regression"""
    # subprocess.call("./Generate/libFM -task r -train Generate/train_step2.libfm -test Generate/test_step2.libfm "
    #                 "-method mcmc -out Generate/prediction", shell=True)
    subprocess.call("./Generate/libFM -task r -train Generate/train_step2.libfm -test Generate/test_step2.libfm "
                    "-method sgd -learn_rate 0.001 -iter 70 -out Generate/prediction", shell=True)

    """ step 3  add active learning result into train_original_data """
    movieDataBase.make_user_movie_rating_matrix(test_original_data)
    active_learning_train_data = []

    # this scheme is adding every thing in active learning
    count = 0
    for values in alternative_user_movie_list:
        userId = values[0]
        movieId = values[1]
        ratings = movieDataBase.user_movie_rating_matrix.get(userId)
        if ratings is not None:
            rating = ratings.get(movieId)
            if rating is not None:
                active_learning_train_data.append([userId, movieId, rating])
                count += 1

    train_add_active_learning_data = train_original_data + active_learning_train_data
    movieDataBase.store_data_to_file(train_add_active_learning_data, fileName='train_add_active_learning_data')
    movieDataBase.generate_libfm_data(train_add_active_learning_data)
    movieDataBase.store_data_to_file(movieDataBase.libfm_data, fileName='train_step3.libfm')
    # subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm "
    #                 "-method mcmc -out Generate/prediction", shell=True)
    # compute_error.computer_error()

    subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm "
                    "-method sgd -dim '1,1, 80' -learn_rate 0.001 -iter 160 -out Generate/prediction", shell=True)
    compute_error.computer_error()
def baseline2_random_after_classification(active_learning_ratio=0.1):
    print '\n==================================='
    print 'baseline2 random choosing active learning data after classification started'
    print '==================================='

    """the test data is still in movieDataBase.core. Next is the step 1 """
    print '\n==================================='
    print 'step 1 binary classification started'
    print '==================================='

    subprocess.call("./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm "
                    "-method mcmc -iter 1 -out Generate/prediction", shell=True)
    # subprocess.call("./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm "
    #                 "-method sgd -learn_rate 0.01 -out Generate/prediction", shell=True)
    selected_data_positions_tmp = compute_classification.compute_classification(len(dataProcess.test_original_data))

    number_of_active_data = (int(dataProcess.movieDataBase.TOTAL_USERS*active_learning_ratio)-1)*240
    number_of_index = len(dataProcess.test_addAllNegative_data)
    selected_data_positions = set()
    while len(selected_data_positions) < number_of_active_data:
        """ next line is wrong but has good experiment behavior i want to know why."""
        # selected_data_positions.add(random.randint(0, len(selected_data_positions_tmp)-1))
        selected_data_positions.add(random.choice(selected_data_positions_tmp))

    alternative_user_movie_list = []
    for index in selected_data_positions:
        userId = dataProcess.test_addAllNegative_data[index][0]
        movieId = dataProcess.test_addAllNegative_data[index][1]
        alternative_user_movie_list.append([userId, movieId])
    # movieDataBase.make_alternative_user_movie_matrix(alternative_user_movie_list)

    """ add active learning result into train_original_data """
    print '\n==================================='
    print 'step active learning regression started'
    print '==================================='
    dataProcess.movieDataBase.make_user_movie_rating_matrix(dataProcess.test_original_data)
    active_learning_train_data = []

    # this scheme is adding every thing in active learning
    count = 0
    for values in alternative_user_movie_list:
        userId = values[0]
        movieId = values[1]
        ratings = dataProcess.movieDataBase.user_movie_rating_matrix.get(userId)
        if ratings is not None:
            rating = ratings.get(movieId)
            if rating is not None:
                active_learning_train_data.append([userId, movieId, rating])
                count += 1

    train_add_active_learning_data = dataProcess.train_original_data + active_learning_train_data
    dataProcess.movieDataBase.store_data_to_file(train_add_active_learning_data, fileName='train_add_active_learning_data')
    dataProcess.movieDataBase.generate_libfm_data(train_add_active_learning_data)
    dataProcess.movieDataBase.store_data_to_file(dataProcess.movieDataBase.libfm_data, fileName='train_step3.libfm')
    # subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm "
    #                 "-method mcmc -out Generate/prediction", shell=True)
    # compute_error.computer_error()

    subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm "
                    "-method sgd -dim '1,1, 200' -learn_rate 0.001 -iter 120 -out Generate/prediction", shell=True)
    print 'number of alternative user-movie requests=', len(alternative_user_movie_list)
    print 'number of gained active learning user-movie data=', count
    compute_error.computer_error()
def experiment2_user_qualification(active_learning_ratio=0.1):
    print '\n==================================='
    print 'experiment2 choosing qualified active learning data after classification started'
    print '==================================='

    """the test data is still in movieDataBase.core. Next is the step 1 """
    print '\n==================================='
    print 'step 1 binary classification started'
    print '==================================='

    subprocess.call("./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm "
                    "-method mcmc -iter 1 -out Generate/prediction", shell=True)
    # subprocess.call("./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm "
    #                 "-method sgd -learn_rate 0.01 -out Generate/prediction", shell=True)
    selected_data_positions = compute_classification.compute_classification(len(dataProcess.test_original_data))

    print '\n==================================='
    print 'choosing users with low rating MSE'
    print '==================================='

    number_of_active_data = (int(dataProcess.movieDataBase.TOTAL_USERS*active_learning_ratio)-1)*240
    number_of_index = len(dataProcess.test_addAllNegative_data)
    alternative_user_movie_list = []
    for index in selected_data_positions:
        userId = dataProcess.test_addAllNegative_data[index][0]
        movieId = dataProcess.test_addAllNegative_data[index][1]
        alternative_user_movie_list.append([userId, movieId])

    alternative_user_movie_list = sorted(alternative_user_movie_list, key=lambda x: dataProcess.movieDataBase.user_MSE[x[0]])
    alternative_user_movie_list = alternative_user_movie_list[:number_of_active_data]

    # for x, y in alternative_user_movie_list:
    #     print x, y

    # movieDataBase.make_alternative_user_movie_matrix(alternative_user_movie_list)

    """ add active learning result into train_original_data """
    print '\n==================================='
    print 'step active learning regression started'
    print '==================================='
    dataProcess.movieDataBase.make_user_movie_rating_matrix(dataProcess.test_original_data)
    active_learning_train_data = []

    # this scheme is adding every thing in active learning
    count = 0
    for values in alternative_user_movie_list:
        userId = values[0]
        movieId = values[1]
        ratings = dataProcess.movieDataBase.user_movie_rating_matrix.get(userId)
        if ratings is not None:
            rating = ratings.get(movieId)
            if rating is not None:
                active_learning_train_data.append([userId, movieId, rating])
                count += 1

    train_add_active_learning_data = dataProcess.train_original_data + active_learning_train_data
    dataProcess.movieDataBase.store_data_to_file(train_add_active_learning_data, fileName='train_add_active_learning_data')
    dataProcess.movieDataBase.generate_libfm_data(train_add_active_learning_data)
    dataProcess.movieDataBase.store_data_to_file(dataProcess.movieDataBase.libfm_data, fileName='train_step3.libfm')
    # subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm "
    #                 "-method mcmc -out Generate/prediction", shell=True)
    # compute_error.computer_error()

    subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm "
                    "-method sgd -dim '1,1, 200' -learn_rate 0.001 -iter 50 -out Generate/prediction", shell=True)
    print 'number of alternative user-movie requests=', len(alternative_user_movie_list)
    print 'number of gained active learning user-movie data=', count
    compute_error.computer_error()
def baseline1_random(active_learning_ratio=0.1):
    movieDataBase = MovieDataBase()
    movieDataBase.generate_complete_rating_data(regenerate=False)

    movieDataBase.make_sorted_rating_data([5, 1, 0])
    sorted_rating_data = movieDataBase.out_rating_data[:]
    movieDataBase.synchronize()
    movieDataBase.make_slice(count_movie_slice=range(1, 1201))
    movieDataBase.synchronize()

    train_original_data = movieDataBase.core_rating_data[:]
    movieDataBase.store_data_to_file(fileName='train_original_data')
    movieDataBase.generate_libfm_data(omitMovie=True)
    movieDataBase.store_data_to_file(movieDataBase.libfm_data, 'train_step2.libfm')

    movieDataBase.add_negative_data()
    movieDataBase.synchronize()
    train_addNegative_data = movieDataBase.core_rating_data[:]
    movieDataBase.store_data_to_file(fileName='train_addNegative_data')
    movieDataBase.generate_libfm_data(omitMovie=True)
    movieDataBase.store_data_to_file(movieDataBase.libfm_data, 'train_step1.libfm')

    movieDataBase.load_core_rating_data(sorted_rating_data)
    movieDataBase.make_slice(count_movie_slice=range(1201, 1441))
    movieDataBase.synchronize()

    test_original_data = movieDataBase.core_rating_data[:]
    movieDataBase.store_data_to_file(fileName='test_original_data')
    movieDataBase.generate_libfm_data(shuffle=False)
    movieDataBase.store_data_to_file(movieDataBase.libfm_data, fileName='test_step3.libfm')

    movieDataBase.add_negative_data(addAllUsers=True)
    movieDataBase.synchronize()
    test_addAllNegative_data = movieDataBase.core_rating_data[:]
    movieDataBase.store_data_to_file(fileName='test_addAllNegative_data')
    movieDataBase.generate_libfm_data(omitMovie=True, shuffle=False)
    movieDataBase.store_data_to_file(movieDataBase.libfm_data, 'test_step1.libfm')
    movieDataBase.store_data_to_file(movieDataBase.libfm_data, 'test_step2.libfm')

    print '\n==================================='
    print 'baseline1 with random choosing active learning data started'
    print '==================================='

    """ Choose active learning data randomly """
    number_of_index = len(test_addAllNegative_data)
    print number_of_index*active_learning_ratio
    invited_user_movies = [0] * (movieDataBase.TOTAL_USERS+1)
    selected_data_positions = set()
    while len(selected_data_positions) < number_of_index*active_learning_ratio:
        new_index = random.randint(0, number_of_index-1)
        if invited_user_movies[movieDataBase.core_rating_data[new_index][0]] < MAX_INVITATION:
            selected_data_positions.add(new_index)
            invited_user_movies[movieDataBase.core_rating_data[new_index][0]] += 1

    alternative_user_movie_list = []
    for index in selected_data_positions:
        userId = movieDataBase.core_rating_data[index][0]
        movieId = movieDataBase.core_rating_data[index][1]
        alternative_user_movie_list.append([userId, movieId])
    movieDataBase.make_alternative_user_movie_matrix(alternative_user_movie_list)

    """ add active learning result into train_original_data """
    print '\n==================================='
    print 'active learning regression started'
    print '==================================='
    movieDataBase.make_user_movie_rating_matrix(test_original_data)
    active_learning_train_data = []

    # this scheme is adding every thing in active learning
    count = 0
    for values in alternative_user_movie_list:
        userId = values[0]
        movieId = values[1]
        ratings = movieDataBase.user_movie_rating_matrix.get(userId)
        if ratings is not None:
            rating = ratings.get(movieId)
            if rating is not None:
                active_learning_train_data.append([userId, movieId, rating])
                count += 1

    train_add_active_learning_data = train_original_data + active_learning_train_data
    movieDataBase.store_data_to_file(train_add_active_learning_data, fileName='train_add_active_learning_data')
    movieDataBase.generate_libfm_data(train_add_active_learning_data)
    movieDataBase.store_data_to_file(movieDataBase.libfm_data, fileName='train_step3.libfm')
    # subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm "
    #                 "-method mcmc -out Generate/prediction", shell=True)
    # compute_error.computer_error()

    subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm "
                    "-method sgd -dim '1,1, 200' -learn_rate 0.001 -iter 10 -out Generate/prediction", shell=True)
    print 'number of alternative user-movie requests=', len(alternative_user_movie_list)
    print 'number of gained active learning user-movie data=', count
    compute_error.computer_error()
Exemplo n.º 10
0
def experiment3_user_similarity(active_learning_ratio=0.1):
    print '\n==================================='
    print 'experiment3 choosing special active learning users after classification started'
    print '==================================='

    """the test data is still in movieDataBase.core. Next is the step 1 """
    print '\n==================================='
    print 'step 1 binary classification started'
    print '==================================='

    subprocess.call("./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm "
                    "-method mcmc -iter 1 -out Generate/prediction", shell=True)
    # subprocess.call("./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm "
    #                 "-method sgd -learn_rate 0.01 -out Generate/prediction", shell=True)
    selected_data_positions = compute_classification.compute_classification(len(dataProcess.test_original_data))
    positive_user_id_movie_id = numpy.zeros(
        [dataProcess.movieDataBase.TOTAL_USERS, dataProcess.TEST_MOVIES], dtype=numpy.bool_)
    for index in selected_data_positions:
        userId = dataProcess.test_addAllNegative_data[index][0]
        movieId = dataProcess.test_addAllNegative_data[index][1]
        positive_user_id_movie_id[userId-1][dataProcess.test_from_movie_id_get_movie_num[movieId]-1] = numpy.True_

    print '\n==================================='
    print 'step2: regression started'
    print '==================================='

    subprocess.call("./Generate/libFM -task r -train Generate/train_step2.libfm -test Generate/test_step2.libfm "
                    "-method mcmc -iter 1 -out Generate/prediction", shell=True)
    f1 = open('Generate/prediction')
    step2_pred_rating = f1.readlines()
    f1.close()
    step2_pred_rating = [float(x.strip()) for x in step2_pred_rating]
    step2_pred_data = []
    for i in range(len(dataProcess.test_addAllNegative_data)):
        step2_pred_data.append([dataProcess.test_addAllNegative_data[i][0],
                                dataProcess.test_addAllNegative_data[i][1], step2_pred_rating[i]])

    dataProcess.movieDataBase.compute_numpy_matrix(step2_pred_data)
    pred_rating_numpy_matrix = dataProcess.movieDataBase.numpy_rating_matrix
    pred_from_movie_id_get_movie_num = dataProcess.movieDataBase.from_movie_id_get_movie_num
    # print pred_rating_numpy_matrix[10][pred_from_movie_id_get_movie_num[743]-1]

    dictionary = dict()
    dictionary['train_rating'] = dataProcess.train_rating_numpy_matrix
    dictionary['predict_rating'] = pred_rating_numpy_matrix
    dictionary['positive_rating'] = positive_user_id_movie_id
    scipy.io.savemat('step2.mat', dictionary)
    """next we have to use Matlab"""
    return

    print '\n==================================='
    print 'choosing users with special similarity'
    print '==================================='

    number_of_index = len(test_addAllNegative_data)
    alternative_user_movie_list = []

    alternative_user_movie_list.append([userId, movieId])
    alternative_user_movie_list_sorted = sorted(alternative_user_movie_list, key=lambda x: x[1])
    # for x, y in alternative_user_movie_list_sorted:
    #     print x, y

    # alternative_user_movie_list = []
    # invited_user_movies = [0] * (movieDataBase.TOTAL_USERS+1)
    # for userId, movieId in alternative_user_movie_list_sorted:
    #     if invited_user_movies[userId] < MAX_INVITATION:
    #         invited_user_movies[userId] += 1
    #         alternative_user_movie_list.append([userId, movieId])
    #         if len(alternative_user_movie_list) >= int(number_of_index*active_learning_ratio):
    #             break

    # for x, y in alternative_user_movie_list:
    #     print x, y

    movieDataBase.make_alternative_user_movie_matrix(alternative_user_movie_list)

    """ add active learning result into train_original_data """
    print '\n==================================='
    print 'step active learning regression started'
    print '==================================='
    movieDataBase.make_user_movie_rating_matrix(test_original_data)
    active_learning_train_data = []

    # this scheme is adding every thing in active learning
    count = 0
    for values in alternative_user_movie_list:
        userId = values[0]
        movieId = values[1]
        ratings = movieDataBase.user_movie_rating_matrix.get(userId)
        if ratings is not None:
            rating = ratings.get(movieId)
            if rating is not None:
                active_learning_train_data.append([userId, movieId, rating])
                count += 1

    train_add_active_learning_data = train_original_data + active_learning_train_data
    movieDataBase.store_data_to_file(train_add_active_learning_data, fileName='train_add_active_learning_data')
    movieDataBase.generate_libfm_data(train_add_active_learning_data)
    movieDataBase.store_data_to_file(movieDataBase.libfm_data, fileName='train_step3.libfm')
    # subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm "
    #                 "-method mcmc -out Generate/prediction", shell=True)
    # compute_error.computer_error()

    subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm "
                    "-method sgd -dim '1,1, 200' -learn_rate 0.001 -iter 10 -out Generate/prediction", shell=True)
    print 'number of alternative user-movie requests=', len(alternative_user_movie_list)
    print 'number of gained active learning user-movie data=', count
    compute_error.computer_error()