def test_dataframe_separation(self):
        """
        Test to check if the class SVDmodel is separating
        the dataframe in train, test and valid dataframes
        with the right proportions.
        """
        path = parent_path + '/ml-1m/ratings.dat'
        df = dfFunctions.load_dataframe(path)
        model = re.SVDmodel(df, 'User', 'item', 'Rating')
        sum_of_sizes = len(model.train) + len(model.test) + len(model.valid)
        proportion_train = len(model.train) / len(df)
        proportion_test = len(model.test) / len(df)
        proportion_valid = len(model.valid) / len(df)
        right_proportions = np.array([0.8, 0.1, 0.1])
        proportions = np.array([proportion_train,
                                proportion_test,
                                proportion_valid])

        error = rmse(proportions, right_proportions)
        self.assertTrue(len(df) == sum_of_sizes)
        self.assertTrue(error < 0.1,
                        '''/n The right proportions are (train,test,valid) =
                        {0}, but the model is separating the dataframe with the
                        proportions {1}'''
                        .format(right_proportions, proportions))
    def test_upperbound(self):
        """
        We run 5000 steps of training and check if the root mean square error
        from the valid dataset is less than 1.0 in the NSVD model
        """
        path = parent_path + '/ml-1m/ratings.dat'
        df = dfFunctions.load_dataframe(path)
        model = re.SVDmodel(df, 'User', 'item', 'Rating', 'nsvd', 'mean')

        dimension = 12
        regularizer_constant = 0.05
        learning_rate = 0.001
        momentum_factor = 0.9
        batch_size = 1000
        num_steps = 5000

        print('\n')
        model.training(dimension,
                       regularizer_constant,
                       learning_rate,
                       momentum_factor,
                       batch_size,
                       num_steps)

        prediction = model.valid_prediction()
        self.assertTrue(prediction <= 1.0,
                        '''/n with num_steps = {0} /n, the mean square
                        error of the valid dataset should be less
                        than 1 and not {1}'''
                        .format(num_steps, prediction))
 def test_batch(self):
     """
     Test to check if the batchgenerator class is creating
     different batches of the same size.
     """
     path = parent_path + '/ml-1m/ratings.dat'
     df = dfFunctions.load_dataframe(path)
     batch_size = 100
     generator = dfFunctions.BatchGenerator(df,
                                            batch_size,
                                            'User',
                                            'item',
                                            'Rating')
     old_observation = None
     count = 0
     num_of_tests = 200
     for i in range(num_of_tests):
         batch = generator.get_batch()
         current_observation = (batch[0][0], batch[1][0], batch[2][0])
         if current_observation == old_observation:
             count += 1
         old_observation = current_observation
         self.assertTrue(len(batch[0]) == batch_size)
         self.assertTrue(len(batch[1]) == batch_size)
         self.assertTrue(len(batch[2]) == batch_size)
     self.assertTrue(count < num_of_tests)
 def test_load_dataframe(self):
     """
     Test to check if the function load_dataframe is working
     with all the datasets from movielens.
     """
     path1 = parent_path + '/ml-1m/ratings.dat'
     path10 = parent_path + '/movielens/ml-10m/ratings.dat'  # Should not be used
     path20 = parent_path + '/movielens/ml-20m/ratings.csv'  # Should not be used
     df1 = dfFunctions.load_dataframe(path1)
     # df10 = dfFunctions.load_dataframe(path10)
     # df20 = dfFunctions.load_dataframe(path20)
     self.assertTrue(type(df1) == pd.core.frame.DataFrame)
 def test_rated_items(self):
     """
     Test to check if the method _set_item_dic creates
     a dic with User:rated_items such that
     len(rated_items) == max_size.
     """
     path = parent_path + '/ml-1m/ratings.dat'
     df = dfFunctions.load_dataframe(path)
     finder = dfFunctions.ItemFinder(df, 'User', 'item', 'Rating', 'mean')
     all_users = df['User'].unique()
     count = 0
     problem_users = []
     for user in all_users:
         r_items = finder.dic[user]
         if len(r_items) == finder.size and r_items.dtype == 'int32':
             count += 1
         else:
             problem_users.append(user)
     self.assertTrue(count == len(all_users),
                     '''/n There are {0} arrays in dic such that
         len(dic[User]) != finder.size or with wrong types. And these
         users are {1}'''.format(count, problem_users))
 def test_dataframe_intersection(self):
     """
     Test to check if the train, test and valid dataframes
     have no intersection between them.
     """
     path = parent_path + '/ml-1m/ratings.dat'
     df = dfFunctions.load_dataframe(path)
     model = re.SVDmodel(df, 'User', 'item', 'Rating')
     dic_intersection = dfFunctions.count_intersection(model.train,
                                                       model.test,
                                                       model.valid)
     self.assertTrue(dic_intersection['1-2'] == 0,
                     '''/n The intersection between
         the train and test dataframe
         is {0}'''.format(dic_intersection['1-2']))
     self.assertTrue(dic_intersection['1-3'] == 0,
                     '''/n The intersection between
         the train and valid dataframe
         is {0}'''.format(dic_intersection['1-3']))
     self.assertTrue(dic_intersection['2-3'] == 0,
                     '''/n The intersection between
         the test and valid dataframe
         is {0}'''.format(dic_intersection['2-3']))
    def test_prediction(self):
        """
        We run 5000 steps of training and check if the difference
        between the prediction mean and the actual mean is less than
        0.9 in the SVD model
        """
        path = parent_path + '/ml-1m/ratings.dat'
        df = dfFunctions.load_dataframe(path)
        model = re.SVDmodel(df, 'User', 'item', 'Rating', 'nsvd', 'mean')

        dimension = 12
        regularizer_constant = 0.05
        learning_rate = 0.001
        momentum_factor = 0.9
        batch_size = 1000
        num_steps = 5000

        print('/n')
        model.training(dimension,
                       regularizer_constant,
                       learning_rate,
                       momentum_factor,
                       batch_size,
                       num_steps)

        user_example = np.array(model.valid['User'])[0:10]
        movies_example = np.array(model.valid['item'])[0:10]
        actual_ratings = np.mean(np.array(model.valid['Rating'])[0:10])
        predicted_ratings = np.mean(model.prediction(user_example,
                                                     movies_example))
        difference = np.absolute(actual_ratings - predicted_ratings)

        self.assertTrue(difference <= 0.9,
                        '''/n with num_steps = {0} /n, the difference should be
                        less than 0.9 and not {1}'''
                        .format(num_steps, difference))
示例#8
0
def initialize():
    # path = parent_path + '\\ml-1m\\ratings.dat'

    filename = os.path.join(
        root_dir,
        'machine_learning/prediction/new_algorithm/ml-1m/ratings.dat')
    path = os.path.abspath(os.path.realpath(filename))

    parser = argparse.ArgumentParser()

    parser.add_argument('-p',
                        '--path',
                        type=str,
                        default=path,
                        help=('ratings path\n'
                              '(default=pwd/movielens/ml-1m/ratings.dat)'))

    parser.add_argument('-e',
                        '--example',
                        type=str,
                        default='1',
                        help=('movielens dataset\n'
                              'examples (only 1, 10 or 20) (default=1)'))

    parser.add_argument('-b',
                        '--batch',
                        type=int,
                        default=700,
                        help='batch size (default=700)')

    parser.add_argument('-s',
                        '--steps',
                        type=int,
                        default=2000,
                        help='number of training steps (default=7000)')

    parser.add_argument('-d',
                        '--dimension',
                        type=int,
                        default=12,
                        help='embedding vector size (default=12)')

    parser.add_argument('-r',
                        '--reg',
                        type=float,
                        default=0.0003,
                        help=('regularizer constant for\n'
                              'the loss function  (default=0.0003)'))

    parser.add_argument('-l',
                        '--learning',
                        type=float,
                        default=0.001,
                        help='learning rate (default=0.001)')

    parser.add_argument('-m',
                        '--momentum',
                        type=float,
                        default=0.926,
                        help='momentum factor (default=0.926)')

    parser.add_argument('-i',
                        '--info',
                        type=str,
                        default='True',
                        help=('Training information.\n'
                              'Only True or False (default=True)'))

    parser.add_argument('-M',
                        '--model',
                        type=str,
                        default='svd',
                        help='models: either svd or nsvd (default=svd)')

    parser.add_argument('-S',
                        '--nsvd_size',
                        type=str,
                        default='mean',
                        help=('size of the vectors of the nsvd model:\n'
                              'either max, mean or min (default=mean)'))

    args = parser.parse_args()

    if args.example == '20':
        path = parent_path + '/movielens/ml-20m/ratings.csv'
    elif args.example == '10':
        path = parent_path + '/movielens/ml-10m/ratings.dat'
    elif args.example == '1':
        pass
    else:
        print('Wrong parameter passed to the example option. '
              'Running default=1\n')

    df = dfFunctions.load_dataframe(args.path)
    if args.model == 'svd':
        model = re.SVDmodel(df, 'User', 'item', 'Rating')
    else:
        model = re.SVDmodel(df,
                            'User',
                            'item',
                            'Rating',
                            args.model,
                            args.nsvd_size)

    dimension = args.dimension
    regularizer_constant = args.reg
    learning_rate = args.learning
    batch_size = args.batch
    num_steps = args.steps
    momentum_factor = args.momentum
    if args.info == 'True':
        info = True
    else:
        info = False

    model.training(dimension,
                   regularizer_constant,
                   learning_rate,
                   momentum_factor,
                   batch_size,
                   num_steps,
                   info)

    prediction = model.valid_prediction()
    print('\nThe mean square error of the whole valid dataset is ', prediction)
    # user_example = np.array(model.valid['User'])[0:10]
    # movies_example = np.array(model.valid['item'])[0:10]
    # actual_ratings = np.array(model.valid['Rating'])[0:10]
    # predicted_ratings = model.prediction(user_example, movies_example)
    # print('''\nUsing our model for 10 specific users and 10
    # movies we predicted the following score:''')
    # print(predicted_ratings)
    # print('\nAnd in reality the scores are:')
    # print(actual_ratings)
    return model