Пример #1
0
class DataFit:
    def __init__(self):
        self.dataset = None

    def fit(self):
        book_list = DataPrep.get_book_list()
        book_feature_list = DataPrep.get_feature_list()
        user_list = DataPrep.get_user_list()
        self.dataset = Dataset()
        self.dataset.fit(users=user_list,
                         items=book_list,
                         item_features=book_feature_list)

        rating_list = DataPrep.get_rating_list()
        interactions, weights = self.dataset.build_interactions(rating_list)

        book_features = DataPrep.create_features()
        books_features = self.dataset.build_item_features(book_features)
        return interactions, weights, books_features

    def create_new_interactions(self, checkpoint):
        rating_list = DataPrep.get_rating_list_from_checkpoint(checkpoint)
        interactions, weights = self.dataset.build_interactions(rating_list)
        return interactions, weights

    def get_user_mapping(self):
        user_id_map, user_feature_map, item_id_map, item_feature_map = self.dataset.mapping(
        )
        return user_id_map

    def get_book_mapping(self):
        user_id_map, user_feature_map, item_id_map, item_feature_map = self.dataset.mapping(
        )
        return item_id_map

    @staticmethod
    def fit_evaluate(test_percentage=0.1):
        book_list = DataPrep.get_book_list()
        book_feature_list = DataPrep.get_feature_list()
        user_list = DataPrep.get_user_list()
        dataset = Dataset()
        dataset.fit(users=user_list,
                    items=book_list,
                    item_features=book_feature_list)

        rating_list = DataPrep.get_rating_list()
        random.shuffle(rating_list)
        rating_list_test = rating_list[:int(test_percentage *
                                            len(rating_list))]
        rating_list_train = rating_list[int(test_percentage *
                                            len(rating_list)):]
        interactions_train, weights_train = dataset.build_interactions(
            rating_list_train)
        interactions_test, weights_test = dataset.build_interactions(
            rating_list_test)

        return interactions_train, weights_train, interactions_test, weights_test
    def interactions(self):
        # If interactions have not been supplied, process the file provided in source
        # N.B. This property also sets weights, which is probably not a best practice
        if self._interactions is None:

            if self._category == 'ratings_matrix':
                rm_df = pd.read_csv(self.path)
                ids = rm_df['sub']
                rm_df = rm_df.set_index(keys='sub')
                if 'Unnamed: 0' in rm_df.columns:
                    rm_df.drop('Unnamed: 0', axis=1, inplace=True)
                dataset = Dataset()
                dataset.fit(list(ids), list(rm_df.columns))
                self.mapping = dataset.mapping()

                interactions = []

                for item in rm_df.columns.tolist():
                    users = rm_df.index[rm_df[item] >= 1].tolist()
                    counts = rm_df[item][rm_df[item] >= 1]
                    interactions.extend(
                        zip(users, itertools.repeat(item, len(users)), counts))

                (self._interactions,
                 self._weights) = dataset.build_interactions(interactions)

            else:
                int_df = pd.read_csv(self.path)
                if 'Unnamed: 0' in int_df.columns:
                    int_df.drop('Unnamed: 0', axis=1, inplace=True)
                int_df = int_df.groupby(['subscriber_id', 'ddi_block_id']).size().reset_index()\
                    .rename(columns={0:'count'})
                dataset = Dataset()
                ids = int_df['subscriber_id'].unique()
                items = int_df['ddi_block_id'].unique()
                dataset.fit(list(ids), list(items))
                self.mapping = dataset.mapping()

                if self._use_weights:
                    interactions = zip(int_df['subscriber_id'],
                                       int_df['ddi_block_id'], int_df['count'])
                else:
                    interactions = zip(int_df['subscriber_id'],
                                       int_df['ddi_block_id'])
                (self._interactions,
                 self._weights) = dataset.build_interactions(interactions)

        else:
            return self._interactions
Пример #3
0
def train_model(df,
                user_id_col='user_id',
                item_id_col='business_id',
                item_name_col='name_business',
                evaluate=True):
    """Train the model using collaborative filtering.

    Args:
        df: the input dataframe.
        user_id_col: user id column.
        item_id_col: item id column.
        item_name_col: item name column.
        evaluate: if evaluate the model performance.

    Returns:
        model_full: the trained model.
        df_interactions: dataframe with user-item interactions.
        user_dict: user dictionary containing user_id as
            key and interaction_index as value.
        item_dict: item dictionary containing item_id
            as key and item_name as value.

    """
    if evaluate:
        print('Evaluating model...')
        evaluate_model(df, user_id_col='user_id', item_id_col='business_id')

    print('Training model...')
    # build recommendations for known users and known businesses
    # with collaborative filtering method
    ds_full = Dataset()
    # we call fit to supply userid, item id and user/item features
    ds_full.fit(
        df[user_id_col].unique(),  # all the users
        df[item_id_col].unique(),  # all the items
    )
    (interactions, weights) = ds_full.build_interactions([(x[0], x[1], x[2])
                                                          for x in df.values])
    # model
    model_full = LightFM(no_components=100,
                         learning_rate=0.05,
                         loss='warp',
                         max_sampled=50)
    model_full.fit(interactions,
                   sample_weight=weights,
                   epochs=10,
                   num_threads=10)
    # mapping
    user_id_map, _, business_id_map, _ = ds_full.mapping()

    # data preparation
    df_interactions = pd.DataFrame(weights.todense())
    df_interactions.index = list(user_id_map.keys())
    df_interactions.columns = list(business_id_map.keys())
    user_dict = user_id_map
    item_dict = df.set_index(item_id_col)[item_name_col].to_dict()
    return model_full, df_interactions, user_dict, item_dict
Пример #4
0
 def __init__(self, dataset: Dataset) -> None:
     """
     userid: user_id
     row: internal user id
     itemid: recipe_id
     column: internal recipe id
     """
     userid2row, _, itemid2col, _ = dataset.mapping()
     self.userid2row = userid2row
     self.itemid2col = itemid2col
     # Invert dictionaries to get mapping in other direction
     self.row2userid = {
         value: key
         for key, value in self.userid2row.items()
     }
     self.col2itemid = {v: k for k, v in self.itemid2col.items()}
Пример #5
0
movies['year'] = years
print(movies.head())

print(f'# Ratings: {len(ratings)}')
print(f'# Users: {len(set(ratings["userId"]))}')
last_user = sorted(list(set(ratings['userId'])))[-1]
new_user = last_user + 1
print('Added new user: %s' % new_user)

dataset = Dataset()
dataset.fit(chain(ratings['userId'], [new_user]),
            movies['movieId'],
            item_features=(GENRES + list(movies['year']) +
                           list(set(movies['movieId']))))

_, _, item_mapping, _ = dataset.mapping()
rev_item_mapping = {y: x for (x, y) in item_mapping.items()}

matches = []
for rid, row in movies.iterrows():
    for m in match_lst:
        if m.lower() in row[1].lower():
            matches.append(row[0])

print(good_ratings.head())
rating_iter = zip(good_ratings['userId'], good_ratings['movieId'])
new_iter = ((new_user, x) for x in matches)
interactions, weights = dataset.build_interactions(chain(
    rating_iter, new_iter))

print(repr(interactions))
Пример #6
0
print('Num users: {}, num_items {}.'.format(num_users, num_items))

# buil user features from users interests
# user_features = dataset.build_user_features(((x['_id'], x['interests']) for x in full_users),normalize=False)

# buil item features from users iterests
# item_features = dataset.build_item_features(((x['_id'], x['subCategory']) for x in locations_data),normalize=False)
# print(repr(item_features))

# with open('data.json', 'w') as outfile:
#     json.dump(dataset.mapping(), outfile)

model = LightFM(loss='warp', no_components=30)
model.fit(interactions[0])

train_auc = auc_score(model, interactions[0], num_threads=2).mean()
print('Hybrid training set AUC: %s' % train_auc)

# np.set_printoptions(threshold=np.inf)

with open('virtual_mapping.json', 'w') as outfile:
    json.dump(dataset.mapping(), outfile)

score = model.predict(182, np.arange(num_items))
pdb.set_trace()
print(repr(score))

# np.set_printoptions(threshold=np.inf)
ranked_items = np.argsort(-score)
find_location_id(ranked_items)
Пример #7
0
def run_learning_curve(test_fraction, max_epoch):

    # create data_train
    data  = Dataset(user_identity_features=True)
    
    # user featurs
    user_features, user_feature_names = get_user_features()
    
    # create map between user_id, post_id, user_features and internal indices
    data.fit((x['user_id'] for x in get_data()),(x['post_id'] for x in get_data()), user_features=user_features)
    
    # print shape
    num_users, num_items = data.interactions_shape()
    print('Num users: {}, num_items {}.'.format(num_users, num_items))
    
    #---------------------------
    # Building the interactions matrix
    #---------------------------
    # create interaction matrix to optimize
    (interactions, weights) = data.build_interactions(((x['user_id'], x['post_id'])) for x in get_data())
    print(repr(interactions))
    
    # retrieve mapping from dataset
    user_id_map, user_feature_map, item_id_map, item_feature_map = data.mapping()
    
    # split test and train
    interaction_train, interaction_test = cross_validation.random_train_test_split(interactions, test_fraction)
    
    #---------------------------
    # train model
    #---------------------------
    model_cs  = LightFM(learning_rate=0.05, loss='warp')
    model_ws  = LightFM(learning_rate=0.05, loss='warp', no_components=len(user_feature_names))

    precision_cs = []
    precision_ws = []

    recall_cs = []
    recall_ws = []

    for epoch in range(int(max_epoch/2)):

        model_cs.fit(interaction_train, epochs=int(epoch*2))
        model_ws.fit(interaction_train, user_features=user_features, epochs=int(epoch*2))
   
        # calculate precision and recall for each epoch
        precision_at_k_cs = evaluation.precision_at_k(model_cs, interaction_test, interaction_train)
        precision_at_k_ws = evaluation.precision_at_k(model_ws, interaction_test, interaction_train, user_features=user_features)

        recall_at_k_cs = evaluation.recall_at_k(model_cs, interaction_test, interaction_train)
        recall_at_k_ws = evaluation.recall_at_k(model_ws, interaction_test, interaction_train, user_features=user_features)

        # append to result
        precision_cs.append(sum(precision_at_k_cs) / len(precision_at_k_cs))
        precision_ws.append(sum(precision_at_k_ws) / len(precision_at_k_ws))
        recall_cs.append(sum(recall_at_k_cs) / len(recall_at_k_cs))
        recall_ws.append(sum(recall_at_k_ws) / len(recall_at_k_ws))

    df_result = pd.DataFrame({
        "precision_cs": precision_cs,
        "precision_ws": precision_ws,
        "recall_cs": recall_cs,
        "recall_ws": recall_ws,
        })

    # save to file
    df_result.to_csv("data/validation/df.epoch.csv", index=False)

    return
Пример #8
0
def run_validation(test_fraction, max_val):

    # containers to hold results
    ave_precision_at_k_cs   = []
    ave_recall_at_k_cs      = []
    ave_auc_score_cs        = []

    ave_precision_at_k_ws   = []
    ave_recall_at_k_ws      = []
    ave_auc_score_ws        = []
   

    # perform validation
    validation_itr = 0

    while (validation_itr < max_val):

        print("Start validating cold, warm start, iteration %s" %validation_itr)

        # prevent random failure to abort entire job
        try:

            # count
            validation_itr += 1

            # create data_train
            data_cs = Dataset()
            data_ws = Dataset(user_identity_features=True)

            # user featurs
            user_features, user_feature_names = get_user_features()
            print(user_feature_names)

            # create map between user_id, post_id, user_features and internal indices
            data_cs.fit((x['user_id'] for x in get_data()),(x['post_id'] for x in get_data()))
            data_ws.fit((x['user_id'] for x in get_data()),(x['post_id'] for x in get_data()), user_features=user_features)
            
            # print shape
            num_users, num_items = data_ws.interactions_shape()
            print('Num users: {}, num_items {}.'.format(num_users, num_items))
            
            #---------------------------
            # Building the interactions matrix
            #---------------------------
            # create interaction matrix to optimize
            (interactions_cs, weights_cs) = data_cs.build_interactions(((x['user_id'], x['post_id'])) for x in get_data())
            (interactions_ws, weights_ws) = data_ws.build_interactions(((x['user_id'], x['post_id'])) for x in get_data())
            print(repr(interactions_ws))

            # retrieve mapping from dataset
            user_id_map_cs, user_feature_map_cs, item_id_map_cs, item_feature_map_cs = data_cs.mapping()
            user_id_map_ws, user_feature_map_ws, item_id_map_ws, item_feature_map_ws = data_ws.mapping()

            # split test and train
            interaction_train_cs, interaction_test_cs = cross_validation.random_train_test_split(interactions_cs, test_fraction)
            interaction_train_ws, interaction_test_ws = cross_validation.random_train_test_split(interactions_ws, test_fraction)

            #---------------------------
            # train model
            #---------------------------
            model_cs  = LightFM(learning_rate=0.05, loss='warp')
            model_ws  = LightFM(learning_rate=0.05, loss='warp', no_components=len(user_feature_names))

            model_cs.fit(interaction_train_cs, epochs=30)
            model_ws.fit(interaction_train_ws, user_features=user_features, epochs=30)

            #---------------------------
            # make predictions
            #---------------------------
            precision_at_k_cs = evaluation.precision_at_k(model_cs, interaction_test_cs, interaction_train_cs)
            recall_at_k_cs = evaluation.recall_at_k(model_cs, interaction_test_cs, interaction_train_cs)
            auc_score_cs = evaluation.auc_score(model_cs, interaction_test_cs, interaction_train_cs)

            precision_at_k_ws = evaluation.precision_at_k(model_ws, interaction_test_ws, interaction_train_ws, user_features=user_features)
            recall_at_k_ws = evaluation.recall_at_k(model_ws, interaction_test_ws, interaction_train_ws, user_features=user_features)
            auc_score_ws = evaluation.auc_score(model_ws, interaction_test_ws, interaction_train_ws, user_features=user_features)

            # append score from each iteration to results
            ave_precision_at_k_cs.append(sum(precision_at_k_cs) / len(precision_at_k_cs))
            ave_recall_at_k_cs.append(sum(recall_at_k_cs) / len(recall_at_k_cs))
            ave_auc_score_cs.append(sum(auc_score_cs) / len(auc_score_cs))

            ave_precision_at_k_ws.append(sum(precision_at_k_ws) / len(precision_at_k_ws))
            ave_recall_at_k_ws.append(sum(recall_at_k_ws) / len(recall_at_k_ws))
            ave_auc_score_ws.append(sum(auc_score_ws) / len(auc_score_ws))


        except:
            print("teration %s failed. Skipping.." %validation_itr)


    print("Validation score for test")
    print(ave_precision_at_k_cs  )
    print(ave_recall_at_k_cs     )
    print(ave_auc_score_cs )
    print(ave_precision_at_k_ws  )
    print(ave_recall_at_k_ws     )
    print(ave_auc_score_ws )

    df_result = pd.DataFrame({
        'precision_at_k_cs': ave_precision_at_k_cs,
        'recall_at_k_cs': ave_recall_at_k_cs,
        'auc_score_cs': ave_auc_score_cs,
        'precision_at_k_ws': ave_precision_at_k_ws,
        'recall_at_k_ws': ave_recall_at_k_ws,
        'auc_score_ws': ave_auc_score_ws,
        })

    # save to file
    df_result.to_csv("data/validation/df.csv", index=False)

    return
Пример #9
0
                    learning_rate=learning_rate,
                    item_alpha=item_alpha,
                    user_alpha=user_alpha)

    model.fit(interaction_matrix,
              sample_weight=interaction_weight,
              epochs=epochs,
              num_threads=4,
              verbose=True)
    print('[ %04ds ] Model fitted' % (time.time() - start_time))

    recommendations = []
    n_businesses = len(training_business_ids)
    # n_users = len(training_user_ids)
    best_k = 50
    user_id_map, _, business_id_map, __ = dataset.mapping()

    business_ids_list = list(training_business_ids)
    training_business_indices = np.array(
        list(map(lambda id: business_id_map[id], business_ids_list)))
    user_seen_businesses = Review.extract_user_seen_business(training_set)

    print('[ %04ds ] Ready to produce recommendations' %
          (time.time() - start_time))
    finished = 0
    with open('user_list.json', 'r') as f:
        recommendation_user_list = json.load(f)['users']

    n_users = len(recommendation_user_list)
    for user_id in recommendation_user_list:
        # user_recommendations = {'user_id': user_id, 'recommended_businesses': []}
def main(train_file, val_file, test_file, weight, output_file):

    # Read data from parquet
    print('Reading data ...')
    train_df = pd.read_parquet(train_file)
    val_df = pd.read_parquet(val_file)
    test_df = pd.read_parquet(test_file)

    train_df = train_df[['user_id', 'book_id', 'rating']]
    val_df = val_df[['user_id', 'book_id', 'rating']]
    test_df = test_df[['user_id', 'book_id', 'rating']]

    # Build the ID mappings
    print('Building the ID mappings ...')
    train = Dataset()
    train.fit((x for x in train_df.user_id), (x for x in train_df.book_id))
    user_map = train.mapping()[0]
    item_map = train.mapping()[2]
    train_size = train.interactions_shape()
    with open(output_file, "a") as f:
        f.write(
            'There are {} interactions in the training data, including {} users and {} items \n'
            .format(len(train_df), train_size[0], train_size[1]))
    print(
        'There are {} interactions in the training data, including {} users and {} items'
        .format(len(train_df), train_size[0], train_size[1]))

    # Build the interactions matrix
    print('Building the interactions and weights matrix ...')
    if weight == 'True':
        train_df.rating = train_df.rating + 1  # use rating +1 as weights
        (train_int, train_weight) = train.build_interactions(
            ((i[1][0], i[1][1], i[1][2]) for i in train_df.iterrows()))
    else:
        (train_int, train_weight) = train.build_interactions(
            ((i[1][0], i[1][1]) for i in train_df.iterrows()))

    # filter out interactions with rating >= 3 as true label
    val_df = val_df[val_df.rating >= 3].reset_index(drop=True)
    val_user = np.array([user_map[i] for i in val_df.user_id])
    val_item = np.array([item_map[i] for i in val_df.book_id])
    val_data = val_df.rating
    val_int = coo_matrix((val_data, (val_user, val_item)), shape=train_size)

    test_df = test_df[test_df.rating >= 3].reset_index(drop=True)
    test_user = np.array([user_map[i] for i in test_df.user_id])
    test_item = np.array([item_map[i] for i in test_df.book_id])
    test_data = test_df.rating
    test_int = coo_matrix((test_data, (test_user, test_item)),
                          shape=train_size)

    print('Running grid search on ranks and regularizations ...')
    ranks = [10, 20, 30]
    regs = [0, 1e-5, 5e-5]
    max_precision = -1
    best_rank = None
    best_reg = None
    best_training_time = None
    best_eval_time = None
    best_model = None

    # Do grid search on ranks and regularizations using training and validation data
    for rank in ranks:
        for reg in regs:
            start_time = time.time()
            model = LightFM(no_components=rank,
                            item_alpha=reg,
                            user_alpha=reg,
                            loss='warp',
                            random_state=1211)  # OPTIMIZE: precision@k
            model.fit(train_int, sample_weight=train_weight, epochs=10)
            train_end_time = time.time()

            val_precision = precision_at_k(model,
                                           val_int,
                                           train_interactions=train_int,
                                           k=500).mean()
            eval_end_time = time.time()

            with open(output_file, "a") as f:
                f.write(
                    'Rank %2d & Reg %.5f Validation Precision@500: %.5f \n' %
                    (rank, reg, val_precision))
            print('Rank %2d & Reg %.5f Validation Precision@500: %.5f' %
                  (rank, reg, val_precision))

            if val_precision > max_precision:
                max_precision = val_precision
                best_rank = rank
                best_reg = reg
                best_training_time = train_end_time - start_time
                best_eval_time = eval_end_time - train_end_time
                best_model = model

    # Evaluate best model performance on test set
    test_precision = precision_at_k(best_model,
                                    test_int,
                                    train_interactions=train_int,
                                    k=500).mean()

    with open(output_file, "a") as f:
        f.write(
            'The best model with rank %2d and reg %.5f achieves test precision@500 of %.5f \n'
            % (best_rank, best_reg, test_precision))
        f.write('The training takes %ss and evaluation takes %ss \n' %
                (best_training_time, best_eval_time))
    print(
        'The best model with rank %2d and reg %.5f achieves test precision@500 of %.5f'
        % (best_rank, best_reg, test_precision))
    print('The training takes %ss and evaluation takes %ss' %
          (best_training_time, best_eval_time))
Пример #11
0
def init_movielens(path,
                   min_rating=0.0,
                   k=3,
                   item_features=None,
                   cluster_n=18,
                   model='vgg19',
                   test_percentage=0.2):
    valid_item_features = {'genres': 'genres', 'clusters': 'clusters'}
    if item_features is not None:
        assert all(item in valid_item_features.values() for item in item_features), \
            'Your specified item features is invalid. You have to use one or more of this: ' \
            + ', '.join(valid_item_features)

    train_dataset = Dataset()
    test_dataset = Dataset()

    data = dict()
    min_interactions = dict()

    with open(path + '/ratings.csv', 'r') as ratings_file:
        reader = csv.reader(
            ratings_file,
            delimiter=',',
        )
        next(reader)  # skip header

        ratings = []
        users = set()
        items = set()
        for row in reader:
            user_id = int(row[0])
            item_id = int(row[1])

            users.add(user_id)
            items.add(item_id)

            rating = float(row[2])

            if rating >= min_rating:
                ratings.append((user_id, item_id, rating))
                __add_interaction(min_interactions, user_id)

        __info_no_of_min_interactions(
            k, 'No of interactions per user overall ==> ', min_interactions)

        users = list(users)
        items = list(items)

        users_column, items_column, ratings_column = zip(*ratings)
        ratings = sparse.coo_matrix(
            (ratings_column, (users_column, items_column)))

        ratings_train, ratings_test = random_train_test_split(
            ratings,
            test_percentage=test_percentage,
            random_state=np.random.RandomState(7))

        ratings_train_to_count = zip(ratings_train.row, ratings_train.col,
                                     ratings_train.data)
        ratings_train = zip(ratings_train.row, ratings_train.col,
                            ratings_train.data)

        ratings_test_to_count = zip(ratings_test.row, ratings_test.col,
                                    ratings_test.data)
        ratings_test = zip(ratings_test.row, ratings_test.col,
                           ratings_test.data)

        min_interactions = __count_train_test_min_interactions(
            ratings_train_to_count)
        __info_no_of_min_interactions(
            k, 'No of interactions per user on train ==> ', min_interactions)

        min_interactions = __count_train_test_min_interactions(
            ratings_test_to_count)
        __info_no_of_min_interactions(
            k, 'No of interactions per user on test ==> ', min_interactions)

        train_dataset.fit(users=users, items=items)
        test_dataset.fit(users=users, items=items)

        (train_interactions,
         train_weights) = train_dataset.build_interactions(ratings_train)
        (test_interactions,
         test_weights) = test_dataset.build_interactions(ratings_test)

        data.update({'train': train_interactions})
        data.update({'test': test_interactions})
        data.update({'train-mapping': train_dataset.mapping()})

    # add item features
    if item_features is not None:
        aggregated_features = []

        if valid_item_features.get('genres') in item_features:
            movie_genres, genres = __init_movies_genres(path)
            aggregated_features.append(movie_genres)

            train_dataset.fit_partial(item_features=genres)
            test_dataset.fit_partial(item_features=genres)

            train_dataset.fit_partial(items=list(movie_genres.keys()))
            test_dataset.fit_partial(items=list(movie_genres.keys()))

        if valid_item_features.get('clusters') in item_features:
            movies_posters_clusters, clusters = __init_movies_posters_clusters(
                path, cluster_n, model=model)
            aggregated_features.append(movies_posters_clusters)

            train_dataset.fit_partial(item_features=clusters)
            test_dataset.fit_partial(item_features=clusters)

            train_dataset.fit_partial(
                items=list(movies_posters_clusters.keys()))
            test_dataset.fit_partial(
                items=list(movies_posters_clusters.keys()))

        aggregated_features = __aggregate_features(aggregated_features)
        item_features = train_dataset.build_item_features(
            ((movie_id, aggregated_features.get(movie_id))
             for movie_id in aggregated_features.keys()))

        _ = test_dataset.build_item_features(
            ((movie_id, aggregated_features.get(movie_id))
             for movie_id in aggregated_features.keys()))

        data.update({'item_features': item_features})
    else:
        data.update({'item_features': None})

    return data
Пример #12
0
             'writer_name']].drop_duplicates().reset_index(drop=True)
users = uid[['uid', 'popular_section', 'popular_platform',
             'popular_sources']].drop_duplicates()

dataset = Dataset()
features_list = create_feature_list(items,
                                    cols=['section_primary', 'writer_name'])
user_features_list = create_feature_list(
    users, cols=['popular_section', 'popular_platform', 'popular_sources'])

#features_list = list(set(items.writer_name.to_list()))
dataset.fit(users=uid.uid.unique(),
            items=uid.article_id.unique(),
            item_features=features_list,
            user_features=user_features_list)

(interactions, weights) = dataset.build_interactions(
    (x.uid, x.article_id) for x in uid.itertuples())
n_users, n_items = interactions.shape
1 - (interactions.getnnz() / (interactions.shape[0] * interactions.shape[1]))
item_features = dataset.build_item_features([
    (i.article_id, [i.section_primary, i.writer_name])
    for i in items.itertuples()
])
user_features = dataset.build_user_features([(u.uid, [u.popular_section])
                                             for u in users.itertuples()])

item_features = dataset.build_item_features(build_features(items))
user_features = dataset.build_user_features(build_features(users))
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping(
)
Пример #13
0
def predict_hard_users(
    train: pd.DataFrame,
    test: pd.DataFrame,
    genre: pd.DataFrame,
    education: pd.DataFrame,
    notices: pd.DataFrame,
    available_notices: set,
    applicant_notice: dict,
    header=None,
):
    user_feature = genre.merge(education, on="idpostulante", how="left")
    user_feature.drop(columns=["fechanacimiento"], inplace=True)
    user_feature_hard_user = user_feature[user_feature.idpostulante.isin(
        train.idpostulante)]

    uf = generate_features(user_feature[["sexo", "nombre", "estado"]])
    itf = generate_features(notices[[
        "nombre_zona", "tipo_de_trabajo", "nivel_laboral", "nombre_area"
    ]])

    dataset1 = Dataset()
    dataset1.fit(
        train.idpostulante.unique(),  # all the users
        notices.idaviso.unique(),
        user_features=uf,  # additional user features
        item_features=itf,  # additional item features
    )
    # plugging in the interactions and their weights
    (interactions, weights) = dataset1.build_interactions([
        (x[1], x[0], x[3]) for x in train.values
    ])

    user_feature_list = generate_in_use_features(
        user_feature_hard_user[["sexo", "nombre", "estado"]].values,
        ["sexo", "nombre", "estado"],
    )
    user_tuple = list(
        zip(user_feature_hard_user.idpostulante, user_feature_list))

    user_features = dataset1.build_user_features(user_tuple, normalize=False)

    (
        user_id_map,
        user_feature_map,
        item_id_map,
        item_feature_map,
    ) = dataset1.mapping()

    inv_item_id_map = {v: k for k, v in item_id_map.items()}

    # for component in [10, 35, 50, 80, 100, 200]:
    component = 35
    model = lfm.LightFM(no_components=component, loss="warp", random_state=42)
    model.fit(
        interactions,
        # user_features=user_features,
        # sample_weight=weights,
        epochs=150,
        num_threads=8,
        verbose=True,
    )

    test_precision = precision_at_k(
        model,
        interactions,
        # user_features=user_features,
        k=10,
        num_threads=8,
    ).mean()
    logger.info(
        f"Evaluation for LightFM is: {test_precision} with {component} number of component"
    )

    final_predictions = {}
    for a_user in tqdm(test.idpostulante.unique()):
        try:
            notices_by_user = applicant_notice[a_user]
        except:
            notices_by_user = set()
        try:
            user_x = user_id_map[a_user]
        except:
            user_x = 0
        n_users, n_items = interactions.shape
        prediction = np.argsort(
            model.predict(
                user_x,
                np.arange(n_items),
                # user_features=user_features,
            ))[::-1]
        prediction_for_user = []
        for pred in prediction:
            notice = inv_item_id_map[pred]
            should_add = (notice in available_notices
                          and notice not in notices_by_user)
            if should_add:
                prediction_for_user += [notice]
            if len(prediction_for_user) == 10:
                break
        final_predictions[a_user] = prediction_for_user

    write_dict(final_predictions, "lightfm", header)
    return ["lightfm"]
Пример #14
0
def train_model():

    # uesr features
    user_features, user_feature_names = get_user_features()

    # create data
    data_ws = Dataset(user_identity_features=True)  # warm start

    # create map between user_id, post_id, user_features and internal indices
    data_ws.fit((x['user_id'] for x in get_data()),
                (x['post_id'] for x in get_data()),
                user_features=user_features)
    #user_biases =

    #---------------------------
    # Building the interactions matrix
    #---------------------------
    # create interaction matrix to optimize
    (interactions_ws, weights_ws) = data_ws.build_interactions(
        ((x['user_id'], x['post_id']) for x in get_data()))
    print(repr(interactions_ws))

    # retrieve mapping from dataset
    user_id_map, user_feature_map, item_id_map, item_feature_map = data_ws.mapping(
    )

    #---------------------------
    # train model
    #---------------------------
    # initialize model
    model_warp_ws = LightFM(learning_rate=0.05,
                            loss='warp',
                            no_components=len(user_feature_names))

    # train model
    model_warp_ws.fit(interactions_ws, user_features=user_features, epochs=30)

    #---------------------------
    # make predictions
    #---------------------------
    # make predictions for all user
    prediction_ws = model_warp_ws.predict_rank(interactions_ws,
                                               user_features=user_features)

    # create identity matrix that represent user features of hypothetical user
    user_features_identity = sparse.csr_matrix(
        np.identity(len(user_feature_names)))

    # make prediction for hypothetical user
    prediction_hypo = []

    for user_irt in range(len(user_feature_names)):

        # calculate prediction score
        prediction_score = model_warp_ws.predict(
            user_ids=0,
            item_ids=item_id_map.values(),
            user_features=user_features_identity)

        # combine prediction score with item map
        prediction_zipped = zip(prediction_score, item_id_map)

        # sort by prediction score
        prediction_sorted = sorted(prediction_zipped,
                                   key=lambda x: x[0],
                                   reverse=True)

        # add to list of hypothetical users
        prediction_hypo.append(prediction_sorted)

    return prediction_hypo, prediction_ws, user_id_map, item_id_map, user_feature_names
Пример #15
0
def lightfm_node(X1_train, X2_train, X1_test, X2_test):
    X2 = pd.concat([X2_train, X2_test])
    X1 = pd.concat([X1_train, X1_test]).set_index('id')

    X1.columns = ['X1_' + i for i in X1.columns]

    X1['X1_5'] = pd.qcut(X1['X1_5'], np.arange(0, 1, 0.1), duplicates='drop')
    X1['X1_8'] = pd.qcut(X1['X1_8'], np.arange(0, 1, 0.1), duplicates='drop')
    X1['X1_6'] = pd.qcut(X1['X1_6'], np.arange(0, 1, 0.1), duplicates='drop')

    for col in ['X1_6', 'X1_8', 'X1_5', 'X1_1', 'X1_13']:
        X1[col] = X1[col].map(lambda x: '{' + col + '}_{' + str(x) + '}')

    X1 = X1.reset_index()

    from lightfm.data import Dataset
    dataset = Dataset()
    dataset.fit(users=(x for x in X2['id']), items=(x for x in X2['A']))

    dataset.fit_partial(users=(x for x in X1['id']),
                        user_features=(x for x in X1['X1_1']))
    dataset.fit_partial(users=(x for x in X1['id']),
                        user_features=(x for x in X1['X1_13']))
    dataset.fit_partial(users=(x for x in X1['id']),
                        user_features=(x for x in X1['X1_5']))
    dataset.fit_partial(users=(x for x in X1['id']),
                        user_features=(x for x in X1['X1_8']))
    dataset.fit_partial(users=(x for x in X1['id']),
                        user_features=(x for x in X1['X1_6']))

    user_features = dataset.build_user_features(
        [(x[1]['id'], x[1][['X1_1', 'X1_13', 'X1_5', 'X1_8', 'X1_6'
                            ]].values.tolist()) for x in X1.iterrows()],
        normalize=True)

    (interactions,
     weights) = dataset.build_interactions(zip(*X2[['id', 'A']].values.T))

    model = LightFM(no_components=32,
                    learning_rate=0.04,
                    loss='bpr',
                    max_sampled=55,
                    random_state=0)
    num_epochs = 20
    for i in range(num_epochs):
        model.fit_partial(interactions, user_features=user_features)

    users_mapping, user_features_mapping, assets_mapping, asset_features_mapping = dataset.mapping(
    )
    user_features_mapping_inv = {
        j: i
        for i, j in user_features_mapping.items()
    }

    tag_embeddings = (model.user_embeddings.T /
                      np.linalg.norm(model.user_embeddings, axis=1)).T

    lightfm_embed = pd.DataFrame(tag_embeddings[:len(users_mapping)],
                                 index=X1['id'])

    return lightfm_embed
Пример #16
0
    print('Building LightFM Dataset...')
    print(50 * '-')
    lfm_dataset = Dataset(user_identity_features=False, item_identity_features=False)


    lfm_dataset.fit(
        users=u_list,
        items=i_list,
        user_features=np.concatenate((users.edad.drop_duplicates().values,
                                      users.sexo.drop_duplicates().values,
                                      users.educacion.drop_duplicates().values),
                                     axis=0)
    )

    print('Retrieving internal mappings and dictionaries...')
    u_map, u_feat_map, i_map, i_feat_map = lfm_dataset.mapping()

    print(50 * '-')
    print('Building Interactions...')
    print(50 * '-')
    interactions = train.groupby(['idpostulante','idaviso']).agg('count').rename(
        columns={'fechapostulacion': 'rating'}).reset_index()
    #print(interactions.sort_values('rating', ascending=False).head())

    interactions = np.array(
        [
            interactions.idpostulante.values,
            interactions.idaviso.values,
            interactions.rating.values
        ],
        dtype=np.object).T
Пример #17
0
# creating the interaction matrix for the model
(interactions, weights) = dataset.build_interactions(
    ((x['ProfielId'], x['VacatureId']) for x in qd.getMatchings()))
#print(interactions.toarray())

# creating the item feature matrix for the model
'''item_features = dataset.build_item_features(((x['VacatureId'], [x['Naam'],x['Taal'],x['Functie']])
                                              for x in qd.getVacancies()),normalize=False)
'''
item_features = dataset.build_item_features(
    ((x['VacatureId'], [x['Naam']]) for x in qd.getVacancies()),
    normalize=False)

# print(item_features.toarray())

print(dataset.mapping())
'''
user_features = dataset.build_user_features(((x['Id'], [x['Motivatie']])
                                             for x in qd.getProfiles()))
print(user_features)
'''

# Creating a user fettu
# Split the set in train and test
test, train = random_train_test_split(interactions,
                                      test_percentage=0.2,
                                      random_state=None)

# Start training the model
print("--- Start model training ---")
model = LightFM(no_components=1, learning_rate=0.027, loss='warp')
Пример #18
0
def train_model(
               df, user_id_col='user_id', item_id_col='business_id',
               item_name_col='name_business', evaluate=True):
    """ Train the model using collaborative filtering.
    Args:
        df: the input dataframe.
        user_id_col: user id column.
        item_id_col: item id column.
        item_name_col: item name column.
        evaluate: if evaluate the model performance.
    Returns:
        model_full: the trained model.
        df_interactions: dataframe with user-item interactions.
        user_dict: user dictionary containing user_id as key and
            interaction_index as value.
        item_dict: item dictionary containing item_id as key and
            item_name as value.
        user_feature_map: the feature map of users
        business_feature_map: the feature map of items
    """
    if evaluate:
        print('Evaluating model...')
        evaluate_model(df, user_id_col='user_id', item_id_col='business_id')
    print('Training model...')

    # build recommendations for known users and known businesses
    # with collaborative filtering method
    ds_full = Dataset()
    # we call fit to supply userid, item id and user/item features
    user_cols = ['user_id', 'average_stars']
    categories = [c for c in df.columns if c[0].isupper()]
    item_cols = ['business_id', 'state']

    for i in df.columns[10:]:
        item_cols.append(str(i))

    user_features = user_cols[1:]
    item_features = item_cols[2:]

    ds_full.fit(
        df[user_id_col].unique(),  # all the users
        df[item_id_col].unique(),  # all the items
        user_features=user_features,  # additional user features
        item_features=item_features
         )

    df_users = df.drop_duplicates(user_id_col)
    # df_users = df[df.duplicated(user_id_col) == False]
    users_features = []
    for i in range(len(df_users)):
        users_features.append(get_users_features_tuple(df_users.values[i]))
    users_features = ds_full.build_user_features(
        users_features, normalize=False)

    items = df.drop_duplicates(item_id_col)
    # items = df[df.duplicated(item_id_col) == False]
    items_features = []
    for i in range(len(items)):
        items_features.append(get_items_features_tuple(
            items.values[i], categories))
    items_features = ds_full.build_item_features(
        items_features, normalize=False)

    (interactions, weights) = ds_full.build_interactions(
        [(x[0], x[1], x[2]) for x in df.values])
    # model
    model_full = LightFM(
        no_components=100, learning_rate=0.05, loss='warp', max_sampled=50)
    model_full.fit(
        interactions, user_features=users_features,
        item_features=items_features, sample_weight=weights,
        epochs=10, num_threads=10)
    # mapping
    user_id_map, user_feature_map, business_id_map, business_feature_map = \
        ds_full.mapping()

    # data preparation
    df_interactions = pd.DataFrame(weights.todense())
    df_interactions.index = list(user_id_map.keys())
    df_interactions.columns = list(business_id_map.keys())
    user_dict = user_id_map
    item_dict = df.set_index(item_id_col)[item_name_col].to_dict()
    return model_full, df_interactions, user_dict, \
        item_dict, user_feature_map, business_feature_map
Пример #19
0
    for u in users:
        #temp_df = pd.DataFrame(user_map[u]*k)
        scores = model.predict(u, litems)
        #sdict[user_map[u]] = [items_map[i] for i in np.argsort(-scores)[:k]]
        temp_df = pd.DataFrame({
            'user_id': [user_map[u]] * k,
            'recom': [items_map[i] for i in np.argsort(-scores)[:k]]
        })
        all_df = pd.concat([all_df, temp_df], ignore_index=True)
    return all_df


#recom = predict(model,range(num_users),dataset.mapping()[0],dataset.mapping()[2])
manager = mp.Manager()
sdict = manager.dict()
predict_mp(model, num_users, dataset.mapping()[0], dataset.mapping()[2])
recom_df = pd.DataFrame(dict(sdict).items(), columns=['user_id', 'Recom'])
recom_df = recom_df.explode('Recom').reset_index(drop=True)
recom_df.to_csv('train_predictions.csv')
"""

df_item_features = df[["city_id","hotel_country"]].drop_duplicates()
features_list = create_feature_list(df_item_features,cols=["hotel_country"])

fdataset = Dataset()

# dataset.fit(df[USER_ID_COL].unique(), df[TARGET_COL].unique())
dataset.fit(df[USER_ID_COL], df[TARGET_COL],item_features=features_list)

num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))