Пример #1
0
def prepareData(df, tags):
    df = df[df.actionCategory == "WebNei clicked"]
    actionByUsers = df.groupby(["userName", "actionName"]).size()
    uniqueUsers = df[df.userName.isin(
        actionByUsers.index.get_level_values(
            0).unique().values)].drop_duplicates('userName')
    uniqueUsers['user_features'] = uniqueUsers[[
        'title', 'team', 'organization', 'department'
    ]].values.tolist()
    dataset = Dataset()
    dataset.fit((list(actionByUsers.index.get_level_values(0))),
                (list(actionByUsers.index.get_level_values(1))))

    rowM, colM = prepareJson(tags)
    rowU, colU = prepareUserFeatures(uniqueUsers)

    dataset.fit_partial(items=rowM,
                        item_features=colM,
                        users=rowU,
                        user_features=colU)

    (interactions, weights) = dataset.build_interactions(
        zip(list(actionByUsers.index.get_level_values(0)),
            list(actionByUsers.index.get_level_values(1))))
    item_features = dataset.build_item_features(zip(rowM, [colM]))
    user_features = dataset.build_user_features(zip(rowU, [colU]))
    return interactions, item_features, user_features
Пример #2
0
    def build_lightfm_dataset(self) -> None:
        """
        Builds final datasets for user-variant and variant-variant recommendations.
        """
        logging.info("Creating LightFM matrices...")
        lightfm_dataset = LFMDataset()
        ratings_list = self.interaction_list
        logging.info('#'*60)
        lightfm_dataset.fit_partial(
            (rating['user_id'] for rating in ratings_list),
            (rating['product_id'] for rating in ratings_list)
        )

        item_feature_names = self.item_df.columns
        logging.info(f'Logging item_feature_names - with product_id: \n{item_feature_names}')
        item_feature_names = item_feature_names[~item_feature_names.isin(['product_id'])]
        logging.info(f'Logging item_feature_names - without product_id: \n{item_feature_names}')

        for item_feature_name in item_feature_names:
            lightfm_dataset.fit_partial(
                items=(item['product_id'] for item in self.item_list),
                item_features=((item[item_feature_name] for item in self.item_list)),
            )

        item_features_data = []
        for item in self.item_list:
            item_features_data.append(
                (
                    item['product_id'],
                    [
                        item['product_name'],
                        item['aisle'],
                        item['department']
                    ],
                )
            )
        logging.info(f'Logging item_features_data @build_lightfm_dataset: \n{item_features_data}')
        self.item_features = lightfm_dataset.build_item_features(item_features_data)
        self.interactions, self.weights = lightfm_dataset.build_interactions(
            ((rating['user_id'], rating['product_id']) for rating in ratings_list)
        )

        self.n_users, self.n_items = self.interactions.shape

        logging.info(f'Logging self.interactions @build_lightfm_dataset: \n{self.interactions}')
        logging.info(f'Logging self.weights @build_lightfm_dataset: \n{self.weights}')
        logging.info(
            f'The shape of self.interactions {self.interactions.shape} '
            f'and self.weights {self.weights.shape} represent the user-item matrix.')
Пример #3
0
def test_exceptions():

    users, items = 10, 100

    dataset = Dataset()
    dataset.fit(range(users), range(items))

    with pytest.raises(ValueError):
        dataset.build_interactions([(users + 1, 0)])

    with pytest.raises(ValueError):
        dataset.build_interactions([(0, items + 1)])

    dataset.fit_partial([users + 1], [items + 1])
    dataset.build_interactions([(users + 1, 0)])
    dataset.build_interactions([(0, items + 1)])
Пример #4
0
def test_exceptions():

    users, items = 10, 100

    dataset = Dataset()
    dataset.fit(range(users), range(items))

    with pytest.raises(ValueError):
        dataset.build_interactions([(users + 1, 0)])

    with pytest.raises(ValueError):
        dataset.build_interactions([(0, items + 1)])

    dataset.fit_partial([users + 1], [items + 1])
    dataset.build_interactions([(users + 1, 0)])
    dataset.build_interactions([(0, items + 1)])
def main():
    current_stage = 6
    model = LightFM(no_components=30)
    dataset = Dataset()

    for c in range(0, current_stage + 1):
        click_train = pd.read_csv(
            train_path + "/underexpose_train_click-{}.csv".format(c),
            header=None,
            names=["user_id", "item_id", "time"],
        )
        click_test = pd.read_csv(
            test_path + "/underexpose_test_click-{}.csv".format(c),
            header=None,
            names=["user_id", "item_id", "time"],
        )
        dataset.fit_partial(click_train["user_id"], click_train["item_id"])
        num_users, num_items = dataset.interactions_shape()
        log('Num users: {}, num_items {}.'.format(num_users, num_items))
Пример #6
0
    def fit_data(self, matrix, user_features=None, item_features=None):
        """
        Create datasets for .fit() method.
        Args:
            matrix: User-item interactions matrix (weighted)
            user_features: User-features pandas dataframe which index contains user_ids (crd_no)
            item_features:  Item-features pandas dataframe which index contains good_ids (plu_id)
        Returns:
            Model with fitted (mapped) datasets
        """
        matrix.sort_index(inplace=True)
        matrix.sort_index(inplace=True, axis=1)
        dataset = Dataset()
        dataset.fit((x for x in matrix.index), (x for x in matrix.columns))
        interactions = pd.melt(
            matrix.replace(0, np.nan).reset_index(),
            id_vars='index',
            value_vars=list(matrix.columns[1:]),
            var_name='plu_id',
            value_name='rating').dropna().sort_values('index')
        interactions.columns = ['crd_no', 'plu_id', 'rating']
        self.interactions, self.weights = dataset.build_interactions(
            [tuple(x) for x in interactions.values])

        if user_features is not None:
            user_features.sort_index(inplace=True)
            dataset.fit_partial(users=user_features.index,
                                user_features=user_features)
            self.user_features = dataset.build_user_features(
                ((index, dict(row))
                 for index, row in user_features.iterrows()))
        else:
            self.user_features = None
        if item_features is not None:
            item_features.sort_index(inplace=True)
            dataset.fit_partial(items=item_features.index,
                                item_features=item_features)
            self.item_features = dataset.build_item_features(
                ((index, dict(row))
                 for index, row in item_features.iterrows()))
        else:
            self.item_features = None
Пример #7
0
def load_parameter():
    ratings = get_ratings()
    books = get_books()
    users = get_users()
    books_pd = convert_pd(books)

    id_users_books = StoreValue()

    for x in ratings:
        id_users_books._user_id.append(x[0])
        id_users_books._book_id.append(x[1])

    # Được tạo ra theo hướng dẫn tại https://making.lyst.com/lightfm/docs/examples/dataset.html
    dataset_explicit = Dataset()
    dataset_explicit.fit(id_users_books._user_id,
                id_users_books._book_id)

    num_users, num_items = dataset_explicit.interactions_shape()
    print('Num users: {}, num_items {}.'.format(num_users, num_items))

    dataset_explicit.fit_partial(items=(x[0] for x in books),
                        item_features=(x[7] for x in books))
    
    dataset_explicit.fit_partial(users=(x[0] for x in users))


    # create ---> mapping
    # interactions: dưới dạng COO_maxtrix, các tương tác sẽ là user_id và book_id
    # Trọng số voting
    (interactions_explicit, weights_explicit) = dataset_explicit.build_interactions((id_users_books._user_id[i], id_users_books._book_id[i]) for i in range(len(ratings)))

    # Đây là đặc trưng trích xuất từ các items (sách) dựa trên tác giả của cuốn sách được cung cấp
    item_features = dataset_explicit.build_item_features(((x[0], [x[7]]) for x in books))
    # user_features = dataset_explicit.build_user_features(((x[0], [x[1]]) for x in users))

    model_explicit_ratings = LightFM_ext(loss='warp')

    (train, test) = random_train_test_split(interactions=interactions_explicit, test_percentage=0.02)

    model_explicit_ratings.fit(train, item_features=item_features, epochs=2, num_threads=4)
    return model_explicit_ratings, dataset_explicit, interactions_explicit, weights_explicit, item_features, books_pd
Пример #8
0
def predict(user_id: int) -> str:
    model_file = Path(BASE_DIR).joinpath(MODEL_FILE_NAME)
    data_file = Path(BASE_DIR).joinpath(DATA_FILE_NAME)

    if not model_file.exists():
        return None

    if not data_file.exists():
        return None

    model: LightFM = pickle.load(open(model_file, "rb"))
    data: pd.DataFrame = pd.read_csv(data_file)

    dataset = Dataset()

    dataset.fit((cac for cac in data.cac.unique()),
                (product for product in data.product_code.unique()))

    features = ['product_code', 'country_code', 'cost_bin']

    for product_feature in features:
        dataset.fit_partial(
            users=(cac for cac in data.cac.unique()),
            items=(product for product in data.product_code.unique()),
            item_features=(feature
                           for feature in data[product_feature].unique()))

    item_features = dataset.build_item_features(((getattr(row, 'product_code'), [getattr(row, product_feature) for product_feature in features if product_feature != 'product_code']) \
            for row in data[features].itertuples()))

    predicted_products: List[str] = sample_recommendation(
        model=model,
        dataset=dataset,
        raw_data=data,
        item_features=item_features,
        user_ids=user_id)

    return predicted_products
Пример #9
0
dict2 = []
for i in range(0, len(dict)):
    if ("author_id" in dict[i].keys() and "cat_id" in dict[i].keys()):
        dict2.append(dict[i])
dict = dict2

import numpy as np
dict = np.array(dict)

print(dict)
from lightfm.data import Dataset

print("Build the dataset...")
dataset = Dataset()
dataset.fit((x['userid'] for x in dict), (x['postid'] for x in dict))
dataset.fit_partial(items=(x['postid'] for x in dict),
                    item_features=(x["author_id"] for x in dict))
dataset.fit_partial(items=(x['postid'] for x in dict),
                    item_features=(x["cat_id"] for x in dict))

num_users, num_items = dataset.interactions_shape()

(interactions, weights) = dataset.build_interactions(
    ((x['userid'], x['postid']) for x in dict))

from lightfm import LightFM

print("Training the model...")
model = LightFM(loss='warp')
model.fit(interactions)
from lightfm.evaluation import precision_at_k
dataset = Dataset()
dataset.fit((x['User-ID'] for x in get_ratings()),
            (x['ISBN'] for x in get_ratings()))

#This call will assign an internal numerical id to every user and item id we pass in. These will be contiguous (from 0 to however many users and items we have), and will also determine the dimensions of the resulting LightFM model.

#We can check that the mappings have been created by querying the dataset on how many users and books it knows about:

num_users, num_items = dataset.interactions_shape()

print('Num users: {}, num_items {}.'.format(num_users, num_items))

#Note that if we don't have all user and items ids at once, we can repeatedly call `fit_partial` to supply additional ids. In this case, we will use this capability to add some item feature mappings:

dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()),
                    item_features=(x['Book-Author']
                                   for x in get_book_features()))

#This will create a feature for every unique author name in the dataset.

#(Note that we fit some more item ids: this is to make sure our mappings are complete even if there are items in the features dataset that are not in the interactions set.)

## Building the interactions matrix

#Having created the mapping, we build the interaction matrix:

(interactions, weights) = dataset.build_interactions(
    ((x['User-ID'], x['ISBN']) for x in get_ratings()))
print(repr(interactions))

#This is main input into a LightFM model: it encodes the interactions betwee users and items.
Пример #11
0
    testrankings = rankings[101008:]

    #dividing the features into Train/Cv/Test
    #unused currently but usable later so it is being kept in
    trainfeats = winefeatures[0:90980]
    cvfeats = winefeatures[90980:110476]
    testfeats = winefeatures[110476:]

    #LightFm Dataset Object
    dataset = Dataset()
    dataset.fit((x['taster'] for x in trainrankings),(y['title'] for y in winefeatures))
    #it needs to be fit by providing iterators for users and the corresponding items


    #manually add all features to the dataset
    dataset.fit_partial(item_features=(x['country'] for x in winefeatures))
    dataset.fit_partial(item_features=(x['province'] for x in winefeatures))
    dataset.fit_partial(item_features=(x['region_1'] for x in winefeatures))
    dataset.fit_partial(item_features=(x['variety'] for x in winefeatures))
    dataset.fit_partial(item_features=(x['winery'] for x in winefeatures))
    dataset.fit_partial(item_features=(x['points'] for x in winefeatures))
    dataset.fit_partial(item_features=(x['price'] for x in winefeatures))
    #then add our word vector features iteratively
    for i in range(9,209):
        dataset.fit_partial(item_features=(x[str(fields[i])] for x in winefeatures))

    num_users, num_items = dataset.interactions_shape()
    
    #building the interaction matrix for training ratings
    (interactions, weights) = dataset.build_interactions(((x['taster'],x['title']) for x in trainrankings))
Пример #12
0
#print(json.dumps(line, indent=4))

# create a dataset and build the ID mappings
dataset = Dataset()
dataset.fit((x['User-ID'] for x in get_ratings()),
            (x['ISBN'] for x in get_ratings()))

# query the dataset to check how many users and items (i.e. books) it knows
num_users, num_items = dataset.interactions_shape()
print('Num users : {}, num_items {}.'.format(num_users, num_items))

# add some item feature mappings, and creates a unique feature for each author
# NOTE: more item ids are fitted than usual, to make sure our mappings are complete
# even if there are items in the features dataset that are not in the interaction set
dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()),
                    item_features=(x['Book-Author']
                                   for x in get_book_features()))

# build the interaction matrix which is a main input to the LightFM model
# it encodes the interactions between the users and the items
(interactions, weights) = dataset.build_interactions(
    ((x['User-ID'], x['ISBN']) for x in get_ratings()))

# item_features matrix can also be created
item_features = dataset.build_item_features(
    ((x['ISBN'], [x['Book-Author']]) for x in get_book_features()))

# split the current dataset into a training and test dataset
train, test = random_train_test_split(interactions,
                                      test_percentage=0.01,
                                      random_state=None)
Пример #13
0
    average_stay='average_stay',
    lat='lat',
    long='long',
    image_url='image_url',
    category='Category',
    voyager_id='voyager_id',
    airport_code='airport_code')

from lightfm.data import Dataset

dataset = Dataset()
dataset.fit((x[0] for i, x in users.iterrows()),
            (x[1] for i, x in users.iterrows()))
# (interactions, weights) = dataset.build_interactions((x[0],x[1]) for i,x in users.iterrows())
dataset.fit_partial(items=(x['destinationid']
                           for i, x in destinations.iterrows()),
                    item_features=(x['Destination-tf-idf']
                                   for i, x in destinations.iterrows()))
dataset.fit_partial(items=(x['userid'] for i, x in users.iterrows()),
                    user_features=(x['age'] for i, x in users.iterrows()))

item_features = dataset.build_item_features(
    ((x['destinationid'], [x['Destination-tf-idf']])
     for i, x in destinations.iterrows()))
user_features = dataset.build_user_features(
    ((x['userid'], [x['age']]) for i, x in users.iterrows()))

mf_model = runMF(interactions=interactions,
                 item_features=item_features,
                 user_features=user_features,
                 n_components=30,
                 loss='warp',
Пример #14
0
#################################
#                               #
#       Building the Model      #
#                               #
#################################

dataset = Dataset()
dataset.fit((x['User-ID'] for x in get_ratings()),
            (x['ISBN'] for x in get_ratings()))

num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

dataset.fit_partial(users=(x['User-ID'] for x in get_user_features()),
                    items=(x['ISBN'] for x in get_book_features()),
                    item_features=(x['Book-Author'] for x in get_book_features()),
                    user_features=(x['Age'] for x in get_user_features()))

(interactions, weights) = dataset.build_interactions(((x['User-ID'], x['ISBN'])
                                                      for x in get_ratings()))

#print(repr(interactions))

item_features = dataset.build_item_features(((x['ISBN'], [x['Book-Author']])
                                              for x in get_book_features()))
#print(repr(item_features))


user_features = dataset.build_user_features(((x['User-ID'], [x['Age']])
                                              for x in get_user_features()))
Пример #15
0
qd = VacancyData()

matchings, vacancies, profiles, profilestest = qd.getData()
# Creating a dataset
dataset = Dataset(user_identity_features=False, item_identity_features=False)
dataset.fit((x['ProfielId'] for x in qd.getMatchings()),
            (x['VacatureId'] for x in qd.getMatchings()))

# Check on items and users
num_users, num_items = dataset.interactions_shape()
print('--- Interaction set : Num users: {}, num_items {}. ---'.format(
    num_users, num_items))

# Adding the features in the mix
dataset.fit_partial(
    items=(x['VacatureId'] for x in qd.getVacancies()),
    item_features=(x['Naam'] for x in qd.getVacancies()),
)
'''dataset.fit_partial(items=(x['VacatureId'] for x in qd.getVacancies()),
                    item_features=(x['Taal'] for x in qd.getVacancies()),
                    )

dataset.fit_partial(items=(x['VacatureId'] for x in qd.getVacancies()),
                    item_features=(x['Functie'] for x in qd.getVacancies()),
                    )

dataset.fit_partial(users=(x['Id'] for x in qd.getProfiles()),
                    user_features=(x['Motivatie'] for x in qd.getProfiles())                    
                    )
'''
num_users, num_items = dataset.interactions_shape()
print('--- Total set : Num users: {}, num_items {}. ---'.format(
Пример #16
0
def init_movielens(path,
                   min_rating=0.0,
                   k=3,
                   item_features=None,
                   cluster_n=18,
                   model='vgg19',
                   test_percentage=0.2):
    valid_item_features = {'genres': 'genres', 'clusters': 'clusters'}
    if item_features is not None:
        assert all(item in valid_item_features.values() for item in item_features), \
            'Your specified item features is invalid. You have to use one or more of this: ' \
            + ', '.join(valid_item_features)

    train_dataset = Dataset()
    test_dataset = Dataset()

    data = dict()
    min_interactions = dict()

    with open(path + '/ratings.csv', 'r') as ratings_file:
        reader = csv.reader(
            ratings_file,
            delimiter=',',
        )
        next(reader)  # skip header

        ratings = []
        users = set()
        items = set()
        for row in reader:
            user_id = int(row[0])
            item_id = int(row[1])

            users.add(user_id)
            items.add(item_id)

            rating = float(row[2])

            if rating >= min_rating:
                ratings.append((user_id, item_id, rating))
                __add_interaction(min_interactions, user_id)

        __info_no_of_min_interactions(
            k, 'No of interactions per user overall ==> ', min_interactions)

        users = list(users)
        items = list(items)

        users_column, items_column, ratings_column = zip(*ratings)
        ratings = sparse.coo_matrix(
            (ratings_column, (users_column, items_column)))

        ratings_train, ratings_test = random_train_test_split(
            ratings,
            test_percentage=test_percentage,
            random_state=np.random.RandomState(7))

        ratings_train_to_count = zip(ratings_train.row, ratings_train.col,
                                     ratings_train.data)
        ratings_train = zip(ratings_train.row, ratings_train.col,
                            ratings_train.data)

        ratings_test_to_count = zip(ratings_test.row, ratings_test.col,
                                    ratings_test.data)
        ratings_test = zip(ratings_test.row, ratings_test.col,
                           ratings_test.data)

        min_interactions = __count_train_test_min_interactions(
            ratings_train_to_count)
        __info_no_of_min_interactions(
            k, 'No of interactions per user on train ==> ', min_interactions)

        min_interactions = __count_train_test_min_interactions(
            ratings_test_to_count)
        __info_no_of_min_interactions(
            k, 'No of interactions per user on test ==> ', min_interactions)

        train_dataset.fit(users=users, items=items)
        test_dataset.fit(users=users, items=items)

        (train_interactions,
         train_weights) = train_dataset.build_interactions(ratings_train)
        (test_interactions,
         test_weights) = test_dataset.build_interactions(ratings_test)

        data.update({'train': train_interactions})
        data.update({'test': test_interactions})
        data.update({'train-mapping': train_dataset.mapping()})

    # add item features
    if item_features is not None:
        aggregated_features = []

        if valid_item_features.get('genres') in item_features:
            movie_genres, genres = __init_movies_genres(path)
            aggregated_features.append(movie_genres)

            train_dataset.fit_partial(item_features=genres)
            test_dataset.fit_partial(item_features=genres)

            train_dataset.fit_partial(items=list(movie_genres.keys()))
            test_dataset.fit_partial(items=list(movie_genres.keys()))

        if valid_item_features.get('clusters') in item_features:
            movies_posters_clusters, clusters = __init_movies_posters_clusters(
                path, cluster_n, model=model)
            aggregated_features.append(movies_posters_clusters)

            train_dataset.fit_partial(item_features=clusters)
            test_dataset.fit_partial(item_features=clusters)

            train_dataset.fit_partial(
                items=list(movies_posters_clusters.keys()))
            test_dataset.fit_partial(
                items=list(movies_posters_clusters.keys()))

        aggregated_features = __aggregate_features(aggregated_features)
        item_features = train_dataset.build_item_features(
            ((movie_id, aggregated_features.get(movie_id))
             for movie_id in aggregated_features.keys()))

        _ = test_dataset.build_item_features(
            ((movie_id, aggregated_features.get(movie_id))
             for movie_id in aggregated_features.keys()))

        data.update({'item_features': item_features})
    else:
        data.update({'item_features': None})

    return data
Пример #17
0
def preprocess():
    import pandas as pd
    import math
    import numpy as np 
            
    data_users = pd.read_csv('users_tag.csv',index_col=0)
    data_business = pd.read_csv('business_Nora.csv',index_col=0)
    data_review = pd.read_csv('reviews_cleaned.csv',index_col = 0)        
            
    data_users.review_count = pd.Series([math.log(x+1) for x in data_users.review_count])
    data_users.useful =  pd.Series([math.log(x+1) for x in data_users.useful])  
            
    #cleam business skewness
    data_business.review_count =  pd.Series([math.log(x+1) for x in data_business.review_count])        
            
    from lightfm.data import Dataset        
            
    #model establishment
    dataset = Dataset()
    dataset.fit(data_review.user_id,data_review.business_id)
    type(dataset)
    num_users, num_items = dataset.interactions_shape()        
            
    # fit item and user features. 
    dataset.fit_partial(items=data_business.business_id,
                        item_features=['stars'])
            
            
    dataset.fit_partial(items=data_business.business_id,
                        item_features=['review_count'])        
            
    tar_cols = [x for x in data_business.columns[24:]] 
            
    dataset.fit_partial(items = data_business.business_id,
                       item_features = tar_cols)        
            
    user_cols = [x for x in data_users[['review_count', 'useful',
                                       'Ice Cream & Frozen Yogurt', 'Korean', 'Tapas/Small Plates',
           'Vietnamese', 'Vegan', 'Caribbean', 'Food Delivery Services', 'Lounges',
           'Pubs', 'Greek', 'Cocktail Bars', 'Mexican', 'Wine Bars', 'Tea Rooms',
           'Delis', 'Vegetarian', 'Ethnic Food', 'Salad', 'Seafood', 'Beer',
           'American (New)', 'Juice Bars & Smoothies', 'Shopping', 'Barbeque',
           'Sports Bars', 'French', 'Chicken Wings', 'Gastropubs', 'Diners',
           'Gluten-Free', 'Thai', 'Comfort Food', 'Health Markets', 'Halal',
           'Caterers', 'Arts & Entertainment']]]        
            
    dataset.fit_partial(users=data_users.user_id,
                        user_features = user_cols)  
          
    print("Building Interactions")        
    (interactions, weights) = dataset.build_interactions([(x['user_id'],
                                                           x['business_id'],
                                                           x['stars']) for index,x in data_review.iterrows()])   
    print("Interactions Build")        
    # build user and item features
    
    def build_dict(df,tar_cols,val_list):
        rst = {}
        for col in tar_cols:
            rst[col] = df[col]
        sum_val = sum(list(rst.values())) # get sum of all the tfidf values
        
        if(sum_val == 0):
            return rst
        else:
            
            w = (2-sum(val_list))/sum_val # weight for each tag to be able to sum to 1
            for key,value in rst.items():
                rst[key] = value * w
        return rst
    
    def user_build_dict(df,tar_cols,val_list):
        rst = {}
        for col in tar_cols:
            rst[col] = df[col]
        sum_val = sum(list(rst.values())) # get sum of all the tfidf values
        
        if(sum_val == 0):
            return rst
        else:
            w = (2-sum(val_list))/sum_val # weight for each tag to be able to sum to 1
            for key,value in rst.items():
                rst[key] = value * w
        return rst
    
    # get max of each column to regularize value to [0,1]
    max_star = max(data_business.stars)
    max_b_rc = max(data_business.review_count)
    print('max_b_rc')
    print(max_b_rc)
    
    # give CF info weight 0.5, all other 0.5. Then in others, give (star, review count) 0.25 and tags 0.25
    item_features = dataset.build_item_features(((x['business_id'], 
                                                  {'stars':0.5*x['stars']/max_star,
                                                   'review_count':0.5*x['review_count']/max_b_rc,
                                                   **build_dict(x,tar_cols,[0.5*x['stars']/max_star,
                                                               0.5*x['review_count']/max_b_rc])})
                                                  for index,x in data_business.iterrows()))
    
    
    # user_features = dataset.build_user_features(((x['user_id'],
    #                                              [x['is_elite'],x['year']])
    #                                            for index, x in data_users.iterrows()))
    max_u_rc = max(data_users.review_count)
    max_useful = max(data_users.useful)
    user_features = dataset.build_user_features(((x['user_id'],
                                                 {'review_count':0.35*x['review_count']/max_u_rc,
                                                  'useful':0.35*x['useful']/max_useful,
                                                 **user_build_dict(x,user_cols,[0.35*x['review_count']/max_u_rc,0.35*x['useful']/max_useful])}) for index, x in data_users.iterrows()))
            
    #train-test split
    
    # seed = 12345 #has multiple seeds set up to account for split biases
    # seed = 101
    # seed = 186
    seed = 123
    from lightfm.cross_validation import random_train_test_split
    train,test=random_train_test_split(interactions,test_percentage=0.2,random_state=np.random.RandomState(seed))
    
    print('The dataset has %s users and %s items, '
          'with %s interactions in the test and %s interactions in the training set.'
          % (train.shape[0], train.shape[1], test.getnnz(), train.getnnz()))
    
    train.multiply(test).nnz == 0 # make sure train and test are truly disjoint        
    return train,test,data_business,dataset,user_features,item_features   
Пример #18
0
def run_lightfm(ratings, train, test, k_items, dataset):
    def create_interaction_matrix(df,
                                  user_col,
                                  item_col,
                                  rating_col,
                                  norm=False,
                                  threshold=None):
        '''
        Function to create an interaction matrix dataframe from transactional type interactions
        Required Input -
            - df = Pandas DataFrame containing user-item interactions
            - user_col = column name containing user's identifier
            - item_col = column name containing item's identifier
            - rating col = column name containing user feedback on interaction with a given item
            - norm (optional) = True if a normalization of ratings is needed
            - threshold (required if norm = True) = value above which the rating is favorable
        Expected output -
            - Pandas dataframe with user-item interactions ready to be fed in a recommendation algorithm
        '''
        interactions = df.groupby([user_col, item_col])[rating_col] \
                .sum().unstack().reset_index(). \
                fillna(0).set_index(user_col)
        if norm:
            interactions = interactions.applymap(lambda x: 1
                                                 if x > threshold else 0)
        return interactions

    test_interactions = create_interaction_matrix(df=test,
                                                  user_col='userId',
                                                  item_col='movieId',
                                                  rating_col='rating')

    budget_l = dataset.budget.unique().tolist()
    gross_l = dataset.gross.unique().tolist()
    awards_l = dataset.awards.unique().tolist()
    nom_l = dataset.nominations.unique().tolist()
    votes_l = dataset.votes.unique().tolist()
    item_ids = np.unique(train.movieId.astype(int))
    print(f'length dataset: {len(dataset)}')
    dataset = dataset[dataset.movieId.isin(item_ids)]
    print(f'length dataset: {len(dataset)}')
    item_features_list = [f'rating_{f}' for f in range(11)]
    gen = [
        'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime',
        'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX',
        'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
    ]  # 'unknown' add unknown for movielens100k
    item_features_list += gen
    item_features_list += budget_l
    item_features_list += gross_l
    item_features_list += awards_l
    item_features_list += nom_l
    item_features_list += votes_l
    item_features = []
    for y, x in dataset.iterrows():
        genres = x['genres']
        tmp_row = (int(x['movieId']), [
            x['rating'], x['budget'], x['gross'], x['awards'],
            x['nominations'], x['votes']
        ])
        for g in genres:
            tmp_row[1].append(g)
        item_features.append(tmp_row)
    #item_features = [(int(x['movieId']), [x['rating'], z, x['budget'], x['gross'], x['awards'], x['votes']]) for y, x in dataset.iterrows() for z in x['genres']] #x['nominations']
    user_ids = np.unique(train.userId)
    built_dif = Dataset()
    built_dif.fit_partial(users=user_ids)
    built_dif.fit_partial(items=item_ids)
    built_dif.fit_partial(item_features=item_features_list)
    dataset_item_features = built_dif.build_item_features(item_features)
    (interactions, weights) = built_dif.build_interactions(
        ((int(x['userId']), int(x['movieId'])) for y, x in train.iterrows()))
    modelx = LightFM(no_components=30, loss='bpr', k=15, random_state=1)
    modelx.fit(interactions,
               epochs=30,
               num_threads=4,
               item_features=dataset_item_features
               )  #item_features=dataset_item_features
    test = sparse.csr_matrix(test_interactions.values)
    test = test.tocoo()
    num_users, num_items = built_dif.interactions_shape()
    print('Num users: {}, num_items {}.'.format(num_users, num_items))

    prec_list = dict()
    rec_list = dict()

    for num_k in k_items:
        trainprecision = precision_at_k(
            modelx, test, k=num_k, item_features=dataset_item_features).mean(
            )  #item_features=dataset_item_features,
        print('Hybrid training set precision: %s' % trainprecision)
        trainrecall = recall_at_k(modelx,
                                  test,
                                  k=num_k,
                                  item_features=dataset_item_features).mean(
                                  )  #item_features=dataset_item_features
        print('Hybrid training set recall: %s' % trainrecall)
        if num_k in prec_list:
            prec_list[num_k].append(trainprecision)
        else:
            prec_list[num_k] = trainprecision

        if num_k in rec_list:
            rec_list[num_k].append(trainrecall)
        else:
            rec_list[num_k] = trainrecall

    return prec_list, rec_list
Пример #19
0
def main():
    #     n = len(sys.argv)
    #     if n > 0:
    #         f = sys.argv[0]
    #     else:
    #         f = 'new_sample.csv'

    # Start imports from s3
    bucket_name = 'forumrecbucket'
    samplecsv_key = 'new_sample.csv'
    pickle_key = 'savefile.pickle'
    item_features_key = 'item_features.npz'
    post_mappings_key = 'post_mappings.csv'

    client = boto3.client(
        's3')  #, aws_access_key_id=aws_id, aws_secret_access_key=aws_secret)
    csv_obj = client.get_object(
        Bucket=bucket_name, Key=samplecsv_key)['Body'].read().decode('utf-8')
    new = pd.read_csv(StringIO(csv_obj))

    s3 = S3FileSystem()
    user_indicies_key = 'user_indicies.npy'
    post_indicies_key = 'post_indicies.npy'

    user_indicies = np.load(
        s3.open('{}/{}'.format(bucket_name, user_indicies_key)))
    post_indicies = np.load(
        s3.open('{}/{}'.format(bucket_name, post_indicies_key)))
    post_mappings_obj = client.get_object(
        Bucket=bucket_name,
        Key=post_mappings_key)['Body'].read().decode('utf-8')
    post_mappings = pd.read_csv(StringIO(post_mappings_obj))

    post_mappings.columns = ['ParentId', 'post_indicies']
    post_mappings.index = post_mappings['ParentId']
    post_mappings = post_mappings['post_indicies']
    post_ind = lambda x: post_mappings.loc[x]

    model_client = client.get_object(Bucket=bucket_name,
                                     Key=pickle_key)['Body'].read()
    model = pickle.loads(model_client)
    print('user_indicies length:  ', len(user_indicies))
    print('post_indicies length:  ', len(post_indicies))
    # item_features_npz = client.get_object(Bucket=bucket_name, Key=item_features_key)['Body'].read()
    # item_features_npz = csr_matrix(item_features_npz)
    # user_indicies = np.load('user_indicies.npy')
    # print(max(user_indicies))
    # post_indicies = np.load('post_indicies.npy')
    # print(max(post_indicies))
    # model = pickle.load(open("savefile.pickle", "rb"))
    dataset = Dataset()
    dataset.fit((x for x in user_indicies), (x for x in post_indicies))
    dummies = range(max(user_indicies) + 1, 876)
    dataset.fit_partial((x for x in dummies))
    print(dataset.interactions_shape())
    # new = pd.read_csv(f)
    new['post_indicies'] = new['ParentId'].apply(post_ind)
    new_user_indicies = dict()
    for i in range(len(new.OwnerUserId.unique())):
        new_user_indicies[new.OwnerUserId.unique()[i]] = dummies[i]
    new['user_indicies'] = new.OwnerUserId.apply(
        lambda x: new_user_indicies[x])
    print(new['user_indicies'].values)
    new_user_indicies = dict()
    for i in range(len(new.OwnerUserId.unique())):
        new_user_indicies[new.OwnerUserId.unique()[i]] = dummies[i]
    new['user_indicies'] = new.OwnerUserId.apply(
        lambda x: new_user_indicies[x])
    #user_indicies = np.append(user_indicies, new.user_indicies.unique())
    #######
    #np.save('user_indicies.npy', user_indicies)
    #######
    new = new[[
        'user_indicies', 'post_indicies', 'Score', 'OwnerUserId', 'ParentId'
    ]]
    dataset.fit_partial((x for x in new.user_indicies.values),
                        (x for x in new.post_indicies.values))
    (new_interactions, new_weights) = dataset.build_interactions(
        ((x[0], x[1], x[2]) for x in new.values))
    print(new_interactions.shape)
    #interactions = sparse.load_npz("interactions.npz")
    item_features = sparse.load_npz("item_features.npz")
    print(item_features.shape)
    # item_features = sparse.load_npz(item_features_npz)
    for i in new.user_indicies.unique():
        print(i, 'mean user embedding before refitting :',
              np.mean(model.user_embeddings[i]))
    print(new_interactions.shape)
    model = model.fit_partial(new_interactions,
                              item_features=item_features,
                              sample_weight=new_weights,
                              epochs=10,
                              verbose=True)
    for i in new.user_indicies.unique():
        print(i, 'mean user embedding after refitting:',
              np.mean(model.user_embeddings[i]))

    nq = pd.read_csv('new_questions.csv')

    csv_buffer = StringIO()
    s3_resource = boto3.resource('s3')

    for i in new.user_indicies.unique():
        scores = pd.Series(
            model.predict(int(i),
                          nq.post_indicies.values,
                          item_features=item_features))
        temp = nq.copy()
        temp['reccomendation'] = scores.values

        temp.to_csv(csv_buffer, index=False)
        s3_resource.Object(bucket_name,
                           'new_recs.csv').put(Body=csv_buffer.getvalue())

    # with open('savefile.pickle', 'wb') as fle:
    #     pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL)

    s3_resource.Object(bucket_name, pickle_key).put(
        Body=pickle.dumps(model))  #, protocol=pickle.HIGHEST_PROTOCOL))
Пример #20
0
def lightfm_node(X1_train, X2_train, X1_test, X2_test):
    X2 = pd.concat([X2_train, X2_test])
    X1 = pd.concat([X1_train, X1_test]).set_index('id')

    X1.columns = ['X1_' + i for i in X1.columns]

    X1['X1_5'] = pd.qcut(X1['X1_5'], np.arange(0, 1, 0.1), duplicates='drop')
    X1['X1_8'] = pd.qcut(X1['X1_8'], np.arange(0, 1, 0.1), duplicates='drop')
    X1['X1_6'] = pd.qcut(X1['X1_6'], np.arange(0, 1, 0.1), duplicates='drop')

    for col in ['X1_6', 'X1_8', 'X1_5', 'X1_1', 'X1_13']:
        X1[col] = X1[col].map(lambda x: '{' + col + '}_{' + str(x) + '}')

    X1 = X1.reset_index()

    from lightfm.data import Dataset
    dataset = Dataset()
    dataset.fit(users=(x for x in X2['id']), items=(x for x in X2['A']))

    dataset.fit_partial(users=(x for x in X1['id']),
                        user_features=(x for x in X1['X1_1']))
    dataset.fit_partial(users=(x for x in X1['id']),
                        user_features=(x for x in X1['X1_13']))
    dataset.fit_partial(users=(x for x in X1['id']),
                        user_features=(x for x in X1['X1_5']))
    dataset.fit_partial(users=(x for x in X1['id']),
                        user_features=(x for x in X1['X1_8']))
    dataset.fit_partial(users=(x for x in X1['id']),
                        user_features=(x for x in X1['X1_6']))

    user_features = dataset.build_user_features(
        [(x[1]['id'], x[1][['X1_1', 'X1_13', 'X1_5', 'X1_8', 'X1_6'
                            ]].values.tolist()) for x in X1.iterrows()],
        normalize=True)

    (interactions,
     weights) = dataset.build_interactions(zip(*X2[['id', 'A']].values.T))

    model = LightFM(no_components=32,
                    learning_rate=0.04,
                    loss='bpr',
                    max_sampled=55,
                    random_state=0)
    num_epochs = 20
    for i in range(num_epochs):
        model.fit_partial(interactions, user_features=user_features)

    users_mapping, user_features_mapping, assets_mapping, asset_features_mapping = dataset.mapping(
    )
    user_features_mapping_inv = {
        j: i
        for i, j in user_features_mapping.items()
    }

    tag_embeddings = (model.user_embeddings.T /
                      np.linalg.norm(model.user_embeddings, axis=1)).T

    lightfm_embed = pd.DataFrame(tag_embeddings[:len(users_mapping)],
                                 index=X1['id'])

    return lightfm_embed
Пример #21
0
def lambda_handler(event, context):
    try:
        ## Fetch data from RDS code
        connection = pymysql.connect(
            host='fitbookdb.crm91a2epcbi.us-east-1.rds.amazonaws.com',
            user='******',
            passwd='postgres',
            db='fitbookdb',
            cursorclass=pymysql.cursors.DictCursor)

        print("Connection successful")
    except:
        print("Connection error")

    # In[3]:

    #Get Food DataFrame
    dict_list = []

    with connection.cursor() as cur:
        cur.execute("select * from food_dataset")
        for row in cur:
            dict_list.append(row)

    food_rds_df = pd.DataFrame(dict_list)
    food_df = food_rds_df.copy()
    food_df.drop([
        'Portion_Default', 'Portion_Amount', 'Factor', 'Increment',
        'Multiplier', 'Portion_Display_Name', 'Food_Code', 'Display_Name'
    ],
                 axis=1,
                 inplace=True)
    # food_df.head()
    print('Food Dataframe imported')

    # In[4]:

    # # TODO: Perform Binning
    # food_30_bins = ['Alcohol', 'Calories', 'Saturated_Fats']
    # for each_column in food_30_bins:
    #     bins = np.linspace(food_df[each_column].min(), food_df[each_column].max(), 30)
    #     food_df[each_column+'bin'] = pd.cut(food_df[each_column], bins, labels=np.arange(0,len(bins)-1))
    # food_df

    # In[5]:

    # for each_column in food_30_bins:
    #     print(food_df[each_column].min())

    # In[6]:

    #Get User Dataframe
    # user_df = pd.read_csv('user_db_try.csv')
    # user_df.head()

    dict_list = []

    with connection.cursor() as cur:
        cur.execute("select * from tblUserData")
        for row in cur:
            dict_list.append(row)

    user_rds_df = pd.DataFrame(dict_list)
    user_df = user_rds_df.copy()
    user_df.drop([
        'cognitoAccessToken', 'cognitoIDToken', 'cognitoRefreshToken',
        'fitbitAccessToken', 'fitbitUserID', 'userName'
    ],
                 axis=1,
                 inplace=True)
    # user_df.head()

    print('User Dataframe imported')

    # In[7]:

    #Get userItem DataFrame
    # userItem_df = pd.read_csv('userItem_db_try_new.csv')
    # userItem_df.head()

    dict_list = []

    with connection.cursor() as cur:
        cur.execute("select * from tblUserRating")
        for row in cur:
            dict_list.append(row)

    userItem_rds_df = pd.DataFrame(dict_list)
    userItem_df = userItem_rds_df.copy()
    # userItem_df.head()
    print('UserItem Dataframe imported')

    # In[8]:

    #Make all the feature values unique
    for column_name in food_df.columns:
        if column_name != 'food_ID':
            food_df[column_name] = str(
                column_name) + ":" + food_df[column_name].astype(str)
    # food_df.head()

    # In[9]:

    #This Dict will be useful while creating tupples
    food_features_df = food_df.drop(['food_ID'], axis=1).copy()
    food_features_dict = food_features_df.to_dict('split')
    # food_features_dict

    # In[10]:

    food_feature_values = []

    for column_name in food_features_df.columns:
        food_feature_values.extend(food_features_df[column_name].unique())

    # food_feature_values

    # In[11]:

    for column_name in user_df.columns:
        if column_name != 'userID':
            user_df[column_name] = str(
                column_name) + ":" + user_df[column_name].astype(str)

    user_features_df = user_df.drop(['userID'], axis=1).copy()

    user_features_dict = user_features_df.to_dict('split')
    # user_features_dict

    # In[12]:

    user_feature_values = []

    for column_name in user_features_df.columns:
        user_feature_values.extend(user_features_df[column_name].unique())

    # user_feature_values

    # In[13]:

    user_tuples = []
    food_tuples = []

    for index, row in user_df.iterrows():
        user_tuples.append((row['userID'], user_features_dict['data'][index]))

    for index, row in food_df.iterrows():
        food_tuples.append((row['food_ID'], food_features_dict['data'][index]))

    # food_tuples

    # In[14]:

    print("Creating LightFm dataset")
    dataset = Dataset()
    dataset.fit(users=(user_id for user_id in user_df['userID']),
                items=(food_id for food_id in food_df['food_ID']))

    print("Dataset Created")
    # In[15]:

    num_users, num_items = dataset.interactions_shape()
    print('Num users: {}, num_items {}.'.format(num_users, num_items))

    # In[16]:

    # dataset.fit_partial(items=(food_id for food_id in food_df['Food_Code']),
    #                            item_features=((each_feature for each_feature in food_features)for food_features in food_features_dict['data']))

    # In[17]:

    # dataset.fit_partial(items=(food_id for food_id in food_df['Food_Code']),
    #                            item_features=((row['Milk'], row['Meats'], row['Alcohol'], row['Calories'])for index,row in food_df.iterrows()))

    # In[18]:

    print("fittng item partial features")
    dataset.fit_partial(items=(food_id for food_id in food_df['food_ID']),
                        item_features=(each_value
                                       for each_value in food_feature_values))

    # In[19]:

    # dataset.fit_partial(users=(user_id for user_id in user_df['Id']),
    #                     user_features=((each_feature for each_feature in user_features)for user_features in user_features_dict['data']))

    # In[20]:
    print("fittng user partial features")

    dataset.fit_partial(users=(user_id for user_id in user_df['userID']),
                        user_features=(each_value
                                       for each_value in user_feature_values))

    # In[21]:

    # dataset.item_features_shape()
    # dataset.user_features_shape()

    # In[22]:

    print("Building Interactions")
    (interactions, weights) = dataset.build_interactions(
        ((x['userID'], x['food_ID'], x['rating'])
         for y, x in userItem_df.iterrows()))

    # print(repr(interactions))
    # print(weights)

    # In[23]:

    # interactions.shape

    # In[24]:

    print("Building item features")
    item_features = dataset.build_item_features(each_tuple
                                                for each_tuple in food_tuples)
    # print(item_features)

    # In[25]:

    user_features = dataset.build_user_features(each_tuple
                                                for each_tuple in user_tuples)
    # print(user_features)

    # In[26]:

    print("Fitting Model")
    model = LightFM(loss='warp')
    model.fit(interactions,
              item_features=item_features,
              user_features=user_features)

    print("Model trained!!")

    print("Pickle started!!")
    pickle.dump(model, open("/tmp/model.pkl", 'wb'), protocol=2)

    bucketName = "fitbook-lambda-packages"
    Key = "/tmp/model.pkl"
    outPutname = "model.pkl"

    print("Uploading to S3")
    s3 = boto3.client('s3')
    s3.upload_file(Key, bucketName, outPutname)
    print("Upload done")
    os.remove("/tmp/model.pkl")

    print("Pickle file deleted")
    print("Successssss!!!!!")
Пример #22
0
for x in get_ratings():
    if k==5000:
        break
    id_isbn._user_id.append(x['user_id'])
    id_isbn._isbn.append(x['book_id'])
    k+=1
    
# print(id_isbn._user_id)
dataset = Dataset()
dataset.fit(id_isbn._user_id,
            id_isbn._isbn)

num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

dataset.fit_partial(items=(x['book_id'] for x in get_book_features()),
                    item_features=(x['authors'] for x in get_book_features()))


(interactions, weights) = dataset.build_interactions((id_isbn._user_id[i], id_isbn._isbn[i]) for i in range(5000))

item_features = dataset.build_item_features(((x['book_id'], [x['authors']])
                                              for x in get_book_features()))

print(item_features.shape)
print(interactions.shape)
# print(weights)

#################################
#								#
#  		Training the Model 		#
#								#