示例#1
0
class DataFit:
    def __init__(self):
        self.dataset = None

    def fit(self):
        book_list = DataPrep.get_book_list()
        book_feature_list = DataPrep.get_feature_list()
        user_list = DataPrep.get_user_list()
        self.dataset = Dataset()
        self.dataset.fit(users=user_list,
                         items=book_list,
                         item_features=book_feature_list)

        rating_list = DataPrep.get_rating_list()
        interactions, weights = self.dataset.build_interactions(rating_list)

        book_features = DataPrep.create_features()
        books_features = self.dataset.build_item_features(book_features)
        return interactions, weights, books_features

    def create_new_interactions(self, checkpoint):
        rating_list = DataPrep.get_rating_list_from_checkpoint(checkpoint)
        interactions, weights = self.dataset.build_interactions(rating_list)
        return interactions, weights

    def get_user_mapping(self):
        user_id_map, user_feature_map, item_id_map, item_feature_map = self.dataset.mapping(
        )
        return user_id_map

    def get_book_mapping(self):
        user_id_map, user_feature_map, item_id_map, item_feature_map = self.dataset.mapping(
        )
        return item_id_map

    @staticmethod
    def fit_evaluate(test_percentage=0.1):
        book_list = DataPrep.get_book_list()
        book_feature_list = DataPrep.get_feature_list()
        user_list = DataPrep.get_user_list()
        dataset = Dataset()
        dataset.fit(users=user_list,
                    items=book_list,
                    item_features=book_feature_list)

        rating_list = DataPrep.get_rating_list()
        random.shuffle(rating_list)
        rating_list_test = rating_list[:int(test_percentage *
                                            len(rating_list))]
        rating_list_train = rating_list[int(test_percentage *
                                            len(rating_list)):]
        interactions_train, weights_train = dataset.build_interactions(
            rating_list_train)
        interactions_test, weights_test = dataset.build_interactions(
            rating_list_test)

        return interactions_train, weights_train, interactions_test, weights_test
    def obtener_matrices(self):
        """
        Método obtener_matrices. Obtiene las matrices necesarias para la creación de los modelos de LightFM.

        Este método solo se utiliza en la interfaz de texto.
        """

        global train, test, modelo, item_features, user_features

        # Se obtienen los dataframes
        Entrada.obtener_datos()
        ratings_df = Entrada.ratings_df
        users_df = Entrada.users_df
        items_df = Entrada.items_df

        # Se transforman los dataframes en matrices que puedan ser utilzadas por los modelos
        dataset = Dataset()
        dataset.fit(users_df[users_df.columns.values[0]],
                    items_df[items_df.columns.values[0]],
                    user_features=users_df[users_df.columns.values[1]],
                    item_features=items_df[items_df.columns.values[1]])

        # Si el modelo es colaborativo o híbrido se tienen en cuenta las valoraciones de los usuarios
        if self.opcion_modelo == 1 or self.opcion_modelo == 2:
            (interacciones, pesos) = dataset.build_interactions(
                (row[ratings_df.columns.values[0]],
                 row[ratings_df.columns.values[1]],
                 row[ratings_df.columns.values[2]])
                for index, row in ratings_df.iterrows())
        else:
            (interacciones, pesos) = dataset.build_interactions(
                (row[ratings_df.columns.values[0]],
                 row[ratings_df.columns.values[1]])
                for index, row in ratings_df.iterrows())

        # Se obtienen las matrices de features y se guardan
        item_features = dataset.build_item_features(
            (row[items_df.columns.values[0]],
             [row[items_df.columns.values[1]]])
            for index, row in items_df.iterrows())
        user_features = dataset.build_user_features(
            (row[users_df.columns.values[0]],
             [row[users_df.columns.values[1]]])
            for index, row in users_df.iterrows())
        print("Guarda la matriz de item features")
        guardar_datos_pickle(item_features, 'la matriz de item features')
        print("Guarda la matriz de user features")
        guardar_datos_pickle(user_features, 'la matriz de user feautures')

        # Se dividen las interacciones en conjuntos de entrenamiento y test y se guardan
        train, test = random_train_test_split(interacciones,
                                              test_percentage=0.2)
        print("Guarda la matriz de entrenamiento")
        guardar_datos_pickle(train, 'la matriz de entrenamiento')
        print("Guarda la matriz de test")
        guardar_datos_pickle(test, 'la matriz de test')
示例#3
0
def evaluate_model(df,
                   user_id_col='user_id',
                   item_id_col='business_id',
                   stratify=None):
    """ Model evaluation.

    Args:
        df: the input dataframe.
        user_id_col: user id column.
        item_id_col: item id column.
        stratify: if use stratification.

    Returns:
        train_auc: training set auc score.
        test_auc: testing set auc score.

    """
    # model evaluation
    # create test and train datasets
    print('model evaluation')
    train, test = train_test_split(df, test_size=0.2, stratify=stratify)
    ds = Dataset()

    # we call fit to supply userid, item id and user/item features
    ds.fit(
        df[user_id_col].unique(),  # all the users
        df[item_id_col].unique(),  # all the items
    )

    # plugging in the interactions
    (train_interactions, train_weights) = ds.build_interactions([
        (x[0], x[1], x[2]) for x in train.values
    ])
    (test_interactions, _) = ds.build_interactions([(x[0], x[1], x[2])
                                                    for x in test.values])
    # model
    model = LightFM(no_components=100,
                    learning_rate=0.05,
                    loss='warp',
                    max_sampled=50)
    model.fit(train_interactions,
              sample_weight=train_weights,
              epochs=10,
              num_threads=10)

    # auc-roc
    train_auc = auc_score(model, train_interactions, num_threads=20).mean()
    print('Training set AUC: %s' % train_auc)
    test_auc = auc_score(model, test_interactions, num_threads=20).mean()
    print('Testing set AUC: %s' % test_auc)
    def interactions(self):
        # If interactions have not been supplied, process the file provided in source
        # N.B. This property also sets weights, which is probably not a best practice
        if self._interactions is None:

            if self._category == 'ratings_matrix':
                rm_df = pd.read_csv(self.path)
                ids = rm_df['sub']
                rm_df = rm_df.set_index(keys='sub')
                if 'Unnamed: 0' in rm_df.columns:
                    rm_df.drop('Unnamed: 0', axis=1, inplace=True)
                dataset = Dataset()
                dataset.fit(list(ids), list(rm_df.columns))
                self.mapping = dataset.mapping()

                interactions = []

                for item in rm_df.columns.tolist():
                    users = rm_df.index[rm_df[item] >= 1].tolist()
                    counts = rm_df[item][rm_df[item] >= 1]
                    interactions.extend(
                        zip(users, itertools.repeat(item, len(users)), counts))

                (self._interactions,
                 self._weights) = dataset.build_interactions(interactions)

            else:
                int_df = pd.read_csv(self.path)
                if 'Unnamed: 0' in int_df.columns:
                    int_df.drop('Unnamed: 0', axis=1, inplace=True)
                int_df = int_df.groupby(['subscriber_id', 'ddi_block_id']).size().reset_index()\
                    .rename(columns={0:'count'})
                dataset = Dataset()
                ids = int_df['subscriber_id'].unique()
                items = int_df['ddi_block_id'].unique()
                dataset.fit(list(ids), list(items))
                self.mapping = dataset.mapping()

                if self._use_weights:
                    interactions = zip(int_df['subscriber_id'],
                                       int_df['ddi_block_id'], int_df['count'])
                else:
                    interactions = zip(int_df['subscriber_id'],
                                       int_df['ddi_block_id'])
                (self._interactions,
                 self._weights) = dataset.build_interactions(interactions)

        else:
            return self._interactions
示例#5
0
def peuimportelenom():

    noms= request.form.getlist("dblst_artists")
    sugg= []
    #print(noms)

    for el in noms:
        artiste= ap[ap.name== el]
        lind= list(artiste.artistID)[0] -1
        vecteur[lind]= artiste.playCountScaled.median()

    # création de la matrice
    X= np.vstack((ratings,vecteur))
    
    # On importe le code du jupyter notebook
    n_users, n_items = X.shape

    Xcsr = csr_matrix(X)
    Xcoo = Xcsr.tocoo()
    data = Dataset()
    data.fit(np.arange(n_users), np.arange(n_items))
    interactions, weights = data.build_interactions(zip(Xcoo.row, Xcoo.col, Xcoo.data)) 
    train, test = random_train_test_split(interactions)

    model = LightFM(learning_rate=0.05, loss='warp')
    model.fit(train, epochs=10, num_threads=2)

    scores = model.predict(0, vecteur)
    top_items = ap["name"].unique()[np.argsort(-scores)]

    sugg= top_items[:10]
    

    return render_template("page.html", artist_names= artist_names, noms= noms, sugg= sugg)
示例#6
0
def create_dataset(df, item_features, list_item_features):
    """
	function to create the dataset based on df which stores all the data including
	features (tags) of each products
	Args: df(pandas dataframe) - 
	"""
    ## create a mapping between the user and item ids from our input data
    #to indices that will be used internally by the model
    dataset = Dataset(item_identity_features=True)
    list_user_names = list(df.index)
    list_items = df.columns.values

    dataset.fit(
        (user_name for user_name in list_user_names),
        (item for item in list_items),
        item_features=(item_feature for item_feature in list_item_features))

    ## Build the interaction matrix
    # it encodes the interactions betwee users and items.
    # need (user, item) pair that has 1's in df
    list_pairs = list(df.stack().index)
    (interactions, weights) = dataset.build_interactions(
        (pair for pair in list_pairs))

    item_feature_matrix = dataset.build_item_features(item_features)

    return dataset, interactions, weights, item_feature_matrix
示例#7
0
def interactions(df):
    movie_genre = [x.split("|") for x in df["genre"]]
    all_movie_genre = sorted(
        list(set(itertools.chain.from_iterable(movie_genre))))

    all_occupations = sorted(list(set(df["occupation"])))

    dataset = Dataset()
    dataset.fit(
        df["userID"],
        df["itemID"],
        item_features=all_movie_genre,
        user_features=all_occupations,
    )

    item_features = dataset.build_item_features(
        (x, y) for x, y in zip(df.itemID, movie_genre))

    user_features = dataset.build_user_features(
        (x, [y]) for x, y in zip(df.userID, df["occupation"]))

    (interactions, _) = dataset.build_interactions(df.iloc[:, 0:3].values)

    train_interactions, test_interactions = cross_validation.random_train_test_split(
        interactions,
        test_percentage=TEST_PERCENTAGE,
        random_state=np.random.RandomState(SEEDNO),
    )
    return train_interactions, test_interactions, item_features, user_features
示例#8
0
def prepareData(df, tags):
    df = df[df.actionCategory == "WebNei clicked"]
    actionByUsers = df.groupby(["userName", "actionName"]).size()
    uniqueUsers = df[df.userName.isin(
        actionByUsers.index.get_level_values(
            0).unique().values)].drop_duplicates('userName')
    uniqueUsers['user_features'] = uniqueUsers[[
        'title', 'team', 'organization', 'department'
    ]].values.tolist()
    dataset = Dataset()
    dataset.fit((list(actionByUsers.index.get_level_values(0))),
                (list(actionByUsers.index.get_level_values(1))))

    rowM, colM = prepareJson(tags)
    rowU, colU = prepareUserFeatures(uniqueUsers)

    dataset.fit_partial(items=rowM,
                        item_features=colM,
                        users=rowU,
                        user_features=colU)

    (interactions, weights) = dataset.build_interactions(
        zip(list(actionByUsers.index.get_level_values(0)),
            list(actionByUsers.index.get_level_values(1))))
    item_features = dataset.build_item_features(zip(rowM, [colM]))
    user_features = dataset.build_user_features(zip(rowU, [colU]))
    return interactions, item_features, user_features
示例#9
0
def train_model(df,
                user_id_col='user_id',
                item_id_col='business_id',
                item_name_col='name_business',
                evaluate=True):
    """Train the model using collaborative filtering.

    Args:
        df: the input dataframe.
        user_id_col: user id column.
        item_id_col: item id column.
        item_name_col: item name column.
        evaluate: if evaluate the model performance.

    Returns:
        model_full: the trained model.
        df_interactions: dataframe with user-item interactions.
        user_dict: user dictionary containing user_id as
            key and interaction_index as value.
        item_dict: item dictionary containing item_id
            as key and item_name as value.

    """
    if evaluate:
        print('Evaluating model...')
        evaluate_model(df, user_id_col='user_id', item_id_col='business_id')

    print('Training model...')
    # build recommendations for known users and known businesses
    # with collaborative filtering method
    ds_full = Dataset()
    # we call fit to supply userid, item id and user/item features
    ds_full.fit(
        df[user_id_col].unique(),  # all the users
        df[item_id_col].unique(),  # all the items
    )
    (interactions, weights) = ds_full.build_interactions([(x[0], x[1], x[2])
                                                          for x in df.values])
    # model
    model_full = LightFM(no_components=100,
                         learning_rate=0.05,
                         loss='warp',
                         max_sampled=50)
    model_full.fit(interactions,
                   sample_weight=weights,
                   epochs=10,
                   num_threads=10)
    # mapping
    user_id_map, _, business_id_map, _ = ds_full.mapping()

    # data preparation
    df_interactions = pd.DataFrame(weights.todense())
    df_interactions.index = list(user_id_map.keys())
    df_interactions.columns = list(business_id_map.keys())
    user_dict = user_id_map
    item_dict = df.set_index(item_id_col)[item_name_col].to_dict()
    return model_full, df_interactions, user_dict, item_dict
示例#10
0
    def fit_evaluate(test_percentage=0.1):
        book_list = DataPrep.get_book_list()
        book_feature_list = DataPrep.get_feature_list()
        user_list = DataPrep.get_user_list()
        dataset = Dataset()
        dataset.fit(users=user_list,
                    items=book_list,
                    item_features=book_feature_list)

        rating_list = DataPrep.get_rating_list()
        random.shuffle(rating_list)
        rating_list_test = rating_list[:int(test_percentage *
                                            len(rating_list))]
        rating_list_train = rating_list[int(test_percentage *
                                            len(rating_list)):]
        interactions_train, weights_train = dataset.build_interactions(
            rating_list_train)
        interactions_test, weights_test = dataset.build_interactions(
            rating_list_test)

        return interactions_train, weights_train, interactions_test, weights_test
示例#11
0
def test_fitting_no_identity():

    users, items = 10, 100

    dataset = Dataset(user_identity_features=False, item_identity_features=False)
    dataset.fit(range(users), range(items))

    assert dataset.interactions_shape() == (users, items)
    assert dataset.user_features_shape() == (users, 0)
    assert dataset.item_features_shape() == (items, 0)

    assert dataset.build_interactions([])[0].shape == (users, items)
    assert dataset.build_user_features([], normalize=False).getnnz() == 0
    assert dataset.build_item_features([], normalize=False).getnnz() == 0
示例#12
0
def create_recommender():
	# obtain interaction table from dynamodb, which is json data
	dynamodb = boto3.resource('dynamodb')
	table = dynamodb.Table('eye_video_vote')
	response = table.scan()
	raw_data = response['Items']
	
		
	#transform json structure data to user-item-rating interaction format
	final_df = pd.DataFrame(columns=['userId','videoId','rating'])

	for i in raw_data:
#    data = raw_data[i]
		if any('upVote' in s for s in list(i.keys())):
			df1 = {k:i[k] for k in ('upVote','videoId')}
			df1['videoId'] = {df1['videoId']}
			df1 = pd.DataFrame.from_dict(df1, orient='index').T
			df1['rating'] = randint(4, 5)
			df1.fillna(value = pd.np.nan, inplace=True)
			df1 = df1.fillna(method='ffill')
			df1.rename(columns={'upVote':'userId'},inplace=True)
			final_df = final_df.append(df1)
		if any('downVote' in s for s in list(i.keys())):
			df2 = {k:i[k] for k in ('downVote','videoId')}
			df2['videoId'] = {df2['videoId']}
			df2 = pd.DataFrame.from_dict(df2, orient='index').T
			df2['rating'] = randint(1, 2)
			df2.fillna(value = pd.np.nan, inplace=True)
			df2 = df2.fillna(method='ffill')
			df2.rename(columns={'downVote':'userId'},inplace=True)
			final_df = final_df.append(df2)
	
	
	#rename the columns 
	final_df.rename(columns={'userId':'UserID', 'videoId':'MovieID', 'rating':'rating'}, inplace=True)
		
	#generate the appropriate lightfm dataset
	dataset = Dataset()
	dataset.fit(users = (row['UserID'] for index,row in final_df.iterrows()),
				items = (row['MovieID'] for index,row in final_df.iterrows()))
	
	(interactions, weights) = dataset.build_interactions((row['UserID'],row['MovieID'],row['rating']) for index,row in final_df.iterrows())
	
	#model collabrative filtering
	model_cf = LightFM(no_components=20, loss='warp')
	model_cf.fit(interactions, user_features=None, item_features=None, sample_weight=None, epochs=20, num_threads=4)
	with open('model_cf.pickle', 'wb') as fle:
		pickle.dump(model_cf, fle, protocol=pickle.HIGHEST_PROTOCOL)
	
	return 
示例#13
0
def test_fitting():

    users, items = 10, 100

    dataset = Dataset()
    dataset.fit(range(users), range(items))

    assert dataset.interactions_shape() == (users, items)
    assert dataset.user_features_shape() == (users, users)
    assert dataset.item_features_shape() == (items, items)

    assert dataset.build_interactions([])[0].shape == (users, items)
    assert dataset.build_user_features([]).getnnz() == users
    assert dataset.build_item_features([]).getnnz() == items
示例#14
0
def test_fitting_no_identity():

    users, items = 10, 100

    dataset = Dataset(user_identity_features=False, item_identity_features=False)
    dataset.fit(range(users), range(items))

    assert dataset.interactions_shape() == (users, items)
    assert dataset.user_features_shape() == (users, 0)
    assert dataset.item_features_shape() == (items, 0)

    assert dataset.build_interactions([])[0].shape == (users, items)
    assert dataset.build_user_features([], normalize=False).getnnz() == 0
    assert dataset.build_item_features([], normalize=False).getnnz() == 0
示例#15
0
def test_fitting():

    users, items = 10, 100

    dataset = Dataset()
    dataset.fit(range(users), range(items))

    assert dataset.interactions_shape() == (users, items)
    assert dataset.user_features_shape() == (users, users)
    assert dataset.item_features_shape() == (items, items)

    assert dataset.build_interactions([])[0].shape == (users, items)
    assert dataset.build_user_features([]).getnnz() == users
    assert dataset.build_item_features([]).getnnz() == items
示例#16
0
    def build_lightfm_dataset(self) -> None:
        """
        Builds final datasets for user-variant and variant-variant recommendations.
        """
        logging.info("Creating LightFM matrices...")
        lightfm_dataset = LFMDataset()
        ratings_list = self.interaction_list
        logging.info('#'*60)
        lightfm_dataset.fit_partial(
            (rating['user_id'] for rating in ratings_list),
            (rating['product_id'] for rating in ratings_list)
        )

        item_feature_names = self.item_df.columns
        logging.info(f'Logging item_feature_names - with product_id: \n{item_feature_names}')
        item_feature_names = item_feature_names[~item_feature_names.isin(['product_id'])]
        logging.info(f'Logging item_feature_names - without product_id: \n{item_feature_names}')

        for item_feature_name in item_feature_names:
            lightfm_dataset.fit_partial(
                items=(item['product_id'] for item in self.item_list),
                item_features=((item[item_feature_name] for item in self.item_list)),
            )

        item_features_data = []
        for item in self.item_list:
            item_features_data.append(
                (
                    item['product_id'],
                    [
                        item['product_name'],
                        item['aisle'],
                        item['department']
                    ],
                )
            )
        logging.info(f'Logging item_features_data @build_lightfm_dataset: \n{item_features_data}')
        self.item_features = lightfm_dataset.build_item_features(item_features_data)
        self.interactions, self.weights = lightfm_dataset.build_interactions(
            ((rating['user_id'], rating['product_id']) for rating in ratings_list)
        )

        self.n_users, self.n_items = self.interactions.shape

        logging.info(f'Logging self.interactions @build_lightfm_dataset: \n{self.interactions}')
        logging.info(f'Logging self.weights @build_lightfm_dataset: \n{self.weights}')
        logging.info(
            f'The shape of self.interactions {self.interactions.shape} '
            f'and self.weights {self.weights.shape} represent the user-item matrix.')
示例#17
0
def predict_artist_list(artist_select): 
# Build a user-artist rating matrix 
    ratings_df = ap.pivot(index='userID', columns='artistID', values='playCountScaled')
    ratings = ratings_df.fillna(0).values

    artist_names = ap.sort_values("artistID")["name"].unique()

    add_user = [0]*17632

    new_list = []

    for item in artist_select:
        artists_idx = artists.index[artists["name"] == item]
        new_list.append(artists_idx)
        for i in new_list : 
            for j in i :
                index = j 
                add_user[index] = 1
            new_ratings_df = np.vstack((ratings_df, add_user))
            ratings_df = pd.DataFrame(new_ratings_df)
    new_userID = (ratings_df.shape[0] - 1)     
    ratings = ratings_df.fillna(0).values

    # Build a sparse matrix
    X = csr_matrix(ratings)

    n_users, n_items = ratings_df.shape

    user_ids = ratings_df.index.values
    artist_names = ap.sort_values("artistID")["name"].unique()


    # Build data references + train test
    Xcoo = X.tocoo()
    data = Dataset()
    data.fit(np.arange(n_users), np.arange(n_items))
    interactions, weights = data.build_interactions(zip(Xcoo.row, Xcoo.col, Xcoo.data)) 
    train, test = random_train_test_split(interactions)



    model = LightFM(learning_rate=0.05, loss='warp')
    model.fit(train, epochs=10, num_threads=2)


    # Predict
    scores = model.predict(0, np.arange(n_items))
    top_items = artist_names[np.argsort(-scores)]
    return top_items[0:10]
def create_dataset(df):
    ## create a mapping between the user and item ids from our input data
    #to indices that will be used internally by the model
    dataset = Dataset()
    list_user_names = list(df.index)
    list_items = df.columns.values
    dataset.fit((user_name for user_name in list_user_names),
                (item for item in list_items))

    ## Build the interaction matrix
    # it encodes the interactions betwee users and items.
    # need (user, item) pair that has 1's in df
    list_pairs = list(df.stack().index)
    (interactions, weights) = dataset.build_interactions(
        (pair for pair in list_pairs))

    return dataset, interactions, weights
示例#19
0
def lightfm_trainer(train: np.ndarray, loss: str, n_components: int,
                    lam: float) -> None:
    """Train lightfm models."""
    model = LightFM(
        loss=loss,
        user_alpha=lam,
        item_alpha=lam,
        no_components=n_components,
        learning_rate=0.001,
        random_state=12345,
    )
    dataset = Dataset()
    dataset.fit(train[:, 0], train[:, 1])
    (interactions, weights) = dataset.build_interactions(
        ((x[0], x[1], 1) for x in train[train[:, 2] == 1]))
    model.fit(interactions, epochs=100)

    return model
示例#20
0
    def fit_data(self, matrix, user_features=None, item_features=None):
        """
        Create datasets for .fit() method.
        Args:
            matrix: User-item interactions matrix (weighted)
            user_features: User-features pandas dataframe which index contains user_ids (crd_no)
            item_features:  Item-features pandas dataframe which index contains good_ids (plu_id)
        Returns:
            Model with fitted (mapped) datasets
        """
        matrix.sort_index(inplace=True)
        matrix.sort_index(inplace=True, axis=1)
        dataset = Dataset()
        dataset.fit((x for x in matrix.index), (x for x in matrix.columns))
        interactions = pd.melt(
            matrix.replace(0, np.nan).reset_index(),
            id_vars='index',
            value_vars=list(matrix.columns[1:]),
            var_name='plu_id',
            value_name='rating').dropna().sort_values('index')
        interactions.columns = ['crd_no', 'plu_id', 'rating']
        self.interactions, self.weights = dataset.build_interactions(
            [tuple(x) for x in interactions.values])

        if user_features is not None:
            user_features.sort_index(inplace=True)
            dataset.fit_partial(users=user_features.index,
                                user_features=user_features)
            self.user_features = dataset.build_user_features(
                ((index, dict(row))
                 for index, row in user_features.iterrows()))
        else:
            self.user_features = None
        if item_features is not None:
            item_features.sort_index(inplace=True)
            dataset.fit_partial(items=item_features.index,
                                item_features=item_features)
            self.item_features = dataset.build_item_features(
                ((index, dict(row))
                 for index, row in item_features.iterrows()))
        else:
            self.item_features = None
示例#21
0
def load_parameter():
    ratings = get_ratings()
    books = get_books()
    users = get_users()
    books_pd = convert_pd(books)

    id_users_books = StoreValue()

    for x in ratings:
        id_users_books._user_id.append(x[0])
        id_users_books._book_id.append(x[1])

    # Được tạo ra theo hướng dẫn tại https://making.lyst.com/lightfm/docs/examples/dataset.html
    dataset_explicit = Dataset()
    dataset_explicit.fit(id_users_books._user_id,
                id_users_books._book_id)

    num_users, num_items = dataset_explicit.interactions_shape()
    print('Num users: {}, num_items {}.'.format(num_users, num_items))

    dataset_explicit.fit_partial(items=(x[0] for x in books),
                        item_features=(x[7] for x in books))
    
    dataset_explicit.fit_partial(users=(x[0] for x in users))


    # create ---> mapping
    # interactions: dưới dạng COO_maxtrix, các tương tác sẽ là user_id và book_id
    # Trọng số voting
    (interactions_explicit, weights_explicit) = dataset_explicit.build_interactions((id_users_books._user_id[i], id_users_books._book_id[i]) for i in range(len(ratings)))

    # Đây là đặc trưng trích xuất từ các items (sách) dựa trên tác giả của cuốn sách được cung cấp
    item_features = dataset_explicit.build_item_features(((x[0], [x[7]]) for x in books))
    # user_features = dataset_explicit.build_user_features(((x[0], [x[1]]) for x in users))

    model_explicit_ratings = LightFM_ext(loss='warp')

    (train, test) = random_train_test_split(interactions=interactions_explicit, test_percentage=0.02)

    model_explicit_ratings.fit(train, item_features=item_features, epochs=2, num_threads=4)
    return model_explicit_ratings, dataset_explicit, interactions_explicit, weights_explicit, item_features, books_pd
示例#22
0
def fetch_data():
    # Create a SQL connection to our SQLite database
    con = sqlite3.connect("db.sqlite3")
    cur = con.cursor()

    # The result of a "cursor.execute" can be iterated over by row
    data = []
    users = []
    movies = []
    for row in cur.execute('SELECT id FROM RecoFramework_userinfo;'):
        users.append(row[0])

    for row in cur.execute('SELECT movieId FROM RecoFramework_movies;'):
        movies.append(row[0])

    for row in cur.execute(
            'SELECT userId, movieId, rating FROM RecoFramework_ratings WHERE rating = 5;'
    ):
        data.append(row)

    dataset = Dataset()
    #print("Loading dataset...")
    dataset.fit(users, movies)
    interactions, ratings = dataset.build_interactions(data)

    # Be sure to close the connection
    con.close()

    train, test = random_train_test_split(interactions)

    model = LightFM(loss='warp')

    # train lightFM model using fit method
    #print("Starting training the model...")
    model.fit(train, epochs=30, num_threads=2)

    user_dict = dataset._user_id_mapping
    movie_dict = dataset._item_id_mapping

    return model, ratings, user_dict, movie_dict, train, test
def create_datasets(cluster_id):

    events_list = get_events_from_es(cluster_id)

    dataframe_interactions, dataframe_users_features, dataframe_item_features, user_tuple, item_tuple = create_interactions_and_features(events_list, cluster_id)

    print(dataframe_interactions, cluster_id, file=sys.stderr)
    print(dataframe_users_features, cluster_id, file=sys.stderr)
    print(dataframe_item_features, cluster_id, file=sys.stderr)

    #print(user_tuple)
   # print(item_tuple)

    user_features = format_users_features(dataframe_users_features)

    #print(user_features)

    item_features = format_items_features(dataframe_item_features)

    #print(item_features)

    dataset = Dataset()

    dataset.fit(
            dataframe_interactions['user'].unique(), # all the users
            dataframe_interactions['item'].unique(), # all the items
            user_features = user_features,
            item_features = item_features
    )

    (interactions, weights) = dataset.build_interactions([(x[0], x[1], x[2]) for x in dataframe_interactions.values ])

#    print(interactions)
#    print(weights)

    final_user_features = dataset.build_user_features(user_tuple, normalize= False)

    final_item_features = dataset.build_item_features(item_tuple, normalize= False)

    return dataset, interactions, weights, final_item_features, final_user_features
示例#24
0
def test_exceptions():

    users, items = 10, 100

    dataset = Dataset()
    dataset.fit(range(users), range(items))

    with pytest.raises(ValueError):
        dataset.build_interactions([(users + 1, 0)])

    with pytest.raises(ValueError):
        dataset.build_interactions([(0, items + 1)])

    dataset.fit_partial([users + 1], [items + 1])
    dataset.build_interactions([(users + 1, 0)])
    dataset.build_interactions([(0, items + 1)])
示例#25
0
def test_exceptions():

    users, items = 10, 100

    dataset = Dataset()
    dataset.fit(range(users), range(items))

    with pytest.raises(ValueError):
        dataset.build_interactions([(users + 1, 0)])

    with pytest.raises(ValueError):
        dataset.build_interactions([(0, items + 1)])

    dataset.fit_partial([users + 1], [items + 1])
    dataset.build_interactions([(users + 1, 0)])
    dataset.build_interactions([(0, items + 1)])
示例#26
0
def train_model():
    dataset = Dataset()
    dataset.fit((x['User_ID'] for x in get_ratings()),
                (x['Item_ID'] for x in get_ratings()))
    for i in range(25):
        add_item_features(dataset, paan_features[i])
    (interactions, weights) = dataset.build_interactions(
        ((x['User_ID'], x['Item_ID']) for x in get_ratings()))

    item_features = dataset.build_item_features(((x['Item_ID'], [
        x['Banaras'], x['Calcutta'], x['Maghai'], x['Sada'], x['Meetha'],
        x['Chocolate'], x['Dry Fruit'], x['Mango'], x['Strawberry'],
        x['Pineapple'], x['Kaju'], x['Jelly'], x['Rose'], x['Shahi'],
        x['Kesar'], x['Vanilla'], x['Masala'], x['Khatta'], x['Orange'],
        x['White'], x['Silver'], x['RaatRani'], x['Nutella'], x['Special'],
        x['Gold']
    ]) for x in get_item_features()))

    model = LightFM(loss='bpr')
    model.fit(interactions, item_features=item_features)

    labels = np.array([x['Item_ID'] for x in get_item_features()])
    print("Model Trained Successfully.....")
    return model, interactions, labels, item_features
示例#27
0
                                 User.extract_user_ids(user_stats)), True)

    business_features = dataset.build_item_features(
        Business.build_business_features(
            business_stats, Business.extract_business_ids(business_stats)),
        True)

    print('[ %04ds ] Dataset initialized' % (time.time() - start_time))

    user_avg, user_std = Review.extract_user_average_and_std(training_set)
    normalized_training_reviews = Review.normalize_by_user(
        training_set, user_avg)
    training_interactions = Review.extract_sparse_interaction_matrix(
        normalized_training_reviews)

    interaction_matrix, interaction_weight = dataset.build_interactions(
        training_interactions)

    print('[ %04ds ] Interactions built' % (time.time() - start_time))

    no_components = 50
    loss = 'bpr'
    learning_rate = 0.1
    item_alpha = 1e-5
    user_alpha = 1e-5
    epochs = 20

    model = LightFM(no_components=no_components,
                    loss=loss,
                    learning_rate=learning_rate,
                    item_alpha=item_alpha,
                    user_alpha=user_alpha)
示例#28
0
    def run(self,
            epochs: int = 1,
            no_components: int = 50,
            learning_rate: float = 0.05) -> Dict[str, float]:
        """
         build interaction matrix -> build movie features -> build model

        Example (5000 samples, 50 components, 5 epochs, learning_rate=0.05)
        =================================
        {'auc_train': 0.66268414, 'auc_test': 0.67257625,
         'precision_train@10': 0.035984848, 'precision_test@10': 0.014193548,
         'recall_train@10': 0.06827082513973247, 'recall_test@10': 0.0646373101211811}

        ###########################
        #### Random Stratified ####
        ###########################
        Example (2 million samples, 50 components, 1 epochs, learning_rate=0.05)
        =================================
        {'auc_train': 0.5171841, 'auc_test': 0.51610065,
         'precision_train@10': 0.018248174, 'precision_test@10': 0.0040145987,
         'recall_train@10': 0.0008001067196610589, 'recall_t0.018248174est@10': 0.0007001527280332769}

        ########################
        #### Popular Active ####
        ########################
        Example (333000 samples, 150 components, 1 epochs, learning_rate=0.05)  20% test data
        =================================
        {'auc_train': 0.63388383, 'auc_test': 0.5569484,
        'precision_train@10': 0.7255412, 'precision_test@10': 0.17099567,
        'recall_train@10': 0.006322884137545113, 'recall_test@10': 0.006053869700910709}

        Example (333000 samples, 50 components, 1 epochs, learning_rate=0.05)  40% test data
        =================================
        {'auc_train': 0.6001097, 'auc_test': 0.56429684,
         'precision_train@10': 0.56060606, 'precision_test@10': 0.33030304,
         'recall_train@10': 0.006517918240037026, 'recall_test@10': 0.005792534657980192}

        Example (333000 samples, 50 components, 20 epochs, learning_rate=0.05)  40% test data
        =================================
        {'auc_train': 0.6077434, 'auc_test': 0.5688331,
         'precision_train@10': 0.5874459, 'precision_test@10': 0.32424247,
         'recall_train@10': 0.0068082500065638684, 'recall_test@10': 0.005756504594433489}

        Example (333000 samples, 50 components, 1 epochs, learning_rate=0.05)  40% test data with normalization
        =================================
        {'auc_train': 0.60080063, 'auc_test': 0.56425303,
         'precision_train@10': 0.56926405, 'precision_test@10': 0.33679655,
         'recall_train@10': 0.006628036812872702, 'recall_test@10': 0.005913302996971047}
         """
        ## Build Matrix Factorization between Customer and Movie
        data = self._filter_data

        dataset = Dataset()
        dataset.fit(data['Cust_Id'].unique(),
                    data['Movie_Id'].unique(),
                    item_features=self.get_combination)
        (interactions, weights) = dataset.build_interactions([
            (x['Cust_Id'], x['Movie_Id'], x['Rating'])
            for index, x in data.iterrows()
        ])

        train, test = random_train_test_split(
            interactions,
            test_percentage=0.4,
            random_state=np.random.RandomState(7))
        print("Finished creating interactions matrix!")

        ## Build movie features
        movies_id, tfidf_data = self.get_tfidf_matrix
        features_lists = [list(x) for x in tfidf_data.values]
        movies_features = dataset.build_item_features(
            data=self.get_movies_tuple(features_lists, movies_id, tfidf_data),
            normalize=True)
        print("Finished building movie features!")

        ## Build model
        model = LightFM(no_components=no_components,
                        learning_rate=learning_rate,
                        loss='warp',
                        k=15)
        model.fit(train,
                  epochs=epochs,
                  item_features=movies_features,
                  num_threads=4)
        print("Finished building LightFM model!")

        with open('hybrid_model_popular_active.pickle', 'wb') as fle:
            pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL)
        print("Finished saving LightFM model!")

        return {
            "auc_train":
            auc_score(model, train, item_features=movies_features).mean(),
            "auc_test":
            auc_score(model, test, item_features=movies_features).mean(),
            "precision_train@10":
            precision_at_k(model, train, item_features=movies_features,
                           k=10).mean(),
            "precision_test@10":
            precision_at_k(model, test, item_features=movies_features,
                           k=10).mean(),
            "recall_train@10":
            recall_at_k(model, train, item_features=movies_features,
                        k=10).mean(),
            "recall_test@10":
            recall_at_k(model, test, item_features=movies_features,
                        k=10).mean()
        }
示例#29
0
def train_model(
               df, user_id_col='user_id', item_id_col='business_id',
               item_name_col='name_business', evaluate=True):
    """ Train the model using collaborative filtering.
    Args:
        df: the input dataframe.
        user_id_col: user id column.
        item_id_col: item id column.
        item_name_col: item name column.
        evaluate: if evaluate the model performance.
    Returns:
        model_full: the trained model.
        df_interactions: dataframe with user-item interactions.
        user_dict: user dictionary containing user_id as key and
            interaction_index as value.
        item_dict: item dictionary containing item_id as key and
            item_name as value.
        user_feature_map: the feature map of users
        business_feature_map: the feature map of items
    """
    if evaluate:
        print('Evaluating model...')
        evaluate_model(df, user_id_col='user_id', item_id_col='business_id')
    print('Training model...')

    # build recommendations for known users and known businesses
    # with collaborative filtering method
    ds_full = Dataset()
    # we call fit to supply userid, item id and user/item features
    user_cols = ['user_id', 'average_stars']
    categories = [c for c in df.columns if c[0].isupper()]
    item_cols = ['business_id', 'state']

    for i in df.columns[10:]:
        item_cols.append(str(i))

    user_features = user_cols[1:]
    item_features = item_cols[2:]

    ds_full.fit(
        df[user_id_col].unique(),  # all the users
        df[item_id_col].unique(),  # all the items
        user_features=user_features,  # additional user features
        item_features=item_features
         )

    df_users = df.drop_duplicates(user_id_col)
    # df_users = df[df.duplicated(user_id_col) == False]
    users_features = []
    for i in range(len(df_users)):
        users_features.append(get_users_features_tuple(df_users.values[i]))
    users_features = ds_full.build_user_features(
        users_features, normalize=False)

    items = df.drop_duplicates(item_id_col)
    # items = df[df.duplicated(item_id_col) == False]
    items_features = []
    for i in range(len(items)):
        items_features.append(get_items_features_tuple(
            items.values[i], categories))
    items_features = ds_full.build_item_features(
        items_features, normalize=False)

    (interactions, weights) = ds_full.build_interactions(
        [(x[0], x[1], x[2]) for x in df.values])
    # model
    model_full = LightFM(
        no_components=100, learning_rate=0.05, loss='warp', max_sampled=50)
    model_full.fit(
        interactions, user_features=users_features,
        item_features=items_features, sample_weight=weights,
        epochs=10, num_threads=10)
    # mapping
    user_id_map, user_feature_map, business_id_map, business_feature_map = \
        ds_full.mapping()

    # data preparation
    df_interactions = pd.DataFrame(weights.todense())
    df_interactions.index = list(user_id_map.keys())
    df_interactions.columns = list(business_id_map.keys())
    user_dict = user_id_map
    item_dict = df.set_index(item_id_col)[item_name_col].to_dict()
    return model_full, df_interactions, user_dict, \
        item_dict, user_feature_map, business_feature_map
示例#30
0
def evaluate_model(
                  df, user_id_col='user_id',
                  item_id_col='business_id', stratify=None):
    """ Model evaluation.
    Args:
        df: the input dataframe.
        user_id_col: user id column.
        item_id_col: item id column.
        stratify: if use stratification.
    No return value
    """
    # create test and train datasets
    print('model evaluation')
    train, test = train_test_split(df, test_size=0.2, stratify=stratify)
    ds = Dataset()
    # we call fit to supply userid, item id and user/item features
    user_cols = ['user_id', 'average_stars']
    categories = [c for c in df.columns if c[0].isupper()]
    item_cols = ['business_id', 'state']

    for i in df.columns[10:]:
        item_cols.append(str(i))

    user_features = user_cols[1:]
    item_features = item_cols[2:]

    ds.fit(
        df[user_id_col].unique(),  # all the users
        df[item_id_col].unique(),  # all the items
        user_features=user_features,  # additional user features
        item_features=item_features
         )

    train_users = train.drop_duplicates('user_id')
    # train_users = train[train.duplicated('user_id') == False]
    train_user_features = []
    for i in range(len(train_users)):
        train_user_features.append(get_users_features_tuple(
            train_users.values[i]))
    train_user_features = ds.build_user_features(
        train_user_features, normalize=False)

    test_users = test.drop_duplicates('user_id')
    # test_users = test[test.duplicated('user_id') == False]
    test_user1_features = []
    for i in range(len(test_users)):
        test_user1_features.append(get_users_features_tuple(
            test_users.values[i]))
    test_user_features = ds.build_user_features(
        test_user1_features, normalize=False)

    train_items = train.drop_duplicates('business_id')
    # train_items = train[train.duplicated('business_id') == False]
    train_item1_features = []
    for i in range(len(train_items)):
        train_item1_features.append(get_items_features_tuple(
            train_items.values[i], categories))
    train_item_features = ds.build_item_features(
        train_item1_features, normalize=False)

    test_items = test.drop_duplicates('business_id')
    # test_items = test[test.duplicated('business_id') == False]
    test_item_features = []
    for i in range(len(test_items)):
        test_item_features.append(get_items_features_tuple(
            test_items.values[i], categories))
    test_item_features = ds.build_item_features(
        test_item_features, normalize=False)

    # plugging in the interactions and their weights
    (train_interactions, train_weights) = ds.build_interactions(
        [(x[0], x[1], x[2]) for x in train.values])
    (test_interactions, test_weights) = ds.build_interactions(
        [(x[0], x[1], x[2]) for x in test.values])

    # model
    model = LightFM(
        no_components=100, learning_rate=0.05, loss='warp', max_sampled=50)
    model.fit(
        train_interactions, user_features=train_user_features,
        item_features=train_item_features, sample_weight=train_weights,
        epochs=10, num_threads=10)

    # auc-roc
    train_auc = auc_score(
        model, train_interactions, user_features=train_user_features,
        item_features=train_item_features, num_threads=20).mean()
    print('Training set AUC: %s' % train_auc)
    test_auc = auc_score(
        model, test_interactions, user_features=test_user_features,
        item_features=test_item_features, num_threads=20).mean()
    print('Testing set AUC: %s' % test_auc)
示例#31
0
def lambda_handler(event, context):
    try:
        ## Fetch data from RDS code
        connection = pymysql.connect(
            host='fitbookdb.crm91a2epcbi.us-east-1.rds.amazonaws.com',
            user='******',
            passwd='postgres',
            db='fitbookdb',
            cursorclass=pymysql.cursors.DictCursor)

        print("Connection successful")
    except:
        print("Connection error")

    # In[3]:

    #Get Food DataFrame
    dict_list = []

    with connection.cursor() as cur:
        cur.execute("select * from food_dataset")
        for row in cur:
            dict_list.append(row)

    food_rds_df = pd.DataFrame(dict_list)
    food_df = food_rds_df.copy()
    food_df.drop([
        'Portion_Default', 'Portion_Amount', 'Factor', 'Increment',
        'Multiplier', 'Portion_Display_Name', 'Food_Code', 'Display_Name'
    ],
                 axis=1,
                 inplace=True)
    # food_df.head()
    print('Food Dataframe imported')

    # In[4]:

    # # TODO: Perform Binning
    # food_30_bins = ['Alcohol', 'Calories', 'Saturated_Fats']
    # for each_column in food_30_bins:
    #     bins = np.linspace(food_df[each_column].min(), food_df[each_column].max(), 30)
    #     food_df[each_column+'bin'] = pd.cut(food_df[each_column], bins, labels=np.arange(0,len(bins)-1))
    # food_df

    # In[5]:

    # for each_column in food_30_bins:
    #     print(food_df[each_column].min())

    # In[6]:

    #Get User Dataframe
    # user_df = pd.read_csv('user_db_try.csv')
    # user_df.head()

    dict_list = []

    with connection.cursor() as cur:
        cur.execute("select * from tblUserData")
        for row in cur:
            dict_list.append(row)

    user_rds_df = pd.DataFrame(dict_list)
    user_df = user_rds_df.copy()
    user_df.drop([
        'cognitoAccessToken', 'cognitoIDToken', 'cognitoRefreshToken',
        'fitbitAccessToken', 'fitbitUserID', 'userName'
    ],
                 axis=1,
                 inplace=True)
    # user_df.head()

    print('User Dataframe imported')

    # In[7]:

    #Get userItem DataFrame
    # userItem_df = pd.read_csv('userItem_db_try_new.csv')
    # userItem_df.head()

    dict_list = []

    with connection.cursor() as cur:
        cur.execute("select * from tblUserRating")
        for row in cur:
            dict_list.append(row)

    userItem_rds_df = pd.DataFrame(dict_list)
    userItem_df = userItem_rds_df.copy()
    # userItem_df.head()
    print('UserItem Dataframe imported')

    # In[8]:

    #Make all the feature values unique
    for column_name in food_df.columns:
        if column_name != 'food_ID':
            food_df[column_name] = str(
                column_name) + ":" + food_df[column_name].astype(str)
    # food_df.head()

    # In[9]:

    #This Dict will be useful while creating tupples
    food_features_df = food_df.drop(['food_ID'], axis=1).copy()
    food_features_dict = food_features_df.to_dict('split')
    # food_features_dict

    # In[10]:

    food_feature_values = []

    for column_name in food_features_df.columns:
        food_feature_values.extend(food_features_df[column_name].unique())

    # food_feature_values

    # In[11]:

    for column_name in user_df.columns:
        if column_name != 'userID':
            user_df[column_name] = str(
                column_name) + ":" + user_df[column_name].astype(str)

    user_features_df = user_df.drop(['userID'], axis=1).copy()

    user_features_dict = user_features_df.to_dict('split')
    # user_features_dict

    # In[12]:

    user_feature_values = []

    for column_name in user_features_df.columns:
        user_feature_values.extend(user_features_df[column_name].unique())

    # user_feature_values

    # In[13]:

    user_tuples = []
    food_tuples = []

    for index, row in user_df.iterrows():
        user_tuples.append((row['userID'], user_features_dict['data'][index]))

    for index, row in food_df.iterrows():
        food_tuples.append((row['food_ID'], food_features_dict['data'][index]))

    # food_tuples

    # In[14]:

    print("Creating LightFm dataset")
    dataset = Dataset()
    dataset.fit(users=(user_id for user_id in user_df['userID']),
                items=(food_id for food_id in food_df['food_ID']))

    print("Dataset Created")
    # In[15]:

    num_users, num_items = dataset.interactions_shape()
    print('Num users: {}, num_items {}.'.format(num_users, num_items))

    # In[16]:

    # dataset.fit_partial(items=(food_id for food_id in food_df['Food_Code']),
    #                            item_features=((each_feature for each_feature in food_features)for food_features in food_features_dict['data']))

    # In[17]:

    # dataset.fit_partial(items=(food_id for food_id in food_df['Food_Code']),
    #                            item_features=((row['Milk'], row['Meats'], row['Alcohol'], row['Calories'])for index,row in food_df.iterrows()))

    # In[18]:

    print("fittng item partial features")
    dataset.fit_partial(items=(food_id for food_id in food_df['food_ID']),
                        item_features=(each_value
                                       for each_value in food_feature_values))

    # In[19]:

    # dataset.fit_partial(users=(user_id for user_id in user_df['Id']),
    #                     user_features=((each_feature for each_feature in user_features)for user_features in user_features_dict['data']))

    # In[20]:
    print("fittng user partial features")

    dataset.fit_partial(users=(user_id for user_id in user_df['userID']),
                        user_features=(each_value
                                       for each_value in user_feature_values))

    # In[21]:

    # dataset.item_features_shape()
    # dataset.user_features_shape()

    # In[22]:

    print("Building Interactions")
    (interactions, weights) = dataset.build_interactions(
        ((x['userID'], x['food_ID'], x['rating'])
         for y, x in userItem_df.iterrows()))

    # print(repr(interactions))
    # print(weights)

    # In[23]:

    # interactions.shape

    # In[24]:

    print("Building item features")
    item_features = dataset.build_item_features(each_tuple
                                                for each_tuple in food_tuples)
    # print(item_features)

    # In[25]:

    user_features = dataset.build_user_features(each_tuple
                                                for each_tuple in user_tuples)
    # print(user_features)

    # In[26]:

    print("Fitting Model")
    model = LightFM(loss='warp')
    model.fit(interactions,
              item_features=item_features,
              user_features=user_features)

    print("Model trained!!")

    print("Pickle started!!")
    pickle.dump(model, open("/tmp/model.pkl", 'wb'), protocol=2)

    bucketName = "fitbook-lambda-packages"
    Key = "/tmp/model.pkl"
    outPutname = "model.pkl"

    print("Uploading to S3")
    s3 = boto3.client('s3')
    s3.upload_file(Key, bucketName, outPutname)
    print("Upload done")
    os.remove("/tmp/model.pkl")

    print("Pickle file deleted")
    print("Successssss!!!!!")
            (x['ISBN'] for x in get_ratings()))

# query the dataset to check how many users and items (i.e. books) it knows
num_users, num_items = dataset.interactions_shape()
print('Num users : {}, num_items {}.'.format(num_users, num_items))

# add some item feature mappings, and creates a unique feature for each author
# NOTE: more item ids are fitted than usual, to make sure our mappings are complete
# even if there are items in the features dataset that are not in the interaction set
dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()),
                    item_features=(x['Book-Author']
                                   for x in get_book_features()))

# build the interaction matrix which is a main input to the LightFM model
# it encodes the interactions between the users and the items
(interactions, weights) = dataset.build_interactions(
    ((x['User-ID'], x['ISBN']) for x in get_ratings()))

# item_features matrix can also be created
item_features = dataset.build_item_features(
    ((x['ISBN'], [x['Book-Author']]) for x in get_book_features()))

# split the current dataset into a training and test dataset
train, test = random_train_test_split(interactions,
                                      test_percentage=0.01,
                                      random_state=None)

# build the model using the training dataset, notice the use of item_features as well,
# this is a hybrid model
model = LightFM(loss='warp',
                item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS)
示例#33
0
def calc(request):
    try :
        stores =  Store.objects.all();
        reviews = Review.objects.all();

        stores = pd.DataFrame(list(stores.values('id', 'store_id','store_name', 'category',
        'address','latitude','longitude','average_rating')))
        reviews = pd.DataFrame(list(reviews.values('id', 'storeid','userid', 'score','reg_time')))

        reviews_source = [(reviews['userid'][i], reviews['storeid'][i]) for i in range(reviews.shape[0])]
        item_feature_source = [(stores['store_id'][i], [ stores['category'][i],stores['address'][i],stores['latitude'][i],stores['longitude'][i], stores['average_rating'][i]] ) for i in range(stores.shape[0]) ]

        dataset = Dataset()
        dataset.fit(users=reviews['userid'].unique(),
            items=reviews['storeid'].unique(),
            item_features=stores[stores.columns[1:]].values.flatten())

        interactions, weights = dataset.build_interactions(reviews_source)
        item_features = dataset.build_item_features(item_feature_source)

        # Split Train, Test data
        train, test = random_train_test_split(interactions, test_percentage=0.1)
        train, test = train.tocsr().tocoo(), test.tocsr().tocoo()
        train_weights = train.multiply(weights).tocoo()

        # Define Search Space
        trials = Trials()
        space = [hp.choice('no_components', range(10, 50, 10)), hp.uniform('learning_rate', 0.01, 0.05)]

        # Define Objective Function
        def objective(params):
            no_components, learning_rate = params
            global model
            model = LightFM(no_components=no_components,
                            learning_schedule='adagrad',
                            loss='warp',
                            learning_rate=learning_rate,
                            random_state=0)

            model.fit(interactions=train,
                    item_features=item_features,
                    sample_weight=train_weights,
                    epochs=3,
                    verbose=False)

            test_precision = precision_at_k(model, test, k=5, item_features=item_features).mean()
            print("no_comp: {}, lrn_rate: {:.5f}, precision: {:.5f}".format(
            no_components, learning_rate, test_precision))
            # test_auc = auc_score(model, test, item_features=item_features).mean()
            output = -test_precision

            if np.abs(output+1) < 0.01 or output < -1.0:
                output = 0.0

            return output

        # max_evals가 몇번 반복실행 할껀지. 
        best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=10, trials=trials)

        # 아이템피쳐 저장        
        with open('./saved_models/item_features.pickle', 'wb') as fle:
            pickle.dump(item_features, fle, protocol=pickle.HIGHEST_PROTOCOL)

        # 모델 저장해야 됨
        with open('./saved_models/model.pickle', 'wb') as fle:
            pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL)

        item_biases, item_embeddings = model.get_item_representations(features=item_features)
        # item_embeddings 저장하기 
        with open('./saved_models/item_embeddings.pickle', 'wb') as fle:
            pickle.dump(item_embeddings, fle, protocol=pickle.HIGHEST_PROTOCOL)
        
        return Response({'result': True}) 
    
    except :
        return Response({'result': False})