Exemplo n.º 1
0
def test_sample_weight():

    model = LightFM()

    train = sp.coo_matrix(np.array([[0, 1], [0, 1]]))

    with pytest.raises(ValueError):
        # Wrong number of weights
        sample_weight = sp.coo_matrix(np.zeros((2, 2)))

        model.fit(train, sample_weight=sample_weight)

    with pytest.raises(ValueError):
        # Wrong shape
        sample_weight = sp.coo_matrix(np.zeros(2))
        model.fit(train, sample_weight=np.zeros(3))

    with pytest.raises(ValueError):
        # Wrong order of entries
        sample_weight = sp.coo_matrix((train.data, (train.row[::-1], train.col[::-1])))
        model.fit(train, sample_weight=np.zeros(3))

    sample_weight = sp.coo_matrix((train.data, (train.row, train.col)))
    model.fit(train, sample_weight=sample_weight)

    model = LightFM(loss="warp-kos")

    with pytest.raises(NotImplementedError):
        model.fit(train, sample_weight=np.ones(1))
Exemplo n.º 2
0
def test_state_reset():

    model = LightFM()

    model.fit(train,
              epochs=1)

    assert np.mean(model.user_embedding_gradients) > 1.0

    model.fit(train,
              epochs=0)
    assert np.all(model.user_embedding_gradients == 1.0)
Exemplo n.º 3
0
def test_movielens_accuracy_fit():

    model = LightFM()
    model.fit(train,
              epochs=10)

    train_predictions = model.predict(train.row,
                                      train.col)
    test_predictions = model.predict(test.row,
                                     test.col)

    assert roc_auc_score(train.data, train_predictions) > 0.84
    assert roc_auc_score(test.data, test_predictions) > 0.76
Exemplo n.º 4
0
def test_movielens_accuracy_pickle():

    model = LightFM(random_state=SEED)
    model.fit(train,
              epochs=10)

    model = pickle.loads(pickle.dumps(model))

    train_predictions = model.predict(train.row,
                                      train.col)
    test_predictions = model.predict(test.row,
                                     test.col)

    assert roc_auc_score(train.data, train_predictions) > 0.84
    assert roc_auc_score(test.data, test_predictions) > 0.76
Exemplo n.º 5
0
def test_return_self():

    no_users, no_items = (10, 100)

    train = sp.coo_matrix((no_users, no_items), dtype=np.int32)

    model = LightFM()
    assert model.fit_partial(train) is model
    assert model.fit(train) is model
Exemplo n.º 6
0
LEARNING_RATE = 1e-4
LOSS = 'warp'

# Let's fit a WARP model: these generally have the best performance.
model = LightFM(loss=LOSS,
                item_alpha=ITEM_ALPHA,
                no_components=COMPONENTS,
                learning_schedule=LEARNING,
                learning_rate=LEARNING_RATE)

print(
    "Currently using LOSS:{0}, COMPONENTS:{1}, LEARNING:{2}, RATE:{3}".format(
        LOSS, COMPONENTS, LEARNING, LEARNING_RATE))

# Run 3 epochs and time it.
model = model.fit(URM_train, epochs=NUM_EPOCHS, verbose=True)
'''
train_precision = precision_at_k(model, URM_train, k=10).mean()
test_precision = precision_at_k(model, URM_test, k=10).mean()

train_auc = auc_score(model, URM_train).mean()
test_auc = auc_score(model, URM_test).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))
'''

recommender = Recommender(URM_train, model)

cumulative_precision = 0.0
cumulative_recall = 0.0
# set the number of threads; can increase this
# if more physical cores are available. However, MacOS systems
# use a default value of 1 thread if OpenMP is not supported
NUM_THREADS = 2
NUM_COMPONENTS = 30
NUM_EPOCHS = 3
ITEM_ALPHA = 1e-6

# Try to fit a WARP model - this is generally the model with the best performance
model = LightFM(loss='warp',
                item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS)

# run 3 epochs and time it
model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)

# compute and print the AUC score
train_auc = auc_score(model, train, num_threads=NUM_THREADS).mean()
print('Collaborative filtering train AUC: %s' % train_auc)

# We pass in the train interactions to exclude them from predictions.
# This is to simulate a recommender system where we do not
# re-recommend things the user has already interacted with in the train set.
test_auc = auc_score(model, test, num_threads=NUM_THREADS).mean()
print('Collaborative filtering test AUC: %s' % test_auc)

# set the biases to zero to rid of pre-estimated per-item biases
model.item_biases *= 0.0

test_auc = auc_score(model, test, num_threads=NUM_THREADS).mean()
Exemplo n.º 8
0
class LFM(BaseRecommender, DataLoaderSaver):
    """
    Wrapper over LightFM model
    """
    def __init__(
        self,
        no_components=30,
        k=5,
        n=10,
        learning_schedule="adagrad",
        loss="logistic",
        learning_rate=0.05,
        rho=0.95,
        epsilon=1e-06,
        item_alpha=0.0,
        user_alpha=0.0,
        max_sampled=10,
        random_state=42,
        epochs=20,
        show_progress=True,
    ):
        """
        Source of descriptions:
        https://making.lyst.com/lightfm/docs/_modules/lightfm/lightfm.html#LightFM

        A hybrid latent representation recommender model.

        The model learns embeddings (latent representations in a high-dimensional
        space) for users and items in a way that encodes user preferences over items.
        When multiplied together, these representations produce scores for every item
        for a given user; items scored highly are more likely to be interesting to
        the user.

        The user and item representations are expressed in terms of representations
        of their features: an embedding is estimated for every feature, and these
        features are then summed together to arrive at representations for users and
        items. For example, if the movie 'Wizard of Oz' is described by the following
        features: 'musical fantasy', 'Judy Garland', and 'Wizard of Oz', then its
        embedding will be given by taking the features' embeddings and adding them
        together. The same applies to user features.

        The embeddings are learned through `stochastic gradient
        descent <http://cs231n.github.io/optimization-1/>`_ methods.

        Four loss functions are available:

        - logistic: useful when both positive (1) and negative (-1) interactions
        are present.
        - BPR: Bayesian Personalised Ranking [1]_ pairwise loss. Maximises the
        prediction difference between a positive example and a randomly
        chosen negative example. Useful when only positive interactions
        are present and optimising ROC AUC is desired.
        - WARP: Weighted Approximate-Rank Pairwise [2]_ loss. Maximises
        the rank of positive examples by repeatedly sampling negative
        examples until rank violating one is found. Useful when only
        positive interactions are present and optimising the top of
        the recommendation list (precision@k) is desired.
        - k-OS WARP: k-th order statistic loss [3]_. A modification of WARP that
        uses the k-th positive example for any given user as a basis for pairwise
        updates.

        Two learning rate schedules are available:

        - adagrad: [4]_
        - adadelta: [5]_

        Parameters
        ----------

        no_components: int, optional
            the dimensionality of the feature latent embeddings.
        k: int, optional
            for k-OS training, the k-th positive example will be selected from the
            n positive examples sampled for every user.
        n: int, optional
            for k-OS training, maximum number of positives sampled for each update.
        learning_schedule: string, optional
            one of ('adagrad', 'adadelta').
        loss: string, optional
            one of  ('logistic', 'bpr', 'warp', 'warp-kos'): the loss function.
        learning_rate: float, optional
            initial learning rate for the adagrad learning schedule.
        rho: float, optional
            moving average coefficient for the adadelta learning schedule.
        epsilon: float, optional
            conditioning parameter for the adadelta learning schedule.
        item_alpha: float, optional
            L2 penalty on item features. Tip: setting this number too high can slow
            down training. One good way to check is if the final weights in the
            embeddings turned out to be mostly zero. The same idea applies to
            the user_alpha parameter.
        user_alpha: float, optional
            L2 penalty on user features.
        max_sampled: int, optional
            maximum number of negative samples used during WARP fitting.
            It requires a lot of sampling to find negative triplets for users that
            are already well represented by the model; this can lead to very long
            training times and overfitting. Setting this to a higher number will
            generally lead to longer training times, but may in some cases improve
            accuracy.
        random_state: int seed, RandomState instance, or None
            The seed of the pseudo random number generator to use when shuffling
            the data and initializing the parameters.

        epochs: (int, optional) number of epochs to run
        """

        super().__init__()

        self.model = LightFM(
            no_components=no_components,
            k=k,
            n=n,
            learning_schedule=learning_schedule,
            loss=loss,
            learning_rate=learning_rate,
            rho=rho,
            epsilon=epsilon,
            item_alpha=item_alpha,
            user_alpha=user_alpha,
            max_sampled=max_sampled,
            random_state=random_state,
        )
        self.epochs = epochs

        # data
        self.interactions = None
        self.train_ui = None
        self.user_id_code = None
        self.user_code_id = None
        self.item_code_id = None

        self.show_progress = show_progress

    def preprocess(self):
        """
        Prepare interactions dataset for training model
        """

        data = self.interactions.copy()
        data["event_value"] = 1

        self.user_code_id = dict(enumerate(data["user"].unique()))
        self.user_id_code = {v: k for k, v in self.user_code_id.items()}
        data["user_code"] = data["user"].apply(self.user_id_code.get)

        self.item_code_id = dict(enumerate(data["item"].unique()))
        item_id_code = {v: k for k, v in self.item_code_id.items()}
        data["item_code"] = data["item"].apply(item_id_code.get)

        self.train_ui = sparse.csr_matrix(
            (data["event_value"], (data["user_code"], data["item_code"])))

    def fit(self):
        """
        Fit the model
        """
        self.model.fit(
            self.train_ui,
            epochs=self.epochs,
            num_threads=multiprocessing.cpu_count(),
            verbose=self.show_progress,
        )

    def recommend(
        self,
        target_users,
        n_recommendations,
        filter_out_interacted_items=True,
        show_progress=True,
    ) -> pd.DataFrame:
        """
            Recommends n_recommendations items for target_users
        :return:
            pd.DataFrame (user, item_1, item_2, ..., item_n)
        """
        items_to_recommend = np.arange(len(self.item_code_id))

        with ThreadPool() as thread_pool:
            recommendations = list(
                tqdm(
                    thread_pool.imap(
                        partial(
                            self.recommend_per_user,
                            n_recommendations=n_recommendations,
                            items_to_recommend=items_to_recommend,
                        ),
                        target_users,
                    ),
                    disable=not self.show_progress,
                ))

        return pd.DataFrame(recommendations)

    def recommend_per_user(self, user, n_recommendations, items_to_recommend):
        """
        Recommends n items per user
        :param user: User id
        :param n_recommendations: Number of recommendations
        :return: list of format [user_id, item1, item2 ...]
        """
        u_code = self.user_id_code.get(user)

        if u_code is not None:
            interacted_items = self.train_ui.indices[
                self.train_ui.indptr[u_code]:self.train_ui.indptr[u_code + 1]]

            scores = self.model.predict(int(u_code), items_to_recommend)

            item_recommendations = items_to_recommend[np.argsort(
                -scores)][:n_recommendations + len(interacted_items)]
            item_recommendations = [
                self.item_code_id[item] for item in item_recommendations
                if item not in interacted_items
            ]

        return ([user] + item_recommendations + [None] *
                (n_recommendations - len(item_recommendations)))
Exemplo n.º 9
0
def recommendations():
    """
    Render the trail_recommendations.html page

    Args:
        Nothing

    Returns:
        the trail_recommendations.html template, this includes hiking trails
        recommendations based on user-input. Up to 10 trails are provided.
        Trail options are presented in cards that include a photo taken of the
        trail, a short description of the trail, and a link to the trail
        profile page on AllTrails.com
    """
    # Gather user input from ideal hike text selection
    user_input = request.form.getlist('user_feature_options[]')
    input_user_features = pd.DataFrame([" ".join(user_input)])

    # Gather user filters - location, feature1, feature2
    user_location = request.form['user_location']
    trail_feature_select1 = request.form['trail_feature_select1']
    trail_feature_select2 = request.form['trail_feature_select2']

    # Parse user input
    user_feature_new = parse_input_descriptors(input_user_features)

    # Make connection to database
    # Database name
    dbname = 'pnw_hike'

    # Set postgres username
    username = '******'

    ## Using an engine to connect to the postgres db
    engine = create_engine('postgres://%s:insight@localhost/%s'%(username, dbname), paramstyle="format")

    # Connect to make queries using psycopg2
    con = None
    con = psycopg2.connect(database = dbname, user = username, password = '******', port = 5432)

    # User features
    user_features_query = """
    SELECT * FROM user_features;
    """
    user_features_from_sql = pd.read_sql_query(user_features_query, con, index_col='review_author')

    # Trail features raw
    trail_reviews_raw_query = """
    SELECT * FROM trail_reviews_raw;
    """
    trail_reviews_raw_from_sql = pd.read_sql_query(trail_reviews_raw_query, con, index_col="index")

    # Trail urls and filtering info
    trail_urls_info_query = """
    SELECT * FROM trail_urls_info;
    """
    trail_urls_info = pd.read_sql_query(trail_urls_info_query,con, index_col="index")

    # User features
    user_features_df = user_features_from_sql.drop(["index", "review_text", "clean_review"], axis = 1)
    user_features = user_features_df.fillna(0)

    # Trail features filling blanks with 0
    trail_features = trail_reviews_raw_from_sql.fillna(0)

    # Convert user-feature space to sparse matrix
    user_features = sparse.csr_matrix(user_features.values)

    # Create a large sparse dataframe of extant user reviews/ratings
    interactions = create_interaction_matrix(trail_reviews_raw_from_sql, user_col='review_author', item_col='trail_name', rating_col='review_rating', norm=False, threshold=None)

    # Align users in the interaction and user matrices due to dropping some trails
    # Identify which users are in the interaction matrix and not in user feature space
    key_diff = set(interactions.index).difference(user_features_from_sql.index)
    where_diff = interactions.index.isin(key_diff)

    # Filter interactions based on users present in user features
    interactions = interactions.loc[~interactions.index.isin(interactions[where_diff].index)]

    # Convert sparse dataframe into a sparse matrix
    interactions_matrix = sparse.csr_matrix(interactions.values)

    # Prep for trail dict
    trail_urls = trail_urls_info[['trail_name', 'trail_url']]

    # Convert new user features to a sparse matrix
    user_feature_new_sparse = sparse.csr_matrix(user_feature_new.values)

    ## Combine new user-feature sparse matrix with current users' sparse matrix
    new_user_features = concatenate_csc_matrices_by_columns(user_feature_new_sparse, user_features)

    # Incorporate new user's selections into the interaction matrix
    interactions_new_user_df = pd.DataFrame().reindex_like(interactions).iloc[0:0]
    interactions_new_user_df.loc["new_user"] = 0
    new_interactions_df = pd.concat([interactions_new_user_df, interactions])
    interactions_new_user = sparse.csr_matrix(interactions_new_user_df.values)
    new_interactions_matrix = concatenate_csc_matrices_by_columns(interactions_new_user, interactions_matrix)

    # Make trail dict
    trails_in_interaction_matrix = pd.DataFrame(interactions_new_user_df.columns.T)
    trail_dict_prep = trails_in_interaction_matrix.merge(trail_urls, on='trail_name')

    # Add unique identifier to trail dict
    trail_dict_prep['trail_id'] = trail_dict_prep.index+1

    # Make trail dict
    trails_dict = create_trail_dict(trail_dict_prep, id_col = 'trail_name', name_col = 'trail_id')

    # With the new interactions df we can defined a user dictionary
    user_dict = create_user_dict(interactions = new_interactions_df)

    # Run model with new user features and interactions
    NUM_THREADS = 4 # The t2.xlarge instance supports up to 4 cores, we'll use all 4 here
    NUM_COMPONENTS = 30
    NUM_EPOCHS = 5
    ITEM_ALPHA = 1e-6

    # Let's train a WARP model: these generally have the best performance.
    model = LightFM(loss='warp', item_alpha=ITEM_ALPHA, no_components=NUM_COMPONENTS, random_state=15)

    # Fit model
    model = model.fit(interactions=new_interactions_matrix, user_features=new_user_features,
                      epochs=NUM_EPOCHS, num_threads=NUM_THREADS)

    # Run the model
    trail_names, trail_overviews, trail_urls, card_image_urls = new_user_recommendation(model,
                                                                                        new_interactions_df,
                                                                                        user_id="new_user",
                                                                                        trail_urls_info=trail_urls_info,
                                                                                        user_location=user_location,
                                                                                        trail_feature_select1=trail_feature_select1,
                                                                                        trail_feature_select2=trail_feature_select2,
                                                                                        user_dict=user_dict, trail_dict=trails_dict,
                                                                                        nrec_items=1500,
                                                                                        threshold=4)

    # Change 'e' if selected
    if user_location == 'e':
      user_location = "all of the Pacific Northwest"

    return render_template('trail_recommendations.html',
                            trail_names = trail_names,
                            trail_overviews = trail_overviews,
                            trail_urls = trail_urls,
                            card_image_urls = card_image_urls,
                            trail_feature_select1 = trail_feature_select1,
                            trail_feature_select2 = trail_feature_select2,
                            user_location = user_location,
                            input_user_features = user_input)
Exemplo n.º 10
0
import numpy as np
from lightfm.datasets import fetch_movielens
from lightfm import LightFM


data = fetch_movielens(min_rating=4.0)
#print(repr(data['train']))
#print(repr(data['test']))
model = LightFM(loss='warp')

model.fit(data['train'], epochs=50,num_threads=4)

def sample_recommendation(model, data, user_ids):

    n_users, n_items = data['train'].shape
    for user_id in user_ids:

        # movies they already like
        known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices]
        # movies our model predicts they will like
        scores = model.predict(user_id, np.arange(n_items))
        # rank them in order of most liked to least
        top_items = data['item_labels'][np.argsort(-scores)]

        # print out the results
        print("User %s" % user_id)
        print("     Known positives:")

        for x in known_positives[:3]:
            print("        %s" % x)
Exemplo n.º 11
0
                                             for x in qd.getProfiles()))
print(user_features)
'''

# Creating a user fettu
# Split the set in train and test
test, train = random_train_test_split(interactions,
                                      test_percentage=0.2,
                                      random_state=None)

# Start training the model
print("--- Start model training ---")
model = LightFM(no_components=1, learning_rate=0.027, loss='warp')
model.fit(train,
          item_features=item_features,
          epochs=100,
          num_threads=4,
          verbose=False)
# model.fit(train,epochs=12,num_threads=4)

modelnofeatures = LightFM(no_components=1, learning_rate=0.027, loss='warp')
modelnofeatures.fit(train, epochs=100, num_threads=4, verbose=False)

# model.fit(train,epochs=12,num_threads=4)
'''

with open('saved_model','wb') as f:
     saved_model={'model':model}
     pickle.dump(saved_model, f)

Exemplo n.º 12
0
print(test.shape)

(train_interactions,
 train_weights) = dataset.build_interactions(train[[3, 1]].values)
(test_interactions, test_weights) = dataset.build_interactions(test[[3, 1
                                                                     ]].values)

# arr = sparse.coo_matrix(np.tile(list(range(2,10)), (len(items), 1)))
# items['features'] = arr.toarray().tolist()
# # item_features = dataset.build_item_features()
# # items2 = items.to_dict('records')

from lightfm import LightFM

model = LightFM(loss='warp', random_state=0)
model.fit(train_interactions, epochs=100, num_threads=1)

from lightfm.evaluation import recall_at_k
from lightfm.evaluation import precision_at_k

print("Train recall@7: %.2f" %
      recall_at_k(model, train_interactions, k=7).mean())
print("Test recall@7: %.2f" %
      recall_at_k(model, test_interactions, train_interactions, k=7).mean())
print("Train precision@7: %.2f" %
      precision_at_k(model, train_interactions, k=7).mean())
print("Test precision@7: %.2f" %
      precision_at_k(model, test_interactions, train_interactions, k=7).mean())
print("Train reciprocal rank: %.2f" %
      reciprocal_rank(model, train_interactions).mean())
print("Test reciprocal rank: %.2f" %
Exemplo n.º 13
0
#pip install numpy
#pip install scipy
#pip install lightfm ( allow to perform any number of popular recommendation libraries )

import numpy as np
from lightfm.datasets import fetch_movielens
from lightfm import LightFM
from fetch_amazonratingonly import fetch_amazonratingonly

#fetch data and format it
data = fetch_amazonratingonly(min_rating=3.0)

#create model WARP
model = LightFM(loss='warp') #Weighted Approximate-Rank Pairwise
#train model
model.fit(data['matrix'], epochs=30, num_threads=2)

# model2 = LightFM(loss='warp-kos') #A modification of WARP that uses the k-th positive example for any given user as a basis for pairwise updates.
# #train model
# model2.fit(data['matrix'], epochs=30, num_threads=2)

# #('logistic', 'warp', 'bpr', 'warp-kos')

# model3 = LightFM(loss='bpr') #A modification of WARP that uses the k-th positive example for any given user as a basis for pairwise updates.
# #train model
# model3.fit(data['matrix'], epochs=30, num_threads=2)


# model4 = LightFM(loss='logistic') #A modification of WARP that uses the k-th positive example for any given user as a basis for pairwise updates.
# #train model
# model4.fit(data['matrix'], epochs=30, num_threads=2)
# from lightfm.datasets import fetch_movielens
# data = fetch_movielens(min_rating=5.0)
# plt.imshow(data['item_features'].toarray())
# data_dict = {'train':train_mat, 
# 			 'test':test_mat, 
# 			 'item_features': ,
# 			 'item_feature_labels': ,
# 			 'item_labels':}


## Create a model instance with the desired latent dimensionality:
model = LightFM(no_components=30)

## Assuming train is a (no_users, no_items) sparse matrix (with 1s denoting 
## positive, and -1s negative interactions), you can fit a traditional matrix factorization model by calling:
model.fit(train_mat, epochs=20)


print("Train precision: %.2f" % precision_at_k(model, train_mat, k=5).mean())
print("Test precision: %.2f" % precision_at_k(model, test_mat, k=5).mean())

## This will train a traditional MF model, as no user or item features have been supplied.
## To get predictions, call model.predict:
predictions = model.predict(test_user_ids, test_item_ids)



	

model = LightFM(loss='warp',
                random_state=2016,
Exemplo n.º 15
0
# free memory
del job_embeddings
del resume_embeddings
del interaction_sparse
gc.collect()

##### create and train LightFM model ######
NUM_THREADS = 4
NUM_COMPONENTS = 30
NUM_EPOCHS = 50
ITEM_ALPHA = 1e-6
K_num = 5

model = LightFM(loss='warp'
               , item_alpha=ITEM_ALPHA
               , no_components=NUM_COMPONENTS)

%time model = model.fit(interactions=train, user_features=job_features_sparse, item_features=resume_features_sparse, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)

%time test_precision = precision_at_k(model, test, user_features=job_features_sparse, item_features=resume_features_sparse, k=K_num).mean()
print('test precision at k: %s' %test_precision)

%time train_precision = precision_at_k(model, train, user_features=job_features_sparse, item_features=resume_features_sparse, k=K_num).mean()
print('train precision at k: %s' %train_precision)

%time test_auc = auc_score(model, test,user_features=job_features_sparse, item_features=resume_features_sparse, num_threads=NUM_THREADS).mean()
print('test AUC: %s' %test_auc)

%time train_auc = auc_score(model, train,user_features=job_features_sparse, item_features=resume_features_sparse, num_threads=NUM_THREADS).mean()
print('train AUC: %s' %train_auc)
import numpy
from lightfm import LightFM
from ratingsData import fetch_ratings

#fetch dataset using our own method fetch_data
data = fetch_ratings()

#creating a model using LightFM class off lightfm module
model = LightFM(loss='warp')
model.fit(data['ratings'], epochs=30, num_threads=2)


def recommend_match(model, data, user_ids):
    n_user, n_matches = data['ratings'].shape

    for user_id in user_ids:

        scores = model.predict(user_id, numpy.arange(n_matches))
        topScores = numpy.argsort(-scores)[:3]

        print('recommendation for user : %s' % user_id)

        for x in topScores[:3]:
            print("    %s" % x)


recommend_match(model, data, [1])
Exemplo n.º 17
0
 def runMF(self, interactions, n_components, learning_rate, loss, k, epoch, n_jobs):
     from lightfm import LightFM
     model = LightFM(no_components= n_components, learning_rate = learning_rate,
                     loss=loss,k=k)
     model.fit(interactions,epochs=epoch,num_threads = n_jobs)
     return model
Exemplo n.º 18
0
games_df.columns = ['user_id', 'title', 'action', 'hours', 'hz']

games_df = games_df[games_df['action'] == 'purchase']

le = LabelEncoder()
le.fit(games_df['title'])

games_df['title'] = le.transform(games_df['title'])

games_df_pivot = games_df.pivot_table(columns=['title'],
                                      index=['user_id'],
                                      values=['hours'])
games_df_pivot.fillna(value=0, inplace=True)

games_df_pivot_train = games_df_pivot.sample(frac=0.8)
games_df_pivot_test = games_df_pivot.loc[games_df_pivot.index.difference(
    games_df_pivot_train.index)]

games_df_pivot_train_sparse = coo_matrix(games_df_pivot_train.values)
games_df_pivot_test_sparse = coo_matrix(games_df_pivot_test.values)

model = LightFM(loss='warp', random_state=42)
model.fit(games_df_pivot_train_sparse, epochs=150, num_threads=2)

return model.predict([3], [1])

# print("Train precision: %.2f" % precision_at_k(model, games_df_pivot_train_sparse, k=5).mean())
# print("Test precision: %.2f" % precision_at_k(model, games_df_pivot_test_sparse, k=5).mean())

# pickle.dump(model,open('model.pickle','wb'))
Exemplo n.º 19
0
import numpy as np
import pandas as pd
from lightfm.datasets import fetch_movielens
from lightfm import LightFM
from movies import fetch_movies
import random
movies = fetch_movies()

model = LightFM(loss="warp")
model.fit(movies, epochs=30, num_threads=2)

users = pd.read_csv('users.dat', sep='::')
movies_data = pd.read_csv('movies.dat', sep='::')

user1 = random.choice(users['UserID'])
user2 = random.choice(users['UserID'])
user3 = random.choice(users['UserID'])


def get_recommendation(users, model, movies_matrix, movies_data):
    n_items = movies_matrix.shape[1]
    for user in users:
        scores = model.predict(user, np.arange(n_items))
        topscore = np.argsort(-scores)[:3]

        print('For User ', user)
        print('\t Reccomanded Movies :')

        for movie in topscore:
            movie_index = np.where(movie == movies_data['MovieID'])[0]
            movie_title = movies_data['Title'][movie_index[0]]
Exemplo n.º 20
0
def process_mpd(playlists_path, target_playlists, output_file,
                prev_songs_window):
    max_prev_song = 0
    previous_tracks = defaultdict(lambda: defaultdict(int))
    playlists_tracks = []
    playlists = []
    playlists_extra = {'name': []}
    filenames = os.listdir(playlists_path)
    for filename in sorted(filenames):
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((playlists_path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)
            for playlist in mpd_slice['playlists']:
                nname = normalize_name(playlist['name'])
                playlists_extra['name'].append(nname)
                tracks = defaultdict(int)

                sorted_tracks = sorted(playlist['tracks'],
                                       key=lambda k: k['pos'])
                prev_track = []
                for track in sorted_tracks:
                    tracks[track['track_uri']] += 1
                    curr_prev_tracks = len(prev_track)
                    for i, song_in_window in enumerate(prev_track):
                        previous_tracks[song_in_window][
                            track['track_uri']] += (i + 1) / curr_prev_tracks
                        previous_tracks[track['track_uri']][
                            song_in_window] += (i + 1) / curr_prev_tracks
                        #previous_tracks[song_in_window][track['track_uri']] += 1
                        #previous_tracks[track['track_uri']][song_in_window] += 1
                        max_prev_song = max(
                            max_prev_song, previous_tracks[track['track_uri']]
                            [song_in_window])
                        max_prev_song = max(
                            max_prev_song, previous_tracks[song_in_window][
                                track['track_uri']])
                    if len(prev_track) == prev_songs_window:
                        prev_track.pop(0)
                    prev_track.append(track['track_uri'])
                playlists_tracks.append(tracks)
                playlists.append(str(playlist['pid']))

    top_pop = []
    for i in previous_tracks.keys():
        top_pop.append((i, np.sum(list(previous_tracks[i].values()))))
    top_pop = sorted(top_pop, key=lambda x: x[1], reverse=True)[:10000]
    top_pop = [t[0] for t in top_pop]

    # Add playlists on testing set
    test_playlists = []
    target = json.load(open(target_playlists))
    train_playlists_count = len(playlists)
    test_playlists_recommended_sum = []
    for playlist in target["playlists"]:
        nname = ""
        if 'name' in playlist:
            nname = normalize_name(playlist['name'])
        playlists_extra['name'].append(nname)
        playlists.append(str(playlist['pid']))
        test_playlists.append(str(playlist['pid']))
        if len(playlist['tracks']) == 0:
            test_playlists_recommended_sum.append(top_pop)
            playlists_tracks.append({})
            continue

        tracks = defaultdict(int)
        for track in playlist['tracks']:
            tracks[track['track_uri']] += 1

        playlists_tracks.append(tracks)
        recommended_pop = defaultdict(list)
        for t in tracks.keys():
            for pt in previous_tracks[t].keys():
                if pt not in tracks:
                    recommended_pop[pt].append(previous_tracks[t][pt] /
                                               max_prev_song)

        recommended_pop_sum = [(t, np.sum(recommended_pop[t]))
                               for t in recommended_pop.keys()]
        recommended_pop_sum = sorted(recommended_pop_sum,
                                     key=lambda x: x[1],
                                     reverse=True)
        recommended_pop_sum = [t[0] for t in recommended_pop_sum]
        test_playlists_recommended_sum.append(recommended_pop_sum)

    print("Data loaded. Creating features matrix")

    dv = DictVectorizer()
    interaction_matrix = dv.fit_transform(playlists_tracks)

    lb = LabelBinarizer(sparse_output=True)
    pfeat = lb.fit_transform(playlists_extra['name'])
    playlist_features = pfeat

    # Need to hstack playlist_features
    eye = sparse.eye(playlist_features.shape[0],
                     playlist_features.shape[0]).tocsr()
    playlist_features_concat = sparse.hstack((eye, playlist_features))

    item_prev = []
    highlevel = []
    for track in dv.feature_names_:
        try:
            f = get_audio_features_dict(track.replace('spotify:track:', ''),
                                        False)
        except ValueError:
            print("Failed loading json", track)
            f = None
        curr_highlevel = {}
        if f is not None:
            curr_highlevel = {k: v for k, v in f.items() if 'class_f' in k}
        highlevel.append(curr_highlevel)

    ifeat_highlevel = DictVectorizer().fit_transform(highlevel)
    item_prev = ifeat_highlevel
    eye = sparse.eye(item_prev.shape[0], item_prev.shape[0]).tocsr()
    item_feat = sparse.hstack((eye, item_prev))

    print("Features matrix created. Training model")
    model = LightFM(loss='warp',
                    no_components=200,
                    max_sampled=30,
                    item_alpha=1e-06,
                    user_alpha=1e-06,
                    random_state=SEED)
    model = model.fit(interaction_matrix,
                      user_features=playlist_features_concat,
                      item_features=item_feat,
                      epochs=150,
                      num_threads=32)
    print("Model Trained")

    user_biases, user_embeddings = model.get_user_representations(
        playlist_features_concat)
    item_biases, item_embeddings = model.get_item_representations(item_feat)

    fuse_perc = 0.7
    with open(output_file, 'w') as fout:
        print('team_info,cocoplaya,creative,[email protected]', file=fout)
        for i, playlist in enumerate(test_playlists):
            playlist_pos = train_playlists_count + i
            y_pred = user_embeddings[playlist_pos].dot(
                item_embeddings.T) + item_biases
            topn = np.argsort(-y_pred)[:len(playlists_tracks[playlist_pos]) +
                                       4000]
            rets = [(dv.feature_names_[t], float(y_pred[t])) for t in topn]
            songids = [
                s for s, _ in rets if s not in playlists_tracks[playlist_pos]
            ]
            songids_dict = {s: 1 for s in songids}
            max_score = max(len(songids),
                            len(test_playlists_recommended_sum[i]))
            pop_sum = {
                s: (max_score - p)
                for p, s in enumerate(test_playlists_recommended_sum[i])
            }
            fuse_sum = []
            for p, s in enumerate(songids):
                pop_val_sum = 0
                if s in pop_sum:
                    pop_val_sum = pop_sum[s]
                fuse_sum.append(
                    (s, ((max_score - p) * fuse_perc + pop_val_sum *
                         (1 - fuse_perc)) / 2))
            for s in pop_sum.keys():
                if s not in songids_dict:
                    fuse_sum.append((s, (pop_sum[s] * (1 - fuse_perc)) / 2))
            fuse_sum = sorted(fuse_sum, key=lambda x: x[1], reverse=True)
            print(' , '.join([playlist] + [x[0] for x in fuse_sum[:500]]),
                  file=fout)
Exemplo n.º 21
0
    ratings.nonzero()[0])) / (ratings.shape[0] * ratings.shape[1]) * 100

X = csr_matrix(ratings)
n_users, n_items = ratings_df.shape
user_ids = ratings_df.index.values
artist_names = ap.sort_values("artistID")["name"].unique()

Xcoo = X.tocoo()
data = Dataset()
data.fit(np.arange(n_users), np.arange(n_items))
interactions, weights = data.build_interactions(
    zip(Xcoo.row, Xcoo.col, Xcoo.data))
train, test = random_train_test_split(interactions)

model = LightFM(learning_rate=0.05, loss='warp')
model.fit(train, epochs=10, num_threads=2)

# Generating the list of artists at start-up:
artIDs = ap['artistID'].unique()
numarts = len(ap['artistID'].unique())
listart = ""
for it, artName in enumerate(ap['name'].unique()):
    listart = listart + '<input type="checkbox" name="' + str(
        artIDs[it]) + '" value="' + str(artName) + '">' + artName + '<br>'


# get_recommendation from Jupyter notebook:
def get_recommendation(userid, ratings=ratings):
    X = csr_matrix(ratings)
    svd = TruncatedSVD(n_components=1000, n_iter=7, random_state=0)
    X_matrix_svd = svd.fit_transform(X)
Exemplo n.º 22
0
import numpy as np
from lightfm.datasets import fetch_movielens
from lightfm import LightFM

data = fetch_movielens(min_rating=4.0)

print(repr(data["train"]))
print(repr(data["test"]))

# creating a model
model = LightFM(loss="warp")

#  training the model
model.fit(data["train"], epochs=30, num_threads=2)


def recommendations(model, data, user_ids):

    # num of users and movies in the matrix
    number_users, number_items = data["train"].shape

    for user_id in user_ids:

        # movies they already like
        liked_movies = data["item_labels"][data["train"].tocsr()
                                           [user_id].indices]

        # movies we predict they will like
        M_list = model.predict(user_id, np.arange(number_items))

        # rank them in  order of most liked to least
Exemplo n.º 23
0
def main(spark, train_data, validation_data):
    spark_session = SparkSession.builder.appName('extension1').master('yarn').config('spark.executor.memory', '15g').config('spark.driver.memory', '15g').getOrCreate()
    spark.conf.set("spark.sql.execution.arrow.enabled", "true")
    
    #####################################################################
    #LIGHTFM Model
    
    # Read data from parquet
    train_df = spark.read.parquet('hdfs:/user/smt570/small_train.parquet')
    train_df.createOrReplaceTempView('train')
    train_df = train_df.select('user_id','book_id','rating')

    val = spark.read.parquet('hdfs:/user/smt570/small_val.parquet')
    val.createOrReplaceTempView('val')
    val_df = val.select('user_id','book_id','rating')
    #remove ratings less than 3 from ground truth
    val_df = val_df.filter(val_df.rating >= 3)

    #all positive instances for training (rating >=3) keep their values, anything else becomes 0
    eq = udf(lambda x: x if x >=3 else 0, IntegerType())
    train_df = train_df.withColumn('rating',eq(train_df.rating))

    #need to sort first
    train_df = train_df.orderBy('user_id')
    
    print('Building input sparse matrices...')
    #convert to pandas for pre-processing
    train_df = train_df.toPandas()
    val_df = val_df.toPandas()

    #initialize dicts
    transf_train = dict()
    transf_val = dict()

    enc = preprocessing.LabelEncoder()

    #transform data values for train and val
    transf_train['user_id']=enc.fit_transform(train_df['user_id'].values)
    transf_train['book_id'] = enc.fit_transform(train_df['book_id'].values)
    transf_train['rating']=enc.fit_transform(train_df['rating'].values)

    transf_val['user_id']=enc.fit_transform(val_df['user_id'].values)
    transf_val['book_id'] = enc.fit_transform(val_df['book_id'].values)
    transf_val['rating']=enc.fit_transform(val_df['rating'].values)

    #get size of COO matrix
    n_users = len(np.unique(transf_train['user_id']))
    n_items = len(np.unique(transf_train['book_id']))

    #create COO matrices 
    train = coo_matrix((transf_train['rating'],(transf_train['user_id'],transf_train['book_id'])),shape=(n_users,n_items))
    val = coo_matrix((transf_val['rating'],(transf_val['user_id'],transf_val['book_id'])),shape=(n_users,n_items))

    #Build LightFM model
    print('Building LightFM model...')
    model = LightFM(loss = 'warp', no_components = 30)

    #Train LightFM model and check time to fit
    print('Training LightFM model...')
    start_time = time.time()
    model.fit(train)

    print('Run time: {} mins'.format((time.time() - start_time)/60))

    #Get data ready for evaluation, use top k predictions for metrics
    print('Evaluating...')
    pak_train = precision_at_k(model,train,k=125).mean()
    pak_val = precision_at_k(model,val,k=125).mean()

    print('Train precision@K = {}:'.format(pak_train))
    print('Test precision@K = {}:'.format(pak_val))
    
    auc_train = auc_score(model, train).mean()
    auc_test = auc_score(model, val).mean()

    print("Train AUC Score: {}".format(auc_train))
    print("Test AUC Score: {}".format(auc_test))
    
    ###################################################################
    #ALS Model
    
    # Read data from parquet
    train = spark.read.parquet(train_data)
    train.createOrReplaceTempView('train')
    train_data = train.select('user_id','book_id','rating')
    train_data = train_data.filter(train_data.rating !=0)

    val = spark.read.parquet(validation_data)
    val.createOrReplaceTempView('val')
    val_data = val.filter(val.rating >= 3)
    val_data = val.select('user_id','book_id','rating')
    
    #creating ground truth df
    w = Window.partitionBy('user_id').orderBy(col('rating').desc())
    actual = val_data.withColumn("sorted_vals_by_rating", F.collect_list('book_id').over(w))
    actual = actual.groupBy('user_id').agg(F.max('sorted_vals_by_rating').alias('items'))
    
    # Go through parameters
    
    # Build ALS model
    print('Building ALS model...')
    als=ALS(maxIter=5,regParam=0.1,rank=2,userCol="user_id",itemCol="book_id",ratingCol="rating",coldStartStrategy="drop",nonnegative=True)

    #Train ALS model
    print('Training ALS model...')
    start_time = time.time()
    model = als.fit(train_data)

    print('Run time: {} mins'.format((time.time() - start_time)/60))

    # Make predictions on val_data
    print('Making predictions...')
    predictions = model.transform(val_data)

    ####
    #MAP (Method 1)
    predictions = model.transform(val_data)

    #model makes top k predictions for all users
    preds = model.recommendForAllUsers(125)

    #remove StructType
    preds = preds.withColumn('recommendations',explode('recommendations')).select('*')
    preds = preds.select('user_id','recommendations.*')

    #build predictions df: group books by user_id, store as single array of books in rating column
    w = Window.partitionBy('user_id').orderBy(col('rating').desc())
    perUserPredictedItemsDF = preds.select('user_id', 'book_id', 'rating', F.rank().over(w).alias('rank')).where('rank <= 500').groupBy('user_id').agg(expr('collect_list(book_id) as books'))
    windowSpec = Window.partitionBy('user_id').orderBy(col('rating').desc())
    perUserActualItemsDF = val.select('user_id', 'book_id', 'rating', F.rank().over(windowSpec).alias('rank')).groupBy('user_id').agg(expr('collect_list(book_id) as books')) 

    #build df of predictions and ground truth, convert to RDD
    perUserItemsRDD = perUserPredictedItemsDF.join(perUserActualItemsDF, 'user_id').rdd.map(lambda row: (row[1], row[2]))
    rankingMetrics = RankingMetrics(perUserItemsRDD)                            
    pak = rankingMetrics.precisionAt(125)

    print('Precision at k is {}'.format(pak))
Exemplo n.º 24
0
pos1_train, pos1_test = random_train_test_split(pos1_spr
                                                , test_percentage=0.25
                                                , random_state = None)

### create and train LightFM model ###
NUM_THREADS = 4
NUM_COMPONENTS = 5
NUM_EPOCHS = 30
ITEM_ALPHA = 1e-6

pos1_model = LightFM(loss='warp'
                    , item_alpha=ITEM_ALPHA
                    , no_components=NUM_COMPONENTS)


%time pos1_model = pos1_model.fit(pos1_train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)

train_auc = auc_score(pos1_model, pos1_train, num_threads=NUM_THREADS).mean()
print('train AUC: %s' %train_auc)
test_auc = auc_score(pos1_model, pos1_test, num_threads=NUM_THREADS).mean()
print('test AUC: %s' %test_auc)

train_precision = precision_at_k(pos1_model, pos1_train, k=10).mean()
print('train precision at k: %s' %train_precision)
test_precision = precision_at_k(pos1_model, pos1_test, k=10).mean()
print('test precision at k: %s' %test_precision)

#################### 4 original resume and FULL job description with td-ifd embeddings ####################
stages = []
for one_job_id in jobIDs:
  pos_tfidf = GenerateTfidfEmbedding(one_job_id, job_text, resume_text)
from lightfm import LightFM

# Fetch data and format it
data = fetch_movielens(min_rating=4.0)

# Print training and testing data
print(repr(data['train']))
print(repr(data['test']))

# Create model
model1 = LightFM(loss='warp')
model2 = LightFM(loss='logistic')
model3 = LightFM(loss='bpr')
model4 = LightFM(loss='warp-kos')
# Train model
model1.fit(data['train'], epochs=30, num_threads=2)
model2.fit(data['train'], epochs=30, num_threads=2)
model3.fit(data['train'], epochs=30, num_threads=2)
model4.fit(data['train'], epochs=30, num_threads=2)


def sample_recommendation(model, data, user_ids):

    # Number of users and movies in training data
    n_users, n_items = data['train'].shape

    # Generate recommendations for each user we input
    for user_id in user_ids:

        # Movies they already like
        # CSR stands for Compressed Sparse Row format. We find all the movies in the training dataset that user_id likes (>=4.0 rating) and we find its indices to index the item labels dataset to get the actual movie names
                                            'book_id', 'rating')
    #     train_csr = scipy.sparse.csr_matrix((train_df['rating'].values, (train_df['user_id'].values, train_df['book_id'].values)))

    #     val_csr = scipy.sparse.csr_matrix((val_df['rating'].values, (val_df['user_id'].values, val_df['book_id'].values)), shape = train_csr.shape)

    assert train_csr.shape == val_csr.shape

    for epoch in epochs:

        for rank in ranks:

            model = LightFM(no_components=rank,
                            loss='warp',
                            learning_rate=0.05)
            start = time.time()
            model.fit(train_coo, epochs=epoch, num_threads=10)
            time_taken_to_fit = time.time() - start
            total = 0

            avg_precision = None
            if calculate_precision_at_k is True:

                for k in ks:

                    _p = precision_at_k(model,
                                        test_interactions=val_csr,
                                        train_interactions=train_csr,
                                        k=k)

                    avg_precision = _p.sum() / len(_p)
    features_generator = ((item_id, ele) for item_id in list_features.keys()
                          for ele in list_features[item_id])
    item_features = train.build_item_features(features_generator,
                                              normalize=False)
    print('End Loading Features.')

    # Train the Model
    print('Training...')
    start = time()
    model = LightFM(no_components=args.emb_K,
                    loss=args.loss,
                    learning_rate=args.lr,
                    random_state=0)
    model.fit(train_interactions,
              item_features=item_features,
              epochs=args.epoch,
              num_threads=args.num_threads,
              verbose=True)
    print('End Training in {0}.'.format(time() - start))

    with open(weight_directory + '_step{0}_LFM.pickle'.format(args.epoch),
              'wb') as dump:
        pickle.dump(model, dump, protocol=pickle.HIGHEST_PROTOCOL)

    # # Evaluation
    print("Evaluation...")
    with open(
            result_directory +
            '_top{0}_ep{1}_LFM.tsv'.format(args.topk, args.epoch), 'w') as out:

        for user_id in range(df_train[0].nunique()):
Exemplo n.º 28
0
    missing_n = 100
    epoch = 5
    data = np.load(
        os.path.join(tensorflow_data_3_dir, str(missing_n),
                     'ori_matrix_sample_{}.npy'.format(missing_n)))
    a = np.where(data == -1)
    data[a[0], a[1]] = 0
    print(np.sum(data))
    data = coo_matrix(data)

    # print(data.toarray())
    '''repr()函数将对象转化为供解释器读取的形式'''
    result = np.zeros(data.shape)
    # create model
    model = LightFM(no_components=30,
                    loss='bpr')  # warp = weighted approximate-rank pairwise

    print(datetime.datetime.now())
    model.fit(data, epochs=epoch, num_threads=2, verbose=True)
    print(datetime.datetime.now())

    n_users, n_items = data.shape

    for i in range(n_users):
        scores = model.predict(i, np.arange(n_items))
        result[i] = scores

    np.save(
        os.path.join(baseline_output_dir,
                     'bpr_{}_wtreview.npy'.format(missing_n)), result)
Exemplo n.º 29
0
#

#fetch data and format it
data = fetch_movielens(min_rating=4.0)

#print training and testing data
print(repr(data['train']))
print(repr(data['test']))

#CHALLENGE part 2 of 3 - use 3 different loss functions (so 3 different models), compare results, print results for
#the best one. - Available loss functions are warp, logistic, bpr, and warp-kos.

#create model
model = LightFM(loss='warp')
#train model
model.fit(data['train'], epochs=30, num_threads=2)

#CHALLENGE part 3 of 3 - Modify this function so that it parses your dataset correctly to retrieve
#the necessary variables (products, songs, tv shows, etc.)
#then print out the recommended results


def sample_recommendation(model, data, user_ids):

    #number of users and movies in training data
    n_users, n_items = data['train'].shape

    #generate recommendations for each user we input
    for user_id in user_ids:

        #movies they already like
Exemplo n.º 30
0
from lightfm.data import Dataset

print(get_ratings())
dataset = Dataset()
dataset.fit((x['User-ID'] for x in get_ratings()),
            (x['ISBN'] for x in get_ratings()))

num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()),
                    item_features=(x['Book-Author']
                                   for x in get_book_features()))

(interactions, weights) = dataset.build_interactions(
    (x['User-ID'], x['ISBN']) for x in get_ratings())
print(repr(interactions))

item_features = dataset.build_item_features(
    ((x['ISBN'], [x['Book-Author']]) for x in get_book_features()))
print(repr(item_features))

from lightfm import LightFM
model = LightFM(loss='bpr')
model.fit(interactions, item_features=item_features)

from lightfm.evaluation import precision_at_k

print("Train precision: %.2f" %
      precision_at_k(model, interactions, k=5).mean())
Exemplo n.º 31
0
import numpy as np
from lightfm.datasets import fetch_movielens
from lightfm import LightFM
from lightfm.evaluation import precision_at_k,auc_score

data = fetch_movielens(min_rating=4.0)

print(repr(data['train']))
print(repr(data['test']))

# model with warp

model_warp = LightFM(loss='warp')
model_warp.fit(data['train'], epochs=30, num_threads=2)

#model with bpr 

model_bpr = LightFM(loss='bpr')
model_bpr.fit(data['train'], epochs=30, num_threads=2)


def recommender(model, data, user_ids):

    n_users, n_items = data['train'].shape

    for user_id in user_ids :
        known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices]

        scores = model.predict(user_id, np.arange(n_items))

        top_items = data['item_labels'][np.argsort(-scores)]
Exemplo n.º 32
0
# print('Optimal parameters:')
# params = ['epochs', 'learning_rate', 'no_components', 'alpha', 'max_sampled']
# for (p, x_) in zip(params, res_fm.x):
#     print('{}: {}'.format(p, x_))

######## train the model  ########
model = LightFM(
    loss='warp',
    learning_rate=0.036281404040243825,
    no_components=29,
    user_alpha=0.00048625731451155697,
    item_alpha=0.00048625731451155697,
    max_sampled=37,
)
# model.fit(train_data, user_features, food_features, epochs=10, num_threads=20)
model.fit(all_data, epochs=197, num_threads=10)

# patks = evaluation.precision_at_k(model, val_data,
#                                   train_interactions=None,
#                                   # user_features = user_features,
#                                   # item_features = food_features,
#                                   k=20, num_threads=20)

# mapatk = np.mean(patks)
# print(mapatk)

######## predict  ########
preds = []
food_ids_vocab = np.array(list(food_ids_set))
usr_ids_vocab = np.array(list(usr_ids_set))
Exemplo n.º 33
0
import numpy as np
from lightfm.datasets import fetch_movielens
from lightfm import LightFM

# fetch data and format it
data = fetch_movielens(min_rating=4.0)

# print training and testing data
print(repr(data['train']))
print(repr(data['test']))

# create model
model = LightFM(loss='warp')

#train model
model.fit(data['train'], epochs=30, num_threads=2)

def sample_recommendation(model, data, user_ids):

    # number of users and movies in training datasets
    n_users, n_items = data['train'].shape

    # generate recommendations for each user we input
    for user_id in user_ids:

        # movies they already like
        known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices]

        # movies our model predicts they will like
        scores = model.predict(user_id, np.arange(n_items))
Exemplo n.º 34
0
train = data['train']
user_idxs = data['user_idxs']
idx_to_userid = data['idx_to_userid']
userid_to_idx = data['userid_to_idx']
idx_to_itemid = data['idx_to_itemid']
itemid_to_idx = data['itemid_to_idx']

fundid_names_df = pd.read_csv('./funds-dataset/fundid_to_name.csv',encoding='cp950')
fundid_to_names = {}

for d in fundid_names_df.to_dict('records'):
    fundid_to_names[d['基金代碼']] = d['基金中文名稱']
#%% 
t1 = time.time()
model_lr = LightFM(learning_rate=0.01, loss='warp')
model_lr.fit(train, epochs=10)
t2 = time.time()
print('model built (lightfm) cost :{:.1f} s'.format(t2-t1))
train_precision = precision_at_k(model_lr, train, k=10).mean()
test_precision = precision_at_k(model_lr, test, k=10).mean()
train_recall = recall_at_k(model_lr,train,k=10).mean()
test_recall = recall_at_k(model_lr,test,k=10).mean()

train_auc = auc_score(model_lr, train).mean()
test_auc = auc_score(model_lr, test).mean()
## on test : Recall- 19.30%, Precision- 1.93%, (AUC-0.91)
print('Recall: train {:.2f}%, test {:.2f}%'.format(100*train_recall,100*test_recall)) 
print('Precision: train {:.2f}% , test {:.2f}%.'.format(100*train_precision, 100*test_precision))
print('AUC: train {:.2f}, test {:.2f}.'.format(train_auc, test_auc))

Exemplo n.º 35
0
    def build_model(self) -> None:
        """
        Fits model for user-variant recommendations and similar variant recommendations.
        """
        if hasattr(self, 'input_file'):
            logging.info(f'Training the main model with dataset {self.input_file}...')
        else:
            logging.info('Training the model...')

        train_validation, test = train_test_split(
            self.dataset.interactions, **self.config.VALIDATION_PARAMS
        )
        train, validation = train_test_split(
            train_validation, **self.config.VALIDATION_PARAMS
        )

        logging.info(f'train: Type; {type(train)}, Shape; {train.shape}')
        logging.info(f'validation: Type; {type(validation)}, Shape; {validation.shape}')
        logging.info(f'test: Type; {type(test)}, Shape; {test.shape}')

        model = LightFM(**self.config.LIGHTFM_PARAMS)
        warp_auc: List[float] = []
        no_improvement_rounds = 0
        best_auc = 0.0
        epochs = self.config.FIT_PARAMS['epochs']
        early_stopping_rounds = self.config.FIT_PARAMS['early_stopping_rounds']

        logging.info(
            f'Training model until validation AUC has not improved in {early_stopping_rounds} epochs...'
        )

        for epoch in range(epochs):
            logging.info(f'Epoch {epoch}...')
            if no_improvement_rounds >= early_stopping_rounds:
                break

            model.fit(
                interactions=train,
                item_features=self.dataset.item_features,
                epochs=self.config.FIT_PARAMS['epochs_per_round'],
                num_threads=self.config.FIT_PARAMS['core_count'],
            )
            warp_auc.append(
                auc_score(
                    model=model,
                    test_interactions=validation,
                    item_features=self.dataset.item_features,
                ).mean()
            )

            if warp_auc[-1] > best_auc:
                best_auc = warp_auc[-1]
                no_improvement_rounds = 0
            else:
                no_improvement_rounds += 1

            logging.info(f'[{epoch}]\tvalidation_warp_auc: {warp_auc[-1]}')

        self.num_epochs = len(warp_auc) - early_stopping_rounds
        logging.info(f'Stopping. Best Iteration:')
        logging.info(
            f'[{self.num_epochs - 1}]\tvalidation_warp_auc: {warp_auc[self.num_epochs - 1]}'
        )

        logging.info(f'Calculating AUC score on test set...')
        test_score = auc_score(
            model=model,
            test_interactions=test,
            item_features=self.dataset.item_features,
        ).mean()
        logging.info(f'Test Set AUC Score: {test_score}')

        self.model = model
        self.test_score = test_score
Exemplo n.º 36
0
row = tbl3['UserId'].values - 1
col = tbl3['movie_id_index'].values


shape = (10000, len(movie_id))

sparse_matrix = coo_matrix((data,(row,col)), shape = shape)
print(repr(sparse_matrix))


#tbl2
#
#data2 = tbl2['IfExists'].values
#row2 = tbl2['MovieId'].values - 1
#col2 = tbl2['GenreId'].values - 1
#
#
#sparse_matrix2 = coo_matrix((data2,(row2,col2)), shape = (max(row2)+1,max(col2)+1))
#print(repr(sparse_matrix2))
#print(str(sparse_matrix2.getrow(1)))

from lightfm.datasets import fetch_movielens
from lightfm import LightFM
model = LightFM(loss='warp')
model.fit(sparse_matrix, epochs=30, num_threads=2)
n_users, n_items = sparse_matrix.shape

scores = model.predict(0,np.arange(n_items))
top_items = np.argsort(-scores)
top_items
from lightfm import LightFM

# Set the number of threads; you can increase this
# ify you have more physical cores available.
NUM_THREADS = 2
NUM_COMPONENTS = 30
NUM_EPOCHS = 3
ITEM_ALPHA = 1e-6

# Let's fit a WARP model: these generally have the best performance.
model = LightFM(loss='warp',
                item_alpha=ITEM_ALPHA,
               no_components=NUM_COMPONENTS)

# Run 3 epochs and time it.
model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)

# Import the evaluation routines
from lightfm.evaluation import auc_score

# Compute and print the AUC score
train_auc = auc_score(model, train, num_threads=NUM_THREADS).mean()
print('Collaborative filtering train AUC: %s' % train_auc)


# We pass in the train interactions to exclude them from predictions.
# This is to simulate a recommender system where we do not
# re-recommend things the user has already interacted with in the train
# set.
test_auc = auc_score(model, test, train_interactions=train, num_threads=NUM_THREADS).mean()
print('Collaborative filtering test AUC: %s' % test_auc)
def do_fiber_training(visualization = False):

    if not os.path.isfile(rc.RECOMMENDER_TRAINING) or not os.path.isfile(rc.RECOMMENDER_MODEL):

        yarn_data_matrix = pickle.load(open( rc.YARN_DATA_MATRIX, "rb" ))
        yarn_data_train = sps.coo_matrix(
                                yarn_data_matrix[:int(len(yarn_data_matrix)*0.5)]
                        ) > 0
        yarn_data_test = sps.coo_matrix(
                                yarn_data_matrix[int(len(yarn_data_matrix)*0.5):]
                        ) > 0
        if visualization:
            print yarn_data_train.shape[0],yarn_data_test.shape[0], len(yarn_data_matrix)

        # Taken from: https://github.com/lyst/lightfm/blob/master/examples/stackexchange/hybrid_crossvalidated.ipynb
        # Set the number of threads; you can increase this
        # ify you have more physical cores available.
        NUM_THREADS = 2
        NUM_COMPONENTS = 30
        NUM_EPOCHS = 3
        ITEM_ALPHA = 1e-6

        # Let's fit a WARP model: these generally have the best performance.
        model = LightFM(loss='warp',
                        item_alpha=ITEM_ALPHA,
                       no_components=NUM_COMPONENTS)

        # Run 3 epochs and time it.
        model = model.fit(yarn_data_train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)



        # Compute and print the AUC score
        train_auc = auc_score(model, yarn_data_train, num_threads=NUM_THREADS).mean()
        print('Collaborative filtering train AUC: %s' % train_auc)


        # We pass in the train interactions to exclude them from predictions.
        # This is to simulate a recommender system where we do not
        # re-recommend things the user has already interacted with in the train
        # set.
        test_auc = auc_score(model, yarn_data_test, train_interactions=yarn_data_train, num_threads=NUM_THREADS).mean()
        print('Collaborative filtering test AUC: %s' % test_auc)

        pickle.dump(yarn_data_matrix,open(rc.RECOMMENDER_TRAINING, 'wb'))
        pickle.dump(model,open(rc.RECOMMENDER_MODEL, 'wb'))
    else:
        yarn_data_matrix = pickle.load(open(rc.RECOMMENDER_TRAINING, 'rb'))
        model = pickle.load(open(rc.RECOMMENDER_MODEL, 'rb'))


    translation_dict = pickle.load(open(rc.YARN_TRANSLATION_DATA, 'rb'))
    print len(yarn_data_matrix)
    for matrix_id in xrange(0,len(yarn_data_matrix)):
        print matrix_id
        predictions = model.predict(matrix_id,yarn_data_matrix[matrix_id])
        matches = []
        predictions += abs(np.min(predictions)) # make non-negative
        _max = np.max(predictions) # find max for normalization
        predictions /= _max # Normalize predictions
        for prediction in xrange(0,len(predictions)):

            if predictions[prediction] > 0.9:
                matches.append([translation_dict[prediction],prediction,predictions[prediction]])

        print translation_dict[matrix_id],matches