def train_trim_nmf(data, R):
    kfold = KFold(n_splits=10)
    rmse_list = [[], [], []]
    for k in range(2, 52, 2):
        print("using k = %d" % k)
        p_rmse = []
        u_rmse = []
        hv_rmse = []
        nmf = NMF(n_factors=k)
        for trainset, testset in kfold.split(data):
            nmf.fit(trainset)
            (p_testset, u_testset, hv_testset) = trim(testset, R)

            p_pred = nmf.test(p_testset)
            u_pred = nmf.test(u_testset)
            hv_pred = nmf.test(hv_testset)

            p_rmse.append(accuracy.rmse(p_pred))
            u_rmse.append(accuracy.rmse(u_pred))
            hv_rmse.append(accuracy.rmse(hv_pred))
        rmse_list[0].append(np.mean(p_rmse))
        rmse_list[1].append(np.mean(u_rmse))
        rmse_list[2].append(np.mean(hv_rmse))
    print("NMF with trim is finished!!")
    return rmse_list
예제 #2
0
def nmf_ratings_predicate(observed_ratings_df,
                          truth_ratings_df,
                          fold='0',
                          phase='eval'):
    """
    nmf_ratings Predicates
    """
    print("NMF predicates")
    nmf_model = NMF()
    reader = Reader(rating_scale=(0.2, 1))
    train_dataset = Dataset.load_from_df(df=observed_ratings_df.reset_index(
    ).loc[:, ['userId', 'movieId', 'rating']],
                                         reader=reader)
    nmf_model.fit(train_dataset.build_full_trainset())

    # make predictions
    predictions = pd.DataFrame(index=truth_ratings_df.index,
                               columns=['rating'])

    for row in truth_ratings_df.loc[:, ['rating']].iterrows():
        uid = row[0][0]
        iid = row[0][1]
        predictions.loc[(uid, iid), 'rating'] = nmf_model.predict(uid, iid).est

    write(predictions, 'nmf_rating_obs', fold, phase)
def nmf_compute_high_var_trim_rmse(k):
    nmf = NMF(n_factors=k, random_state=42)
    rmse = []
    for trainset, testset in KFold(n_splits=10, random_state=42).split(R):
        nmf.fit(trainset)
        testset_trimmed = high_variance_trimming(testset, frequency, variance)
        pred = nmf.test(testset_trimmed)
        rmse.append(accuracy.rmse(pred, verbose=False))
    print('k: %s | RMSE: %f' % (k, np.mean(rmse)))
    return np.mean(rmse)
def nmf_compute_prec_rec(t):
    precision, recall = [], []
    for trainset, testset in KFold(n_splits=10, random_state=42).split(R):
        nmf = NMF(n_factors=nmf_best_k, random_state=42)
        nmf.fit(trainset)
        trimmed_testset = trim_unpopular_user(testset, t, threshold)
        pred = nmf.test(trimmed_testset)

        precision_dict, recall_dict = calculate_precision_recall(
            pred, t, threshold)
        precision.append(np.mean([prec for prec in precision_dict.values()]))
        recall.append(np.mean([rec for rec in recall_dict.values()]))
    return np.mean(precision), np.mean(recall)
예제 #5
0
파일: Project3.py 프로젝트: shacocn/EE219
def Q23(col=0):
    print('Chosen column is ' + str(col))
    data = np.loadtxt('ml-latest-small/ratings.csv',
                      delimiter=',',
                      skiprows=1,
                      usecols=(0, 1, 2))

    row_userId = data[:, :1].astype(int)
    row_movieId = data[:, 1:2].astype(int)
    row_rating = data[:, 2:3]

    sortedId = np.sort(row_movieId.transpose()[0])
    m = {}
    idx = 0
    last = None
    for i in sortedId.tolist():
        if i != last:
            m[i] = idx
            idx += 1
        last = i

    data = load_data()
    model = NMF(n_factors=20)
    trainset, testset = train_test_split(data, test_size=0.0001)
    model.fit(trainset)
    U = model.pu
    V = model.qi

    import csv
    dict_ID_to_genre = {}
    with open('ml-latest-small/movies.csv', 'rt') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        cnt = 0
        for row in reader:
            if cnt != 0:
                dict_ID_to_genre[row[0]] = row[1:]
            cnt += 1

    dict_col_to_ID = {}
    for key in m:
        dict_col_to_ID[m[key]] = key

    V_col = V[:, col]
    V_col_sort_top10 = np.sort(V_col)[::-1][:10]
    V_col_list = V_col.tolist()
    for val in V_col_sort_top10:
        ind = V_col_list.index(val)
        m_id = dict_col_to_ID[ind]
        genre = dict_ID_to_genre[str(m_id)]
        print(genre[-1])
예제 #6
0
파일: P5.py 프로젝트: EVANMON/BigDataMining
def NMF_trim_filter(ratings, dims, func, mv_dict):
    reader = Reader(rating_scale=(0.0, 5.0))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']],
                                reader)
    RMSE = np.empty([len(dims)])
    MAE = np.empty([len(dims)])
    min_RMSE = False
    min_MAE = False
    fac_num_RMSE = 0
    fac_num_MAE = 0
    kf = KFold(n_splits=10, random_state=42)

    for k in range(len(dims)):
        nmf = NMF(n_factors=dims[k], random_state=42)
        test_rmse = np.array([])
        test_mae = np.array([])
        for trainset, testset in kf.split(data):
            nmf.fit(trainset)
            full_data = trainset.build_testset() + testset
            func(mv_dict, testset)
            pred = nmf.test(testset)
            test_rmse = np.append(test_rmse, accuracy.rmse(pred,
                                                           verbose=False))
            test_mae = np.append(test_mae, accuracy.mae(pred, verbose=False))
        RMSE[k] = np.mean(test_rmse)
        if ((not min_RMSE) or RMSE[k] < min_RMSE):
            min_RMSE = RMSE[k]
            fac_num_RMSE = dims[k]

        MAE[k] = np.mean(test_mae)
        if ((not min_MAE) or MAE[k] < min_MAE):
            min_MAE = MAE[k]
            fac_num_MAE = dims[k]
        print('For k = %i :' % dims[k])
        print('RMSE: ', RMSE[k])
        print('MAE: ', MAE[k])

    plt.plot(dims, RMSE)
    plt.plot(dims, MAE)
    plt.legend(['RMSE', 'MAE'])
    plt.show()
    print('Finishing Plotting...')
    print('For RMSE:')
    print('\t---Optimal number of latent factors is ', fac_num_RMSE)
    print('\t---Minumun Average RMSE is ', min_RMSE)
    print('\nFor MAE:')
    print('\t---Optimal number of latent factors is ', fac_num_MAE)
    print('\t---Minumun Average MAE is ', min_MAE)
예제 #7
0
파일: Project3.py 프로젝트: shacocn/EE219
def Q19to21(qNum):
    data = load_data()
    kf = KFold(n_splits=10)

    trimFun = {12: popularTrim, 13: unpopularTrim, 14: highVarTrim}
    RMSE = []
    for k in range(2, 20, 2):
        nmf = NMF()
        subRMSE = []
        for trainSet, testSet in kf.split(data):
            subsubRMSE = 0
            nmf.fit(trainSet)
            testSet = trimFun[qNum](testSet)
            nTest = len(testSet)
            print("test set size after trimming: %d", nTest)
            predictions = nmf.test(testSet)
            for p in predictions:
                subsubRMSE += pow(p.est - p.r_ui, 2)
        # average of all train-test splits of k-NN
        RMSE.append(np.mean(subRMSE))
    return RMSE
def use_nmf():
    start = time.time()
    performance = []

    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()

    print('Using NMF')
    algo_NMF = NMF()
    algo_NMF.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions_NMF = algo_NMF.test(testset)

    accuracy_rmse = accuracy.rmse(predictions_NMF)
    accuracy_mae = accuracy.mae(predictions_NMF)
    performance.append(accuracy_rmse)
    performance.append(accuracy_mae)

    end = time.time()
    performance.append(end - start)

    return performance
예제 #9
0
파일: P5.py 프로젝트: EVANMON/BigDataMining
def NMF_bin_pre(ratings, ts, nmf_fac, thrd):
    reader = Reader(rating_scale=(0.0, 5.0))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']],
                                reader)
    trainset, testset = train_test_split(data, test_size=ts)
    algo = NMF(n_factors=nmf_fac, random_state=42)
    algo.fit(trainset)
    pre = algo.test(testset)

    true_rating = np.empty(len(pre))
    pred_rating = np.empty(len(pre))

    for i in range(len(pre)):
        true_rating[i] = pre[i][2]
        pred_rating[i] = pre[i][3]

    bi_rating = np.empty(len(pre))
    one_idx = true_rating >= thrd
    zero_idx = true_rating < thrd
    bi_rating[one_idx] = 1.0
    bi_rating[zero_idx] = 0.0

    return bi_rating, pred_rating
예제 #10
0
파일: P5.py 프로젝트: EVANMON/BigDataMining
    bi_rating[zero_idx] = 0.0

    return bi_rating, pred_rating


threshold = np.array([2.5, 3, 3.5, 4])
for td in threshold:
    tar, pre = NMF_bin_pre(ratings, 0.1, 18, td)
    plot_roc(pre, tar)

# Q23
reader = Reader(rating_scale=(0.0, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
data = data.build_full_trainset()
nmf = NMF(n_factors=20, random_state=42)
nmf.fit(data)
for i in range(10):
    col = nmf.qi[:, i]
    top_movie = col.argsort()[::-1][:10]
    print('For the %i th column, the top 10 movie genres are:' % (i + 1))
    for j in range(10):
        raw_iid = nmf.trainset.to_raw_iid(top_movie[j])
        gen = movies.loc[movies['movieId'] == raw_iid]['genres'].values
        print('\t--%i :' % (j + 1), gen)

# MF With Bias Filter
# Q24 & 25


def MF_bias_filter(ratings, dims):
    reader = Reader(rating_scale=(0.0, 5.0))
예제 #11
0
plt.show()
fig3.savefig(path + 'fig/Part_8_knn_preVSrec.png')

#define model for training
k_min_rmse = 18
nnmf = NMF(n_factors=k_min_rmse, random_state=1)

#train, test and rank
top_t_list = range(1, 26)
pre_list_nnmf = []
rec_list_nnmf = []
for top_t in top_t_list:
    pre = 0
    rec = 0
    for trainset, testset in kf.split(data_raw):
        nnmf.fit(trainset)
        prediction = nnmf.test(testset)
        G = create_dict(testset)
        G_s = create_dict(prediction, if_pred=1)
        R, R_s = threshold_rank_filter(G, G_s, thre=3, top_t=top_t)
        #precision and recall for each fold
        pre_fold = 0
        rec_fold = 0
        for key in R.keys():
            pre_temp, rec_temp = precision_recall(R[key], R_s[key])
            pre_fold += pre_temp
            rec_fold += rec_temp
        pre += pre_fold / len(R)
        rec += rec_fold / len(R)

    pre_list_nnmf.append(pre / num_fold)
예제 #12
0
    y_score = [prediction.est for prediction in predictions]
    fpr, tpr, thresholds = roc_curve(y_true=y_true, y_score=y_score)
    roc_auc = auc(fpr, tpr)

    plotROC(fpr, tpr, roc_auc, threshold)
"""
Question 23: Movie-Latent Factor Interaction
"""
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
data = data.build_full_trainset()

movieDat = pd.read_csv('ml-latest-small/movies.csv')

nmf = NMF(n_factors=20, biased=False)
nmf.fit(data)

movies = df['movieId'].unique(
)  # identify unique movie IDs from the ratings CSV (9724, already sorted)
V = nmf.qi

# get top 10 movie genres for the first 20 columns of the V matrix
for i in range(20):
    Vcol = V[:, i]

    # convert column of V into a list for processing
    VcolOrig = []
    VcolSort = []
    for j in range(len(Vcol)):
        VcolOrig.append(Vcol[j])  # original array for looking up movie index
        VcolSort.append(Vcol[j])  # sorted array for getting top movies
class DataLoader(object):
    def __init__(self,category,save_name):
        self.category=category
        self.max_user=10000 #maximum number of user
        self.price_dict={}
        self.price_dict_temp={}
        self.cate_dict={}
        self.cate_dict_temp={}
        self.top_value=15 # top x features in SVD
        self.model=NMF()
        self.topk=500 #maximum items in each category, finding the top k popular
        self.max_price={}
        self.save_path= os.path.join("..", "feature", save_name)
        if not os.path.isfile(self.save_path):
            self.load_data() #load raw data
            #self.create_user_item_matrix()
            self.create_ratings()
            self.gen_new_price_dict()
            self.save_data(self.save_path) #save the feature
        else:
            self.load(self.save_path) #load the feature
        
    
    def load_ratings(self, filename):
        with open(os.path.join("..", "data", filename), "rb") as f:
            ratings = pd.read_csv(f,names=("user","item","rating","timestamp"))
        return ratings
    
    def load_prices(self,filename):
        price_dict = {}
        num_no_price=0
        for review in parse(os.path.join("..", "data", filename)):
            try:
                price=review['price']
                asin=review['asin']
                v=list(review['salesRank'].values())[0]
                if v<self.topk:
                    price_dict[asin]=price
            except:
                num_no_price+=1
                continue
        print("filename:",filename)
        print("length of price dict:", len(price_dict))
        print("# of items without price", num_no_price)
        return price_dict
    
    def load_data(self):
        print("Loading data:")
        for i in self.category:
            ratings_name= "ratings_"+i+".csv"
            price_name="meta_"+i+".json.gz"
            ratings_temp=self.load_ratings(ratings_name)
            print(len(ratings_temp))
            price_temp=self.load_prices(price_name)
            ratings_temp=ratings_temp[ratings_temp['item'].isin(price_temp.keys())]
            print(len(ratings_temp))
            self.price_dict_temp.update(price_temp)
            self.max_price[i]=max(list(price_temp.values()))
            cate_temp={}
            for j in price_temp.keys():
                cate_temp[j]=i
            self.cate_dict_temp.update(cate_temp)
            price_temp.clear()
            try:
                self.ratings=pd.merge(self.ratings,ratings_temp, how='outer')
            except:
                self.ratings=ratings_temp
        print(self.max_price)
        
    #old method
    def create_user_item_matrix(self, user_key="user",item_key="item"):
        n = len(set(self.ratings[user_key]))
        d = len(set(self.ratings[item_key]))
        self.user_mapper = dict(zip(np.unique(self.ratings[user_key]), list(range(n))))
        self.item_mapper = dict(zip(np.unique(self.ratings[item_key]), list(range(d))))

        self.user_inverse_mapper = dict(zip(list(range(n)), np.unique(self.ratings[user_key])))
        self.item_inverse_mapper = dict(zip(list(range(d)), np.unique(self.ratings[item_key])))

        self.user_ind = [self.user_mapper[i] for i in self.ratings[user_key]]
        self.item_ind = [self.item_mapper[i] for i in self.ratings[item_key]]

        self.ratings_matrix = sparse_matrix((self.ratings["rating"]-3, (self.user_ind, self.item_ind)), shape=(n,d))
        print("user-item matrix generated.")
    
    def create_ratings(self):
        #C=MBRecsys(self.ratings_matrix,top_value)
        S=set(self.ratings['user'])
        S=sample(S,self.max_user)
        n = len(S)
        d = len(set(self.ratings['item']))
        self.ratings=self.ratings[self.ratings['user'].isin(S)]
        reader=Reader(rating_scale=(1,5))
        data = Dataset.load_from_df(self.ratings[['user', 'item', 'rating']], reader)
        train_set=data.build_full_trainset()
        self.model.fit(train_set)
        
        self.inv_cate_dict={} #{'categoryA':[],'categoryB':[]}
        for i in self.category:
            self.inv_cate_dict[i]=[]
        for j in train_set.all_items():
            item_raw=train_set.to_raw_iid(j)
            self.inv_cate_dict[self.cate_dict_temp[item_raw]].append(j)
            self.price_dict[j]=self.price_dict_temp[item_raw]
            self.cate_dict[j]=self.cate_dict_temp[item_raw]
        self.cate_dict_temp.clear()
        self.price_dict_temp.clear()
        print("inv_cate_dict constructed.")
        d=0
        for i in self.category:
            d+=len(self.inv_cate_dict[i])
            print(i,':',len(self.inv_cate_dict[i]))
            
        self.ratings_predict=np.zeros([n,d])
        for i in train_set.all_users():
            user_raw=train_set.to_raw_uid(i)
            for j in train_set.all_items():
                item_raw=train_set.to_raw_iid(j)
                self.ratings_predict[i][j]=self.model.predict(user_raw, item_raw)[3]
        print("predicted ratings generated.")
        
        self.ranking=np.zeros([n,d])
        temp={}
        for i in range(n):
            for c in self.category:
                temp[c]=sorted(self.ratings_predict[i][self.inv_cate_dict[c]],reverse=True)
            for j in range(d):
                c=self.cate_dict[j]
                self.ranking[i][j]= temp[c].index(self.ratings_predict[i][j])+1
        print("user_item rankings generated.")
    
    def save_data(self,save_path):
        self.dict_all={'prices':self.price_dict,#'raw_ratings':self.ratings_matrix,
                           'new_ratings':self.ratings_predict,'cate':self.cate_dict,
                           'rankings': self.ranking,'max_price':self.max_price,
                           'new_price':self.new_price_dict}
                           #'user_mapper':self.user_mapper, 'item_mapper':self.item_mapper, 
                           #'user_inverse_mapper':self.user_inverse_mapper, 'item_inverse_mapper':self.item_inverse_mapper}
        with open(save_path,'wb') as f:
            pickle.dump(self.dict_all, f)
        print("data saved in ", save_path)
            
    def load(self,save_path):
        with open(save_path,'rb') as f:
            self.dict_all=pickle.load(f)
        #self.ratings_matrix =self.dict_all['raw_ratings']
        self.ratings_predict=self.dict_all['new_ratings']
        self.price_dict=self.dict_all['prices']
        self.cate_dict=self.dict_all['cate']
        self.ranking=self.dict_all['rankings']
        self.max_price=self.dict_all['max_price']
        self.new_price_dict=self.dict_all['new_price']
        #self.user_mapper=self.dict_all['user_mapper']
        #self.item_mapper=self.dict_all['item_mapper']
        #self.user_inverse_mapper=self.dict_all['user_inverse_mapper']
        #self.item_inverse_mapper=self.dict_all['item_inverse_mapper']
        self.dict_all.clear()
        del self.dict_all
        print("Saved data loaded.")
    
    def gen_new_price_dict(self):
        self.new_price_dict={}
        for i in self.category:
            self.new_price_dict[i]={}
        for i in range(len(self.cate_dict)):
            self.new_price_dict[self.cate_dict[i]][i]=self.price_dict[i]
        print("new price dictionary generated.")
예제 #14
0
    plt.plot(fpr,
             tpr,
             color='darkorange',
             lw=lw,
             label='ROC curve (area = %0.4f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve for threshold = %0.2f' % thresh)
    plt.legend(loc="lower right")
    plt.show()

#23
trainset, testset = train_test_split(data, test_size=.1, random_state=100)
nmf = NMF(n_factors=20, verbose=False, random_state=100)
nmf.fit(trainset).test(testset)
V = nmf.qi
k = [item for item in range(0, 20)]
df = pd.read_csv(movies_file_path,
                 names=['movieid', 'title', 'genres'],
                 header=0)
for i in k:
    print(i)
    mov = V[:, i]
    mov1 = [(n, j) for n, j in enumerate(mov)]
    mov1.sort(key=lambda x: x[1], reverse=True)
    for a in mov1[:10]:
        print(df['genres'][a[0]])
           'RMSE after High Variance Movie Trimming')

# In[42]:

print("Minimum average RMSE after high variance movie trimming: %.4f" %
      np.min(nmf_rmse_high_var_trim))

# <font size=4>**Question 22:** Plot the ROC curves for the NNMF-based collaborative filter designed in Question 17 for threshold values [2.5,3,3.5,4]. For the ROC plotting use the optimal number of latent factors found in Question 18. For each of the plots, also report the area under the curve (AUC) value.</font>

# In[43]:

nmf_best_k = ks[np.argmin(nmf_rmse)]

trainset, testset = train_test_split(R, test_size=0.1, random_state=42)
nmf_best = NMF(n_factors=nmf_best_k, random_state=42)
nmf_best.fit(trainset)
nmf_best_pred = nmf_best.test(testset)

plot_roc_curves(testset, nmf_best_pred, 'NNMF')

# <font size=4>**Question 23:** Perform Non-negative matrix factorization on the ratings matrix R to obtain the factor matrices U and V , where U represents the user-latent factors interaction and V represents the movie-latent factors interaction (use k = 20). For each column of V , sort the movies in descending order and report the genres of the top 10 movies. Do the top 10 movies belong to a particular or a small collection of genre? Is there a connection between the latent factors and the movie genres?</font>

# In[44]:

nmf_k20 = NMF(n_factors=20, random_state=42)
nmf_k20.fit(R.build_full_trainset())

# In[45]:

item_factors = nmf_k20.qi
def main():
    # Load data
    reader = Reader(sep=',', rating_scale=(0.0, 5.0), skip_lines=1)
    allMoives = Dataset.load_from_file('ratings.csv', reader=reader)
    popMoives = Dataset.load_from_file('popular.csv', reader=reader)
    unpopMoives = Dataset.load_from_file('unpopular.csv', reader=reader)
    varMoives = Dataset.load_from_file('variance.csv', reader=reader)
    binary = []
    binary.append(Dataset.load_from_file('bin2.5.csv', reader=reader))
    binary.append(Dataset.load_from_file('bin3.csv', reader=reader))
    binary.append(Dataset.load_from_file('bin3.5.csv', reader=reader))
    binary.append(Dataset.load_from_file('bin4.csv', reader=reader))
    with open('movies.csv', 'r', encoding='utf8') as f:
        reader = csv.reader(f, delimiter=',', quotechar='"')
        next(reader, None)
        movies = {int(movie[0]): movie[2] for movie in reader}

    # NMFs
    ks = range(2, 52, 2)
    mae, rmse = [0] * len(ks), [0] * len(ks)

    def nmf(dataName, data, biased=True):
        print('Start building NMF with ' + dataName + '!')
        for i, k in enumerate(ks):
            nmf = NMF(n_factors=k, biased=biased)
            scores = cross_validate(nmf, data, cv=10)
            mae[i] = scores['test_mae'].mean()
            rmse[i] = scores['test_rmse'].mean()
            print('k = ' + str(k) + ' finished!')
        plt.figure()
        plt.subplot(211)
        plt.plot(ks, mae)
        plt.xlabel('k')
        plt.ylabel('mean absolute error')
        plt.title('Mean absolute error vs. k of ' + dataName)
        plt.subplot(212)
        plt.plot(ks, rmse)
        plt.xlabel('k')
        plt.ylabel('root mean squared error')
        plt.title('Root mean squared error vs. k of ' + dataName)
        print('mae:')
        print(mae)
        print('rmse:')
        print(rmse)
        print('Finish building NMF with ' + dataName + '!')

    # Q17
    nmf('all movies', allMoives)

    # Q18
    optimalK = 4
    print('The optimal number of latent factors is ' + str(optimalK))

    # Q19
    nmf('popular movies', popMoives)

    # Q20
    nmf('unpopular movies', unpopMoives)

    # Q21
    nmf('high variance movies', varMoives)

    # Draw ROC Curve
    thresholds = [2.5, 3, 3.5, 4]

    def drawRoc(model, i, k):
        print('Start drawing ROC curve of NMF with optimal k = ' + str(k) +
              ', threshold = ' + str(thresholds[i]) + '!')
        train, test = train_test_split(binary[i],
                                       train_size=0.9,
                                       test_size=0.1)
        model.fit(train)
        labels = model.test(test)
        y_true = [label.r_ui for label in labels]
        y_pred = [label.est for label in labels]
        fpr, tpr, _ = roc_curve(y_true, y_pred)
        roc_auc = auc(fpr, tpr)

        plt.figure()
        plt.plot(fpr,
                 tpr,
                 color='darkorange',
                 lw=2,
                 label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC curve of NMF with optimal k = ' + str(k) +
                  ', threshold = ' + str(thresholds[i]))
        plt.legend(loc="lower right")
        print('Finish drawing ROC curve of NMF with optimal k = ' + str(k) +
              ', threshold = ' + str(thresholds[i]) + '!')

    # Q22
    nmf = NMF(n_factors=optimalK)
    for i in range(len(thresholds)):
        drawRoc(nmf, i, optimalK)

    # Q23
    print("Start finding top K!")
    k, col = 20, 5
    nmf = NMF(n_factors=k)
    trainAllMovies = allMoives.build_full_trainset()
    nmf.fit(trainAllMovies)
    ids = [[] for _ in range(col)]
    for i in range(col):
        factors = nmf.qi[:, i]
        s = sorted([[i, factor] for i, factor in enumerate(factors)],
                   key=lambda x: x[1],
                   reverse=True)
        for k in range(10):
            ids[i].append(s[k][0])
    genres = [[] for _ in range(col)]
    for i in range(col):
        for j in range(10):
            genres[i].append(movies[int(trainAllMovies.to_raw_iid(ids[i][j]))])
    for i in range(col):
        print('Col ' + str(i + 1) + ':')
        for genre in genres[i]:
            print(genre, end=', ')
        print('')
    print("Finish finding top K!")

    # Q24
    nmf('all movies', allMoives, True)

    # Q25
    optimalKBiased = 2
    print('The optimal number of latent factors is ' + optimalKBiased)

    # Q26
    nmf('popular movies', popMoives, True)

    # Q27
    nmf('unpopular movies', unpopMoives, True)

    # Q28
    nmf('high variance movies', varMoives, True)

    # Q29
    optimalKBiased = 2
    nmfBiased = NMF(n_factors=optimalKBiased, biased=True)
    for i in range(len(thresholds)):
        drawRoc(nmfBiased, i, optimalKBiased)

    plt.show()
예제 #17
0
    plt.savefig('plot/q22_nmf_roc_' + str(threshold) + '.png')
    plt.clf()


if __name__ == "__main__":
    threshold = [2.5, 3, 3.5, 4]
    file_path = os.path.expanduser("ml-latest-small/ratings_new.csv")
    reader = Reader(sep=',')
    data = Dataset.load_from_file(file_path, reader=reader)

    sim_options = {'name': 'pearson', 'user_based': True}

    trainset, testset = train_test_split(data, test_size=0.1)

    for th in threshold:
        algo = NMF(n_factors=16)
        algo.fit(trainset)
        predictions = algo.test(testset)

        y_true = []
        y_estimate = []

        for row in predictions:
            if row[2] >= th:
                y_true.append(1)
            else:
                y_true.append(0)
            y_estimate.append(row[3])

        plot_roc(y_true, y_estimate, th)
def problems_19_20_21_rmse_pop_unpop_hv():
    x_axis = range(2, 52, 2)
    ratings = {}
    for r in data.raw_ratings:

        if r[1] not in ratings:
            ratings[r[1]] = []
        ratings[r[1]].append(r[2])

    ###############################################################################################

    popular_movies = [x for x in ratings if len(ratings[x]) > 2]
    unpopular_movies = [x for x in ratings if len(ratings[x]) <= 2]

    ###################################################################################
    kf = KFold(n_splits=10)
    rmse_popular_store = []
    for i in x_axis:

        algo = NMF(i, verbose=False)
        accu = []
        for trainset, testset in kf.split(data):
            algo.fit(trainset)
            test_trim = [x for x in testset if x[1] in popular_movies]
            predictions = algo.test(test_trim)
            accu.append(accuracy.rmse(predictions, verbose=True))
        s = np.mean(accu)
        rmse_popular_store.append(s)

    plotgraphs(x_axis, rmse_popular_store, 'K', 'Mean RMSE scores',
               'Plot of popular movies', 'q19_rmse_popular_movies.png')
    plotgraphs(x_axis, rmse_popular_store, 'K', 'Mean RMSE scores',
               'Plot of popular movies')

    ##########################################################################################

    kf = KFold(n_splits=10)
    rmse_unpopular_store = []
    for i in x_axis:

        algo = NMF(i, verbose=False)
        accu = []
        for trainset, testset in kf.split(data):
            algo.fit(trainset)
            test_trim = [x for x in testset if x[1] in unpopular_movies]
            predictions = algo.test(test_trim)
            accu.append(accuracy.rmse(predictions, verbose=True))
        s = np.mean(accu)
        rmse_unpopular_store.append(s)

    plotgraphs(x_axis, rmse_unpopular_store, 'K', 'Mean RMSE scores',
               'Plot of unpopular movies', 'q20_rmse_unpopular_movies.png')
    plotgraphs(x_axis, rmse_unpopular_store, 'K', 'Mean RMSE scores',
               'Plot of unpopular movies')

    ############ rates  "key" id, values are ratings #######################################
    movie_var = {}
    for k in ratings:
        # print(k)
        movie_var[k] = np.var(ratings[k])

    ####################################################################################
    highvar_movies = [
        x for x in ratings if len(ratings[x]) >= 5 and movie_var[x] >= 2
    ]
    ##################################################################################

    kf = KFold(n_splits=10)
    rmse_highvar_store = []
    for i in x_axis:

        algo = NMF(i, verbose=False)
        accu = []
        for trainset, testset in kf.split(data):
            algo.fit(trainset)
            test_trim = [x for x in testset if x[1] in highvar_movies]
            predictions = algo.test(test_trim)
            accu.append(accuracy.rmse(predictions, verbose=True))
        s = np.mean(accu)
        rmse_highvar_store.append(s)

    pd.DataFrame(rmse_highvar_store).to_csv("rmse_highvar_store_21.csv")
    plotgraphs(x_axis, rmse_highvar_store, 'K', 'Mean RMSE scores',
               'Plot of high variance movies', 'q21_rmse_high_var_movies.png')