def eval_branches(tree_results):

    eval_info = {}

    Y_true = [tree_results[i]['true_label'] for i in list(tree_results.keys())]
    Y_pred = [
        tree_results[i]['tree_prediction_from_avg_softmax']
        for i in list(tree_results.keys())
    ]

    confidence_softmax = [
        np.max(tree_results[i]['softmax_raw_avg'])
        for i in list(tree_results.keys())
    ]

    eval_info['accuracy'] = accuracy_score(Y_true, Y_pred)

    eval_info['macroF'] = f1_score(Y_true,
                                   Y_pred,
                                   average='macro',
                                   labels=[0, 1, 2])

    eval_info['rmse_softmax'] = rmse(Y_true, Y_pred, confidence_softmax)

    return eval_info
Пример #2
0
    def fitting_wrapper(self, df, df_out):
        F = FitMedlyn(fluxnet=False)
        params = F.setup_model_params()
        (result, success) = F.minimise_params(params, df, df["Cond"])

        if success:
            g1 = result.params['g1'].value
            g1_se = result.params['g1'].stderr
            g0 = 0.0
            model = F.gs_model(df["VPD"], df["Photo"], df["CO2S"], g0, g1)
            rsq = (pearsonr(df["Cond"], model)[0])**2
            num_pts = len(df["Cond"])
            rmse_val = rmse(df["Cond"], model)

            row = pd.Series([
                df.Scale.iloc[0], g1, g1_se, rsq, rmse_val, num_pts,
                df.Species.iloc[0], df.Datacontrib.iloc[0],
                df.Location.iloc[0], df.latitude.iloc[0], df.longitude.iloc[0],
                df.PFT.iloc[0], df.Pathway.iloc[0], df.Type.iloc[0],
                df.Plantform.iloc[0], df.Leafspan.iloc[0], df.Tregion.iloc[0]
            ],
                            index=self.out_cols)

            df_out = df_out.append(row, ignore_index=True)

        return (df_out)
    def fitting_wrapper(self, df, df_out):
        F = FitMedlyn(fluxnet=False)
        params = F.setup_model_params()
        (result, success) = F.minimise_params(params, df, df["Cond"])

        if success:
            g1 = result.params['g1'].value
            g1_se = result.params['g1'].stderr
            g0 = 0.0
            model = F.gs_model(df["VPD"], df["Photo"], df["CO2S"], g0, g1)
            rsq = (pearsonr(df["Cond"], model)[0])**2
            num_pts = len(df["Cond"])
            rmse_val = rmse(df["Cond"], model)

            row = pd.Series([df.Scale.iloc[0], g1, g1_se, rsq, rmse_val,
                             num_pts, df.Species.iloc[0],
                             df.Datacontrib.iloc[0], df.Location.iloc[0],
                             df.latitude.iloc[0], df.longitude.iloc[0],
                             df.PFT.iloc[0], df.Pathway.iloc[0],
                             df.Type.iloc[0], df.Plantform.iloc[0],
                             df.Leafspan.iloc[0], df.Tregion.iloc[0]],
                             index=self.out_cols)

            df_out = df_out.append(row, ignore_index=True)

        return (df_out)
Пример #4
0
    def update_fit_stats(self, d, result, F, df):
        d['g1'] = result.params['g1'].value
        d['g1_se'] = result.params['g1'].stderr
        d['g0'] = 0.0
        model = F.gs_model(df["VPD_f"], df["GPP_f"], df["CO2"], d['g0'],
                           d['g1'])
        d['rsq'] = (pearsonr(df["gs_est"], model)[0])**2
        d['num_pts'] = len(df["gs_est"])
        d['rmse_val'] = rmse(df["gs_est"], model)

        return (d, model)
    def update_fit_stats(self, d, result, F, df):
        d['g1'] = result.params['g1'].value
        d['g1_se'] = result.params['g1'].stderr
        d['g0'] = 0.0
        model = F.gs_model(df["VPD_f"], df["GPP_f"], df["CO2"],
                           d['g0'], d['g1'])
        d['rsq'] = (pearsonr(df["gs_est"], model)[0])**2
        d['num_pts'] = len(df["gs_est"])
        d['rmse_val'] = rmse(df["gs_est"], model)

        return (d, model)
Пример #6
0
def nfoldCrossValidation(Xtrain,Ytrain, K, maxIter,nfold = 5,LambdaRange=None,MuRange=None,GammaRange=None,BetaRange=None):
    #### split the data into training data and cross validation data
    #X, Xcv, Y, Ycv = randomsplit(Xtrain, Ytrain, cvRatio=0.3)
    #T = len(Xtrain)
    minRMSE = float("inf")
    Lambda_opt, Mu_opt, Gamma_opt, Beta_opt = [], [], [], []
    #dummyN, D = Xtrain[0].shape
#    startL, endL, numL = 0, 1, 1#1    
#    LambdaRange = np.linspace(startL,endL,numL)
#    startM, endM, numM = 0, 5, 1#6    
#    MuRange = np.linspace(startM,endM,numM)
#    startG, endG, numG = 0, 1, 1#11    
#    GammaRange = np.linspace(startG,endG,numG)
#    startB, endB, numB = 0, 5, 1#6    
#    BetaRange = np.linspace(startB,endB,numB)
    parameters = {}
    for Lambda in LambdaRange:
        for Mu in MuRange:
            for Gamma in GammaRange:
                for Beta in BetaRange:
                    print('Lambda={},Mu={},Gamma={},Beta={}'.format(Lambda,Mu,Gamma,Beta))
                    err = np.zeros((nfold,1))
                    for i in range(1,nfold+1):
                        print('i={}'.format(i))
                        X, Xcv, Y, Ycv = nfoldsplit(Xtrain, Ytrain, nfold, i)#ith fold data                   
                        W,L,S,Omega = MTL(X, Y, K, Lambda, Mu, Gamma, Beta,maxIter)        
                        ## cross validation testing error
#                        Ycv_est = []
#                        testrmse = np.zeros((1,T))
#                        for t in range(T):
#                            Xtestt = Xcv[t] # Ntest by D matrix
#                            Ntest, dummyD = Xtestt.shape
#                            Wt = W[:,t] # D by 1 column vector
#                            Ycv_est.append(np.dot(Xtestt,Wt).reshape((Ntest,1)))
#                            temp = 1.0/Ntest*np.sum(np.power((Ycv[t]-Ycv_est[t]),2))
#                            testrmse[0,t] = np.sqrt(temp)
                        testrmse = rmse(Xcv,Ycv,W)
                        err[i-1,0] = np.mean(testrmse)
                    meanTestRMSE = np.mean(err)                    
                    if(minRMSE>meanTestRMSE):
                        minRMSE = meanTestRMSE
                        Lambda_opt, Mu_opt, Gamma_opt, Beta_opt = Lambda,Mu,Gamma,Beta
                        parameters['W'] = W
                        parameters['L'] = L
                        parameters['S'] = S
                        parameters['Omega'] = Omega
        
    return Lambda_opt, Mu_opt, Gamma_opt, Beta_opt, parameters
        


          

    
Пример #7
0
def run():
    for i in range(1, 6):
        fold = 'fold' + str(i)
        command = "cut " + source + "/" + fold + "/test" + " -d' ' -f 3 | paste -d ' ' - " + output + "/" + fold + "/test.predict  > tmp.txt"
        os.popen(command)
        r = rmse.rmse(r'tmp.txt')
        entity.setValue('fold_name', 'FOLD' + str(i))
        entity.setValue('subset_name', 'TEST')
        entity.setValue('rmse', str(r))
        print entity
        entity.persist()
Пример #8
0
def run():
	for i in range(1,6):
		fold = 'fold' + str(i)
		command = "cut " + source + "/" + fold + "/test" + " -d' ' -f 3 | paste -d ' ' - " + output + "/" + fold + "/test.predict  > tmp.txt"
		os.popen(command)
		r = rmse.rmse(r'tmp.txt')
		entity.setValue('fold_name','FOLD'+str(i))
		entity.setValue('subset_name','TEST')
		entity.setValue('rmse',str(r))
		print entity
		entity.persist()
def eval_timeline(predictions):
    eval_info = {}

    Y_true = predictions['true_label']
    Y_pred = predictions['prediction_from_softmax_raw']

    confidence = [
        np.max(predictions['softmax_raw'][i]) for i in range(len(Y_true))
    ]

    eval_info['accuracy'] = accuracy_score(Y_true, Y_pred)

    eval_info['macroF'] = f1_score(Y_true,
                                   Y_pred,
                                   average='macro',
                                   labels=[0, 1, 2])

    eval_info['rmse'] = rmse(Y_true, Y_pred, confidence)

    return eval_info
        basic_u[:, u] = numpy.dot(numpy.dot(R_matrix[u, :], basic_v.T), numpy.linalg.pinv(numpy.asmatrix(numpy.dot(basic_v, basic_v.T))))

    for v in range(m): #for each row
        basic_v[:, v] = numpy.dot(numpy.dot(R_matrix[:, v].T, basic_u.T), numpy.linalg.pinv(numpy.asmatrix(numpy.dot(basic_u, basic_u.T))))

    basic_R = numpy.dot(basic_u.T, basic_v)
<<<<<<< HEAD
pr.disable()
s = StringIO.StringIO()
sortby = "cumulative"
ps = pstats.Stats(pr, stream = s).sort_stats(sortby)
ps.print_stats()
ps.dump_stats("output_stats.txt")
#print s.getvalue()
=======
    err = rmse(R_matrix, basic_R)
    i += 1
>>>>>>> 3db25c535f07af3fc549e118689b68ad6a31d0d1
t1 = time.time()
basic_time = t1 - t0
# ZERO MATRIX FACTORIZATION

un,um = zero_u.shape
t0 = time.time()
for i in range(iterations):
    for u in range (n): # u = row
        zero_u[:, u] = numpy.dot(numpy.dot(R_matrix[u,R_matrix[u,:]!=0], zero_v[:,R_matrix[u,:]!=0].T), numpy.linalg.pinv(numpy.asmatrix(numpy.dot(zero_v[:,R_matrix[u,:]!=0], zero_v[:,R_matrix[u,:]!=0].T))))

    for v in range(m): #for each row
        zero_v[:, v] = numpy.dot(numpy.dot(R_matrix[R_matrix[:,v]!=0, v].T, zero_u[:, R_matrix[0:um,v]!=0].T), numpy.linalg.pinv(numpy.asmatrix(numpy.dot(zero_u[:, R_matrix[0:um,v]!=0], zero_u[:, R_matrix[0:um,v]!=0].T))))
Пример #11
0
    diff = ground_truth - predicted_value
    diff_square = np.dot(diff, diff)
    #rmse = np.sqrt(np.divide(diff_square, ground_truth.shape[0]))
    rmse = np.sqrt(diff_square/ground_truth.shape[0])
    return rmse
    """


# In[ ]:


#1-dimensional input variables using the training set
#first feature for the test set
test_fixed_acidity = red_test_data[:, 0].reshape(-1, 1)
test_X_acidity = np.hstack((test_fixed_acidity, np.repeat(1, test_fixed_acidity.shape[0]).reshape(-1, 1)))
predicted_score_acidity = np.dot(test_X_acidity, train_w_acidity.T)
#predicted_score_acidity = predicted_value(train_fixed_acidity, test_fixed_acidity, red_test_score)
rmse.rmse(predicted_score_acidity, red_test_score)
#0.7860892754162216


# In[ ]:


#full 11-dimensional input variables
test_X = np.hstack((red_test_data, np.repeat(1, red_test_data.shape[0]).reshape(-1, 1)))
predicted_score = np.dot(test_X, w_all.T)
rmse.rmse(predicted_score, red_test_score)
#0.644717277241364

Пример #12
0
   def hierarchical(self, cluster_goal=25):
      clusters = [[element] for element in self.elements]

      while (len(clusters) > cluster_goal):
         sys.stdout.write('\rCombining %d clusters' % len(clusters))
         (clust_A, clust_B) = self.findCloseClusters(clusters)
         clusters = self.combineClusters(clusters, clust_A, clust_B)
         gc.collect()

      #print self.global_matrix

      predictions = []
      for cluster in clusters:
         plurality = None
         doc_freq = dict()

         for doc in cluster:
            if (plurality is None):
               plurality = doc.get_short_file()
            
            file_abbrev = re.sub('_.*-', '', doc.get_short_file())

            if file_abbrev not in doc_freq:
               doc_freq[file_abbrev] = 1
            else:
               doc_freq[file_abbrev] += 1

         plurality = max(doc_freq)
         for doc in cluster:
            predictions.append((doc.get_short_file(), plurality))

      author_rmse_calc = []
      sim_matrix = open('matrix', 'w+')

      for prediction in predictions:
         if (re.match(prediction[1][:-2], prediction[0])):
            author_rmse_calc.append(0)
            sim_matrix.write("**match**\n")
         else:
            author_rmse_calc.append(1)
         sim_matrix.write("guess %s => %s\n" % (prediction[0], prediction[1]))

      author_actual = [0 for ndx in author_rmse_calc]

      rmse_val = rmse(author_rmse_calc, author_actual)

      sim_matrix.write("\nrmse: '%f'\n\n" % rmse_val)

      for author in self.global_matrix.keys():
         sim_matrix.write(",%s" % author.strip())
      sim_matrix.write("\n")

      for auth_A in self.global_matrix.keys():
         sim_matrix.write("%s," % auth_A.strip())

         tmp_matrix = self.global_matrix[auth_A]

         for auth_B in self.global_matrix.keys():
            if (auth_B in tmp_matrix):
               dissim = tmp_matrix[auth_B]
               sim_matrix.write("%.03f," % dissim)
            else:
               sim_matrix.write("--,")

         sim_matrix.write("\n")

      sim_matrix.close()

      return predictions
Пример #13
0
def problem2(test_docs, train_docs, lower_bound=0.25, upper_bound=0.75,curr_classifier="NaiveBayes"):
    data_list = []
    two_count, three_count, four_count, five_count, all_count = 0,0,0,0,0

    #creates awesome data object for each document
    for doc in train_docs:
        #pars = [par.lower() for par in doc.get_pars() if par is not None]
        #ratings = [rating for rating in doc.get_ratings() if rating is not None]
        par = doc.get_pars()[3]
        rating = doc.get_ratings()[3]
        if (par is not None and rating is not None):
            #Tally up the rating counts from the training data
            if(rating == '2'):
                two_count += 1
            elif(rating == '3'):
                three_count += 1
            if(rating == '4'):
                four_count += 1
            if(rating == '5'):
                five_count += 1
            data_list.append(data(par, rating, doc.filename, doc.author))
        else:
            print "Found bad review by -> " + doc.author + " (this comes from: par = None) in file '%s'" % doc.get_filename()

    all_count = two_count + three_count + four_count + five_count

    #4 fold cross validation
    fold_size = math.floor(len(data_list) / DEFAULT_NUM_FOLDS)

    folds = [[], [], [], []]

    temp_data_list = copy.deepcopy(data_list)
    
    #divide into 4 folds
    while len(temp_data_list) != 0:
        ndx = random.randrange(0, len(temp_data_list))
        fold = random.randrange(0, DEFAULT_NUM_FOLDS)

        if len(folds[fold]) <= fold_size + 1:
            folds[fold].append(temp_data_list[ndx])
            del(temp_data_list[ndx])
    
    rmses = []
    #Big loop: For each fold
    for fold_num in range(DEFAULT_NUM_FOLDS):
        test_data = folds[fold_num]
        train_data = []
        for ndx in range(len(folds)): 
            if ndx != fold_num:
                train_data.extend(folds[ndx])

        # Begin Filtering
        bag_words_train = []
        
        for train_datum in train_data:
            bag_words_train.extend([(train_datum.get_bag_of_words(), train_datum.rating)])
            
        bag_words_test = []

        for test_datum in test_data:
            bag_words_test.extend([(test_datum.get_bag_of_words(), test_datum.rating)])

        all_rating_dist = dict()

        for bag, rating in bag_words_train:
            if rating not in all_rating_dist:
                all_rating_dist[rating] = []
            all_rating_dist[rating].extend(bag)

        rating_freq_dist = dict()
        for rating, words in all_rating_dist.iteritems():
            rating_freq_dist[rating] = nltk.FreqDist(words)

        rating_filters = dict()
        for rating in all_rating_dist.keys():
            word_freqs = [(word, freq) for (word, freq) in rating_freq_dist[rating].iteritems()]

            #Ascending sort
            sorted_wfreq = sorted(word_freqs, key=lambda x:x[1])
 
            start = int(lower_bound * len(sorted_wfreq))
            end = int(upper_bound * len(sorted_wfreq))

            rating_filters[rating] = sorted_wfreq[:start] + sorted_wfreq[end:]

        for train_datum in train_data:
            train_datum.set_filtered_words(rating_filters[train_datum.rating])

        # Generate mega-cool feature set thing
        filtered_bag_of_words_train = dict()
        for train_datum in train_data:
            if train_datum.rating not in filtered_bag_of_words_train:
                filtered_bag_of_words_train[train_datum.rating] = []
            filtered_bag_of_words_train[train_datum.rating].extend(train_datum.get_bag_of_words())


        feature_set = []
        for rating in filtered_bag_of_words_train:
            feature_set.append((bag_of_words_to_presence(filtered_bag_of_words_train[rating]), rating))

       #Train with the classifier specified by the user, default is NaiveBayes
        if(curr_classifier == "NaiveBayes"):
            classifier = nltk.classify.NaiveBayesClassifier.train(feature_set)
        elif(curr_classifier == "Maxent"):
            algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
            classifier = nltk.classify.MaxentClassifier.train(feature_set, algorithm,max_iter=0, trace=0)
        elif(curr_classifier == "DecisionTree"):
            classifier = nltk.classify.DecisionTreeClassifier.train(feature_set,binary=True)
        elif (curr_classifier == "MaxBayes"):
            classifier1 = nltk.classify.NaiveBayesClassifier.train(feature_set)
            algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
            classifier2 = nltk.classify.MaxentClassifier.train(feature_set, algorithm,max_iter=0, trace=0)
        else:
            classifier = nltk.classify.NaiveBayesClassifier.train(feature_set)


        if(curr_classifier == "MaxBayes"):
            guesses, new_guesses1, new_guesses2, actuals, guesses1_dist, guesses2_dist = [], [], [], [], [], []

            #Classify the test data using the probability classifier for both MaxEnt and NaiveBayes (returns probabilities
            for tdata in test_data:
                guesses1_dist.append(classifier1.prob_classify(bag_of_words_to_presence(tdata.get_bag_of_words())))
                guesses2_dist.append(classifier2.prob_classify(bag_of_words_to_presence(tdata.get_bag_of_words())))
                actuals.append(tdata.rating)
   
            #Calculate the probabilities for a rating to occur (from the training data)
            two_prob = float(two_count) / float(all_count)
            three_prob = three_count / float(all_count)
            three_prob = three_count / float(all_count)
            four_prob = four_count / float(all_count)
            five_prob = five_count / float(all_count)
 
            #Get the final probabilities for NaiveBayes
            for dist in guesses1_dist:
                new_guesses1.append( (float(dist.prob('2')+two_prob)/2*2) + (float(dist.prob('3')+three_prob)/2*3) + (float(dist.prob('4')+four_prob)/2*4) + (float(dist.prob('5')+five_prob)/2*5) )  

            #Get the final probabilities for MaxEnt
            for dist in guesses2_dist:
                new_guesses2.append( (float(dist.prob('2')+two_prob)/2*2) + (float(dist.prob('3')+three_prob)/2*3) + (float(dist.prob('4')+four_prob)/2*4) + (float(dist.prob('5')+five_prob)/2*5) )            

            #Average the probabilities of NaiveBayes and MaxEnt
            for x in range(len(new_guesses1)):
                val =  float(new_guesses1[x])*0.5 + float(new_guesses2[x])*0.5
                guesses.append(val)
		
            rmses.append(rmse(guesses, actuals))

            print "Validation run " + str(fold_num + 1)
            print "Need to output validation set filenames...."
            print " RMSE: " + str(rmses[fold_num])
        else:
            guesses, guesses_dist, actuals, new_guesses = [], [], [], []

            #Classify the test data using the probability classifier (returns probabilities)
            for tdata in test_data:
                if(curr_classifier == "DecisionTree"):
                    guesses.append(classifier.classify(bag_of_words_to_presence(tdata.get_bag_of_words())))
                else:
                    guesses_dist.append(classifier.prob_classify(bag_of_words_to_presence(tdata.get_bag_of_words())))
                actuals.append(tdata.rating)

            #Calculate the probabilities for a rating to occur (from the training data)
            two_prob = float(two_count) / float(all_count)
            three_prob = three_count / float(all_count)
            three_prob = three_count / float(all_count)
            four_prob = four_count / float(all_count)
            five_prob = five_count / float(all_count)

            #Get the final probabilities for the selected classifier
            if(curr_classifier == "DecisionTree"):
                new_guesses = guesses
            else:               
                for dist in guesses_dist:
                    #new_guesses.append( (float(dist.prob('2')+two_prob)/2*2) + (float(dist.prob('3')+three_prob)/2*3) + (float(dist.prob('4')+four_prob)/2*4) + (float(dist.prob('5')+five_prob)/2*5) )     
                    new_guesses.append( float(dist.prob('2')*2) + float(dist.prob('3')*3) + float(dist.prob('4')*4) + float(dist.prob('5')*5) )  

            rmses.append(rmse(new_guesses, actuals))
            print "Validation run " + str(fold_num + 1)
            #print set(test_data)
            print " RMSE: " + str(rmses[fold_num])

    print "\nAVG RMSE: " + str(sum(rmses)/DEFAULT_NUM_FOLDS)  
Пример #14
0
import scipy.sparse as sp
from scipy.sparse.linalg import svds
from rmse import rmse
import numpy as np

from work_with_data import train_data_matrix, test_data_matrix, n_users, n_items

# делаем SVD
u, s, vt = svds(train_data_matrix, k=10)
s_diag_matrix = np.diag(s)

# предсказываем
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)

# выводим метрику
print('User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix)))
    # print()
    # plt.plot(Y_pred_nat, Y_test, 'bo')
    # plt.show()
    #
    # time0_gd = time.time()
    # time1_gd = time.time() - time0_gd
    # theta_gd = gradient_descent(X_train, Y_train)
    # Y_pred_gd = np.dot(X_test, theta_gd)
    # loss_gd = rmse(Y_pred_gd, Y_test)
    # print('Theta, gradient descent:')
    # print(theta_gd)
    # print('Time to find solution with gradient descent: ', time1_gd)
    # print('Gradient descent RMSE: ', loss_gd)
    # print()
    # plt.plot(Y_pred_gd, Y_test, 'bo')
    # plt.show()

    time0_gen = time.time()
    time1_gen = time.time() - time0_gen
    theta_gen = gen(X_train, Y_train)
    Y_pred_gen = np.dot(X_test, theta_gen)
    loss_gd = rmse(Y_pred_gen, Y_test)
    print('Theta, evolution:')
    print(theta_gen)
    print('Time to find solution with evolution strategy: ', time1_gen)
    print('Evolution strategy RMSE: ', loss_gd)
    print()
    plt.plot(Y_pred_gen, Y_test, 'bo')
    plt.show()

Пример #16
0
#GammaRange = [0.001,0.1,10]
#BetaRange = [0.001,0.1,10]  
# =============================================================================
# Lambda,Mu,Gamma,Beta, Parameters = nfoldCrossValidation(Xtrain,Ytrain, K, maxIter,nfold = 5,
#                                                         LambdaRange=LambdaRange,
#                                                         MuRange=MuRange,
#                                                         GammaRange=GammaRange,
#                                                         BetaRange=BetaRange) 
# #Lambda,Mu,Gamma,Beta = 1e-3, 1e-3, 0.1, 1e-3
# =============================================================================
Lambda,Mu,Gamma,Beta = 0.1, 0.1, 0.1, 50. #toy-3tasks-nonoverlap-DATA
#### train on all training data to get results
W,L,S,Omega = MTL(Xtrain, Ytrain, K, Lambda, Mu, Gamma, Beta, maxIter)
            
#### testing error
testrmse = rmse(Xtest,Ytest,W)
meanTestRMSE = np.mean(testrmse)
print('the mean test RMSE is ' + str(meanTestRMSE))

#### save results
import scipy.io as sio
result = {}
result['W_est'] = W
result['L_est'] = L
result['S_est'] = S
result['Omega'] = Omega
result['testrmse'] = testrmse
result['Lambda'] = Lambda
result['Mu'] = Mu
result['Gamma'] = Gamma
result['Beta'] = Beta
Пример #17
0
    def complete(self, data_list):
        #4 fold cross validation
        fold_size = math.floor(len(data_list) / DEFAULT_NUM_FOLDS)

        folds = [[], [], [], []]
        temp_data_list = copy.deepcopy(data_list)
    
        #divide into 4 folds
        while len(temp_data_list) != 0:
            ndx = random.randrange(0, len(temp_data_list))
            fold = random.randrange(0, DEFAULT_NUM_FOLDS)
            
            if len(folds[fold]) <= fold_size + 1:
                folds[fold].append(temp_data_list[ndx])
                del(temp_data_list[ndx])

        #get sentiment for words
        sent = buildSenti()

        rmses = []
        #for each fold, get bag of words
        for fold in folds:
            guesses = []
            actuals = []
            for datum in fold:
                #get words
                bag_of_words = datum.get_bag_of_words()
                #get actual rating
                actual_rating = datum.rating

                good_seed = ['excellent', 'amazing', 'best', 'delicious', 'tradition', 'fastest', 'clean', 'favorite', 'taste', 'worth',
                             'nice', 'friendly', 'positive', 'quality', 'great', 'prompt', 'amazing']
                bad_seed = ['horrible', 'terrible', 'metro', 'alright', 'cannot', 'mediocre', 'bad', 'wrong', 'messing', 'long', 'took', 
                            'unfortunately', 'obvious', 'drops', 'incorrect']

                senti_word = []
                for word in bag_of_words:
                    if word in good_seed:
                        senti_word.append(6)
                    elif word in bad_seed:
                        senti_word.append(-1)
                    elif word in sent:
                        sentiment = sent[word]

                        #augment sent value
                        if sentiment[1] > sentiment[0]:
                            actual_sent = round((sentiment[1] * 0.9) + 4.6)
                            senti_word.append(actual_sent)
                        else:
                            actual_sent = round((sentiment[0] * 5 ) + 1.5)
                            senti_word.append(actual_sent)

                if (len(senti_word) > 0):
                    prediction = round(sum(senti_word) / len(senti_word), 1)
#                    print "\nHERE: " 
#                    print "Prediction: "
#                    print prediction
#                    print "Actual: "
#                    print actual_rating
                    guesses.append(prediction)
                    actuals.append(actual_rating)
                
            temp_rm = rmse(guesses, actuals)
            print "For this fold: %f" % (temp_rm)
            rmses.append(temp_rm)    
            
        print "Average RMSE: %f" % (sum(rmses) /len(rmses))
Пример #18
0
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np
from rmse import rmse

from work_with_data import train_data_matrix, test_data_matrix, n_users, n_items


def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(
            ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array(
            [np.abs(similarity).sum(axis=1)])
    return pred


# считаем косинусное расстояние для пользователей и фильмов
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))