def test(p,T): #当这个值小于1的时候,点差切矢才有效 #数据衡量三种方法的优劣 p,pder=data(T)#提取数据点 '''开始实验''' ## fder=fmill(p,tlimit)#有问题,需要调试 ## print error.rmse(pder,fder)#维数统一 bder=bessell(p,tlimit) print error.rmse(pder,bder)
def matrix_factorization(users, movies, ratings, test_users, test_movies, test_ratings, K=10, steps=10, alpha=0.0002, beta=0.01, delta=0.01): t0 = time.clock() P, Q = np.ones((np.max(users) + 1, K)) * .2, np.ones( (np.max(movies) + 1, K)) * .2 #initPQ(R.shape[0], K, R.shape[1]) for step in range(steps): print("Step : ", step) for idx in range(len(users)): if (ratings[idx] > 0): i = users[idx] j = movies[idx] eij = ratings[idx] - np.dot(P[i, :], Q[j, :]) for k in range(K): P[i, k] = P[i, k] + alpha * (2 * eij * Q[j, k] - beta * P[i, k]) Q[j, k] = Q[j, k] + alpha * (2 * eij * P[i, k] - beta * Q[j, k]) print("Time till now :", round(time.clock() - t0, 2), "Train error", round(rmse(users, movies, ratings, P, Q), 4), "Test error", round(rmse(test_users, test_movies, test_ratings, P, Q), 4)) return P, Q
def cross_validate(X,y,model,folds=5,random_seed=42,test_size=.2,model_name='',parameters={},plot=False): """ ----------------------- cross validate models ----------------------- """ print ' ------------------ Cross Validation using %s model -------------------- '% model_name mses=[];rmses=[];rmsles=[] for fold in range(folds): #! create a test and train cv set train_cv, test_cv, y_target, y_true = cross_validation.train_test_split(X, y, test_size=test_size, random_state=fold*random_seed) #! train model and make predictions model.fit(train_cv, y_target) preds = model.predict(test_cv) #! measure the error (difference between the predictions and the actual targets) mse = error.mse(y_true, preds) rmse = error.rmse(y_true, preds) rmsle = error.rmsle(y_true, preds) print "(fold %d of %d) MSE : %f | RMSE: %f | RMSLE: %f %s" % (fold + 1, folds, mse, rmse, rmsle, '') mses.append(mse); rmses.append(rmse); rmsles.append(rmsle) print ">>> Mean MSE: %f | Mean RMSE: %f | Mean RMSLE: %f <<<" % (np.mean(mses), np.mean(rmses), np.mean(rmsles)) print "______________________________________________________________" if plot: plot_error(range(folds), rmsles,"Fold", "RMSLE", "Cross Validation using %s model" % model_name, ["RMSLE"]) # ---------------------- Scrap Notes --------------------------------- # models = [(linear_model.SGDRegressor, ), (linear_model.Ridge, ), ()] # param_grid = {'alpha': [0.001, 0.01, 0.5]} #,1,5, 10, 100, 1000] } #clf = linear_model.Ridge(alpha=a) #clf = linear_model.SGDRegressor(alpha=0.2,n_iter=1000,shuffle=True) #clf = linear_model.LassoCV(cv=3) #clf = linear_model.ElasticNet() #clf = linear_model.BayesianRidge() #clf = ensemble.RandomForestRegressor(n_estimators=100,random_state=42*idx*10,max_depth=4) #clf = ensemble.ExtraTreesRegressor(n_estimators=100,random_state=42*idx*10,max_depth=4) #clf = ensemble.GradientBoostingRegressor(alpha=a,n_estimators=100,random_state=42,max_depth=4)
def search_forward(X_train, X_test, y_train, y_test): """ implements search forward algorithm, returns features selected Keyword arguments: X_train - training data set y_train - training target X_test - testing data set y_test - testing target """ beta0 = learn_linreg_NormEq(X_train, y_train) y_hat = lin_reg(X_test, beta0) e_all_best = 2000000 V = [] v_best = 1 all_features = list(range(len(X_test.T))) while(v_best != 100): v_best = 100 e_best = e_all_best for v in (list(set(all_features) - set(V))): current_feature = [] current_feature.append(v) V_prime = V + current_feature beta_ = learn_linreg_NormEq(X_train[:, V_prime], y_train) y_hat = lin_reg(X_test[:, V_prime], beta_) e = rmse(y_test, y_hat) if e < e_best: e_best = e v_best = v if e_best < e_all_best: V.append(v_best) e_all_best = e_best return V
def factorize(users, movies, ratings, test_users, test_movies, test_ratings, latent=30, steps=10, gpu_steps=1, alpha=0.0002, beta=0.02, delta=0.01, rmse_repeat_count=5, debug=1): U, V = initUV(int(np.max(users) + 1), latent, int(np.max(movies) + 1)) U, V = np.array(U).astype(np.float32), np.array(V).astype( np.float32).transpose() print("Shape of P,Q : ", U.shape, V.shape) start_time = time.clock() y1, y2 = [], [] error, count = rmse(test_users, test_movies, test_ratings, U, V.T), 0 print("Initial test error :", round(error, 4)) for k in range(steps): if debug > 1: print("Step : ", k) t6 = time.clock() uu, mm, rr = np.array(users).astype(np.int32), np.array(movies).astype( np.int32), np.array(ratings).astype(np.int32) t7 = time.clock() tools.clear_context_caches() u_gpu = gpuarray.to_gpu(uu) v_gpu = gpuarray.to_gpu(mm) r_gpu = gpuarray.to_gpu(rr) a_gpu = gpuarray.to_gpu(U) b_gpu = gpuarray.to_gpu(V) if debug > 1: print("Length of uu,mm ", len(uu), len(mm), np.max(users), np.max(movies), U.shape, V.shape) if (len(uu) != 0 and len(mm) != 0): matrixfact( u_gpu, v_gpu, r_gpu, a_gpu, b_gpu, np.int32(np.max(users)), np.int32(latent), np.int32(np.max(movies)), np.int32(len(uu)), np.int32(len(mm)), np.int32(gpu_steps), np.float32(alpha), np.float32(beta), np.float32(delta), block=(16, 16, 1), grid=( 3, 4 ) # always keep blockIdx.z as 1 - the kernal expects no threads in z axis ) P = a_gpu.get() Q = b_gpu.get() U, V = np.array(P), np.array(Q) t8 = time.clock() if debug > 1: t9 = time.clock() if debug > 2: np.savetxt('U' + str(k), U, fmt='%.4f') np.savetxt('V' + str(k), V, fmt='%.4f') print("Timer :", round(t7 - t6, 4), round(t8 - t7, 4), round(t9 - t8, 4)) t5 = time.clock() if debug > 1: print("Step time taken : ", round(t5 - t7, 2)) y1.append(t5 - start_time) test_rmse = rmse(test_users, test_movies, test_ratings, U, V.T) print("Step test error :", round(test_rmse, 4)) train_rmse = rmse(users, movies, ratings, U, V.T) y2.append([train_rmse, test_rmse]) step_error = round(test_rmse, 4) if step_error < delta: break elif step_error == error: count = count + 1 elif step_error > error: break elif rmse_repeat_count == count: break else: error = step_error if debug > 1: np.savetxt('gpmf-' + str(start_time) + '-y1.txt', y1, fmt='%.4f') np.savetxt('gpmf-' + str(start_time) + '-y2.txt', y2, fmt='%.4f')
def factorize(users, movies, ratings, test_users, test_movies, test_ratings, blocks=1, latent=30, steps=10, block_steps=1, alpha=0.00001, beta=0.01, delta=0.01, rmse_repeat_count=3, debug=2, dataset=''): global U, V U, V = initUV(np.max(users) + 1, latent, np.max(movies) + 1) R = csr_matrix((ratings, (users, movies))).todense() size = max(np.max(users) + 1, np.max(movies) + 1) split = int(size / blocks) us = int(math.ceil(np.float(np.max(users)) / split)) vs = int(math.ceil(np.float(np.max(movies)) / split)) if debug > 1: print("Total splits : ", split, us, vs, us * vs) print("U, V shapes :", U.shape, V.shape) start_time = time.clock() y1, y2 = [], [] count, error = 0, 100 for k in range(steps): if debug > 1: print("Step : ", k) u1, v1 = 0, 0 t4 = time.clock() for i in range(us): u1 = i * split if np.max(users) < u1: u1 = int(np.max(users)) u2 = ((i + 1) * split - 1) if np.max(users) < u2: u2 = int(np.max(users)) stemp = 0 tpool = [None] * vs for j in range(vs): xtemp = int((i + stemp) % us) if debug > 1: print("i, j, ii, jj ", i, j, xtemp, j) u1 = xtemp * split if np.max(users) < u1: u1 = int(np.max(users)) u2 = ((xtemp + 1) * split - 1) if np.max(users) < u2: u2 = int(np.max(users)) v1 = j * split if np.max(movies) < v1: v1 = int(np.max(movies)) v2 = (j + 1) * split - 1 if np.max(movies) < v2: v2 = int(np.max(movies)) if debug > 1: print("Processing split : ", i, j, u1, u2, v1, v2) uu, mm, rr = fetch(u1, u2, v1, v2, users, movies, ratings) if debug > 1: print("Shapes of uu,mm,rr :", uu.shape, mm.shape, rr.shape) t6 = time.clock() P, Q = U[u1:u2 + 1, 0:latent], V[v1:v2 + 1, 0:latent] if debug > 1: print("P Q shapes : ", P.shape, Q.shape) t7 = time.clock() if debug > 1: print("Length of uu,mm ", len(uu), len(mm), u2 - u1 + 1, v2 - v1 + 1, P.shape, Q.shape) if (len(uu) != 0 and len(mm) != 0): t = tpool[j] if t is not None: while t.isAlive(): print('waiting for the thread ...') time.sleep(5) t = threading.Thread(target=block_factorization, args=(P, Q, R, u1, u2, v1, v2, block_steps)) tpool[j] = t t.start() t8 = time.clock() stemp += 1 t5 = time.clock() if debug > 1: print(" Step time taken : ", round(t5 - t4, 2)) y1.append(round(t5 - start_time, 3)) test_rmse = rmse(test_users, test_movies, test_ratings, U, V) print("Step error :", round(test_rmse, 3)) y2.append(round(test_rmse, 3)) step_error = round(test_rmse, 4) if step_error < delta: break elif error < step_error: break elif rmse_repeat_count < count: break elif error == step_error: count = count + 1 else: count = 0 error = step_error np.savetxt(str(blocks * blocks) + 'blocks_y2.txt', y2, fmt='%.3f') np.savetxt(str(blocks * blocks) + 'blocks_y1.txt', y1, fmt='%.3f')
def factorize(users, movies, ratings, test_users, test_movies, test_ratings, blocks=1, latent=10, steps=10, gpu_steps=2, alpha=0.0002, beta=0.01, delta=0.01, rmse_repeat_count=3, debug=2, dataset=''): U, V = initUV( np.max(users)-np.min(users)+1, latent, np.max(movies)-np.min(movies)+1) U = np.array(U) V = np.array(V) size = max(np.max(users)+1, np.max(movies)+1) split = int(size/blocks) us = int(math.ceil( np.float(np.max(users))/split ) ) vs = int(math.ceil( np.float(np.max(movies))/split ) ) if debug>1: print("Total splits : ",split, us, vs, us*vs) print("U, V shapes :", U.shape, V.shape) start_time=time.clock() y1, y2 = [], [] count, error = 0, 100 for k in range(steps): if debug>1: print("Step : ", k) u1, v1 = 0, 0 t4 = time.clock() for i in range(us): u1 = i*split if np.max(users) < u1: u1 = int(np.max(users)) u2 = ((i+1)*split - 1) if np.max(users) < u2: u2 = int(np.max(users)) stemp = 0 UU, MM, RR = [], [], [] ulimits = [0] for j in range(vs): xtemp = int((i+stemp)%us) print("i, j, ii, jj ", i, j, xtemp, j) u1 = xtemp*split if np.max(users) < u1: u1 = int(np.max(users)) u2 = ((xtemp+1)*split - 1) if np.max(users) < u2: u2 = int(np.max(users)) v1 = j*split if np.max(movies) < v1: v1 = int(np.max(movies)) v2 = (j+1)*split -1 if np.max(movies) < v2: v2 = int(np.max(movies)) print("Processing split : " , i , j, u1, u2, v1, v2) uu, mm, rr = fetch(u1,u2, v1,v2, users,movies,ratings) if(len(uu)!=0 and len(mm)!=0): UU,MM,RR, ulimits = pack(UU,MM,RR, uu,mm,rr, ulimits) stemp+=1 U, V = matrix_factorization(UU,MM,RR, U,V, ulimits,np.min(users), np.min(movies)) t5 = time.clock() if debug>1: print(" Step time taken : ", round(t5-t4,2)) y1.append(round(t5-start_time,3)) train_rmse = rmse(users, movies, ratings, U, V) test_rmse = rmse(test_users, test_movies, test_ratings, U, V) print("Train error:", round(train_rmse, 3) , " Test error:", round(test_rmse,3) ) y2.append(round(test_rmse,3) ) step_error=round(test_rmse,4) if step_error < delta: break elif error<step_error : break elif rmse_repeat_count<count: break elif step_error==error: count=count+1 else: count = 0 error=step_error np.savetxt('blocks_'+str(gpu_steps)+'iterations_y2.txt', y2, fmt='%.3f') np.savetxt('blocks_'+str(gpu_steps)+'iterations_y1.txt', y1, fmt='%.3f')
train_input = train_data[:,1:] train_output = train_data[:,0] test_input = test_data[:,1:] test_output = test_data[:,0] # Linear Regression m=test_input.shape[0] ones=np.ones((m,1)) test_input1=np.hstack((ones,test_input)) learing_rate =1e-8 max_steps = 100000 obj_linear_regression = linear_regression(train_input,train_output,learing_rate,max_steps,C=0.0) linear_regression_weights = obj_linear_regression.weights test_loss = mse(prediction = np.dot(test_input1,linear_regression_weights)[:,0], target = test_output)/100 test_loss1 = rmse(prediction = np.dot(test_input1,linear_regression_weights)[:,0], target = test_output)/100 print(f'\nMse Test loss in Linear Regression is \t {test_loss}') print(f'Rmse Test loss in Linear Regression is \t {test_loss1}\n') # Neural Network nn_max_epochs = 1000 nn_batch_size = 128 nn_learning_rate = 5e-8 num_layers = 1 num_units = 32 lamda = 0.00002 network = neural_network(train_input,num_layers,num_units) optimizer = optimizer(nn_learning_rate) train(network, optimizer, lamda, nn_batch_size, nn_max_epochs,train_input, train_output)
'Dses' : forecast.dses(main_df, usr_input[1], usr_input[3]), 'Hles' : forecast.hles(main_df, usr_input[1], usr_input[3]) } f_df = forecasts[usr_input[2]] #f_df is the forecast data frame with respect to the usr_input v_df = error.val_period(f_df, today) #get validation data frame from forecast data frame #gives all error values for dataframe mae = error.mae(v_df) avg_err = error.avg_err(v_df) mape = error.mape(v_df) rmse = error.rmse(v_df) print('MAE = ' + str(mae) + '\nAverage Error = ' + str(avg_err) +\ '\nMAPE = ' + str(mape) + '%\nRMSE = ' + str(rmse)) #Prints error names with values plottin.b_plot(f_df, usr_input[1], usr_input[3], usr_input[0], \ usr_input[2], today) #makes a basic plot but doesn't show it plt.show() #shows the plot which was made in plottin #see if you can put a timer on it