def test_regressor_partial_fit(): y_bin = y.copy() y_bin[y != 1] = -1 for data in (X, X_csr): reg = PassiveAggressiveRegressor(C=1.0, fit_intercept=True, random_state=0) for t in xrange(50): reg.partial_fit(data, y_bin) pred = reg.predict(data) assert_less(np.mean((pred - y_bin)**2), 1.7)
def test_regressor_partial_fit(): y_bin = y.copy() y_bin[y != 1] = -1 for data in (X, X_csr): reg = PassiveAggressiveRegressor(C=1.0, fit_intercept=True, random_state=0) for t in range(50): reg.partial_fit(data, y_bin) pred = reg.predict(data) assert_less(np.mean((pred - y_bin) ** 2), 1.7)
def test_regressor_partial_fit(): y_bin = y.copy() y_bin[y != 1] = -1 for data in (X, X_csr): for average in (False, True): reg = PassiveAggressiveRegressor(random_state=0, average=average, max_iter=100) for t in range(50): reg.partial_fit(data, y_bin) pred = reg.predict(data) assert np.mean((pred - y_bin) ** 2) < 1.7 if average: assert hasattr(reg, 'average_coef_') assert hasattr(reg, 'average_intercept_') assert hasattr(reg, 'standard_intercept_') assert hasattr(reg, 'standard_coef_')
def test_regressor_partial_fit(): y_bin = y.copy() y_bin[y != 1] = -1 for data in (X, X_csr): for average in (False, True): reg = PassiveAggressiveRegressor( C=1.0, fit_intercept=True, random_state=0, average=average, max_iter=100) for t in range(50): reg.partial_fit(data, y_bin) pred = reg.predict(data) assert_less(np.mean((pred - y_bin) ** 2), 1.7) if average: assert hasattr(reg, 'average_coef_') assert hasattr(reg, 'average_intercept_') assert hasattr(reg, 'standard_intercept_') assert hasattr(reg, 'standard_coef_')
def test_regressor_partial_fit(): y_bin = y.copy() y_bin[y != 1] = -1 for data in (X, X_csr): for average in (False, True): reg = PassiveAggressiveRegressor(C=1.0, fit_intercept=True, random_state=0, average=average) for t in range(50): reg.partial_fit(data, y_bin) pred = reg.predict(data) assert_less(np.mean((pred - y_bin) ** 2), 1.7) if average: assert_true(hasattr(reg, 'average_coef_')) assert_true(hasattr(reg, 'average_intercept_')) assert_true(hasattr(reg, 'standard_intercept_')) assert_true(hasattr(reg, 'standard_coef_'))
def refit_from_scratch(self): temp_model = PassiveAggressiveRegressor() temp_enc = CountVectorizer() X = [] # binary matrix the presence of tags Z = [] # additional numerical data Y = [] # target (to predict) values db_size = self.db.size() for data in self.db.yield_all(): feedback = data["feedback"] tags = data[ "tags" ] if feedback and tags: Y.append( feedback ) X.append(" ".join(tags)) Z.append(self.fmt_numerical(data)) X = temp_enc.fit_transform(X) X = hstack((X, coo_matrix(Z))) self.allX = X for i in range(X.shape[0]): temp_model.partial_fit(X.getrow(i), [Y[0]]) self.model = temp_model self.enc = temp_enc
print(np.sum(train_clusters), np.sum(test_clusters)) train, test = dg.generate_train_test_splits(train_clusters, test_clusters) train, test = shuffle(train).astype(np.float32), shuffle(test).astype(np.float32) train_x, train_y = train.iloc[:, 1:], train.iloc[:, 0] test_x, test_y = test.iloc[:, 1:], test.iloc[:, 0] dg.add_drift(0.5, True) n, d = train_x.shape _n += n for i in range(0, n, batch_size): count += 1 pa.partial_fit(train_x[i:i + batch_size], train_y[i:i + batch_size]) sgd.partial_fit(train_x[i:i + batch_size], train_y[i:i + batch_size]) if i % 50 == 0 or i == n - 1: pred1 = pa.predict(test_x) pred2 = sgd.predict(test_x) mse1, mae1 = np.sqrt(mean_squared_error(test_y, pred1)), mean_absolute_error(test_y, pred1) mse2, mae2 = np.sqrt(mean_squared_error(test_y, pred2)), mean_absolute_error(test_y, pred2) pa_mse_arr.append(mse1) pa_mae_arr.append(mae1) sgd_mse_arr.append(mse2) sgd_mae_arr.append(mae2) count_arr.append(count)
缺点: 暂时未知 对于算法的具体过程还不是很清楚,所以暂时作为一个黑箱吧 ''' rg = PassiveAggressiveRegressor(C=1.0, fit_intercept=True, n_iter=5, shuffle=True, verbose=0, loss='epsilon_insensitive', epsilon=0.1, random_state=None, warm_start=False) rg.fit(X_train, Y_train) rg.partial_fit(X_train, Y_train) # 增量学习 Y_pre = rg.predict(X_test) rg.score(X_test, Y_test) rg.coef_ rg.intercept_ ''' C 正则化项系数 fit_intercept 是否计算截距 n_iter 迭代次数 shuffle 是否洗牌 verbose 哈 loss 损失函数 epsilon 阈值 random_state 随机器 warm_start=False 新的迭代开始后,是否用上一次的最后结果作为初始化 '''
def main(input_path, output_attribute_index, scikit_output_path, spark_output_path): # Instancira se Passive Aggressive Regressor model regressor = PassiveAggressiveRegressor() for file_path in hdfs.ls(input_path): # Ucitava se sadrzaj fajla i kreira string matrica od njega content = hdfs.load(file_path) temp = content.split("\n") temp = list(map(lambda x: x.split(","), temp)) temp = list(filter(lambda x: len(x) > 1, temp)) raw_matrix = np.array(temp) # Ucitava se numpy matrica i zatim parsira u matricu realnih vrednosti # koja se nakon toga koristi prilikom treniranja modela # raw_matrix = np.genfromtxt(file_path, delimiter=',', dtype='string') input_matrix = raw_matrix[1:, 3:-5].astype('float64') output_vector = raw_matrix[1:, -5 + output_attribute_index].astype('float64') # Model se trenira u vidu iterativnog poboljsanja regressor.partial_fit(input_matrix, output_vector) # Na konzoli se stampa putanja do obradjenog fajla print(file_path) # Cuva se kreirani model na izlaznoj putanji # koja je prosledjena u vidu argumenta with hdfs.open(scikit_output_path, 'w') as opened_file: pickle.dump(regressor, opened_file) # Inicijalizacija konfiguracije i konteksta izvrsenja aplikacije configuration = SparkConf().setAppName("BigDataProj3_Trainer") context = SparkContext(conf=configuration) context.setLogLevel("ERROR") # Inicijalizacija sesije # (mora da se obavi zbog upisivanja modela) session = SparkSession(context) # Ucitavanje RDD podataka sa ulazne putanje input_data = context.textFile(input_path) # Parsiranje svakog reda na reci input_data = input_data.map(lambda x: x.split(",")) # Ignorisu se header-i input_data = input_data.filter(lambda x: x[0] != "Timestamp") # Ignorisu se prve tri vrste (Timestamp, Latitude i Longitude) # i bira se odgovarajuca izlazna kolona # (u zavisnosti od output_attribute_index promenljive) input_data = input_data.map(lambda x: list(map(lambda y: float(y), x[ 3:-5])) + [float(x[-5 + output_attribute_index])]) # Formira se odgovarajuci DataFrame objekat # (VectorAssembler se koristi kod formiranja kolona # koje omogucavaju koriscenje fit metode linearne regresije) input_cols = [] for i in range(15): input_cols.append("_" + str(i + 1)) assembler = VectorAssembler(inputCols=input_cols, outputCol='features') data_frame = assembler.transform(input_data.toDF()) # Instancira se LinearRegression objekat i vrsi njegovo treniranje # i zatim cuvanje na zadatoj putanji regression = LinearRegression(featuresCol='features', labelCol='_16') model = regression.fit(data_frame) model.write().overwrite().save(spark_output_path)
random_state=42) scaler.fit(Xtrain) Xtrain = scaler.transform(Xtrain) Xtest = scaler.transform(Xtest) scaleTime = time.time() - st print("Time to split and Scale Data: " + str(scaleTime)) #X = [[0, 0], [2, 2]] #y = [0.5, 2.5] #clf = MLPRegressor(hidden_layer_sizes=(1000,), random_state=1, max_iter=1, warm_start=True) clf = PassiveAggressiveRegressor(random_state=1, warm_start=True, max_iter=100) st = time.time() ttList = [] for i in range(len(Xtrain)): tt = time.time() clf = clf.partial_fit(Xtrain, Ytrain) print(i / len(Xtrain)) ttList.append(time.time() - tt) trainTime = time.time() - st joblib.dump(clf, 'currentModel.mod') pred = [] st = time.time() for x in range(len(Xtest)): pred.append(clf.predict([Xtest[x]])) score = metrics.mean_absolute_error(Ytest, clf.predict(Xtest)) predictTime = time.time() - st print("Time to import: " + str(importTime)) print("Time to Read File: " + str(readFileTime)) print("Time to split and Scale Data: " + str(scaleTime)) print("Time to train: " + str(trainTime)) print("Time to predict: " + str(time.time() - st))
#TO DO #Remove text columns that have already been converted into numeric features train_features = train.drop(['Text', 'Summary'], axis='columns') #Convert features to sparse matrix train_features = csr_matrix(train_features.values) #Combine sparse matrices train = hstack([train_summary_dtm, train_text_dtm, train_features]) #Scale train = scaler.transform(train) #Compute partial fit lm.partial_fit(train, train_score) #Create empty lists validation_pred = [] validation_score = [] #Loop through chunks for validation for reviews in pd.read_csv('Reviews.csv', index_col='Id', usecols=['Id', 'Summary', 'Text', 'Score'], chunksize=chunksize): #Only need validation data validation = reviews.iloc[reviews.index.isin( validation_indices[fold])]
def main(): X, y, coef = make_regression(1000, 200, 10, 1, noise=0.05, coef=True, random_state=42) # X = np.column_stack((X, np.ones(X.shape[0]))) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # sca = StandardScaler() # sca.fit(X_train) # X_train = sca.transform(X_train) # X_test = sca.transform(X_test) # print X.shape # print y.shape # print coef.shape param_grid = { "C": [0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 10, 100, 1000], "epsilon": [0.0001, 0.001, 0.01, 0.1]} param_grid_kern = { "C": [0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 10, 100, 1000], "epsilon": [0.0001, 0.001, 0.01, 0.1], "gamma": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]} # "loss": ["pa", "pai", "paii"]}} my_pa = PARegressor(loss="paii", C=1, epsilon=0.001, n_iter=1, fit_intercept=False) # # search = GridSearchCV(my_pa, param_grid, # scoring='mean_absolute_error', n_jobs=8, iid=True, refit=True, cv=5, # verbose=1) # search.fit(X_train, y_train) # print search.best_params_ my_pa.fit(X_train, y_train) print my_pa.coef_ # y_preds = search.predict(X_test) y_preds = my_pa.predict(X_test) mae_my_pa = mean_absolute_error(y_test, y_preds) print "My PA MAE = %2.4f" % mae_my_pa my_kpa_linear = KernelPARegressor(kernel="linear", loss="paii", C=1, epsilon=0.001, n_iter=1, fit_intercept=False) my_kpa_linear.fit(X_train, y_train) print "alphas", len(my_kpa_linear.alphas_), my_kpa_linear.alphas_ y_preds = my_kpa_linear.predict(X_test) mae_kpa_linear = mean_absolute_error(y_test, y_preds) print "My KPA linear MAE = %2.4f" % mae_kpa_linear my_kpa_rbf = KernelPARegressor(kernel="rbf", loss="paii", gamma=0.001, C=1, epsilon=0.001, n_iter=1, fit_intercept=False) # search = GridSearchCV(my_kpa_rbf, param_grid_kern, # scoring='mean_absolute_error', n_jobs=8, iid=True, refit=True, cv=5, # verbose=1) # search.fit(X_train, y_train) my_kpa_rbf.fit(X_train, y_train) print "alphas", len(my_kpa_rbf.alphas_), my_kpa_rbf.alphas_ print "support", len(my_kpa_rbf.support_) # print "alphas", len(search.best_estimator_.alphas_) # , my_kpa_rbf.alphas_ # print "support", len(search.best_estimator_.support_) # print search.best_params_ y_preds = my_kpa_rbf.predict(X_test) # y_preds = search.predict(X_test) mae_my_kpa = mean_absolute_error(y_test, y_preds) print "My Kernel PA MAE = %2.4f" % mae_my_kpa # print search.best_estimator_ # print np.corrcoef(search.best_estimator_.coef_, coef) # param_grid = { # "C": [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 10, # 100, 1000, 10000], # "epsilon": [0.0001, 0.001, 0.01, 0.1], # # "loss": ["epsilon_insensitive", "squared_epsilon_insensitive"]} # "loss": ["squared_epsilon_insensitive"]} # search = GridSearchCV(PassiveAggressiveRegressor(fit_intercept=True), # param_grid, scoring='mean_absolute_error', n_jobs=8, iid=True, # refit=True, cv=5, verbose=1) # search.fit(X_train, y_train) sk_pa = PassiveAggressiveRegressor(loss="squared_epsilon_insensitive", C=1, epsilon=0.001, n_iter=1, fit_intercept=False, warm_start=True) for i in xrange(X_train.shape[0]): # for x_i, y_i in zip(X_train, y_train): x = np.array(X_train[i], ndmin=2) y = np.array(y_train[i], ndmin=1) # print x.shape # print y sk_pa.partial_fit(x, y) # sk_pa.fit(X_train, y_train) # y_preds = search.predict(X_test) y_preds = sk_pa.predict(X_test) mae_sk_pa = mean_absolute_error(y_preds, y_test) print "Sklearn PA MAE = %2.4f" % mae_sk_pa
X, Y = make_regression(n_samples=nb_samples_1, n_features=5, random_state=1000) # Create the model par = PassiveAggressiveRegressor(C=0.01, loss='squared_epsilon_insensitive', epsilon=0.001, max_iter=2000, random_state=1000) # Fit the model incrementally and collect the squared errors squared_errors = [] for (x, y) in zip(X, Y): par.partial_fit(x.reshape(1, -1), y.ravel()) y_pred = par.predict(x.reshape(1, -1)) squared_errors.append(np.power(y_pred - y, 2)) # Show the error plot fig, ax = plt.subplots(figsize=(18, 8)) ax.plot(squared_errors) ax.set_xlabel('Sample') ax.set_ylabel('Squared error') ax.grid() plt.show() # Repeat the example with a discontinuous dataset X1, Y1 = make_regression(n_samples=nb_samples_2,