def test_regressor_partial_fit():
    y_bin = y.copy()
    y_bin[y != 1] = -1

    for data in (X, X_csr):
        reg = PassiveAggressiveRegressor(C=1.0,
                                         fit_intercept=True,
                                         random_state=0)
        for t in xrange(50):
            reg.partial_fit(data, y_bin)
        pred = reg.predict(data)
        assert_less(np.mean((pred - y_bin)**2), 1.7)
def test_regressor_partial_fit():
    y_bin = y.copy()
    y_bin[y != 1] = -1

    for data in (X, X_csr):
            reg = PassiveAggressiveRegressor(C=1.0,
                                             fit_intercept=True,
                                             random_state=0)
            for t in range(50):
                reg.partial_fit(data, y_bin)
            pred = reg.predict(data)
            assert_less(np.mean((pred - y_bin) ** 2), 1.7)
Exemplo n.º 3
0
def test_regressor_partial_fit():
    y_bin = y.copy()
    y_bin[y != 1] = -1

    for data in (X, X_csr):
        for average in (False, True):
            reg = PassiveAggressiveRegressor(random_state=0,
                average=average, max_iter=100)
            for t in range(50):
                reg.partial_fit(data, y_bin)
            pred = reg.predict(data)
            assert np.mean((pred - y_bin) ** 2) < 1.7
            if average:
                assert hasattr(reg, 'average_coef_')
                assert hasattr(reg, 'average_intercept_')
                assert hasattr(reg, 'standard_intercept_')
                assert hasattr(reg, 'standard_coef_')
def test_regressor_partial_fit():
    y_bin = y.copy()
    y_bin[y != 1] = -1

    for data in (X, X_csr):
        for average in (False, True):
            reg = PassiveAggressiveRegressor(
                C=1.0, fit_intercept=True, random_state=0,
                average=average, max_iter=100)
            for t in range(50):
                reg.partial_fit(data, y_bin)
            pred = reg.predict(data)
            assert_less(np.mean((pred - y_bin) ** 2), 1.7)
            if average:
                assert hasattr(reg, 'average_coef_')
                assert hasattr(reg, 'average_intercept_')
                assert hasattr(reg, 'standard_intercept_')
                assert hasattr(reg, 'standard_coef_')
Exemplo n.º 5
0
def test_regressor_partial_fit():
    y_bin = y.copy()
    y_bin[y != 1] = -1

    for data in (X, X_csr):
        for average in (False, True):
            reg = PassiveAggressiveRegressor(C=1.0,
                                             fit_intercept=True,
                                             random_state=0,
                                             average=average)
            for t in range(50):
                reg.partial_fit(data, y_bin)
            pred = reg.predict(data)
            assert_less(np.mean((pred - y_bin) ** 2), 1.7)
            if average:
                assert_true(hasattr(reg, 'average_coef_'))
                assert_true(hasattr(reg, 'average_intercept_'))
                assert_true(hasattr(reg, 'standard_intercept_'))
                assert_true(hasattr(reg, 'standard_coef_'))
Exemplo n.º 6
0
    def refit_from_scratch(self):
        temp_model = PassiveAggressiveRegressor()
        temp_enc   = CountVectorizer()
        X = []   # binary matrix the presence of tags
        Z = []   # additional numerical data
        Y = []   # target (to predict) values
        db_size = self.db.size()
        for data in self.db.yield_all():
            feedback = data["feedback"]
            tags     = data[  "tags"  ]
            if feedback and tags:
                Y.append(   feedback   )
                X.append(" ".join(tags))
                Z.append(self.fmt_numerical(data))

        X = temp_enc.fit_transform(X)
        X = hstack((X, coo_matrix(Z)))
        self.allX = X
        for i in range(X.shape[0]):
            temp_model.partial_fit(X.getrow(i), [Y[0]])
        self.model = temp_model
        self.enc = temp_enc
        print(np.sum(train_clusters), np.sum(test_clusters))

        train, test = dg.generate_train_test_splits(train_clusters, test_clusters)
        train, test = shuffle(train).astype(np.float32), shuffle(test).astype(np.float32)

        train_x, train_y = train.iloc[:, 1:], train.iloc[:, 0]
        test_x, test_y = test.iloc[:, 1:], test.iloc[:, 0]

        dg.add_drift(0.5, True)

        n, d = train_x.shape
        _n += n
        for i in range(0, n, batch_size):
            count += 1
            pa.partial_fit(train_x[i:i + batch_size], train_y[i:i + batch_size])
            sgd.partial_fit(train_x[i:i + batch_size], train_y[i:i + batch_size])

            if i % 50 == 0 or i == n - 1:
                pred1 = pa.predict(test_x)
                pred2 = sgd.predict(test_x)

                mse1, mae1 = np.sqrt(mean_squared_error(test_y, pred1)), mean_absolute_error(test_y, pred1)
                mse2, mae2 = np.sqrt(mean_squared_error(test_y, pred2)), mean_absolute_error(test_y, pred2)

                pa_mse_arr.append(mse1)
                pa_mae_arr.append(mae1)
                sgd_mse_arr.append(mse2)
                sgd_mae_arr.append(mae2)
                count_arr.append(count)
Exemplo n.º 8
0
        缺点:
            暂时未知
        对于算法的具体过程还不是很清楚,所以暂时作为一个黑箱吧

'''
rg = PassiveAggressiveRegressor(C=1.0,
                                fit_intercept=True,
                                n_iter=5,
                                shuffle=True,
                                verbose=0,
                                loss='epsilon_insensitive',
                                epsilon=0.1,
                                random_state=None,
                                warm_start=False)
rg.fit(X_train, Y_train)
rg.partial_fit(X_train, Y_train)  # 增量学习
Y_pre = rg.predict(X_test)
rg.score(X_test, Y_test)
rg.coef_
rg.intercept_
'''
    C                           正则化项系数 
    fit_intercept               是否计算截距
    n_iter                      迭代次数
    shuffle                     是否洗牌
    verbose                     哈
    loss                        损失函数
    epsilon                     阈值
    random_state                随机器
    warm_start=False            新的迭代开始后,是否用上一次的最后结果作为初始化
'''
Exemplo n.º 9
0
def main(input_path, output_attribute_index, scikit_output_path,
         spark_output_path):

    # Instancira se Passive Aggressive Regressor model
    regressor = PassiveAggressiveRegressor()
    for file_path in hdfs.ls(input_path):
        # Ucitava se sadrzaj fajla i kreira string matrica od njega
        content = hdfs.load(file_path)
        temp = content.split("\n")
        temp = list(map(lambda x: x.split(","), temp))
        temp = list(filter(lambda x: len(x) > 1, temp))
        raw_matrix = np.array(temp)
        # Ucitava se numpy matrica i zatim parsira u matricu realnih vrednosti
        # koja se nakon toga koristi prilikom treniranja modela
        # raw_matrix = np.genfromtxt(file_path, delimiter=',', dtype='string')
        input_matrix = raw_matrix[1:, 3:-5].astype('float64')
        output_vector = raw_matrix[1:, -5 +
                                   output_attribute_index].astype('float64')
        # Model se trenira u vidu iterativnog poboljsanja
        regressor.partial_fit(input_matrix, output_vector)
        # Na konzoli se stampa putanja do obradjenog fajla
        print(file_path)

    # Cuva se kreirani model na izlaznoj putanji
    # koja je prosledjena u vidu argumenta
    with hdfs.open(scikit_output_path, 'w') as opened_file:
        pickle.dump(regressor, opened_file)

    # Inicijalizacija konfiguracije i konteksta izvrsenja aplikacije
    configuration = SparkConf().setAppName("BigDataProj3_Trainer")
    context = SparkContext(conf=configuration)
    context.setLogLevel("ERROR")
    # Inicijalizacija sesije
    # (mora da se obavi zbog upisivanja modela)
    session = SparkSession(context)

    # Ucitavanje RDD podataka sa ulazne putanje
    input_data = context.textFile(input_path)
    # Parsiranje svakog reda na reci
    input_data = input_data.map(lambda x: x.split(","))
    # Ignorisu se header-i
    input_data = input_data.filter(lambda x: x[0] != "Timestamp")
    # Ignorisu se prve tri vrste (Timestamp, Latitude i Longitude)
    # i bira se odgovarajuca izlazna kolona
    # (u zavisnosti od output_attribute_index promenljive)
    input_data = input_data.map(lambda x: list(map(lambda y: float(y), x[
        3:-5])) + [float(x[-5 + output_attribute_index])])

    # Formira se odgovarajuci DataFrame objekat
    # (VectorAssembler se koristi kod formiranja kolona
    # koje omogucavaju koriscenje fit metode linearne regresije)
    input_cols = []
    for i in range(15):
        input_cols.append("_" + str(i + 1))
    assembler = VectorAssembler(inputCols=input_cols, outputCol='features')
    data_frame = assembler.transform(input_data.toDF())

    # Instancira se LinearRegression objekat i vrsi njegovo treniranje
    # i zatim cuvanje na zadatoj putanji
    regression = LinearRegression(featuresCol='features', labelCol='_16')
    model = regression.fit(data_frame)
    model.write().overwrite().save(spark_output_path)
Exemplo n.º 10
0
                                                    random_state=42)
    scaler.fit(Xtrain)
    Xtrain = scaler.transform(Xtrain)
    Xtest = scaler.transform(Xtest)
scaleTime = time.time() - st
print("Time to split and Scale Data: " + str(scaleTime))

#X = [[0, 0], [2, 2]]
#y = [0.5, 2.5]
#clf = MLPRegressor(hidden_layer_sizes=(1000,), random_state=1, max_iter=1, warm_start=True)
clf = PassiveAggressiveRegressor(random_state=1, warm_start=True, max_iter=100)
st = time.time()
ttList = []
for i in range(len(Xtrain)):
    tt = time.time()
    clf = clf.partial_fit(Xtrain, Ytrain)
    print(i / len(Xtrain))
    ttList.append(time.time() - tt)
trainTime = time.time() - st
joblib.dump(clf, 'currentModel.mod')
pred = []
st = time.time()
for x in range(len(Xtest)):
    pred.append(clf.predict([Xtest[x]]))
score = metrics.mean_absolute_error(Ytest, clf.predict(Xtest))
predictTime = time.time() - st
print("Time to import: " + str(importTime))
print("Time to Read File: " + str(readFileTime))
print("Time to split and Scale Data: " + str(scaleTime))
print("Time to train: " + str(trainTime))
print("Time to predict: " + str(time.time() - st))
Exemplo n.º 11
0
            #TO DO

            #Remove text columns that have already been converted into numeric features
            train_features = train.drop(['Text', 'Summary'], axis='columns')

            #Convert features to sparse matrix
            train_features = csr_matrix(train_features.values)

            #Combine sparse matrices
            train = hstack([train_summary_dtm, train_text_dtm, train_features])

            #Scale
            train = scaler.transform(train)

            #Compute partial fit
            lm.partial_fit(train, train_score)

        #Create empty lists
        validation_pred = []
        validation_score = []

        #Loop through chunks for validation
        for reviews in pd.read_csv('Reviews.csv',
                                   index_col='Id',
                                   usecols=['Id', 'Summary', 'Text', 'Score'],
                                   chunksize=chunksize):

            #Only need validation data
            validation = reviews.iloc[reviews.index.isin(
                validation_indices[fold])]
Exemplo n.º 12
0
def main():
    X, y, coef = make_regression(1000, 200, 10, 1, noise=0.05, coef=True,
                                 random_state=42)

    # X = np.column_stack((X, np.ones(X.shape[0])))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                        random_state=42)

    # sca = StandardScaler()
    # sca.fit(X_train)
    # X_train = sca.transform(X_train)
    # X_test = sca.transform(X_test)

    # print X.shape
    # print y.shape
    # print coef.shape

    param_grid = {
        "C": [0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 10,
              100, 1000],
        "epsilon": [0.0001, 0.001, 0.01, 0.1]}

    param_grid_kern = {
        "C": [0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 10,
              100, 1000],
        "epsilon": [0.0001, 0.001, 0.01, 0.1],
        "gamma": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}
    # "loss": ["pa", "pai", "paii"]}}

    my_pa = PARegressor(loss="paii", C=1, epsilon=0.001, n_iter=1,
                        fit_intercept=False)
    #
    # search = GridSearchCV(my_pa, param_grid,
    #                       scoring='mean_absolute_error', n_jobs=8, iid=True, refit=True, cv=5,
    #                       verbose=1)
    # search.fit(X_train, y_train)
    # print search.best_params_

    my_pa.fit(X_train, y_train)
    print my_pa.coef_

    # y_preds = search.predict(X_test)
    y_preds = my_pa.predict(X_test)

    mae_my_pa = mean_absolute_error(y_test, y_preds)
    print "My PA MAE = %2.4f" % mae_my_pa

    my_kpa_linear = KernelPARegressor(kernel="linear", loss="paii", C=1, epsilon=0.001, n_iter=1, fit_intercept=False)
    my_kpa_linear.fit(X_train, y_train)
    print "alphas", len(my_kpa_linear.alphas_), my_kpa_linear.alphas_
    y_preds = my_kpa_linear.predict(X_test)
    mae_kpa_linear = mean_absolute_error(y_test, y_preds)
    print "My KPA linear MAE = %2.4f" % mae_kpa_linear

    my_kpa_rbf = KernelPARegressor(kernel="rbf", loss="paii", gamma=0.001, C=1, epsilon=0.001, n_iter=1, fit_intercept=False)
    # search = GridSearchCV(my_kpa_rbf, param_grid_kern,
    #                       scoring='mean_absolute_error', n_jobs=8, iid=True, refit=True, cv=5,
    #                       verbose=1)
    # search.fit(X_train, y_train)

    my_kpa_rbf.fit(X_train, y_train)
    print "alphas", len(my_kpa_rbf.alphas_), my_kpa_rbf.alphas_
    print "support", len(my_kpa_rbf.support_)
    # print "alphas", len(search.best_estimator_.alphas_)  # , my_kpa_rbf.alphas_
    # print "support", len(search.best_estimator_.support_)
    # print search.best_params_
    y_preds = my_kpa_rbf.predict(X_test)
    # y_preds = search.predict(X_test)
    mae_my_kpa = mean_absolute_error(y_test, y_preds)
    print "My Kernel PA MAE = %2.4f" % mae_my_kpa

    # print search.best_estimator_
    # print np.corrcoef(search.best_estimator_.coef_, coef)

    # param_grid = {
    # "C": [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 10,
    #           100, 1000, 10000],
    #     "epsilon": [0.0001, 0.001, 0.01, 0.1],
    #     # "loss": ["epsilon_insensitive", "squared_epsilon_insensitive"]}
    #     "loss": ["squared_epsilon_insensitive"]}


    # search = GridSearchCV(PassiveAggressiveRegressor(fit_intercept=True),
    # param_grid, scoring='mean_absolute_error', n_jobs=8, iid=True,
    # refit=True, cv=5, verbose=1)
    # search.fit(X_train, y_train)

    sk_pa = PassiveAggressiveRegressor(loss="squared_epsilon_insensitive", C=1,
                                       epsilon=0.001, n_iter=1,
                                       fit_intercept=False,
                                       warm_start=True)
    for i in xrange(X_train.shape[0]):
        # for x_i, y_i in zip(X_train, y_train):
        x = np.array(X_train[i], ndmin=2)
        y = np.array(y_train[i], ndmin=1)
        # print x.shape
        # print y
        sk_pa.partial_fit(x, y)

    # sk_pa.fit(X_train, y_train)

    # y_preds = search.predict(X_test)
    y_preds = sk_pa.predict(X_test)
    mae_sk_pa = mean_absolute_error(y_preds, y_test)
    print "Sklearn PA MAE = %2.4f" % mae_sk_pa
Exemplo n.º 13
0
    X, Y = make_regression(n_samples=nb_samples_1,
                           n_features=5,
                           random_state=1000)

    # Create the model
    par = PassiveAggressiveRegressor(C=0.01,
                                     loss='squared_epsilon_insensitive',
                                     epsilon=0.001,
                                     max_iter=2000,
                                     random_state=1000)

    # Fit the model incrementally and collect the squared errors
    squared_errors = []

    for (x, y) in zip(X, Y):
        par.partial_fit(x.reshape(1, -1), y.ravel())
        y_pred = par.predict(x.reshape(1, -1))
        squared_errors.append(np.power(y_pred - y, 2))

    # Show the error plot
    fig, ax = plt.subplots(figsize=(18, 8))

    ax.plot(squared_errors)
    ax.set_xlabel('Sample')
    ax.set_ylabel('Squared error')
    ax.grid()

    plt.show()

    # Repeat the example with a discontinuous dataset
    X1, Y1 = make_regression(n_samples=nb_samples_2,