示例#1
0
def scaler_constructor(flags: list):
    if ('sine_advanced' in flags) or ('quad_advanced' in flags):
        if ('sine_advanced' in flags) and ('quad_advanced' in flags):
            features = FeatureUnion([
                ("sine", preprocessing.FunctionTransformer(np.sin)),
                ("quadratic", preprocessing.FunctionTransformer(np.square))
            ])
            print('a')
        elif ('sine_advanced' in flags):
            features = preprocessing.FunctionTransformer(np.sin)
            print('b')
        else:
            features = preprocessing.FunctionTransformer(np.square)
            print('c')
        if ('norm_advanced' in flags):
            scaler = Pipeline([('features', features),
                               ('norm', preprocessing.StandardScaler()),
                               ('final_operation',
                                preprocessing.MinMaxScaler())])
            print('d')
        else:
            scaler = Pipeline([('features', features),
                               ('final_operation',
                                preprocessing.MinMaxScaler())])
            print('e')
    elif ('norm_advanced' in flags):
        scaler = Pipeline([('norm', preprocessing.StandardScaler()),
                           ('final_operation', preprocessing.MinMaxScaler())])
        print('f')
    else:
        scaler = preprocessing.MinMaxScaler()
        print('g')
    return scaler
示例#2
0
def create_estimator(ml_obj, numeric_features, cat_features, date_features):
    estimator = pipeline.Pipeline(steps=[
        ('Feature_processing',
         pipeline.FeatureUnion(transformer_list=[
             ('Numeric_features',
              pipeline.Pipeline(steps=[(
                  'selecting',
                  preprocessing.FunctionTransformer(
                      lambda data: data[:, numeric_features], validate=True)),
                                       ('scaling',
                                        preprocessing.StandardScaler(
                                            with_mean=0., with_std=1))])),
             ('Categical_features',
              pipeline.Pipeline(steps=[(
                  'selecting',
                  preprocessing.FunctionTransformer(
                      lambda data: data[:, cat_features], validate=True)),
                                       ('hot_encoding',
                                        preprocessing.OneHotEncoder(
                                            handle_unknown='ignore'))])),
             ('Date_features',
              pipeline.Pipeline(steps=[(
                  'selecting',
                  preprocessing.FunctionTransformer(
                      lambda data: data[:, date_features], validate=True)),
                                       ('hot_encoding',
                                        preprocessing.OneHotEncoder(
                                            handle_unknown='ignore'))]))
         ])), ('Model_fitting', ml_obj)
    ])
    return estimator


#TODO:
#make custom score
def sk_function_transformer():
    def simple_preprocessor(numpy_x):
        return numpy_x**2

    transformer = sk_preprocessing.FunctionTransformer(simple_preprocessor,
                                                       validate=True)
    return transformer
示例#4
0
def define_pipeline():
    categorical = ('season', 'holiday', 'workingday', )
    numerical = ('datetime', 'weather', 'temp', 'atemp', 'humidity', 'windspeed',) # Datetime isn't numerical, but needs to be in the numeric branch
    pipeline = Pipeline([
        # Process cat & num separately, then join back together
        ('union', FeatureUnion([ 
            ('categorical', Pipeline([
                ('select_cat', fe.SelectCols(cols = categorical)),
                ('onehot', OneHotEncoder()),    
            ])),    
            ('numerical', Pipeline([
                ('select_num', fe.SelectCols(cols = numerical)),
                ('prog_age', fe.AddProgAge()),
                ('date', fe.DateFormatter()),
                ('daily_max', fe.DailyGroup(func = np.max, cols = ['weather'], rsuffix = '_dailymax')),
                ('daily_mean', fe.DailyGroup(func = np.mean, cols = ['temp'], rsuffix = '_dailymean')),
                ('drop_datetime', fe.SelectCols(cols = ('datetime', 'month'), invert = True)),
                ('temp', fe.ProcessNumerical(cols_to_square = ('temp', 'atemp', 'humidity'),)),
                ('rollingweather', fe.RollingWindow(cols = ('weather', ))),
                ('forecast', fe.WeatherForecast()),
                # ('bad_weather', fe.BinarySplitter(col = 'weather', threshold = 2)),
                # ('filter', fe.PassFilter(col='atemp', lb = 15, replacement_style = 'mean'))
                ('scale', StandardScaler()),    
            ])),    
        ])),
        ('to_dense', preprocessing.FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), 
        ('clf', GradientBoostingRegressor(n_estimators=100,random_state=2)),
    ])
    return pipeline
示例#5
0
def main():
    #     data = pd.read_csv("colour-data.csv")
    data = pd.read_csv(sys.argv[1])
    X = data  # array with shape (n, 3). Divide by 255
    y = data  # array with shape (n,) of colour words

    # TODO: build model_rgb to predict y from X.
    # TODO: print model_rgb's accuracy_score

    # TODO: build model_lab to predict y from X by converting to LAB colour first.
    # TODO: print model_lab's accuracy_score

    data = pd.read_csv("colour-data.csv")
    rgb_columns = ["R", "G", "B"]
    data[rgb_columns] = data[rgb_columns].values / 255
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
        data[rgb_columns].values, data["Label"].values)
    model_rgb = GaussianNB()
    model_rgb = model_rgb.fit(X_train, Y_train)
    Y_predicted = model_rgb.predict(X_test)
    print(accuracy_score(Y_test, Y_predicted))

    model_lab = pipeline.make_pipeline(
        preprocessing.FunctionTransformer(my_rgb2lab), GaussianNB())
    model_lab = model_lab.fit(X_train, Y_train)
    Y_predicted_lab = model_lab.predict(X_test)
    print(accuracy_score(Y_test, Y_predicted_lab))

    plot_predictions(model_rgb)
    plt.savefig('predictions_rgb.png')
    plot_predictions(model_lab)
    plt.savefig('predictions_lab.png')
示例#6
0
def fit_encoder(encoding_method, df):
    """

    Parameters
    ----------
    encoding_method: {"OneHot", "OneHot_drop_binary", "Identity"}
        String indicating what encoding method to use.
    df: pd.DataFrame
        DataFrame containing only categorical data.

    Returns
    -------
    sklearn.base.BaseEstimator

    """
    # TODO rather then passing a string this could accept a function
    if encoding_method == "OneHot":
        fitted_encoder = preprocessing.OneHotEncoder(handle_unknown="error",
                                                     sparse=False).fit(df)
    elif encoding_method == "OneHot_drop_binary":
        fitted_encoder = preprocessing.OneHotEncoder(drop="if_binary",
                                                     handle_unknown="error",
                                                     sparse=False).fit(df)
    elif encoding_method is None or "Identity":
        fitted_encoder = preprocessing.FunctionTransformer(func=None,
                                                           inverse_func=None)
    else:
        raise ValueError("Encoding Method not known")
    return fitted_encoder
示例#7
0
def main(infile):
    data = pd.read_csv(infile)
    X = data[[
        'R', 'G', 'B'
    ]] / 255  # array with shape (n, 3). Divide by 255 so components are all 0-1.
    y = data['Label']  # array with shape (n,) of colour words.
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=0.2)
    model_rgb = GaussianNB()

    model_rgb.fit(X_train, y_train)
    accuracy_score = model_rgb.score(X_test, y_test)
    print("The accuracy score of RGB is %.3g" % accuracy_score)
    plot_predictions(model_rgb)
    plt.savefig('predictions_rgb.png')

    # TODO: build model_rgb to predict y from X.
    # TODO: print model_rgb's accuracy_score

    model_lab = sp.make_pipeline(spr.FunctionTransformer(RGB_LAB),
                                 GaussianNB(priors=None))
    model_lab.fit(X_train, y_train)
    accuracy_score = model_lab.score(X_test, y_test)
    print("The accuracy score of LAB is %.3g" % accuracy_score)
    plot_predictions(model_lab)
    plt.savefig('predictions_lab.png')
示例#8
0
def log_transform(x_train_dum, scale_list):
    ''' Log Transformer '''
    logtf = preprocessing.FunctionTransformer(np.log1p)
    x_train_logtf = x_train_dum.copy()
    for i in scale_list:
        x_train_logtf.iloc[:, i] = logtf.transform(x_train_dum.iloc[:, i])
    return x_train_logtf
示例#9
0
def fit_scaler(scaling_method, df):
    """

    Parameters
    ----------
    scaling_method: {"MinMax", "Standard", "Identity"}
        String indicating what scaling method to use.
    df: pd.DataFrame
        DataFrame only containing continuous data.

    Returns
    -------
    sklearn.base.BaseEstimator

    """
    # TODO rather then passing a string this could accept a function
    if scaling_method == "MinMax":
        fitted_scaler = preprocessing.MinMaxScaler().fit(df)
    elif scaling_method == "Standard":
        fitted_scaler = preprocessing.StandardScaler().fit(df)
    elif scaling_method is None or "Identity":
        fitted_scaler = preprocessing.FunctionTransformer(func=None,
                                                          inverse_func=None)
    else:
        raise ValueError("Scaling Method not known")
    return fitted_scaler
def transform(type, train_frame, validation_frame, test_frame, columns):
    test_frame_X = test_frame.drop(['y'], axis=1)
    test_frame_Y = test_frame['y']
    train_frame_X = train_frame.drop(['y'], axis=1)
    train_frame_Y = train_frame['y']
    validation_frame_X = validation_frame.drop(['y'], axis=1)
    validation_frame_Y = validation_frame['y']

    if type == 'log':
        function = preprocessing.FunctionTransformer(log01p, validate=False)
    elif type == 'bi':
        function = preprocessing.Binarizer(threshold=0)
    elif type == 'std':
        function = preprocessing.StandardScaler().fit(train_frame_X)

    test_data_X = function.transform(test_frame_X)
    train_data_X = function.transform(train_frame_X)
    validation_data_X = function.transform(validation_frame_X)

    test_set = pd.DataFrame(test_data_X, columns=columns)
    test_set['y'] = test_frame_Y.values
    train_set = pd.DataFrame(train_data_X, columns=columns)
    train_set['y'] = train_frame_Y.values
    validation_set = pd.DataFrame(validation_data_X, columns=columns)
    validation_set['y'] = validation_frame_Y.values
    return [train_set, validation_set, test_set]
示例#11
0
    def __init__(self, estimator, transform=None):
        """a container for a trained estimator and transform

    Input:
        estimator: a fitted sklearn estimator
        transform: a fitted sklearn transform

    For example:
        >>> from sklearn.datasets import load_iris
        >>> data = load_iris()
        >>> d = MLData(*traintest(data.data[:,:3], data.data[:,3], .2))
        >>> from sklearn.linear_model import LinearRegression
        >>> from sklearn.preprocessing import StandardScaler
        >>> xfm = StandardScaler().fit(d.xtrain)
        >>> lnr = LinearRegression().fit(xfm.transform(d.xtrain), d.ytrain)
        >>> e = Estimator(lnr, xfm)
        >>> [e(*i) for i in d.xtest[:2]]
        [1.7802194778123053, 1.3775908988859642]
        >>> e.test(d.xtest)[:2].tolist()
        [1.7802194778123053, 1.3775908988859642]
        >>> d.ytest[:2].tolist()
        [1.8, 1.3]
        >>> e.score(d.xtest, d.ytest)
        0.9440222526291645
        """
        self.estimator = estimator
        if transform is None:
            import sklearn.preprocessing as pre
            transform = pre.FunctionTransformer()  #XXX: or StandardScaler ?
        self.transform = transform
        self.function = lambda *x: float(
            self.test(np.array(x).reshape(1, -1)).reshape(-1))
示例#12
0
def define_fuc(x):
    '''
    通俗的讲,就是把原始的特征放进一个函数中做转换,这个函数出来的值作为新的特征;
    '''
    fuc = np.log1p
    transformer = preprocessing.FunctionTransformer(fuc)
    x_t = transformer.transform(x)
    return x_t
def make_log_plot(state_name, visible=True):
    state_data = states_data[states_data['state'] == state_name]
    log_transformer = preprocessing.FunctionTransformer(np.log, validate=True)
    confirmed_log = log_transformer.fit_transform(state_data['confirmed'].values.reshape(-1, 1))
    return go.Scatter(x=state_data['date'], y=confirmed_log.ravel(), mode='lines', 
                            line=dict(color='cornflowerblue', width=1.5),
                            name='Confirmed cases',
                            visible = visible
                        )
示例#14
0
 def transform_normalize(self, X):
     print("Transforming with log1p and scaling with MinMaxScaler")
     # transform data with log1p function - data is right skewed
     transformer = preprocessing.FunctionTransformer(np.log1p,
                                                     validate=True)
     X = transformer.transform(X)
     # normalize - to similarly scale the data
     X = preprocessing.MinMaxScaler().fit_transform(X)
     return X
def main():
    data = pd.read_csv(sys.argv[1])
    X = data[['R', 'G', 'B']].values / 255
    y = data['Label'].values
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=0.3)
    # TODO: create some models

    bayes_rgb_model = GaussianNB(priors=None)

    bayes_lab_model = sp.make_pipeline(spr.FunctionTransformer(RGB_LAB),
                                       GaussianNB(priors=None))

    knn_rgb_model = KNeighborsClassifier(n_neighbors=9)

    knn_lab_model = sp.make_pipeline(spr.FunctionTransformer(RGB_LAB),
                                     KNeighborsClassifier(n_neighbors=9))

    svc_rgb_model = SVC(kernel='linear', C=3)

    svc_lab_model = sp.make_pipeline(spr.FunctionTransformer(RGB_LAB),
                                     SVC(kernel='linear', C=0.1))

    # train each model and output image of predictions
    models = [
        bayes_rgb_model, bayes_lab_model, knn_rgb_model, knn_lab_model,
        svc_rgb_model, svc_lab_model
    ]
    # models = [svc_lab_model]
    for i, m in enumerate(
            models):  # yes, you can leave this loop in if you want.
        m.fit(X_train, y_train)
        plot_predictions(m)
        plt.savefig('predictions-%i.png' % (i, ))

    print(
        OUTPUT_TEMPLATE.format(
            bayes_rgb=bayes_rgb_model.score(X_test, y_test),
            bayes_lab=bayes_lab_model.score(X_test, y_test),
            knn_rgb=knn_rgb_model.score(X_test, y_test),
            knn_lab=knn_lab_model.score(X_test, y_test),
            svm_rgb=svc_rgb_model.score(X_test, y_test),
            svm_lab=svc_lab_model.score(X_test, y_test),
        ))
示例#16
0
def gene_feature(data_pd):
    # numeric columns
    col_binary = ['holiday', 'workingday']
    index_binary = np.asarray([(col in col_binary) for col in data_pd.columns],
                              dtype=bool)
    # cate columns
    col_cate = ['season', 'weather']
    index_cate = np.asarray([(col in col_cate) for col in data_pd.columns],
                            dtype=bool)
    # numeric columns
    col_num = ['temp', 'atemp', 'humidity', 'windspeed']
    index_num = np.asarray([(col in col_num) for col in data_pd.columns],
                           dtype=bool)
    # normal value
    col_normal = ['month', 'day', 'hour']
    normal_num = np.asarray([(col in col_normal) for col in data_pd.columns],
                            dtype=bool)

    feature_trans_list = [
        ('binary_value',
         Pipeline(steps=[(
             'select',
             preprocessing.FunctionTransformer(lambda x: x[:, index_binary])
         ), ('transform', preprocessing.OneHotEncoder())])),
        ('cate_value',
         Pipeline(steps=[(
             'select',
             preprocessing.FunctionTransformer(lambda x: x[:, index_cate])
         ), ('transform', preprocessing.OneHotEncoder())])),
        ('numeric_value',
         Pipeline(steps=[(
             'select',
             preprocessing.FunctionTransformer(lambda x: x[:, index_num])
         ), ('transform', preprocessing.StandardScaler(with_mean=0))])),
        ('normal_value',
         Pipeline(steps=[(
             'select',
             preprocessing.FunctionTransformer(lambda x: x[:, normal_num]))]))
    ]
    feature_union = FeatureUnion(feature_trans_list)
    feature_set = feature_union.fit_transform(data_pd).toarray()
    return feature_set
示例#17
0
 def _log_transform(self):
     for c in self.num_feats:
         logt = preprocessing.FunctionTransformer(np.log1p,
                                                  inverse_func=np.expm1,
                                                  validate=True)
         logt.fit(self.df[c].values.reshape(-1, 1))
         self.output_df.loc[:,
                            c] = logt.transform(self.df[c].values.reshape(
                                -1, 1))
         self.log_transform[c] = logt
     return self.output_df, self.log_transform
示例#18
0
 def get_estimator(self):
     binary = ('binary_variables_processing',
               preprocessing.FunctionTransformer(
                   lambda data: data[:, Model.binary_data_indices],
                   validate=True))
     categorial = (
         'categorical_variables_processing',
         pipeline.Pipeline(
             steps=[(
                 'selecting',
                 preprocessing.FunctionTransformer(
                     lambda data: data[:, Model.categorical_data_indices],
                     validate=True)),
                    ('hot_encoding',
                     preprocessing.OneHotEncoder(handle_unknown='ignore',
                                                 sparse=False))]))
     estimator = pipeline.Pipeline(
         steps=[('feature_processing',
                 pipeline.FeatureUnion(
                     transformer_list=[binary, categorial])
                 ), ('model_fitting', self.regressor)])
     return estimator
示例#19
0
def param_tuning_graphs(train_data,dev_data,train_label,pipeline,parameter,param_values):

    categorical = ('season', 'holiday', 'workingday', )
    numerical = ('datetime', 'weather', 'temp', 'atemp', 'humidity', 'windspeed',) # Datetime isn't numerical, but needs to be in the numeric branch
    pipeline = Pipeline([
        # Process cat & num separately, then join back together
        ('union', FeatureUnion([ 
            ('categorical', Pipeline([
                ('select_cat', fe.SelectCols(cols = categorical)),
                ('onehot', OneHotEncoder()),    
            ])),    
            ('numerical', Pipeline([
                ('select_num', fe.SelectCols(cols = numerical)),
                ('date', fe.DateFormatter()),
                #('drop_datetime', fe.SelectCols(cols = ('datetime'), invert = True)),
                ('temp', fe.ProcessNumerical(cols_to_square = ('temp', 'atemp', 'humidity'),)),
                # ('bad_weather', fe.BinarySplitter(col = 'weather', threshold = 2)),
                # ('filter', fe.PassFilter(col='atemp', lb = 15, replacement_style = 'mean'))
                ('scale', StandardScaler()),    
            ])),    
        ])),
        ('to_dense', preprocessing.FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), 
        #('clf', GradientBoostingRegressor(n_estimators=100,random_state=2)),
    ])

    # Run train and dev data through pipeline for feature engineering
    features = [c for c in train_data.columns if c not in ['count', 'casual', 'registered', 'log_casual', 'log_registered', 'prog_age']]
    fe_train_data = pipeline.fit_transform(train_data[features])
    fe_dev_data = pipeline.transform(dev_data[features])

    row_format = "{:>10}" *(6)
    rmse_list=[]
    for i in param_values:
        t0 = time()
        if parameter == 'n_estimators':
            gb = GradientBoostingRegressor(n_estimators=i,learning_rate=0.05,max_depth=10, min_samples_leaf=20,random_state=2)
        if parameter == 'learning_rate': 
            gb = GradientBoostingRegressor(n_estimators=115,learning_rate=i,max_depth=10, min_samples_leaf=20,random_state=2)
        if parameter == 'max_depth': 
            gb = GradientBoostingRegressor(n_estimators=115,learning_rate=0.05,max_depth=i, min_samples_leaf=20,random_state=2)
        if parameter == 'min_samples_leaf': 
            gb = GradientBoostingRegressor(n_estimators=115,learning_rate=0.05,max_depth=10, min_samples_leaf=i,random_state=2)
        gb.fit(fe_train_data, train_data[train_label])
        predicted_y = gb.predict(fe_dev_data)
        rmse = get_RMSE(actual_values = dev_data[train_label], predicted_values = predicted_y)
        rmse_list.append(round(rmse,3))
        print row_format.format(parameter+":", i, "RMSE:", round(rmse,3),
                                "Runtime:", round((time() - t0),3))
    plt.plot(param_values,rmse_list)
    plt.show()
    return rmse_list
示例#20
0
def main():
    data = pd.read_csv(sys.argv[1])
    X = data # array with shape (n, 3). Divide by 255
    y = data # array with shape (n,) of colour words

    
    rgb_columns = ["R","G","B"]
    data[rgb_columns] = data[rgb_columns].values/255
    X_train,X_test,y_train,y_test = model_selection.train_test_split(data[rgb_columns].values,data["Label"].values)
    
    bayes_rgb_model = GaussianNB()

    bayes_lab_model = pipeline.make_pipeline(preprocessing.FunctionTransformer(my_rgb2lab),GaussianNB())

    knn_rgb_model = KNeighborsClassifier(15)
    
    knn_lab_model = pipeline.make_pipeline(preprocessing.FunctionTransformer(my_rgb2lab),KNeighborsClassifier(15))
    
    svc_rgb_model = svm.SVC(C=30)
    
    svc_lab_model = pipeline.make_pipeline(preprocessing.FunctionTransformer(my_rgb2lab),svm.SVC(C=1.0,kernel="linear", decision_function_shape="ovr"))
    
    # train each model and output image of predictions
    models = [bayes_rgb_model, bayes_lab_model, knn_rgb_model, knn_lab_model, svc_rgb_model, svc_lab_model]
    for i, m in enumerate(models):  # yes, you can leave this loop in if you want.
        m.fit(X_train, y_train)
        plot_predictions(m)
        plt.savefig('predictions-%i.png' % (i,))

    print(OUTPUT_TEMPLATE.format(
        bayes_rgb=bayes_rgb_model.score(X_test, y_test),
        bayes_lab=bayes_lab_model.score(X_test, y_test),
        knn_rgb=knn_rgb_model.score(X_test, y_test),
        knn_lab=knn_lab_model.score(X_test, y_test),
        svm_rgb=svc_rgb_model.score(X_test, y_test),
        svm_lab=svc_lab_model.score(X_test, y_test),
    ))
示例#21
0
def generate_model(pred_vars,
                   log_transform=True,
                   one_hot_week=False,
                   method="lm"):
    """
    Generate the model for transforming and predicting.
    ...
    """
    assert method in ['lm',
                      'poisson'], "method must be one of 'lm' or 'poisson'"
    if log_transform:
        ft = preprocessing.FunctionTransformer(np.log)
    else:
        ft = preprocessing.FunctionTransformer()

    if one_hot_week:
        model_prep = compose.ColumnTransformer(
            [("onehot_categorical", preprocessing.OneHotEncoder(),
              ["week_num"]), ("num_scaler", ft, pred_vars)],
            remainder="drop",
        )
    else:
        model_prep = compose.ColumnTransformer(
            [("num_scaler", ft, pred_vars + ['ca_prop'])],
            remainder="drop",
        )
    if method == 'lm':
        pipe = pipeline.Pipeline([("preprocessor", model_prep),
                                  ("regressor",
                                   linear_model.LinearRegression())])
    elif method == 'poisson':
        pipe = pipeline.Pipeline([
            ("preprocessor", model_prep),
            ("regressor",
             linear_model.PoissonRegressor(alpha=1e-12, max_iter=10000))
        ])
    return pipe
示例#22
0
    def __init__(self, verbose=False, have_cache_data=False):
        self.__net = None
        self.__verbose = verbose

        log_transformer = preprocessing.FunctionTransformer(
            np.log1p, _inv_log1p,
            validate=True)
        scale_transformer = preprocessing.MinMaxScaler()

        self.__pipeline = Pipeline([("log", log_transformer),
                                    ("scale", scale_transformer)])
        
        self.__tree_transform = TreeFeaturizer()
        self.__have_cache_data = have_cache_data
        self.__in_channels = None
        self.__n = 0
示例#23
0
文件: sklearn.py 项目: esvhd/mlstack
def make_transformer(func: Callable, **kwargs):
    """Make an sklearn transformer, to use with transform() function.

    Parameters
    ----------
    func : Callable
        function to perform the transform

    Returns
    -------
    [type]
        [description]
    """

    transformer = skp.FunctionTransformer(func, kw_args=kwargs)
    return transformer
def define_pipeline():
    categorical = (
        'season',
        'holiday',
        'workingday',
    )
    numerical = (
        'datetime',
        'weather',
        'temp',
        'atemp',
        'humidity',
        'windspeed',
    )  # Datetime isn't numerical, but needs to be in the numeric branch
    pipeline = Pipeline([
        # Process cat & num separately, then join back together
        ('union',
         FeatureUnion([
             ('categorical',
              Pipeline([
                  ('select_cat', fe.SelectCols(cols=categorical)),
                  ('onehot', OneHotEncoder()),
              ])),
             ('numerical',
              Pipeline([
                  ('select_num', fe.SelectCols(cols=numerical)),
                  ('date', fe.DateFormatter()),
                  ('drop_datetime',
                   fe.SelectCols(cols=('datetime', 'month'), invert=True)),
                  ('fix_bad_vals',
                   fe.FillData(cols=('windspeed', 'humidity'), threshold=1)),
                  ('temp',
                   fe.ProcessNumerical(cols_to_square=('temp', 'atemp',
                                                       'humidity'), )),
                  ('rollingweather', fe.RollingWindow(cols=('weather', ))),
                  ('forecast', fe.WeatherForecast()),
                  ('scale', StandardScaler()),
              ])),
         ])),
        ('to_dense',
         preprocessing.FunctionTransformer(lambda x: x.todense(),
                                           accept_sparse=True)),
        ('clf', GradientBoostingRegressor(n_estimators=100, random_state=2)),
    ])
    return pipeline
示例#25
0
 def fit(self, on_engine, velocities, accelerations, *args):
     if on_engine.all():
         self.base = self.model = DefaultStartStopModel()
     else:
         X = np.column_stack((velocities, accelerations) + args)
         model = sk_tree.DecisionTreeClassifier(random_state=0, max_depth=4)
         self.model = sk_pip.Pipeline([('feature_selection',
                                        sk_fsel.SelectFromModel(model)),
                                       ('classification', model)])
         self.model.fit(X, on_engine)
         model = sk_tree.DecisionTreeClassifier(random_state=0, max_depth=3)
         self.base = sk_pip.Pipeline([
             ('feature_selection',
              sk_prep.FunctionTransformer(lambda X: X[:, :2])),
             ('classification', model)
         ])
         self.base.fit(X, on_engine)
     return self
示例#26
0
    print("二值化: \n",
          preprocessing.Binarizer(threshold=3).fit_transform(iris_x))
    print("哑编码: \n",
          preprocessing.OneHotEncoder().fit_transform(iris_y.reshape(-1, 1)))

    from numpy import vstack, array, nan
    print(
        "填充缺失值:\n",
        preprocessing.Imputer().fit_transform(
            vstack((array([nan, nan, nan, nan]), iris_x))))

    print("多项式变化:\n", preprocessing.PolynomialFeatures().fit_transform(iris_x))

    from numpy import log1p
    print("自定义转换函数:\n",
          preprocessing.FunctionTransformer(log1p).fit_transform(iris_x))

    from sklearn.feature_selection import VarianceThreshold
    print("方差选择法:\n", VarianceThreshold(threshold=3).fit_transform(iris_x))

    from sklearn.feature_selection import SelectKBest, chi2
    print("卡方检验:\n", SelectKBest(chi2, k=2).fit_transform(iris_x, iris_y))

    from sklearn.feature_selection import RFE
    from sklearn.linear_model import LogisticRegression
    # 参数estimator为基模型
    # 参数n_features_to_select为选择的特征个数
    print(
        "递归特征消除法:\n",
        RFE(estimator=LogisticRegression(),
            n_features_to_select=2).fit_transform(iris_x, iris_y))
示例#27
0
result = polynomial.fit_transform(matrix)
print(result)

polynomial = preprocessing.PolynomialFeatures(degree = 3, include_bias = False)
result = polynomial.fit_transform(matrix)
print(result)

# 함수 적용하기
matrix = np.array([[100, 200], [300, 150]])
print(matrix)

# 100을 결합하기
def intconvert(x):
    return x + 100

transformer = preprocessing.FunctionTransformer(intconvert)
result = transformer.transform(matrix)
print(result)

print(data['국어'])
print(data['국어'].apply(intconvert))

import numpy as np
import pandas as pd

# array를 입력받아서 z 점수(평균의 표준편차 3 범위)
# 밖에 있는 데이터를 리턴해주는 함수

def z_score_outlier(ar) :
    threshold = 3
    # 평균 가져오기
示例#28
0
                'numerical',
                Pipeline([
                    ('select_num', fe.SelectCols(cols=numerical)),
                    ('date', fe.DateFormatter()),
                    ('drop_datetime',
                     fe.SelectCols(cols=('datetime'), invert=True)),
                    ('temp',
                     fe.ProcessNumerical(cols_to_square=('temp', 'atemp',
                                                         'humidity'))),
                    # ('bad_weather', fe.BinarySplitter(col = 'weather', threshold = 2)),
                    # ('filter', fe.PassFilter(col='atemp', lb = 15, replacement_style = 'mean'))
                    ('scale', StandardScaler()),
                ])),
        ])),
    ('to_dense',
     preprocessing.FunctionTransformer(lambda x: x.todense(),
                                       accept_sparse=True)),
    ('clf', GradientBoostingRegressor(n_estimators=100, random_state=2)),
])


#Helper function to calculate root mean squared error
def get_RMSE(actual_values, predicted_values):
    n = len(actual_values)
    RMSE = np.sqrt(
        np.sum(
            ((np.log(predicted_values + 1) - np.log(actual_values + 1))**2) /
            n))
    return RMSE


#create custom scorer
def Standard_Features(X_train, mode, X_test):
    '''
    :param X_train: type: dataFrame or 2D-array, mXn, m rows, n columns: n features
    :param X_test: type: dataFrame or 2D-array, m'Xn', m' rows, n' columns: n' features.
                   when only X_train is scaled (i.e., mode='scale'), X_test does not must be input.
    :param mode: process feature values modes: 'zscore', 'maxmin', 'log', 'scale'
    :return: Standard_X_train: 2d Array, m X n, m rows, n columns: n features
             Standard_X_test: 2d Array, m' X n', m' rows, n' columns: n' features
    '''

    # 每个项目都使用自身的均值方差/不使用归一化,是考虑测试集可能不包括最值,归一化可以用来和标准化比较影响
    # CC认为即使改变数据分布也没关系,因为跨项目数据分布本来就和源数据分布不一致
    # Xy = Sample.values  # DataFrame2Array
    # X = Sample[:, :-1]  # mX(n-1): m samples, n features
    # y = Sample[:, -1]  # mX1: m label values
    # X_Mean = X.mean(axis=0)
    # X_Std = X.std(axis=0)
    # X_Max = X.max(axis=0)
    # X_Min = X.min(axis=0)

    # 正规化方法:z-score, 均值为0,方差为1, 适用于属性A的最大值和最小值未知的情况,或有超出取值范围的离群数据的情况。
    if mode == 'zscore':
        print(
            '*                  Do Z-score on source & target datasets according to source ...'
        )
        zscore_scaler = preprocessing.StandardScaler(
            copy=True, with_mean=True,
            with_std=True)  # Binarizer, Imputer, LabelBinarizer
        zscore_scaler.fit(X_train)
        X_train_zscore = zscore_scaler.transform(X_train)
        X_test_zscore = zscore_scaler.transform(X_test)
        return X_train_zscore, X_test_zscore

    # 规范化方法:max-min normalization,原始数据的线性变换,使结果映射到[0,1]区间
    # 实现特征极小方差的鲁棒性以及在稀疏矩阵中保留零元素。(鲁棒性:表征控制系统对特性或参数扰动的不敏感性)
    elif mode == 'maxmin':
        print(
            '*                  Do max-min on source & target datasets according to source ...'
        )
        min_max_scaler = preprocessing.MinMaxScaler()
        X_train_minmax = min_max_scaler.fit_transform(X_train)
        X_test_minmax = min_max_scaler.transform(X_test)
        return X_train_minmax, X_test_minmax
    if mode == 'zscore_t':
        print(
            '*                  Do Z-score on source & target datasets according to target ...'
        )
        zscore_scaler = preprocessing.StandardScaler(
            copy=True, with_mean=True,
            with_std=True)  # Binarizer, Imputer, LabelBinarizer
        zscore_scaler.fit(X_test)
        X_test_zscore = zscore_scaler.transform(X_test)
        X_train_zscore = zscore_scaler.transform(X_train)
        return X_train_zscore, X_test_zscore

        # 规范化方法:max-min normalization,原始数据的线性变换,使结果映射到[0,1]区间
        # 实现特征极小方差的鲁棒性以及在稀疏矩阵中保留零元素。(鲁棒性:表征控制系统对特性或参数扰动的不敏感性)
    elif mode == 'maxmin_t':
        print(
            '*                  Do max-min on source and target datasets according to target ...'
        )
        min_max_scaler = preprocessing.MinMaxScaler()
        X_test_minmax = min_max_scaler.fit_transform(X_test)
        X_train_minmax = min_max_scaler.transform(X_train)
        return X_train_minmax, X_test_minmax

    elif mode == 'log':
        if X_train.any() >= 0:  #  ?? 为何有负数还为True
            #
            # print('Do log(x+1) on source and target datasets...')
            log_scaler = preprocessing.FunctionTransformer(
                np.log1p, validate=True)  # log1p = log(1+x)
            X_train_log = log_scaler.fit_transform(X_train)
            if X_test.all() >= 0:
                X_test_log = log_scaler.transform(X_test)
                return X_train_log, X_test_log
            else:
                raise ValueError('test data exists negative values')
                # return None

        else:
            raise ValueError('training data exists negative values')
            # return None

    elif mode == 'scale':
        print(
            '*                  Do score(mean=0, std=1) on source and target separately....'
        )
        X_train_scaled = preprocessing.scale(X_train)
        X_test_scaled = preprocessing.scale(X_test)
        return X_train_scaled, X_test_scaled

    elif mode == 'maxminscale':
        print(
            '*                  Do max-min on source and target separately....'
        )
        X_train_minmaxscaled = preprocessing.MinMaxScaler().fit_transform(
            X_train)
        X_test_minmaxscaled = preprocessing.MinMaxScaler().fit_transform(
            X_test)
        return X_train_minmaxscaled, X_test_minmaxscaled

    elif mode == 'logscale':
        print(
            '*                  Do log(x+1) on source and target separately....'
        )
        X_train_logscaled = preprocessing.FunctionTransformer(
            np.log1p, validate=True).fit_transform(X_train)  # log1p = log(1+x)
        X_test_logscaled = preprocessing.FunctionTransformer(
            np.log1p, validate=True).fit_transform(X_test)
        return X_train_logscaled, X_test_logscaled

    else:
        raise ValueError('the value of mode is wrong, please check...')
示例#30
0

# generate some sparse data
xtrain = np.random.uniform(0, 100, size=(10, 4))
target = model(xtrain.T).T
xtest = np.random.uniform(0, 100, size=(10, 4))
test = model(xtest.T).T

# define some model constraints
equations = """
3*b + c > -0.75
4.5*b - d > 11.0
"""
var = list('abcd')
equations = simplify(equations, variables=var)
cf = generate_constraint(generate_solvers(equations, variables=var))

if __name__ == '__main__':
    # build a kernel-transformed regressor
    ta = pre.FunctionTransformer(func=vectorize(cf, axis=1))
    tp = pre.PolynomialFeatures(degree=3)
    e = lin.LinearRegression()

    # train and score, then test and score
    xtrain_ = tp.fit_transform(ta.fit_transform(xtrain))
    assert 1.0 == e.fit(xtrain_, target).score(xtrain_, target)
    xtest_ = tp.fit_transform(ta.fit_transform(xtest))
    assert 1 - e.score(xtest_, test) <= 1e-2

# EOF