示例#1
0
def test_column_transformer_get_feature_names():
    X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T
    ct = ColumnTransformer([('trans', Trans(), [0, 1])])
    # raise correct error when not fitted
    assert_raises(NotFittedError, ct.get_feature_names)
    # raise correct error when no feature names are available
    ct.fit(X_array)
    assert_raise_message(
        AttributeError, "Transformer trans (type Trans) does not provide "
        "get_feature_names", ct.get_feature_names)

    # working example
    X = np.array([[{
        'a': 1,
        'b': 2
    }, {
        'a': 3,
        'b': 4
    }], [{
        'c': 5
    }, {
        'c': 6
    }]],
                 dtype=object).T
    ct = ColumnTransformer([('col' + str(i), DictVectorizer(), i)
                            for i in range(2)])
    ct.fit(X)
    assert_equal(ct.get_feature_names(), ['col0__a', 'col0__b', 'col1__c'])

    # passthrough transformers not supported
    ct = ColumnTransformer([('trans', 'passthrough', [0, 1])])
    ct.fit(X)
    assert_raise_message(NotImplementedError,
                         'get_feature_names is not yet supported',
                         ct.get_feature_names)

    ct = ColumnTransformer([('trans', DictVectorizer(), 0)],
                           remainder='passthrough')
    ct.fit(X)
    assert_raise_message(NotImplementedError,
                         'get_feature_names is not yet supported',
                         ct.get_feature_names)

    # drop transformer
    ct = ColumnTransformer([('col0', DictVectorizer(), 0),
                            ('col1', 'drop', 1)])
    ct.fit(X)
    assert_equal(ct.get_feature_names(), ['col0__a', 'col0__b'])
示例#2
0
def load_credita():
    path = os.path.join('datasets', 'credit-a.arff')
    raw_data = loadarff(path)
    df = pd.DataFrame(raw_data[0])

    y = df.pop('class')
    X = df

    y_label_encoder = LabelEncoder()
    y = y_label_encoder.fit_transform(y)

    # fill missing numerical values
    X.fillna(X.mean(), inplace=True)

    # fill missing categorical values
    categ_cols = X.select_dtypes(include=['category', object]).columns
    for col in categ_cols:
        X[col].replace(b'?', X[col].mode()[0], inplace=True)

    # standarize numerical features
    num_cols = X.select_dtypes(include=['number']).columns
    mm_scaler = MinMaxScaler()
    X[num_cols] = mm_scaler.fit_transform(X[num_cols])

    # use one transformer per feature to preserve its name in the generated features
    # since new feature names are based on the transformer's name
    transformers = [(col, OneHotEncoder(drop='first'), [col])
                    for col in categ_cols]
    col_transformer = ColumnTransformer(transformers, remainder='passthrough')
    X_arr = col_transformer.fit_transform(X)

    X = pd.DataFrame(X_arr, columns=col_transformer.get_feature_names())

    return X, y
示例#3
0
文件: AE.py 项目: HypnosPy/HypnosPy
class BoutDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        ## - onehotencode hyp_time_col
        ## - scale sedentary, light, medium, vigorous
        ## - make sure activities are in order
        ## - bout_train = BoutDataset(df_per_hour.loc[train_ids.pid])
        ## - bout_test = BoutDataset(df_per_hour.loc[test_ids.pid])

        # One-Hot-Encode the hyp_time_col
        self.columnTransformer = ColumnTransformer(
            [('hour', OneHotEncoder(handle_unknown='ignore',
                                    sparse=False), ['hour'])],
            remainder='passthrough')
        index = df.index
        df = self.columnTransformer.fit_transform(df)
        df = pd.DataFrame(df,
                          columns=self.columnTransformer.get_feature_names(),
                          index=index)

        filter_columns = df.columns.str.startswith('hour')
        columns = df.columns[filter_columns].str.split('__x0_').str.join('_')

        # append last columns to renamed beginning columns
        columns = columns.append(df.columns[len(columns):])
        df.columns = columns

        # reorder
        reorder_columns = [
            'sedentary_bins', 'light_bins', 'medium_bins', 'vigorous_bins'
        ]
        reorder_columns.extend(df.columns[filter_columns].tolist())
        df = df[reorder_columns]

        # Scale
        self.scaler = preprocessing.StandardScaler()
        scaled = self.scaler.fit_transform(df[[
            'sedentary_bins', 'light_bins', 'medium_bins', 'vigorous_bins'
        ]])
        df[['sedentary_bins', 'light_bins', 'medium_bins',
            'vigorous_bins']] = scaled

        # Set class object
        self.df = df.sort_index()
        self.df = self.df[[
            'sedentary_bins', 'light_bins', 'medium_bins', 'vigorous_bins'
        ]]
        self.hours_in_day = 24

    def __len__(self):
        return len(self.df)

    def __getitem__(self, pid):
        # Select sample
        X = self.df.loc[pid].values
        X = np.expand_dims(X, axis=0)
        y = self.df.loc[pid].values
        y = np.expand_dims(y, axis=0)
        return X, y
def apply_haar_smooth(data):
    transformers = [
        ('orig', PassthroughTransformer(), ['pct_change__close']),
        ('haar_smooth', HaarSmoothTransformer(.4), ['pct_change__close']),
    ]
    ct = ColumnTransformer(transformers=transformers,
                           remainder='drop',
                           n_jobs=-1)
    return pd.DataFrame(data=ct.fit_transform(data),
                        columns=ct.get_feature_names())
示例#5
0
class FeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        # Ugly but otherwise col_transformer.feature_names() doesn't work
        StandardScaler.get_feature_names = get_empty_feature_names
        FunctionTransformer.get_feature_names = get_empty_feature_names
        OrdinalEncoder.get_feature_names = get_empty_feature_names
        SimpleImputer.get_feature_names = get_empty_feature_names
        RobustScaler.get_feature_names = get_empty_feature_names

        # Transformer which returns the same result
        identity = FunctionTransformer(func=lambda x: x, validate=False)
        # transformer 1/x
        reciprocal = FunctionTransformer(func=lambda x: 1 / x, validate=False)

        # ColumnTransformer allows different columns or column subsets of the input
        # to be transformed separately and the results combined into a single
        # feature space.
        self.col_transformer = ColumnTransformer(
            [
                # (name, transformer, column(s))

                # ==categorical==

                # OneHotEncoder - M categories in column -> M columns
                ("Transmission Type", OneHotEncoder(), ["Transmission Type"]),

                # OrdinalEncoder - encodes categories to integer
                ("Vehicle Size",
                 OrdinalEncoder([['Compact', 'Midsize', 'Large']
                                 ]), ["Vehicle Size"]),

                # ==numerical==

                # Leave column as it is
                ("Number of Doors", identity, ["Number of Doors"]),
                ("Engine HP", identity, ["Engine HP"]),

                # calculate 1/x
                ("city mpg trans", reciprocal, ["city mpg"]),

                # Leave column as it is
                ("Year", identity, ["Year"]),
            ],
            remainder='drop'  # Drop all other remaining columns
        )

    def fit(self, X):
        self.col_transformer.fit(X)
        return self

    def transform(self, X):
        return self.col_transformer.transform(X)

    def get_feature_names(self):
        return self.col_transformer.get_feature_names()
示例#6
0
class OneHotEncoderPrim(primitive):
    # can handle missing values. turns nans to extra category
    def __init__(self, random_state=0):
        super(OneHotEncoderPrim, self).__init__(name='OneHotEncoder')
        self.id = 4
        self.hyperparams = []
        self.type = 'data preprocess'
        self.description = "Encode categorical integer features as a one-hot numeric array. The input to this transformer should be an array-like of integers or strings, denoting the values taken on by categorical (discrete) features. The features are encoded using a one-hot (aka ‘one-of-K’ or ‘dummy’) encoding scheme. This creates a binary column for each category and returns a sparse matrix or dense array. By default, the encoder derives the categories based on the unique values in each feature. Alternatively, you can also specify the categories manually. The OneHotEncoder previously assumed that the input features take on values in the range [0, max(values)). This behaviour is deprecated. This encoding is needed for feeding categorical data to many scikit-learn estimators, notably linear models and SVMs with the standard kernels."
        self.hyperparams_run = {'default': True}
        self.preprocess = None
        self.cat_cols = None
        self.accept_type = 'b'

    def can_accept(self, data):
        return self.can_accept_b(data)

    def is_needed(self, data):
        # data = handle_data(data)
        cols = data['X']
        num_cols = data['X']._get_numeric_data().columns
        cat_cols = list(set(cols) - set(num_cols))
        if len(cat_cols) == 0:
            return False
        return True

    def fit(self, data):
        data = handle_data(data)
        if not self.is_needed(data):
            return
        x = deepcopy(data['X'])
        cols = data['X'].columns
        num_cols = data['X']._get_numeric_data().columns
        self.cat_cols = list(set(cols) - set(num_cols))
        x[self.cat_cols] = x[self.cat_cols].fillna('NaN')
        self.preprocess = ColumnTransformer([
            ("one_hot", OneHotEncoder(handle_unknown='ignore'), self.cat_cols)
        ])
        x[self.cat_cols] = x[self.cat_cols].astype(str)
        self.preprocess.fit(x)  # .astype(str)

    def produce(self, data):
        output = handle_data(data)
        if not self.is_needed(output):
            final_output = {0: output}
            return final_output
        output['X'][self.cat_cols] = output['X'][self.cat_cols].fillna('NaN')
        result = self.preprocess.transform(output['X'])
        if isinstance(result, csr_matrix):
            result = result.toarray()
        output['X'] = pd.DataFrame(
            result,
            columns=self.preprocess.get_feature_names()).infer_objects()
        output['X'] = output['X'].ix[:, ~output['X'].columns.duplicated()]
        final_output = {0: output}
        return final_output
def test_feature_names_empty_columns(empty_col):
    pd = pytest.importorskip('pandas')

    df = pd.DataFrame({"col1": ["a", "a", "b"], "col2": ["z", "z", "z"]})

    ct = ColumnTransformer(transformers=[
        ("ohe", OneHotEncoder(), ["col1", "col2"]),
        ("empty_features", OneHotEncoder(), empty_col),
    ], )

    ct.fit(df)
    assert ct.get_feature_names() == ['ohe__x0_a', 'ohe__x0_b', 'ohe__x1_z']
def test_column_transformer_get_feature_names():
    X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T
    ct = ColumnTransformer([('trans', Trans(), [0, 1])])
    # raise correct error when not fitted
    assert_raises(NotFittedError, ct.get_feature_names)
    # raise correct error when no feature names are available
    ct.fit(X_array)
    assert_raise_message(AttributeError,
                         "Transformer trans (type Trans) does not provide "
                         "get_feature_names", ct.get_feature_names)

    # working example
    X = np.array([[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}],
                  [{'c': 5}, {'c': 6}]], dtype=object).T
    ct = ColumnTransformer(
        [('col' + str(i), DictVectorizer(), i) for i in range(2)])
    ct.fit(X)
    assert_equal(ct.get_feature_names(), ['col0__a', 'col0__b', 'col1__c'])

    # passthrough transformers not supported
    ct = ColumnTransformer([('trans', 'passthrough', [0, 1])])
    ct.fit(X)
    assert_raise_message(
        NotImplementedError, 'get_feature_names is not yet supported',
        ct.get_feature_names)

    ct = ColumnTransformer([('trans', DictVectorizer(), 0)],
                           remainder='passthrough')
    ct.fit(X)
    assert_raise_message(
        NotImplementedError, 'get_feature_names is not yet supported',
        ct.get_feature_names)

    # drop transformer
    ct = ColumnTransformer(
        [('col0', DictVectorizer(), 0), ('col1', 'drop', 1)])
    ct.fit(X)
    assert_equal(ct.get_feature_names(), ['col0__a', 'col0__b'])
示例#9
0
def load_sick():
    raw_data = loadarff('datasets/sick.arff')
    df = pd.DataFrame(raw_data[0])

    y = df.pop('class')
    X = df

    X.drop('TBG', axis=1, inplace=True)  # all NaN, useless

    implicit_cols = [col for col in X.columns if col.endswith('_measured')]
    X.drop(implicit_cols, axis=1, inplace=True)

    # Replace NaN values
    X.fillna(X.mean(), inplace=True)
    X['sex'].replace(b'?', X['sex'].mode()[0], inplace=True)

    # Standarize numerical features
    num_cols = X.select_dtypes(include=['number']).columns
    scaler = MinMaxScaler()
    X[num_cols] = scaler.fit_transform(X[num_cols])

    # Encode categorical features
    categ_cols = X.select_dtypes(include=['category', object]).columns
    categ_cols = categ_cols.drop('referral_source')

    # we use a dict where each feature has an entry with its encoder
    # for future or inverse transformations
    label_encoders = defaultdict(LabelEncoder)
    X[categ_cols] = X[categ_cols].apply(
        lambda x: label_encoders[x.name].fit_transform(x))

    ohe_encoder = OneHotEncoder()  # save for future or inverse transformations
    ohe_transformer = ColumnTransformer(
        [('referral_source', ohe_encoder, ['referral_source'])],
        remainder='passthrough')
    X_arr = ohe_transformer.fit_transform(X)

    X = pd.DataFrame(X_arr, columns=ohe_transformer.get_feature_names())

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

    candidates = []
    for eps in range(1, 11):
        for ms in range(4, 21):
            model = DBSCAN(eps=eps / 10, min_samples=ms).fit(X)
            counts = np.unique(model.labels_, return_counts=True)[1]
            if len(counts) == 3:
                print(model, counts)
                candidates.append(model)
示例#10
0
class FeatureColumnTransformer(DfTransformer):
    
    def __init__(self, transformers, remainder="passthrough", n_jobs=-1):
        self.name = "FeatureColumnTransformer"
        super().log_start(self.name)
        
        self.transformers = transformers
        self.remainder = remainder
        self.n_jobs = n_jobs
        self.column_transfomer = ColumnTransformer(
            transformers = self.transformers,
            remainder = self.remainder,
            n_jobs=self.n_jobs
            )
        self.columns = None
        self.column_types = None
    
    def fit(self, X, y=None):
        self.column_transfomer.fit(X)
        return self
    
    def transform(self, X, y=None):
        X_concat = self.column_transfomer.transform(X)
        self.columns = self.column_transfomer.get_feature_names()
        self.rename_df_columns()
                
        X_concat = pd.DataFrame(X_concat, index = X.index, columns = self.columns)
        X_concat_df = self.redefine_column_types(X,X_concat)

        super().log_end(self.name)
        return X_concat_df
    
    def rename_df_columns(self):
        for i,col in enumerate(self.columns):
            self.columns[i] = col.split(sep="__")[-1]
            
    def redefine_column_types(self, X_input, X_output):
        for feature in X_input.columns:
            if feature in X_output.columns:
                X_output[feature]=X_output[feature].astype(X_input[feature].dtypes.name)
        return X_output
示例#11
0
def test_ColumnTransformer():
    import pandas as pd
    from sklearn.compose import ColumnTransformer
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.preprocessing import OneHotEncoder
    from ML_in_business.hw6.TransformerLib import MyTempEncoder, tempEstimator
    from sklearn import set_config

    X = pd.DataFrame(
        {'city': ['London', 'London', 'Paris', 'Sallisaw'],
        'title': ["His Last Bow", "How Watson Learned the Trick", "A Moveable Feast", "The Grapes of Wrath"],
        'expert_rating': [5, 3, 4, 5],
        'user_rating': [4, 5, 4, 3]})

    # column_trans = ColumnTransformer(
    #     [('city_category', OneHotEncoder(dtype='int'),['city']),
    #     ('title_bow', CountVectorizer(), 'title')],
    #     remainder='drop')
    
    # column_trans= Pipeline([
    #             ('selector', MyTempEncoder())
    #         ])

    column_trans = ColumnTransformer(
        [   
            #('city_category', OneHotEncoder(dtype='int'),['city']),
            ('myEncoder', tempEstimator('AAAA'), ['title'])],
        remainder='passthrough'
        #remainder='drop'
    )
    
    #HTML representation of Pipeline
    #set_config(display='diagram')
    set_config(display='text')
    column_trans

    column_trans.fit_transform(X)
    names = column_trans.get_feature_names()
    arr = column_trans.transform(X)
    assert True
def feature_generation(df):
    """
    Crear nuevos features útiles como:
    - Una variable booleana para saber si la llamada es del 911/066 o no.
    - Transformar var categóricas con OneHotEncoder

    :param df: Dataframe del cual se generarán nuevas variables
    :return:
    """

    # Creamos la variable booleana
    print("Creating boolean variable.")
    df["bool_llamada"] = np.where((df.tipo_entrada == "LLAMADA DEL 911") |
                                  (df.tipo_entrada == "LLAMADA DEL 066"), 1, 0)

    print("Transforming discrete variables...")
    # Aplicamos OneHot Encoder para las categóricas
    transformers = [('one_hot', OneHotEncoder(), [
        'delegacion_inicio', 'incidente_c4', 'tipo_entrada', 'espacio_del_dia'
    ])]

    col_trans = ColumnTransformer(transformers,
                                  remainder="passthrough",
                                  n_jobs=-1)

    # Ordenaremos el dataframe temporalmente
    df = df.sort_values(
        by=["año_creacion", "mes_creacion", "dia_creacion", "hora_simple"])

    X = col_trans.fit_transform(df.drop(columns="label"))
    y = df.label.values.reshape(X.shape[0], )
    print("Successfully transformation of the discrete variables.'")
    print(X.shape)

    print("Converting to dataframe...")
    X = X.todense()
    df = pd.DataFrame(X, columns=col_trans.get_feature_names())
    df['label'] = y

    return df, X, y
示例#13
0
class DataFrameOHETransformer(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names=None):
        self.fnames = feature_names
        self.col_transf = None
        self.fit_est = None
        self.features = None

    def fit(self, X, y=None):
        ohes = []
        for feature in self.fnames:
            ohes.append((feature, OneHotEncoder(dtype='int'), [feature]))
        self.col_transf = ColumnTransformer(ohes, remainder='drop')
        self.col_transf.fit(X, y)
        return self

    def fit_transform(self, X, y=None, **fit_params):
        return self.fit(X, y).transform(X, y)

    def transform(self, X, y=None):
        tf = pandas.DataFrame(self.col_transf.transform(X),
                              columns=self.col_transf.get_feature_names(),
                              index=X.index)
        return pandas.concat([tf, X.drop(self.fnames, 1)], 1)
示例#14
0
def make_features(input_df,
                  target_col,
                  keep_cols=None,
                  ma_lags=None,
                  ma_cols=None,
                  n_samples=None) -> pd.DataFrame:
    transformers = list()
    if keep_cols:
        transformers.extend([('passthrough', PassthroughTransformer(),
                              keep_cols)])
    if ma_lags and ma_cols:
        transformers.extend([('ma' + str(n), MovingAverageTransformer(n),
                              ma_cols) for n in ma_lags])
    transformers.extend([('target', PercentChangeTransformer(), [target_col])])
    ct = ColumnTransformer(transformers=transformers,
                           remainder='drop',
                           n_jobs=-1)

    arr = ct.fit_transform(input_df)
    arr = strip_nan_rows(arr)
    if n_samples:
        arr = keep_last_n_rows(arr, n_samples)
    return pd.DataFrame(data=arr, columns=list(ct.get_feature_names()))
示例#15
0
class FeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        # Ugly but otherwise col_transformer.feature_names() doesn't work
        StandardScaler.get_feature_names = get_empty_feature_names
        FunctionTransformer.get_feature_names = get_empty_feature_names
        OrdinalEncoder.get_feature_names = get_empty_feature_names
        SimpleImputer.get_feature_names = get_empty_feature_names
        RobustScaler.get_feature_names = get_empty_feature_names

        identity = FunctionTransformer(func=lambda x: x, validate=False)
        reciprocal = FunctionTransformer(func=lambda x: 1 / x, validate=False)

        self.col_transformer = ColumnTransformer(
            [
                # categorical
                ("Transmission Type", OneHotEncoder(), ["Transmission Type"]),
                ("Vehicle Size",
                 OrdinalEncoder([['Compact', 'Midsize', 'Large']
                                 ]), ["Vehicle Size"]),

                # numerical
                ("city mpg", reciprocal, ["city mpg"]),
                ("Year", identity, ["Year"]),
                ("Engine HP", identity, ["Engine HP"]),
            ],
            remainder='drop')

    def fit(self, X):
        self.col_transformer.fit(X)
        return self

    def transform(self, X):
        return self.col_transformer.transform(X)

    def get_feature_names(self):
        return self.col_transformer.get_feature_names()
示例#16
0
    list_cat = [
        2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23, 24, 25, 27,
        28, 29, 30, 31, 32, 33, 35, 39, 40, 41, 42, 53, 55, 57, 58, 60, 63, 64,
        65, 72, 73, 74, 78, 79
    ]
    list_num = [
        0, 1, 3, 4, 17, 18, 19, 20, 26, 34, 36, 37, 38, 43, 44, 45, 46, 47, 48,
        49, 50, 51, 52, 54, 56, 59, 61, 62, 66, 67, 68, 69, 70, 71, 75, 76, 77,
        80
    ]

    ct = ColumnTransformer([('oneHot',
                             OneHotEncoder(categories='auto',
                                           sparse=False), list_cat)])
    ct_result = pd.DataFrame(ct.fit_transform(data))
    ct_result.columns = ct.get_feature_names()

    ct_result.insert(0, "Id", ct_result.index + 1)

    #merge categorical datafarme with numerical dataframe on ID
    #Store in processed full data dataframe

    numeric_df = data.iloc[:, list_num]
    p_full_data = pd.merge(ct_result,
                           numeric_df,
                           left_on='Id',
                           right_on='Id',
                           how='inner')

    #Split data set in training and testing subset
示例#17
0
def load_credita(weighting=None, **extra_kwargs):
    cv_splits = []

    # preprocess the first fold keeping statistics for next folds
    train_path = os.path.join('datasetsCBR', 'credit-a',
                              f'credit-a.fold.000000.train.arff')
    test_path = os.path.join('datasetsCBR', 'credit-a',
                             f'credit-a.fold.000000.test.arff')

    df_train = pd.DataFrame(loadarff(train_path)[0])
    df_test = pd.DataFrame(loadarff(test_path)[0])

    X = df_train.append(df_test)
    y = X.pop('class')

    y_label_encoder = LabelEncoder()
    y = y_label_encoder.fit_transform(y)

    # fill missing numerical values
    means = X.mean()
    X.fillna(means, inplace=True)

    # fill missing categorical values
    categ_cols = X.select_dtypes(include=['category', object]).columns
    modes = X[categ_cols].mode()
    for col in categ_cols:
        X[col].replace(b'?', modes[col][0], inplace=True)

    # standarize numerical features
    num_cols = X.select_dtypes(include=['number']).columns
    mm_scaler = MinMaxScaler()
    X[num_cols] = mm_scaler.fit_transform(X[num_cols])

    # use one transformer per feature to preserve its name in the generated features
    # since new feature names are based on the transformer's name
    transformers = [(col, OneHotEncoder(drop='first'), [col])
                    for col in categ_cols]
    col_transformer = ColumnTransformer(transformers, remainder='passthrough')
    X_arr = col_transformer.fit_transform(X)

    X = pd.DataFrame(X_arr, columns=col_transformer.get_feature_names())

    p = len(df_train)
    X_train, X_test, y_train, y_test = X[:p], X[p:], y[:p], y[p:]

    # feature selection
    if weighting == 'mutual_info':
        weights = mutual_info(X, y)

        # apply weights to features
        X_train *= weights
        X_test *= weights

    elif weighting == 'relief':
        weights = relief(X, y)

        # apply weights to features
        X_train *= weights
        X_test *= weights

    cv_splits.append((X_train, X_test, y_train, y_test))

    # preprocess rest of folds
    for i in range(1, K_FOLDS):
        train_path = os.path.join('datasetsCBR', 'credit-a',
                                  f'credit-a.fold.00000{str(i)}.train.arff')
        test_path = os.path.join('datasetsCBR', 'credit-a',
                                 f'credit-a.fold.00000{str(i)}.test.arff')

        df_train = pd.DataFrame(loadarff(train_path)[0])
        df_test = pd.DataFrame(loadarff(test_path)[0])

        X = df_train.append(df_test)
        y = X.pop('class')

        y = y_label_encoder.transform(y)

        # fill missing numerical values
        X.fillna(means, inplace=True)

        # fill missing categorical values
        for col in categ_cols:
            X[col].replace(b'?', modes[col][0], inplace=True)

        # normalize numerical features
        X[num_cols] = mm_scaler.transform(X[num_cols])

        # one hot encode
        X_arr = col_transformer.transform(X)
        X = pd.DataFrame(X_arr, columns=col_transformer.get_feature_names())

        p = len(df_train)
        X_train, X_test, y_train, y_test = X[:p], X[p:], y[:p], y[p:]

        # feature selection
        if weighting == 'mutual_info':
            weights = mutual_info(X_train, y_train)

            # apply weights to features
            X_train *= weights
            X_test *= weights

        elif weighting == 'relief':
            weights = relief(X_train, y_train)

            # apply weights to features
            X_train *= weights
            X_test *= weights

        cv_splits.append((X_train, X_test, y_train, y_test))

    return cv_splits
示例#18
0
# In[4]:
"""
Apply DWT Smooth.
"""
transformers = [
    ('haar_smooth', HaarSmoothTransformer(.05),
     list(feature_data_train.columns)),
    ('orig', PassthroughTransformer(), ['target__close']),
]

ct = ColumnTransformer(transformers=transformers, n_jobs=-1)

smooth_arr_train = ct.fit_transform(feature_data_train)
smooth_data_train = pd.DataFrame(smooth_arr_train,
                                 columns=ct.get_feature_names())

smooth_arr_test = ct.fit_transform(feature_data_test)
smooth_data_test = pd.DataFrame(smooth_arr_test,
                                columns=ct.get_feature_names())

smooth_data_train.plot()
plt.show()

# In[5]:
"""
Make time-series data.
"""
X_train, y_train = data_to_supervised(input_df=smooth_data_train,
                                      target_ix=-1,
                                      Tx=Tx,
示例#19
0
def data_manipulation(data_set):
    """
    Prepare the dataset for training the regression model.
    This function takes the csv file location as a parameter
    Parameters
    ----------
    data_set: str
        data file location
    Returns
    -------
    data_set: csv file
        turns all the categorical data into numerical data.
    """
    print("Loading the Dataset...")
    df = pd.read_csv(data_set)
    print("Data Manipulation...")
    # remove all the null value from the dataset
    df.dropna(inplace=True)
    # create the variable that contains the combine source destination
    source_destination = df.source + '-' + df.destination
    # create the column and assign to the above created variable to the
    df['source_destination'] = source_destination
    # since we create a seperate column for source destination, now we drop that
    df.drop(['source', 'destination'], axis=1, inplace=True)

    # create a dictionary to convert cab_type data to number
    cab_type = {'Lyft': 0, 'Uber': 1}

    # map the above dictionary to the cab_type dataset
    df.cab_type = df['cab_type'].map(cab_type)
    # drop the id column since it has lot of unique variables
    df.drop('id', axis=1, inplace=True)

    print("One Hot Encoding...")
    # create a list for categorical labels
    categorical_label = ['product_id', 'source_destination', 'name']
    # create a OneHotEncoder object
    one_hot_encoding = OneHotEncoder()
    # perform the columnTransformer and use oneHotEncoder as a transformer
    transformer = ColumnTransformer(
        [('one_hot', one_hot_encoding, categorical_label)],
        remainder='passthrough')
    # fit the transformer with the dataframe
    transform_df = transformer.fit_transform(df).toarray()

    # get the feature name
    columns = transformer.get_feature_names()
    new_column = []
    # get the column name using iteration and append it to new_column
    for i in range(len(columns) - 5):
        new_column.append(columns[i][12:])
    for i in range(len(columns) - 5, len(columns)):
        new_column.append(columns[i])

    print("Transforming data...")
    # create a dataset using the transformed dataframe
    data_set = pd.DataFrame(transform_df)
    # replace the column with the new_column list that we created
    data_set.columns = new_column
    # return the manipulated dataset
    return data_set
示例#20
0
def examples():
    from sklearn.pipeline import Pipeline
    from sklearn.svm import SVC
    from sklearn.decomposition import PCA
    estimators = [('reduce_dim', PCA()), ('clf', SVC())]
    pipe = Pipeline(estimators)
    print(pipe)
    print(pipe.steps[0])
    print(pipe.named_steps['reduce_dim'])

    pipe.set_params(clf__C=10)
    print(pipe.named_steps['clf'])

    ###################################################
    # 网格搜索,搜索管道中的参数(重要)
    from sklearn.model_selection import GridSearchCV
    param_grid = dict(reduce_dim__n_components=[2, 5, 10],
                      clf__C=[0.1, 10, 100])
    grid_search = GridSearchCV(pipe, param_grid=param_grid)
    print(grid_search)

    ###################################################
    # 网格搜索,搜索管道中的参数(重要)
    from sklearn.linear_model import LogisticRegression

    param_grid = dict(reduce_dim=[None, PCA(5), PCA(10)],
                      clf=[SVC(), LogisticRegression()],
                      clf__C=[0.1, 10, 100])  # 多个可组成列表
    grid_search = GridSearchCV(pipe, param_grid=param_grid)
    print(grid_search)

    ###################################################
    from sklearn.pipeline import make_pipeline
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.preprocessing import Binarizer
    pipe = make_pipeline(Binarizer(), MultinomialNB())
    print(pipe)

    ###################################################
    # 利用memory减少重复计算
    from tempfile import mkdtemp
    from shutil import rmtree
    from sklearn.decomposition import PCA
    from sklearn.svm import SVC
    from sklearn.pipeline import Pipeline
    estimators = [('reduce_dim', PCA()), ('clf', SVC())]
    cachedir = mkdtemp()
    pipe = Pipeline(estimators, memory=cachedir)
    print(pipe)

    # Clear the cache directory when you don't need it anymore
    rmtree(cachedir)

    #####################################################
    #  Transforming target in regression
    import numpy as np
    from sklearn.datasets import load_boston
    from sklearn.compose import TransformedTargetRegressor
    from sklearn.preprocessing import QuantileTransformer
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    boston = load_boston()
    X = boston.data
    y = boston.target
    transformer = QuantileTransformer(output_distribution='normal')
    regressor = LinearRegression()
    regr = TransformedTargetRegressor(regressor=regressor,
                                      transformer=transformer)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    regr.fit(X_train, y_train)

    print('R2 score: {0:.2f}'.format(regr.score(X_test, y_test)))

    raw_target_regr = LinearRegression().fit(X_train, y_train)
    print('R2 score: {0:.2f}'.format(raw_target_regr.score(X_test, y_test)))

    ##########################################################
    # 对每列数据进行处理-预处理
    import pandas as pd
    X = pd.DataFrame({
        'city': ['London', 'London', 'Paris', 'Sallisaw'],
        'title': [
            "His Last Bow", "How Watson Learned the Trick", "A Moveable Feast",
            "The Grapes of Wrath"
        ],
        'expert_rating': [5, 3, 4, 5],
        'user_rating': [4, 5, 4, 3]
    })

    from sklearn.compose import ColumnTransformer
    from sklearn.feature_extraction.text import CountVectorizer
    column_trans = ColumnTransformer(
        [('city_category', CountVectorizer(analyzer=lambda x: [x]), 'city'),
         ('title_bow', CountVectorizer(), 'title')],
        remainder='drop')

    print(column_trans.fit(X))
    print(column_trans.get_feature_names())
    print(column_trans.transform(X).toarray())


train_df = pd.DataFrame(train, 
                        columns= numerical_features + list(preprocessor.named_transformers_.cat))


train_df.head()

    
ct = ColumnTransformer([
        ('oh_enc', 
         OneHotEncoder(sparse=False), 
         [8,9,10,11]),])
d_1he = ct.fit_transform(Xtrain_new)
d_encoded_data = pd.DataFrame(d_1he, columns=ct.get_feature_names())
d_encoded_data.drop(['oh_enc__x0_2016', 'oh_enc__x1_1','oh_enc__x2_0', 'oh_enc__x3_0','oh_enc__x4_0', 'oh_enc__x5_fall'], inplace=True, axis=1)
df_concat = pd.concat([Xtrain_new.reset_index(drop=True), d_encoded_data.reset_index(drop=True)], axis=1)
df_concat.drop(['season', 'year', 'month', 'hours', 'is_business_day', 'is_holiday'], inplace=True, axis=1)
X_trained = df_concat[:dataInt.shape[0]]



# Les Num   
 
ct_num = ColumnTransformer([
        ('stdScal', StandardScaler(), ['temp_1','temp_2','mean_national_temp','humidity_1',
         'humidity_2','consumption_secondary_1','consumption_secondary_2','consumption_secondary_3'])],
    remainder='passthrough')
        
X_tr = ct_num.fit_transform(numerical_features)
示例#22
0
print(X_df.shape, y.shape)

column_trans = ColumnTransformer([
    ('system_category', OneHotEncoder(dtype='int'), ['systems']),
    ('genre_category', OneHotEncoder(dtype='int'), ['genres']),
    ('playModes_category', OneHotEncoder(dtype='int'), ['playModes']),
    ('themes_category', OneHotEncoder(dtype='int'), ['themes']),
    ('series_category', OneHotEncoder(dtype='int'), ['series']),
    ('playerPerspectives', OneHotEncoder(dtype='int'), ['playerPerspectives']),
    ('TfIdf', TfidfVectorizer(stop_words='english'), 'gameDescription')
],
                                 remainder='drop')

column_trans.fit(X_df)
column_trans.get_feature_names()
X = column_trans.transform(X_df).toarray()
print(X)

# Split data into test and train
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    random_state=42,
                                                    stratify=y)

#####################
### RANDOM FOREST ###
#####################

# Model (can also use single decision tree)
示例#23
0
    'city': ['London', 'London', 'Paris', 'Sallisaw'],
    'title': [
        "His Last Bow", "How Watson Learned the Trick", "A Moveable Feast",
        "The Grapes of Wrath"
    ],
    'expert_rating': [5, 3, 4, 5],
    'user_rating': [4, 5, 4, 3]
})

column_trans = ColumnTransformer(
    [
        (
            'city category', OneHotEncoder(dtype='int'), ['city']
        ),  # One Hot Encoder requires 2D data as an input, thus we have to pass the column name as a list of strings, 
        # as is the case with most transformers.
        ('title bow', CountVectorizer(), 'title')
    ],  # CountVectorizer takes a 1D array as input, thus the column is passed as a string 
    remainder='drop'
)  # The 'remainder' parameter determines whether to ignore(drop) the remaining columns. The columns can be kept by using remainder='passthrough'
# The remainder can also be set to an estimator to transform the remaining columns
# remainder=MinMaxScaler())
column_trans.fit(X)

print(column_trans.get_feature_names())

# The make_column_transformer function is a useful alternative as it automatically assigns names
col_tran = make_column_transformer((OneHotEncoder(), ['city']),
                                   (CountVectorizer(), 'title'),
                                   remainder=MinMaxScaler())
print(col_tran)
 def test_transformer_get_feature_names(self):
     transformers = [('transformer name', BaseTransformer(), self.test_cols)]
     ct = ColumnTransformer(transformers=transformers)
     ct.fit(self.data)
     self.assertListEqual(['transformer name__' + col for col in self.test_cols], ct.get_feature_names())
示例#25
0
文件: DataLoader.py 项目: BigDaMa/DFS
	def get_data(self, dataset='Adult', random_number=42):

		if isinstance(dataset, str):
			dataset_key = self.map_name2id[dataset]
		else:
			dataset_key = str(dataset)

		number_instances = []
		number_attributes = []
		number_features = []

		def get_class_attribute_name(df):
			for i in range(len(df.columns)):
				if str(df.columns[i]).startswith('class@'):
					return str(df.columns[i])

		def get_sensitive_attribute_id(df, sensitive_attribute_name):
			for i in range(len(df.columns)):
				if str(df.columns[i]) == sensitive_attribute_name:
					return i

		key = dataset_key
		if type(dataset_key) == type(None):
			key = list(self.map_dataset.keys())[random.randint(0, len(self.map_dataset) - 1)]

		data_path = './google_drive_data'
		if not os.path.isdir(data_path):
			print("Downloading Datasets ...")
			download_file_from_google_drive("19Qj3T9Yt_hQ4bM0Ac9D2MS7x507sTJRU", 'DFS_datasets.zip')

			with zipfile.ZipFile('DFS_datasets.zip') as zf:
				zf.extractall('google_drive_data')
			os.remove('DFS_datasets.zip')

			print("Downloading Query Optimizer Models ...")
			download_file_from_google_drive("1lxbcs9vS6U8t-5II2qpx0OIv08EON7NL", 'DFS_models.zip')
			with zipfile.ZipFile('DFS_models.zip') as zf:
				zf.extractall('google_drive_models')
			os.remove('DFS_models.zip')

		value = self.map_dataset[key]
		with open(data_path + "/dfs_datasets/" + str(key) + ".arff") as f:
			df = a2p.load(f)

			number_instances.append(df.shape[0])
			number_attributes.append(df.shape[1])

			y = copy.deepcopy(df[get_class_attribute_name(df)])
			X = df.drop(columns=[get_class_attribute_name(df)])

			categorical_features = []
			continuous_columns = []
			for type_i in range(len(X.columns)):
				if X.dtypes[type_i] == object:
					categorical_features.append(type_i)
				else:
					continuous_columns.append(type_i)

			sensitive_attribute_id = get_sensitive_attribute_id(X, value)

			#print(sensitive_attribute_id)

			X_datat = X.values
			for x_i in range(X_datat.shape[0]):
				for y_i in range(X_datat.shape[1]):
					if type(X_datat[x_i][y_i]) == type(None):
						if X.dtypes[y_i] == object:
							X_datat[x_i][y_i] = 'missing'
						else:
							X_datat[x_i][y_i] = np.nan

			X_temp, X_test, y_temp, y_test = train_test_split(X_datat, y.values.astype('str'), test_size=0.2,
															  random_state=random_number,
															  stratify=y.values.astype('str'))

			X_train, X_validation, y_train, y_validation = train_test_split(X_temp, y_temp, test_size=0.25,
																			random_state=random_number, stratify=y_temp)

			cat_sensitive_attribute_id = -1
			for c_i in range(len(categorical_features)):
				if categorical_features[c_i] == sensitive_attribute_id:
					cat_sensitive_attribute_id = c_i
					break

			my_transformers = []
			if len(categorical_features) > 0:
				ct = ColumnTransformer(
					[("onehot", OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)])
				my_transformers.append(("o", ct))
			if len(continuous_columns) > 0:
				scale = ColumnTransformer([("scale", Pipeline(
					[('impute', SimpleImputer(missing_values=np.nan, strategy='mean')), ('scale', MinMaxScaler())]),
											continuous_columns)])
				my_transformers.append(("s", scale))

			pipeline = FeatureUnion(my_transformers)
			pipeline.fit(X_train)
			X_train = pipeline.transform(X_train)
			X_validation = pipeline.transform(X_validation)
			X_test = pipeline.transform(X_test)

			number_features.append(X_train.shape[1])

			all_columns = []
			for ci in range(len(X.columns)):
				all_columns.append(str(X.columns[ci]).split('@')[0])
			X.columns = all_columns

			names = ct.get_feature_names()
			for c in continuous_columns:
				names.append(str(X.columns[c]))

			for n_i in range(len(names)):
				if names[n_i].startswith('onehot__x'):
					tokens = names[n_i].split('_')
					category = ''
					for ti in range(3, len(tokens)):
						category += '_' + tokens[ti]
					cat_id = int(names[n_i].split('_')[2].split('x')[1])
					names[n_i] = str(X.columns[categorical_features[cat_id]]) + category

			sensitive_ids = []
			all_names = ct.get_feature_names()
			for fname_i in range(len(all_names)):
				if all_names[fname_i].startswith('onehot__x' + str(cat_sensitive_attribute_id) + '_'):
					sensitive_ids.append(fname_i)

			le = preprocessing.LabelEncoder()
			le.fit(y_train)
			y_train = le.fit_transform(y_train)
			y_validation = le.transform(y_validation)
			y_test = le.transform(y_test)

			return X_train, X_validation, X_test, y_train, y_validation, y_test, names, sensitive_ids
示例#26
0
文件: test_adult.py 项目: BigDaMa/DFS
xshape = X_train.shape[1]
if one_hot:
    ct = ColumnTransformer([
        ("onehot", OneHotEncoder(handle_unknown='ignore',
                                 sparse=False), [1, 3, 5, 6, 7, 8, 9, 13])
    ])
    scale = ColumnTransformer([("scale", MinMaxScaler(), continuous_columns)])

    pipeline = FeatureUnion([("o", ct), ("s", scale)])

    X_train = pipeline.fit_transform(X_train)
    xshape = X_train.shape[1]
    print(xshape)
    X_test = pipeline.transform(X_test)

    print(ct.get_feature_names())

names = ct.get_feature_names()
for c in continuous_columns:
    names.append(str(X.columns[c]))

pickle.dump(names, open("/home/felix/phd/ranking_exeriments/names.p", "wb"))

print(np.array(names))

#ranking by accuracy
ranking_model = ExtraTreesClassifier(n_estimators=n_estimators, random_state=0)
ranking_model.fit(X_train, y_train)
accuracy_ranking = ranking_model.feature_importances_
pickle.dump(
    accuracy_ranking,
示例#27
0
def get_fair_data1(dataset_key=None):
	map_dataset = {}

	map_dataset['31'] = 'foreign_worker@{yes,no}'
	map_dataset['802'] = 'sex@{female,male}'
	map_dataset['1590'] = 'sex@{Female,Male}'
	map_dataset['1461'] = 'AGE@{True,False}'
	map_dataset['42193'] = 'race_Caucasian@{0,1}'
	map_dataset['1480'] = 'V2@{Female,Male}'
	# map_dataset['804'] = 'Gender@{0,1}'
	map_dataset['42178'] = 'gender@STRING'
	map_dataset['981'] = 'Gender@{Female,Male}'
	map_dataset['40536'] = 'samerace@{0,1}'
	map_dataset['40945'] = 'sex@{female,male}'
	map_dataset['451'] = 'Sex@{female,male}'
	# map_dataset['945'] = 'sex@{female,male}'
	map_dataset['446'] = 'sex@{Female,Male}'
	map_dataset['1017'] = 'sex@{0,1}'
	map_dataset['957'] = 'Sex@{0,1,4}'
	map_dataset['41430'] = 'SEX@{True,False}'
	map_dataset['1240'] = 'sex@{Female,Male}'
	map_dataset['1018'] = 'sex@{Female,Male}'
	# map_dataset['55'] = 'SEX@{male,female}'
	map_dataset['38'] = 'sex@{F,M}'
	map_dataset['1003'] = 'sex@{male,female}'
	map_dataset['934'] = 'race@{black,white}'


	number_instances = []
	number_attributes = []
	number_features = []

	def get_class_attribute_name(df):
		for i in range(len(df.columns)):
			if str(df.columns[i]).startswith('class@'):
				return str(df.columns[i])

	def get_sensitive_attribute_id(df, sensitive_attribute_name):
		for i in range(len(df.columns)):
			if str(df.columns[i]) == sensitive_attribute_name:
				return i

	key = dataset_key
	if type(dataset_key) == type(None):
		key = list(map_dataset.keys())[random.randint(0, len(map_dataset) - 1)]

	value = map_dataset[key]
	with open(Config.get('data_path') + "/downloaded_arff/" + str(key) + ".arff") as f:
		df = a2p.load(f)

		print("dataset: " + str(key))

		number_instances.append(df.shape[0])
		number_attributes.append(df.shape[1])

		y = copy.deepcopy(df[get_class_attribute_name(df)])
		X = df.drop(columns=[get_class_attribute_name(df)])

		categorical_features = []
		continuous_columns = []
		for type_i in range(len(X.columns)):
			if X.dtypes[type_i] == object:
				categorical_features.append(type_i)
			else:
				continuous_columns.append(type_i)

		sensitive_attribute_id = get_sensitive_attribute_id(X, value)

		print(sensitive_attribute_id)

		X_datat = X.values
		for x_i in range(X_datat.shape[0]):
			for y_i in range(X_datat.shape[1]):
				if type(X_datat[x_i][y_i]) == type(None):
					if X.dtypes[y_i] == object:
						X_datat[x_i][y_i] = 'missing'
					else:
						X_datat[x_i][y_i] = np.nan


		X_train, X_test, y_train, y_test = train_test_split(X_datat, y.values.astype('str'), test_size=0.5,
															random_state=42, stratify=y.values.astype('str'))
		'''
		X_train, X_test, y_train, y_test = train_test_split(X_datat[0:200,:], y.values[0:200].astype('str'), test_size=0.5,
															random_state=42, stratify=y.values[0:200].astype('str'))
		'''

		cat_sensitive_attribute_id = -1
		for c_i in range(len(categorical_features)):
			if categorical_features[c_i] == sensitive_attribute_id:
				cat_sensitive_attribute_id = c_i
				break

		my_transformers = []
		if len(categorical_features) > 0:
			ct = ColumnTransformer(
				[("onehot", OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)])
			my_transformers.append(("o", ct))
		if len(continuous_columns) > 0:
			scale = ColumnTransformer([("scale", Pipeline(
				[('impute', SimpleImputer(missing_values=np.nan, strategy='mean')), ('scale', MinMaxScaler())]),
										continuous_columns)])
			my_transformers.append(("s", scale))

		pipeline = FeatureUnion(my_transformers)
		pipeline.fit(X_train)
		X_train = pipeline.transform(X_train)
		X_test = pipeline.transform(X_test)

		number_features.append(X_train.shape[1])

		all_columns = []
		for ci in range(len(X.columns)):
			all_columns.append(str(X.columns[ci]).split('@')[0])
		X.columns = all_columns

		names = ct.get_feature_names()
		for c in continuous_columns:
			names.append(str(X.columns[c]))

		for n_i in range(len(names)):
			if names[n_i].startswith('onehot__x'):
				tokens = names[n_i].split('_')
				category = ''
				for ti in range(3, len(tokens)):
					category += '_' + tokens[ti]
				cat_id = int(names[n_i].split('_')[2].split('x')[1])
				names[n_i] = str(X.columns[categorical_features[cat_id]]) + category

		print(names)

		sensitive_ids = []
		all_names = ct.get_feature_names()
		for fname_i in range(len(all_names)):
			if all_names[fname_i].startswith('onehot__x' + str(cat_sensitive_attribute_id) + '_'):
				sensitive_ids.append(fname_i)

		le = preprocessing.LabelEncoder()
		le.fit(y_train)
		y_train = le.fit_transform(y_train)
		y_test = le.transform(y_test)

		return X_train, X_test, y_train, y_test, names, sensitive_ids, key, sensitive_attribute_id
示例#28
0
                           remainder='passthrough')

train_new = ct_num.fit_transform(numeric_features)

#TEST
test_new = ct_num.fit(numeric_features)

# Gerer les variables categoriques
ct = ColumnTransformer([
    ('oh_enc', OneHotEncoder(sparse=False), [8, 9, 10, 11, 12, 13]),
])
d_1he = ct.fit_transform(train_new)
#Get Feature Names of Encoded columns
#ct.get_feature_names()
# Converting the numpy array into a pandas dataframe
d_encoded_data = pd.DataFrame(d_1he, columns=ct.get_feature_names())
d_encoded_data.drop([
    'oh_enc__x0_2016', 'oh_enc__x1_1', 'oh_enc__x2_0', 'oh_enc__x3_0',
    'oh_enc__x4_0', 'oh_enc__x5_fall'
],
                    inplace=True,
                    axis=1)
#Concatenating the encoded dataframe with the original dataframe
df_concat = pd.concat(
    [train_new.reset_index(drop=True),
     d_encoded_data.reset_index(drop=True)],
    axis=1)
# Dropping drive-wheels, make and engine-location columns as they are encoded
df_concat.drop(
    ['season', 'year', 'month', 'hours', 'is_business_day', 'is_holiday'],
    inplace=True,
示例#29
0
print("..Training Result:")
print(f"....acc: {accuracy_score(y_train, pred_train)}")
print(f"....precision: {precision_score(y_train, pred_train)}")
print(f"....recall: {recall_score(y_train, pred_train)}")
print(f"....f1: {f1_score(y_train, pred_train)}")
print("..Testing Result:")
print(f"....acc: {accuracy_score(y_test, pred_test)}")
print(f"....precision: {precision_score(y_test, pred_test)}")
print(f"....recall: {recall_score(y_test, pred_test)}")
print(f"....f1: {f1_score(y_test, pred_test)}")


# %% plot the decision tree and look for important features
from sklearn.tree import plot_tree

plot_tree(clf, filled=True, max_depth=6, feature_names=ct.get_feature_names())

# %% apply logistic regerssion classifier and check results
from sklearn.linear_model import LogisticRegression


clf = LogisticRegression()
clf.fit(x_train, y_train)
pred_train = clf.predict(x_train)
pred_test = clf.predict(x_test)

print(clf.__class__.__name__)
print("..Training Result:")
print(f"....acc: {accuracy_score(y_train, pred_train)}")
print(f"....precision: {precision_score(y_train, pred_train)}")
print(f"....recall: {recall_score(y_train, pred_train)}")
示例#30
0
pc = preprocessing_config

# In[4]:

transforms = [
    ('passthrough', PassthroughTransformer(), pc['passthrough']),
    ('ma03', MovingAverageTransformer(3), pc['moving_average']),
    ('ma06', MovingAverageTransformer(6), pc['moving_average']),
    ('ma12', MovingAverageTransformer(12), pc['moving_average']),
    ('ma24', MovingAverageTransformer(24), pc['moving_average']),
    ('ma48', MovingAverageTransformer(48), pc['moving_average']),
    ('make_target', PercentChangeTransformer(), [pc['target']]),
]
ct = ColumnTransformer(transforms, remainder='drop', n_jobs=-1)
ct = ct.fit(data)
features = ct.get_feature_names()
features

# In[5]:

arr = ct.transform(data)
arr = arr[~np.isnan(arr).any(axis=1)]
arr.view()

# In[6]:

plt.figure()
plt.plot(arr[:, features.index('passthrough__close')])
plt.title('close')
plt.figure()
plt.plot(arr[:, features.index('make_target__close')])
示例#31
0
        X_train = pipeline.transform(X_train)
        X_validation = pipeline.transform(X_validation)
        X_test = pipeline.transform(X_test)

        number_features = X_train.shape[1]

        print(name_dataset + ": instances = " + str(number_instances) +
              " attributes = " + str(number_attributes) + " features = " +
              str(number_features))

        all_columns = []
        for ci in range(len(X.columns)):
            all_columns.append(str(X.columns[ci]).split('@')[0])
        X.columns = all_columns

        names = ct.get_feature_names()
        for c in continuous_columns:
            names.append(str(X.columns[c]))

        for n_i in range(len(names)):
            if names[n_i].startswith('onehot__x'):
                tokens = names[n_i].split('_')
                category = ''
                for ti in range(3, len(tokens)):
                    category += '_' + tokens[ti]
                cat_id = int(names[n_i].split('_')[2].split('x')[1])
                names[n_i] = str(
                    X.columns[categorical_features[cat_id]]) + category

        print(names)