Пример #1
0
def data_prepartion():

    num_attribs = ["yearsExperience", "milesFromMetropolis"]
    cat_attribs = ["companyId", "jobType", "degree", "major", "industry"]
    num_pipeline = Pipeline([
        ('imputer', DataFrameImputer(num_attribs)),
        ('std_scaler', StandardScaler()),
    ])

    cat_pipeline = Pipeline([
        ('imputer', DataFrameImputer(cat_attribs)),
        ('label_binarizer', OneHotEncoder()),
    ])

    data_pipeline = FeatureUnion(
        transformer_list=[("num_pipeline",
                           num_pipeline), ("cat_pipeline", cat_pipeline)])
    return data_pipeline
 def transform_data(self, housing_data):
    data             = housing_data.drop('median_house_value', axis=1)
    self.housing_num = data.select_dtypes(include=[np.number])
    self.num_attribs = list(self.housing_num)
    self.cat_attribs = list(data.select_dtypes(include=[np.object]))
    
    self.num_pipeline = Pipeline([
          ('selector'     , DataFrameSelector      (self.num_attribs )),
          ('imputer'      , Imputer                (strategy="median")),
          ('attribs_adder', CombinedAttributesAdder(                 )),
          ('std_caller'   , StandardScaler         (                 ))
       ])
    
    self.cat_pipeline = Pipeline([
          ('selector'     , DataFrameSelector      (self.cat_attribs )),
          ('cat_encoder'  , OneHotEncoder          (sparse=False     ))
       ])
    
    self.full_pipeline = FeatureUnion(transformer_list=[
          ("num_pipeline", self.num_pipeline),
          ("cat_pipeline", self.cat_pipeline)
       ])
Пример #3
0
    return pd.read_csv('datasets/'+fileName)

trainData = load_titanic_data('train.csv')
testData = load_titanic_data('test.csv')

num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
        ("imputer", Imputer(strategy="median"))
        ])
    
num_pipeline.fit_transform(trainData)    

cat_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["Pclass", "Sex", "Embarked"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])
    
cat_pipeline.fit_transform(trainData)    

preprocessed_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline)
        ])

train_data_preprocessed = preprocessed_pipeline.fit_transform(trainData)
train_data_labels = trainData["Survived"]

svm_classifier = SVC()
svm_classifier.fit(train_data_preprocessed, train_data_labels)
Пример #4
0
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

# In[62]:

ordinal_encoder.categories_

# **Warning**: earlier versions of the book used the `LabelBinarizer` or `CategoricalEncoder` classes to convert each categorical value to a one-hot vector. It is now preferable to use the `OneHotEncoder` class. Right now it can only handle integer categorical inputs, but in Scikit-Learn 0.20 it will also handle string categorical inputs (see [PR #10521](https://github.com/scikit-learn/scikit-learn/issues/10521)). So for now we import it from `future_encoders.py`, but when Scikit-Learn 0.20 is released, you can import it from `sklearn.preprocessing` instead:

# In[63]:

from future_encoders import OneHotEncoder

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

# By default, the `OneHotEncoder` class returns a sparse array, but we can convert it to a dense array if needed by calling the `toarray()` method:

# In[64]:

housing_cat_1hot.toarray()

# Alternatively, you can set `sparse=False` when creating the `OneHotEncoder`:

# In[65]:

cat_encoder = OneHotEncoder(sparse=False)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
########################################################################################################################
# pipe line for preprocessing the data
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('cat_encoder', OneHotEncoder(sparse=False)),
])
# concate the data
full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])
housing_prepared = full_pipeline.fit_transform(housing)

########################################################################################################################
# perform grid search on the data
######################################### grid search CV

dataExtract = housing_prepared[1:100]
labelExtract = housing_labels[1:100]
Пример #6
0
num_pipeline = Pipeline([('std_scalar', StandardScaler())])
housing_num_tr = num_pipeline.fit_transform(housing_tr)
'''
#%%
'''
housing['total_bedrooms'] = housing_tr['total_bedrooms']
housing['rooms_per_household'] = housing_tr['rooms_per_household']
housing['population_per_household'] = housing_tr['population_per_household']
housing['bedrooms_per_room'] = housing_tr['bedrooms_per_room']
'''
num_attribs = list(housing)
num_attribs.remove('ocean_proximity')
cat_attribs = ['ocean_proximity']
full_pipeline = ColumnTransformer([('num', num_pipeline, num_attribs),
                                   ('cat', OneHotEncoder(), cat_attribs)])
housing_prepared = full_pipeline.fit_transform(housing)

#%% Select and train a model
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

#%%
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)

#%%
lin_mae = mean_absolute_error(housing_labels, housing_predictions)

#%%
Пример #7
0
#Taking care of missing data
from sklearn.preprocessing import Imputer
imputer=Imputer(strategy="median")                   #We want to replace each attribute's missing values with the median of that attribute
housing_num=housing.drop("ocean_proximity",axis=1)   #Copy withouth the text attribute
imputer.fit(housing_num)                             #
imputer.statistics_                                  #Shows the results (median for each attribute)
X=imputer.transform(housing_num)                     #Transform the training set by replacing the missing values by the learned values
housing_tr=pd.DataFrame(X,columns=housing_num.columns)#Convert the array to a DataFrame

#Handling Text and Categorical Attributes
housing_cat=housing["ocean_proximity"]                      #Categorical attribute. It is a serie
housing_cat=pd.DataFrame(data=housing_cat)                  #DataFrame converts from serie --> DataFrame (includes the index)

from future_encoders import OneHotEncoder                   #This function assign 1 for when the instance has that category
cat_encoder=OneHotEncoder(sparse=False)
housing_cat_1hot=cat_encoder.fit_transform(housing_cat)     #Then, create a sparse matrix with the location of nonzeros.
housing_cat_1hot
cat_encoder.categories_


#Custom Transformer
from sklearn.base import BaseEstimator,TransformerMixin

rooms_ix,bedrooms_ix,population_ix,household_ix=3,4,5,6                                    #Columns (location) of each attribute

class CombinedAttributesAdder(BaseEstimator,TransformerMixin):                             #The New class is using two base classes
    def __init__(self,add_bedrooms_per_room=True):                                           #_init_ is the constructor for the class
        self.add_bedrooms_per_room=add_bedrooms_per_room
    def fit(self,X,y=None):
        return self                                                                        #Nothing else to do
Пример #8
0
    for set_ in (train_set, test_set):
        set_.drop("income_cat", axis=1, inplace=True)

    housing = train_set.drop("median_house_value", axis=1)
    housing_labels = train_set["median_house_value"].copy()

    num_attribs = list(housing.drop("ocean_proximity", axis=1))
    cat_attribs = ["ocean_proximity"]

    num_pipeline = Pipeline([
        ('imputer', Imputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

    housing_prepared = full_pipeline.fit_transform(housing)
    print(housing_prepared)

    lin_reg = LinearRegression()
    lin_reg.fit(housing_prepared, housing_labels)

    housing_predictions = lin_reg.predict(housing_prepared)
    lin_mse = mean_squared_error(housing_labels, housing_predictions)
    print("RMSE: ", np.sqrt(lin_mse))
Пример #9
0
# Create attributes for indexing
num_attribs = list(housing.drop('ocean_proximity', axis=1))
cat_attribs = ['ocean_proximity']

# Build pipeline for pre-processing
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAddr()),
        ('std_scaler', StandardScaler()),
     ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('cat_encoder', OneHotEncoder(sparse=False)),  #Rem sparse see ln 19
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ('num_pipeline', num_pipeline),
        ('cat_pipeline', cat_pipeline),
    ])

# Process data with pipeline
housing_prepared = full_pipeline.fit_transform(housing)

## Train some Models

# Declare instance for a support vector machine
svm_reg = SVR()
Пример #10
0

if __name__ == "__main__":
    # Load the dataset
    df = load_data("Data/data.json")

    # Select features and target
    features = df.drop("score", axis=1)
    y = df["score"].copy()

    numeric_values = features.drop(["mood", "weather", "activity"], axis=1)  # returns a copy of the dataframe
    num_attribs = list(numeric_values)

    cat_attribs = ["mood", "weather", "activity"]

    num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),  # Own transformation
        ('imputer', Imputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

    full_pipeline = ColumnTransformer([
        ("num_pipline", num_pipeline, num_attribs),
        ("cat_pipline", OneHotEncoder(), cat_attribs),
    ])

    X = full_pipeline.fit_transform(features)

    forest_reg = RandomForestRegressor()
    forest_reg.fit(X, y)
Пример #11
0
    def transform(self, X, y=None):
        return X[self.attributes_names].values
# pipeline to transform numeric data
num_pipeline = Pipeline([
                         ('selector', DataFrameSelector(num_attribs)), #num col
                         ('imputer', Imputer(strategy="median")), #missing value
                         ('attribs_adder', AddAttributes()), #customize add col
                         ('std_scaler', StandardScaler()),  #feature scaling
                         ])
#the first transformer Imputer: input is dataframe, output is numpy array
#housing_num_tr = num_pipeline.fit_transform(housing_num)
#pipeline to transform text data
from future_encoders import OneHotEncoder, OrdinalEncoder
cat_pipeline = Pipeline([
                         ('selector', DataFrameSelector(cat_attribs)), #text col
                         ('cat_pipline', OneHotEncoder(sparse=False)), #turn text to num
                         ])
#join the features by two pipeline
full_pipeline = FeatureUnion(transformer_list=[
            ('num_pipeline', num_pipeline),
            ('cat_pipeline', cat_pipeline),
            ])
#housing_prepared = cat_pipeline.fit_transform(housing)
housing_prepared = num_pipeline.fit_transform(housing)
#encoder=OrdinalEncoder() #OneHotEncoder(sparse=False)
#b = housing[cat_attribs]
#a = encoder.fit_transform(b)
"""
------------            Train and Evaluate          -------------------------
  4. Training
    4.1  LinearRegression():  input: numpy arrary or dataframe, output: np array