rhinoceros.append(col) print(df_train.info()) print(df_train.head()) else: zebra = [] rhinoceros = [] ''' Features combination ''' if combination: print('combination') if poly: poly = PolynomialFeatures(degree=2, interaction_only=True) output_array = poly.fit_transform(df_train.loc[:, 'f28':]) df_output = pd.DataFrame(output_array, columns=poly.get_feature_names( df_train.columns['f28':])) print(df_output.info()) print(df_output.head()) sys.exit(0) donkey = [] cat_zebra = False cat_rhinoceros = False date_cat = False f5_cat = False catD = True if cat_zebra: for col1 in cat: for col2 in zebra: df_train[col1 + col2] = df_train[col1] + df_train[col2] * 10
from sklearn.linear_model import Ridge from sklearn.preprocessing import OneHotEncoder enc = OneHotEncoder() X_hour_week_onehot = enc.fit_transform(X_hour_week).toarray() eval_on_features(X_hour_week_onehot, y, Ridge()) # use ridge - with regularization # Pre_Processing 2: Polynomial Features from sklearn.preprocessing import PolynomialFeatures poly_transformer = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False) X_hour_week_onehot_poly = poly_transformer.fit_transform(X_hour_week_onehot) #如果是normalization不能train和test这么同时fit lr = Ridge() eval_on_features(X_hour_week_onehot_poly, y, lr) ## plot coefficients learned by the model (NA for random forest) hour = ["%02d:00" % i for i in range(0, 24, 3)] day = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] features = day + hour # name all the interaction features, and keep only the features with nonzero coefficients: features_poly = poly_transformer.get_feature_names(features) features_nonzero = np.array(features_poly)[lr.coef_ != 0] coef_nonzero = lr.coef_[lr.coef_ != 0] # visualize the coefficients learned by the linear model plt.figure(figsize=(15, 2)) plt.plot(coef_nonzero, 'o') plt.xticks(np.arange(len(coef_nonzero)), features_nonzero, rotation=90) plt.xlabel("Feature name") plt.ylabel("Feature magnitude") plt.show()
def featureEngineer(self, data, ntrain): data.loc[(data.PoolArea > 0), ['MiscFeature']] = 'Pool' data.loc[(data.PoolArea > 0), ['MiscVal']] = data.loc[(data.PoolArea > 0), ['MiscVal', 'PoolArea']].apply( lambda x: (x.MiscVal + x.PoolArea), axis=1) data[ 'TotalExtraPoints'] = data.HeatingQC + data.PoolQC + data.FireplaceQu + data.KitchenQual data['TotalPoints'] = ( data.ExterQual + data.FireplaceQu + data.GarageQual + data.KitchenQual + data.BsmtQual + data.BsmtExposure + data.BsmtFinType1 + data.PoolQC + data.ExterCond + data.BsmtCond + data.GarageCond + data.OverallCond + data.BsmtFinType2 + data.HeatingQC) + data.OverallQual**2 df = data.loc[(data.SalePrice > 0), [ 'TotalPoints', 'TotalExtraPoints', 'OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'PoolQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'SalePrice' ]] data['GarageArea_x_Car'] = data.GarageArea * data.GarageCars data['TotalBsmtSF_x_Bsm'] = data.TotalBsmtSF * data['1stFlrSF'] # We don´t have a feature with all construct area, maybe it is an interesting feature to create. data['ConstructArea'] = (data.TotalBsmtSF + data.WoodDeckSF + data.GrLivArea + data.OpenPorchSF + data.TSsnPorch + data.ScreenPorch + data.EnclosedPorch + data.MasVnrArea + data.GarageArea + data.PoolArea) # all_data['TotalArea'] = all_data.ConstructArea + all_data.LotArea data['Garage_Newest'] = data.YearBuilt > data.GarageYrBlt data.Garage_Newest = data.Garage_Newest.apply(lambda x: 1 if x else 0) data[ 'TotalPorchSF'] = data.OpenPorchSF + data.EnclosedPorch + data.TSsnPorch + data.ScreenPorch + data.WoodDeckSF data.EnclosedPorch = data.EnclosedPorch.apply(lambda x: 1 if x else 0) data['LotAreaMultSlope'] = data.LotArea * data.LandSlope data['BsmtSFPoints'] = (data.BsmtQual**2 + data.BsmtCond + data.BsmtExposure + data.BsmtFinType1 + data.BsmtFinType2) data['BsmtSFMultPoints'] = data.TotalBsmtSF * ( data.BsmtQual**2 + data.BsmtCond + data.BsmtExposure + data.BsmtFinType1 + data.BsmtFinType2) data['TotBathrooms'] = data.FullBath + ( data.HalfBath * 0.5) + data.BsmtFullBath + (data.BsmtHalfBath * 0.5) data.FullBath = data.FullBath.apply(lambda x: 1 if x else 0) data.HalfBath = data.HalfBath.apply(lambda x: 1 if x else 0) data.BsmtFullBath = data.BsmtFullBath.apply(lambda x: 1 if x else 0) data.BsmtHalfBath = data.BsmtHalfBath.apply(lambda x: 1 if x else 0) data.MSSubClass = data.MSSubClass.astype('str') data.MoSold = data.MoSold.astype('str') data, dummies = self.one_hot_encode(data) ZeroTest = data[dummies][ntrain:].sum() == 0 data.drop(dummies[ZeroTest], axis=1, inplace=True) print('Dummins in test dataset with all observatios equal to 0:', len(dummies[ZeroTest]), 'of \n', dummies[ZeroTest], '\n') dummies = dummies.drop(dummies[ZeroTest]) # Find dummies with all training observatiosn are equal to 0 ZeroTest = data[dummies][:ntrain].sum() == 0 data.drop(dummies[ZeroTest], axis=1, inplace=True) print('Dummins in trainig dataset with all observatios equal to 0:', len(dummies[ZeroTest]), 'of \n', dummies[ZeroTest], '\n') dummies = dummies.drop(dummies[ZeroTest]) del ZeroTest data['Remod'] = 2 data.loc[(data.YearBuilt == data.YearRemodAdd), ['Remod']] = 0 data.loc[(data.YearBuilt != data.YearRemodAdd), ['Remod']] = 1 #all_data['Age'] = all_data.YearRemodAdd - all_data.YrSold # sice I convert both to age data["WasNew"] = 2 data.loc[(data.YearBuilt == data.YrSold), ['WasNew']] = 1 data.loc[(data.YearBuilt != data.YrSold), ['WasNew']] = 0 data.drop([ 'FireplaceQu', 'BsmtSFPoints', 'TotalBsmtSF', 'GarageArea', 'GarageCars', 'OverallQual', 'GrLivArea', 'TotalBsmtSF_x_Bsm', '1stFlrSF', 'PoolArea', 'LotArea', 'SaleCondition_Partial', 'Exterior1st_VinylSd', 'GarageCond', 'HouseStyle_2Story', 'BsmtSFMultPoints', 'ScreenPorch', 'LowQualFinSF', 'BsmtFinSF2', 'TSsnPorch' ], axis=1, inplace=True) data.rename(columns={'2ndFlrSF': 'SndFlrSF'}, inplace=True) # Remove the higest correlations and run a multiple regression cols = data.columns print(cols) cols = cols.drop(['SalePrice']) #vif = self.VRF('SalePrice', all_data.loc[all_data.SalePrice > 0, cols], all_data.SalePrice[all_data.SalePrice > 0], cols) cols = cols.drop([ 'Condition1_PosN', 'Neighborhood_NWAmes', 'Exterior1st_CBlock', 'BldgType_1Fam', 'RoofStyle_Flat', 'MSZoning_Call', 'Alley_Grvl', 'LandContour_Bnk', 'LotConfig_Corner', 'GarageType_2Types', 'MSSubClass_45', 'MasVnrType_BrkCmn', 'Foundation_CBlock', 'MiscFeature_Gar2', 'SaleType_COD', 'Exterior2nd_CBlock' ]) #vif = self.VRF('SalePrice', all_data.loc[all_data.SalePrice > 0, cols], all_data.SalePrice[all_data.SalePrice > 0], cols) cols = cols.drop([ 'PoolQC', 'BldgType_TwnhsE', 'BsmtFinSF1', 'BsmtUnfSF', 'Electrical_SBrkr', 'Exterior1st_MetalSd', 'Exterior2nd_VinylSd', 'GarageQual', 'GarageType_Attchd', 'HouseStyle_1Story', 'MasVnrType_None', 'MiscFeature_NA', 'MSZoning_RL', 'RoofStyle_Gable', 'SaleCondition_Normal', 'MoSold_10', 'SaleType_New', 'SndFlrSF', 'TotalPorchSF', 'WoodDeckSF', 'BldgType_Duplex', 'MSSubClass_90' ]) print(cols) #print(vif) df_copy = data[data.SalePrice > 0].copy() data.CentralAir = data.CentralAir.astype('uint8') data.Garage_Newest = data.Garage_Newest.astype('uint8') data.EnclosedPorch = data.EnclosedPorch.astype('uint8') data.FullBath = data.FullBath.astype('uint8') data.HalfBath = data.HalfBath.astype('uint8') data.BsmtFullBath = data.BsmtFullBath.astype('uint8') data.BsmtHalfBath = data.BsmtHalfBath.astype('uint8') data.Remod = data.Remod.astype('uint8') data.WasNew = data.WasNew.astype('uint8') data.Street = data.Street.astype('uint8') # orinal data.PavedDrive = data.PavedDrive.astype('uint8') # ordinal data.Functional = data.Functional.astype('uint8') # ordinal data.LandSlope = data.LandSlope.astype('uint8') # ordinal numeric_features = list( data.loc[:, cols].dtypes[(data.dtypes != "category") & (data.dtypes != 'uint8')].index) ''' with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) ''' skewed_features = data[numeric_features].apply( lambda x: skew(x.dropna())).sort_values(ascending=False) #compute skewness skewness = pd.DataFrame({'Skew': skewed_features}) # Get only higest skewed features skewness = skewness[abs(skewness) > 0.7] skewness = skewness.dropna() l_opt = {} for feat in skewness.index: data[feat], l_opt[feat] = boxcox((data[feat] + 1)) skewed_features2 = data[skewness.index].apply( lambda x: skew(x.dropna())).sort_values(ascending=False) #compute skewness skewness2 = pd.DataFrame({'New Skew': skewed_features2}) y = data.SalePrice[data.SalePrice > 0] X = data.loc[data.SalePrice > 0, ['ConstructArea']] #self.poly(X, y, 'ConstructArea') X = data.loc[data.SalePrice > 0, ['ConstructArea', 'TotalPoints']] #self.poly(X, y) X = data.loc[data.SalePrice > 0, [ 'ConstructArea', 'TotalPoints', 'LotAreaMultSlope', 'GarageArea_x_Car' ]] #self.poly(X, y) poly_cols = [ 'ConstructArea', 'TotalPoints', 'LotAreaMultSlope', 'GarageArea_x_Car' ] pf = PolynomialFeatures(degree=3, interaction_only=False, include_bias=False) res = pf.fit_transform(data.loc[:, poly_cols]) target_feature_names = [ feat.replace(' ', '_') for feat in pf.get_feature_names(poly_cols) ] output_df = pd.DataFrame(res, columns=target_feature_names, index=data.index).iloc[:, len(poly_cols):] print('Polynomial Features included:', output_df.shape[1]) # display(output_df.head()) data = pd.concat([data, output_df], axis=1) print('Total Features after Polynomial Features included:', data.shape[1]) colsP = output_df.columns del output_df, target_feature_names, res, pf y_train = (data.SalePrice[data.SalePrice > 0].reset_index( drop=True, inplace=False)) #self.trainingData = all_data.loc[(all_data.SalePrice>0), cols].reset_index(drop=True, inplace=False) #self.testingData = all_data.loc[(all_data.SalePrice==0), cols].reset_index(drop=True, inplace=False) return data, y_train, cols, colsP
x = pd.DataFrame(np.c_[df['LSTAT'], df['RM']], columns=['LSTAT', 'RM']) Y = df['MEDV'] from sklearn.model_selection import train_test_split x_train, x_test, Y_train, Y_test = train_test_split(x, Y, test_size=0.3, random_state=5) #---use a polynomial function of degree 2--- degree = 2 # 2차식 polynomial_features = PolynomialFeatures(degree=degree) x_train_poly = polynomial_features.fit_transform(x_train) #---print out the formula--- print(polynomial_features.get_feature_names(['x', 'y'])) model = LinearRegression() model.fit(x_train_poly, Y_train) x_test_poly = polynomial_features.fit_transform(x_test) print('R-Squared: %.4f' % model.score(x_test_poly, Y_test)) print(model.intercept_) print(model.coef_) fig = plt.figure(figsize=(18, 15)) ax = fig.add_subplot(111, projection='3d') ax.scatter(x['LSTAT'], x['RM'], Y, c='b') ax.set_xlabel("LSTAT")
# impute missing values imputer = SimpleImputer(strategy='median') poly_target = poly_features['TARGET'] poly_features = poly_features.drop(columns=['TARGET']) poly_features = imputer.fit_transform(poly_features) poly_features_test = imputer.transform(poly_features_test) # create polynomial features poly_transformer = PolynomialFeatures(degree=3) poly_transformer.fit(poly_features) # transform the features poly_features = poly_transformer.transform(poly_features) poly_features_test = poly_transformer.transform(poly_features_test) print('Polynomial Features shape: ', poly_features.shape) poly_transformer.get_feature_names(input_features=[ 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH' ])[:15] # create df for features poly_features = pd.DataFrame(poly_features, columns=poly_transformer.get_feature_names([ 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH' ])) # add in the target poly_features['TARGET'] = poly_target # find the correlations with the target poly_corrs = poly_features.corr()['TARGET'].sort_values() # display most negative and most positive print(poly_corrs.head(10)) print(poly_corrs.tail(5))
# ### 1.b Extract polynomial features and interactions up to a degree of 2 X = data.drop('MEDV', axis=1) Y = data['MEDV'] # ##### By default train_test_split divide the sample assigning 25% for train test group from sklearn.preprocessing import PolynomialFeatures poly = PolynomialFeatures(degree=2, include_bias=False).fit(X) X_poly = poly.transform(X) print("X_poly.shape: {}".format(X_poly.shape)) # ##### The polynominal transformation dataset includes 104 features, 13 original features, new 13 squared values of the original and 78 interaction among these variables. The inclusion of these new variables should improve model fitting from a lineal version which do not take into account non-lineal relation among features (explanatory variables) and dependent variable (av. price of Boston houses or MEDV) print("Polynomial feature names:\n{}".format(poly.get_feature_names())) # ##### The polynominal transformation dataset includes 105 features, 13 original features, new 13 squared values of the original and 70 interaction among these variables. The inclusion of these new variables should improve model fitting from a lineal version which do not take into account non-lineal relation among features (explanatory variables) and dependent variable (av. price of Boston houses) # ### 1.c Create a pandas DataFrame using the polynomials and save the file polynomials0 = pd.DataFrame( X_poly, columns=[ 'x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x0^2', 'x0 x1', 'x0 x2', 'x0 x3', 'x0 x4', 'x0 x5', 'x0 x6', 'x0 x7', 'x0 x8', 'x0 x9', 'x0 x10', 'x0 x11', 'x0 x12', 'x1^2', 'x1 x2', 'x1 x3', 'x1 x4', 'x1 x5', 'x1 x6', 'x1 x7', 'x1 x8', 'x1 x9', 'x1 x10', 'x1 x11', 'x1 x12', 'x2^2', 'x2 x3', 'x2 x4', 'x2 x5', 'x2 x6', 'x2 x7', 'x2 x8', 'x2 x9', 'x2 x10', 'x2 x11', 'x2 x12', 'x3^2', 'x3 x4', 'x3 x5', 'x3 x6', 'x3 x7', 'x3 x8', 'x3 x9',
# [3*x for x in range(max_range)], # [4*x for x in range(max_range)], # [5*x for x in range(max_range)]] X = la.transpose(X) # print(X) # Fitting Polynomial Regression to the dataset from sklearn.preprocessing import PolynomialFeatures poly_reg = PolynomialFeatures(degree=2) X_poly_features = poly_reg.fit(X) # print(X_poly_features) # print() # fit or fit_transform must be called before this is called feature_names = poly_reg.get_feature_names() feature_names.sort() print(feature_names) print() # X_poly_transform = poly_reg.transform(X) # print(len(poly_reg.get_feature_names())) # print(X_poly_transform) # print() # print(poly_reg.get_feature_names()) # print() # print(poly_reg.get_params()) # print()
def feature_adder_poly(df, *cols, degree=2, include_bias=False): poly = PolynomialFeatures(degree=degree, include_bias=include_bias) poly.fit(df[list(cols)]) return poly.transform(df[list(cols)])[:, len(cols):], poly.get_feature_names(cols)[len(cols):]
df_label = df_label[(z < threshold)] # Reset the index for the polynomial features merge df_label = df_label.reset_index(drop=True) # Get polynomial features polyTrans = PolynomialFeatures(degree=2, include_bias=False) df_label_Num = df_label[[ "level", "temperature", "usage", "Brightness", "RAM" ]] df_label = df_label.drop( ["level", "temperature", "usage", "Brightness", "RAM"], axis=1) # Drop them to get back later the poly Trans of them polyData_Num = polyTrans.fit_transform(df_label_Num) columnNames = polyTrans.get_feature_names( ["level", "temperature", "usage", "Brightness", "RAM"]) df_label_Num = pandas.DataFrame(polyData_Num, columns=columnNames) for column in columnNames: df_label[column] = pandas.Series(df_label_Num[column]) # Get dataframes y_label = df_label["output"] X_label = df_label.drop(["output"], axis=1) # Keep only the selected columns for each labels X_label = X_label[selColumns[idx]] # Split data training and testing ... X_train_label, X_test_label, y_train_label, y_test_label = train_test_split( X_label, y_label, test_size=0.25, random_state=42)
def featureSelection(data, labels): plotROCCurveBase() crossValidationTestAndPlot(LogisticRegression(), "Full Feature Set", data, labels, cvNum=5, addAverage=True) # Create and fit selector selector = SelectKBest(k=100) selector.fit(data, labels) # Get columns to keep cols = selector.get_support() print(len(cols)) # Create new dataframe with only desired columns, or overwrite existing data = data[data.columns[cols]] print(data.shape) crossValidationTestAndPlot(LogisticRegression(), "100-Best Features", data, labels, cvNum=5, addAverage=True) print(data.columns.values.tolist()) poly = PolynomialFeatures(interaction_only=True) polyData = pd.DataFrame(poly.fit_transform(data), columns=poly.get_feature_names(data.columns)) crossValidationTestAndPlot(LogisticRegression(), "Features with Interaction", polyData, labels, cvNum=5, addAverage=True) #print ("Interaction Features", poly.get_feature_names(data.columns)) # Create and fit selector selector = SelectKBest(k=100) selector.fit(polyData, labels) # Get columns to keep cols = selector.get_support() # Create new dataframe with only desired columns, or overwrite existing polyData = polyData[polyData.columns[cols]] crossValidationTestAndPlot(LogisticRegression(), "100-Best Features With Interaction", polyData, labels, cvNum=5, addAverage=True) print(polyData.columns.values.tolist()) #print(polyData.get_feature_names(data.columns)) poly3 = PolynomialFeatures(degree=2) poly3Data = pd.DataFrame(poly3.fit_transform(data), columns=poly3.get_feature_names(data.columns)) crossValidationTestAndPlot(LogisticRegression(), "Features with Polynomials up to n^2", poly3Data, labels, cvNum=5, addAverage=True) # Create and fit selector selector = SelectKBest(k=100) selector.fit(poly3Data, labels) # Get columns to keep cols = selector.get_support() # Create new dataframe with only desired columns, or overwrite existing poly3Data = poly3Data[poly3Data.columns[cols]] crossValidationTestAndPlot(LogisticRegression(), "100-Best Features With Polynomials up to n^2", poly3Data, labels, cvNum=5, addAverage=True) plt.legend() plt.show()
train_performances = train_scenario.performance_data print(train_performances) train_features = train_scenario.feature_data # preprocessing imputer = SimpleImputer() polytransform = PolynomialFeatures(2) scaler = StandardScaler() # Impute train_features[train_features.columns] = imputer.fit_transform( train_features[train_features.columns]) # Create polynomial features if use_quadratic_transform: quad_data = polytransform.fit_transform(train_features.to_numpy()) new_cols = polytransform.get_feature_names(train_features.columns) train_features = pd.DataFrame(data=quad_data, index=train_features.index, columns=new_cols) # Standardize train_features[train_features.columns] = scaler.fit_transform( train_features[train_features.columns]) # inst, perf, rank = util.construct_numpy_representation_with_pairs_of_rankings( # train_features, train_performances, max_pairs_per_instance=max_pairs_per_instance, seed=seed) cutoff = scenario.algorithm_cutoff_time par10 = cutoff * 10 perf = train_performances.to_numpy()
def test_heat_capacity(self): # Size of CMU and # % solid # Density of concrete in CMU, lb/ft³* # heat capacity data # The 6.9 point on row 5 is bad data but I am leaving it here because # I eliminate it by deleting it. I want the original data from Table hc_data = { "heat_capacity_IMP": [ 3.40, 3.78, 4.17, 4.55, 4.93, 5.56, 5.96, 4.01, 4.47, 4.94, 5.40, 5.86, 6.60, 7.08, 5.05, 5.64, 6.23, 6.82, 7.41, 8.37, 8.99, 4.36, 4.87, 5.37, 5.87, 6.38, 7.19, 7.72, 6.04, 6.76, 7.47, 8.18, 6.90, 10.05, 10.80, 5.57, 6.23, 6.88, 7.52, 8.17, 9.21, 9.89, 8.17, 9.14, 10.11, 11.08, 12.04, 13.61, 14.63, 6.50, 7.25, 8.01, 8.76, 9.51, 10.60, 11.38, 10.26, 11.48, 12.71, 13.93, 15.15, 17.13, 18.41, 7.75, 8.66, 9.57, 10.48, 11.39, 12.86, 13.81, 12.30, 13.77, 15.25, 16.37, 18.20, 20.59, 22.14 ], "percent_solid": [ 65, 65, 65, 65, 65, 65, 65, 78, 78, 78, 78, 78, 78, 78, 100, 100, 100, 100, 100, 100, 100, 55, 55, 55, 55, 55, 55, 55, 78, 78, 78, 78, 78, 78, 78, 52, 52, 52, 52, 52, 52, 52, 78, 78, 78, 78, 78, 78, 78, 48, 48, 48, 48, 48, 48, 48, 78, 78, 78, 78, 78, 78, 78, 48, 48, 48, 48, 48, 48, 48, 78, 78, 78, 78, 78, 78, 78 ], "thickness_in": [ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12 ], "density_IMP": [ 80, 90, 100, 110, 120, 130, 140, 80, 90, 100, 110, 120, 130, 140, 80, 90, 100, 110, 120, 130, 140, 80, 90, 100, 110, 120, 130, 140, 80, 90, 100, 110, 120, 130, 140, 80, 90, 100, 110, 120, 130, 140, 80, 90, 100, 110, 120, 130, 140, 80, 90, 100, 110, 120, 130, 140, 80, 90, 100, 110, 120, 130, 140, 80, 90, 100, 110, 120, 130, 140, 80, 90, 100, 110, 120, 130, 140 ] } df_IMP = pd.DataFrame(hc_data) df = pd.DataFrame({ "heat capacity": self.Btu_per_ft2F_to_J_per_m2K * df_IMP["heat_capacity_IMP"], "percent solid": df_IMP["percent_solid"], "thickness": self.in_to_m * df_IMP["thickness_in"], "density": self.lbpft3_to_kgpm3 * df_IMP["density_IMP"] }) if self.include_plots: fig, axl = plt.subplots(3, 1, figsize=(10, 20)) axl[0].scatter(df["percent solid"], df["heat capacity"], color="red") axl[1].scatter(df["thickness"], df["heat capacity"], color="blue") axl[2].scatter(df["density"], df["heat capacity"], color="green") X = df[["percent solid", "thickness", "density"]] Y = df[["heat capacity"]] model = sm.OLS(Y, X).fit() predictions = model.predict(X) print_model = model.summary() # from multi-variate linear to multi-variate polynomial fits. if self.include_plots: fig2, axl = plt.subplots(1, 2, figsize=(20, 10)) for i in range(5): poly = PolynomialFeatures(degree=i + 1) X_ = poly.fit_transform(X, y=Y) model = sm.OLS(Y, X_).fit() predictions = model.predict(X_) df['heat capacity fit polynomial order {0:2d}'.format( i + 1)] = predictions df['heat capacity errors polynomial order {0:2d}'.format( i + 1)] = 100 * (predictions - df["heat capacity"]) / df["heat capacity"] if self.include_plots: df['heat capacity errors polynomial order {0:2d}'.format( i + 1)].plot(ax=axl[0], label="poly {0:2d}".format(i + 1)) if self.include_plots: axl[0].legend() axl[0].set_title("Fits with 1 bad data point") # THE np.nan used to be 6.9 but this makes not sense w/r to the the # data and the polynomial fits and associated error proved this by being # unable to fit the spurious point. df.drop(index=32, inplace=True) # redo X and Y because we have dropped a point. X = df[["percent solid", "thickness", "density"]] Y = df[["heat capacity"]] if self.include_plots: fig3, ax3 = plt.subplots(1, 1) # we not stop at order 3 because it has the best combination of error and complexity: # poly 2 error max: 8.279 min: -8.256 average magnitude: 1.888 # poly 3 error max: 2.006 min: -1.714 average magnitude: 0.731 X # poly 4 error max: 2.044 min: -1.665 average magnitude: 0.655 # poly 5 error max: 1.958 min: -1.561 average magnitude: 0.635 for i in range(3): poly = PolynomialFeatures(degree=i + 1) X_ = poly.fit_transform(X, y=Y) model = sm.OLS(Y, X_).fit() predictions = model.predict(X_) df['heat capacity fit polynomial order {0:2d}'.format( i + 1)] = predictions err = 100 * (predictions - df["heat capacity"]) / df["heat capacity"] df['heat capacity errors polynomial order {0:2d}'.format(i + 1)] = err if self.include_plots: df['heat capacity errors polynomial order {0:2d}'.format( i + 1)].plot(ax=axl[1], label="poly {0:2d}".format(i + 1)) print( "poly {0:2d} error max: {1:5.3f} min: {2:5.3f} average magnitude: {3:5.3f}" .format(i + 1, np.max(np.max(err.values)), np.min(err.values), np.mean(abs(err.values)))) if self.include_plots: axl[1].legend() axl[1].set_title("Fits bad data removed") model.summary() print("The model parameters are:") params = model.params for param in params: print("{0:12.8e}".format(param)) print(poly.get_feature_names()) # now test the implementation in ElCanoBuildingEnergy_Demand_Load_Model.py.concrete_wall HC_val1 = 19.312 / self.Btu_per_hft2F_to_W_per_m2K * self.Btu_per_ft2F_to_J_per_m2K HC_val2 = 42.7136 / self.Btu_per_hft2F_to_W_per_m2K * self.Btu_per_ft2F_to_J_per_m2K wall1 = ec_be.concrete_wall(1280, 0.35, 0.1016, 0.0) wall2 = ec_be.concrete_wall(1760, (100 - 52) / 100, 0.2032, 0.0) err1 = np.abs(100 * (wall1.HC_value - HC_val1) / HC_val1) err2 = np.abs(100 * (wall2.HC_value - HC_val2) / HC_val2) self.assertTrue(err1 < 2.006) self.assertTrue(err2 < 2.006)
def task_2b(): # read csv files as dataframe life_df = pd.read_csv("life.csv") world_df = pd.read_csv("world.csv") world_df = world_df.rename(columns={ 'Country Name': 'Country', 'Time': 'Year' }) # PREPROCESSING - from Question 2A # merge dataframes on common columns (country and country code) world_df = world_df.rename(columns={ 'Country Name': 'Country', 'Time': 'Year' }) new_df = pd.merge(life_df, world_df, how='inner', on=['Country', 'Country Code']) # remove rows with null 'life expectancy' values new_df = new_df.dropna(axis=0, subset=['Life expectancy at birth (years)']) # split into training and test sets with random state of 100 X1 = new_df.iloc[:, 5:] # learn from these data X2 = new_df.iloc[:, 1] # only keep Country Code with 20 original features, create a pointer to this df for reference to country split later X = pd.concat([X2.reset_index(drop=True), X1.reset_index(drop=True)], axis=1) y = new_df.loc[:, 'Life expectancy at birth (years)'] # expected results X_train_with_country, X_test_with_country, y_train, y_test = ms.train_test_split( X, y, train_size=2 / 3, test_size=1 / 3, random_state=100) X_train_with_country_df = pd.DataFrame( X_train_with_country, index=X_train_with_country.index, columns=X_train_with_country.columns) X_test_with_country_df = pd.DataFrame(X_test_with_country, index=X_test_with_country.index, columns=X_test_with_country) # reassign pointers to purely quantitative features in X_train and X_test X_train = X_train_with_country.iloc[:, 1:] X_test = X_test_with_country.iloc[:, 1:] # turn strings from X_train and X_test to NaN (inputs) for column in X_train.columns: X_train[column] = pd.to_numeric(X_train[column], errors='coerce') for column in X_test.columns: X_test[column] = pd.to_numeric(X_test[column], errors='coerce') # fill the NaN values in X_test and X_train with median of X_train for col in X_train.select_dtypes(include=np.number): X_train[col] = X_train[col].fillna(X_train[col].median()) for col in X_test.select_dtypes(include=np.number): X_test[col] = X_test[col].fillna(X_train[col].median()) # scale training set and test set scaler = preprocessing.StandardScaler() scaled_X_train = scaler.fit_transform(X_train) scaled_X_test = scaler.transform(X_test) scaled_X_train = pd.DataFrame(scaled_X_train, index=X_train.index, columns=X_train.columns) scaled_X_test = pd.DataFrame(scaled_X_test, index=X_test.index, columns=X_test.columns) # PART 1: FEATURE ENGINEERING # INTERACTION TERM PAIRS print(DIVIDER + "INTERACTION TERM PAIRS" + DIVIDER) # degree of terms = 2 # interaction_only means no feature multiplied by itself (x^2) only x*y # include_bias means including constant terms that act as intercept in linear model --> false poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False) # numpy array of world.csv with 210 features world_int_term_pairs = poly.fit_transform(scaled_X_train) int_pair_list = poly.get_feature_names(scaled_X_train.columns.tolist()) int_term_pairs = pd.DataFrame({'features': int_pair_list}) world_int_term_pairs_df = pd.DataFrame(data=world_int_term_pairs[:], columns=int_pair_list) print("Number of interaction term pairs:", len(int_term_pairs)) print(int_term_pairs) print("\n") # CLUSTERING LABELS print(DIVIDER + "CLUSTERING LABELS" + DIVIDER) # use elbow method to determine suitable k value # within-cluster-sum-of-squares (WCSS) wcss = [] # this loop will fit the k-means algorithm to data and compute the WCSS and append to list for i in range(1, 20): kmeans = KMeans(n_clusters=i, init='k-means++') kmeans.fit(scaled_X_train) # kmeans inertia attribute is sum of squared distance of samples to their closest cluster center wcss.append(kmeans.inertia_) # plot the elbow graph - choose n=5 for clustering as this is point where graph plateaus plt.figure(figsize=(12, 6)) plt.plot(range(1, 20), wcss, marker='o') plt.title('Elbow Method Graph for Selection of Appropriate k Value') plt.xlabel('Number of Clusters') plt.ylabel('Within-Cluster Sum of Squares (WCSS)') plt.savefig('task2bElbowGraph.png', bbox_inches='tight') plt.show() plt.close() # form clusters kmeans = KMeans( n_clusters=7 ) # any number between 5-7 should be good, but higher = not much difference clusters = kmeans.fit(scaled_X_train) prediction = kmeans.predict(scaled_X_train) # see count of data points in each cluster frame = pd.DataFrame(scaled_X_train) frame['k-means cluster'] = prediction counts_series = frame['k-means cluster'].value_counts().sort_index() counts_df = pd.DataFrame({ 'cluster number': counts_series.index, 'counts': counts_series.values }) print("\nCount of countries in each cluster:") print(counts_df.to_string(index=False)) # see which cluster each country from training set is in (should be a number between 0-6 because 7 clusters)) print("\nCountries from training set with new feature (f-clusterlabels):") cluster_label_df = pd.DataFrame({ 'Country Code': X_train_with_country_df.iloc[:, 0], 'f-clusterlabel': clusters.labels_ }) print("\n", cluster_label_df.to_string(index=False)) # Turn the 20 features from world.csv into 2 dimensions with PCA pca_2 = PCA(n_components=2) plot_columns = pca_2.fit_transform(scaled_X_train.iloc[:, :21]) # Plot each cluster and shade by their cluster label plt.scatter(x=plot_columns[:, 0], y=plot_columns[:, 1], c=scaled_X_train["k-means cluster"]) plt.title('K-Means Clustering with 7 Clusters and 2 Principal Components') plt.xlabel('Principal Component 1') plt.ylabel('Principal Component 2') plt.savefig('task2bKMeansClustering.png', bbox_inches='tight') plt.show() plt.close() # PART 2: FEATURE SELECTION # SELECT 4 FEATURES IN A PRINCIPLED MANNER print("\n" + DIVIDER + "SELECTING 4 FEATURES IN A PRINCIPLED MANNER" + DIVIDER) model = ExtraTreesClassifier(random_state=200) model.fit(scaled_X_train, y_train) #plot graph of top 10 most important features for better visualization feat_importances = pd.Series(model.feature_importances_, index=scaled_X_train.columns) feat_importances.nlargest(10).plot(kind='barh') plt.title('Top 10 Most Important Features to Life Expectancy') plt.xlabel('Importance Score with Extra-Trees Classifier') plt.ylabel('Feature Name') plt.savefig('task2bTop10ImportantFeatures.png', bbox_inches='tight') plt.show() plt.close() print( "The 4 chosen features and their relative importance (descending order) to predicting Life Expectancy are:" ) feat_importance_dict = feat_importances.nlargest(4).to_dict() # sort the dictionary in descending order of importance { key: value for key, value in sorted(feat_importance_dict.items(), key=lambda item: item[1], reverse=True) } dct_for_knn = {} dct_for_knn_test = {} i = 1 for key, item in feat_importance_dict.items(): dct_for_knn[key] = scaled_X_train[key] dct_for_knn_test[key] = scaled_X_test[key] print("Chosen feature number " + str(i) + ": " + key + " with an importance of " + str(round(item, 3))) i += 1 principled_df = pd.DataFrame(dct_for_knn) principled_df_test = pd.DataFrame(dct_for_knn_test) # PCA print("\n" + DIVIDER + "PRINCIPAL COMPONENT ANALYSIS" + DIVIDER) pca = PCA(n_components=4) scaled_X_train = scaled_X_train.drop('k-means cluster', 1) pca.fit(scaled_X_train) principal_components = pca.transform(scaled_X_train) principal_df = pd.DataFrame(data=principal_components, columns=['PC-1', 'PC-2', 'PC-3', 'PC-4']) principal_components_test = pca.transform(scaled_X_test) principal_test = pd.DataFrame(data=principal_components_test, columns=['PC-1', 'PC-2', 'PC-3', 'PC-4']) print("Reduced training set data from " + str(len(scaled_X_train.columns)) + " features to " + str(len(principal_df.columns)) + " Principal Components:") print(principal_df) # FIRST 4 FEATURES print("\n" + DIVIDER + "FIRST 4 FEATURES" + DIVIDER) first_4_features = scaled_X_train.iloc[:, :4] first_4_features_test = scaled_X_test.iloc[:, :4] print("The first 4 features of the original dataset are:") i = 1 for column in first_4_features.columns: print(str(i) + "." + " " + column) i += 1 # PART 3: PERFORM 5-NN CLASSIFICATION USING FEATURES SELECTED ABOVE print( "\n==========ACCURACY OF EACH FEATURE GROUP IN 5-NN CLASSIFICATION==========" ) k5_classifier = KNeighborsClassifier(n_neighbors=5) # 5-NN with 4 features selected in principled manner k5_classifier.fit(principled_df, y_train) k5_test_accu_principled = k5_classifier.score(principled_df_test, y_test) # 5-NN with 4 features from PCA k5_classifier.fit(principal_df, y_train) k5_test_accu_PCA = k5_classifier.score(principal_test, y_test) # 5-NN with first 4 features k5_classifier.fit(first_4_features, y_train) k5_test_accu_first_4 = k5_classifier.score(first_4_features_test, y_test) print('Accuracy of feature engineering: {:.{width}f}'.format( k5_test_accu_principled, width=3)) print('Accuracy of PCA: {:.{width}f}'.format(k5_test_accu_PCA, width=3)) print('Accuracy of first four features: {:.{width}f}'.format( k5_test_accu_first_4, width=3))
# ================== Fit a polynomial regression model ========================= # Load the library required for feature engineering from sklearn.preprocessing import PolynomialFeatures # Extract the predictor from the dataframe df X = df.iloc[:, 0:1].values # Calculate the MSE with a polynomial with varying degrees degrees = [2, 3, 4, 5, 6, 7, 8, 9] mse = [] for degree in degrees: poly = PolynomialFeatures(degree, include_bias=False) X_poly = poly.fit_transform(X) X_poly_feature_name = poly.get_feature_names( ['Feature' + str(l) for l in range(1, 6)]) df_poly = pd.DataFrame(X_poly, columns=X_poly_feature_name) df_poly['y'] = df['Y'] X_train = df_poly.drop('y', axis=1) y_train = df_poly['y'] poly = LinearRegression(normalize=True) model_poly = poly.fit(X_train, y_train) y_poly = poly.predict(X_train) mse.append(mean_squared_error(y_poly, y_train)) # Analyze the MSE with a polynomial with varying degrees plt.figure(figsize=(12, 8)) plt.xlabel("Degrees", fontsize=20) plt.ylabel("Mean-squared Eror", fontsize=20) plt.grid(1) plt.scatter(degrees, mse, edgecolors=(0, 0, 0), lw=2, s=80)
# Need to impute missing values poly_features = imputer.fit_transform(poly_features) poly_features_test = imputer.transform(poly_features_test) poly_transformer = PolynomialFeatures(degree=2) poly_transformer.fit(poly_features) # Transform the features poly_features = poly_transformer.transform(poly_features) poly_features_test = poly_transformer.transform(poly_features_test) print('Polynomial Features shape: ', poly_features.shape) poly_features = pd.DataFrame(poly_features, columns=poly_transformer.get_feature_names( ['板温', '现场温度', '光照强度', '风速', '风向'])) # Add in the target poly_features['TARGET'] = poly_target # Find the correlations with the target poly_corrs = poly_features.corr()['TARGET'].sort_values() # Display most negative and most positive # print(poly_corrs) # Put test features into dataframe poly_features_test = pd.DataFrame(poly_features_test, columns=poly_transformer.get_feature_names( ['板温', '现场温度', '光照强度', '风速', '风向'])) '''''' ''''''
scores_lr = [] errors_lr = [] print('[Linear Regression] running 10-fold cross-validation') for train_indices, val_indices in kf.split( X, Y): #split into training and test set X_train = X[train_indices] X_test = X[val_indices] Y_train = Y[train_indices] Y_test = Y[val_indices] #transform to format f(x,theta) = theta0 + theta1*x1 + theta2*x2 + theta3*x1*x2 p = PolynomialFeatures(interaction_only=True, include_bias=False, degree=2) X_train = p.fit_transform(X_train) X_test = p.fit_transform(X_test) poly_feature_names = p.get_feature_names() #train model lr.fit(X_train, Y_train) #model parameters beta_est = [lr.coef_[0], lr.intercept_] intercept = lr.intercept_ coefficients = lr.coef_ #theta values assert (len(coefficients) == len(poly_feature_names)) #predict values for testing set Y_est = lr.predict(X_test) #calculate Mean Square error and accuracy MSE = evaluate_predictions(y_true=Y_test, y_pred=Y_est)
Pred1 = np.exp(Pred1) Pred1[Pred1 > df_train['price'].max()] = df_train['price'].max() Pred1[Pred1 < df_train['price'].min()] = df_train['price'].min() print('Linear') print('RMSE is:', round(np.sqrt(mse(y, Pred1)), 2)) print('R2 is:', round(r2_score(y, Pred1), 2)) plt.plot(y, Pred1, '.') plt.scatter(y, Pred1) plt.show() ############################## #poly poly = PolynomialFeatures(2) X_train2 = poly.fit_transform(X_train1) polynames = poly.get_feature_names(X_train1.columns) X_train2 = pd.DataFrame(X_train2, columns=polynames) reg2 = lm.LinearRegression() y_log = np.log(y) reg2.fit(X_train2, y_log) Pred2 = reg2.predict(X_train2) Pred2 = np.exp(Pred2) Pred2[Pred2 > df_train['price'].max()] = df_train['price'].max() Pred2[Pred2 < df_train['price'].min()] = df_train['price'].min() print('Polynomial fit') print('RMSE is:', round(np.sqrt(mse(y, Pred2)), 2)) print('R2 is:', round(r2_score(y, Pred2), 2)) plt.plot(y, Pred2, '.')
plt.xlabel("Univariate Regression Coefficients") plt.ylabel("Multiple Regression Coefficients") plt.grid() plt.show() # ## (f) Non-linear association between any of the predictors and the response. # Model of the form y = β<sub>0</sub> + β<sub>1</sub>X + β<sub>2</sub>X<sup>2</sup> + β<sub>3</sub>X<sup>3</sup> + ε beta_nl = [] poly = PolynomialFeatures(3) for i in df_X.columns: print("Predictor-", i) col = df[i].values.reshape(-1, 1) X_poly = poly.fit_transform(col) df_poly = pd.DataFrame(X_poly, columns=poly.get_feature_names(i)) print("Dataframe with non-linear terms-") print(df_poly.head()) model = sm.OLS(y, df_poly).fit() predictions_nl = model.predict( df_poly) # make the predictions by the model # Print out the statistics stats_nl = model.summary().tables[1] print(stats_nl) print("The accuracy is {}%".format((model.rsquared) * 100)) beta_nl.append(model.params[1]) print("") print("")
def poly(df, col_names): data = df[col_names] p = PolynomialFeatures(2).fit(data) features = pd.DataFrame(p.transform(data), columns=p.get_feature_names(data.columns)) return features
# led_current="25 mA") # print(x.shape) # # pls_screen_as726x(x, y, n_comps=10) # print(type(x)) poly = PolynomialFeatures(degree=1) x_trans = poly.fit_transform(x) # pls.fit(x_trans, y) # y_predict = pls.predict(x_trans) # print(mean_absolute_error(y, y_predict)) # ham # n_comps = 6 # regr = PLSRegression(n_components=n_comps) # print(x_trans.shape) # print(poly.get_feature_names()) # x_trans = pd.DataFrame(x_trans, columns=poly.get_feature_names()) print(x_trans) cols_to_use = [] for column in poly.get_feature_names(): if ' ' not in column: cols_to_use.append(column) print(cols_to_use) x_trans = x_trans[cols_to_use] print(x_trans) # svr = SVR() # pls = PLSRegression(n_components=6) # regr = pls # print(y.columns) # # pls.fit(x, y['Avg Total Chlorophyll (µg/cm2)']) # # print(pls.coef_) # # plot_learning_curve(pls, "", x_trans, y['Avg Total Chlorophyll (µg/cm2)'])
from sklearn.metrics import mean_squared_error from sklearn.preprocessing import PolynomialFeatures from math import sqrt import pandas as pd import numpy as np File_Path = '../datasets/admission_data.csv' admission_df = pd.read_csv(File_Path).drop('Serial No.', axis=1) admission_df.haed() polynomial_transformer = PolynomialFeatures(6) polynomial_features = polynomial_transformer.fit_transform(admission_df.values) features = polynomial_transformer.get_feature_names(admission_df.columns) label = admission_df[['Chance of Admit ']] train_data, test_data, train_label, test_label = train_test_split(admission_df, label, test_size=0.3, random_state=5) model = Lasso(alpha=0.001, max_iter=1000, normalize=True) """ alpha : 정규화 식에서의 람다 값 지정. max_iter : 경사 하강법 반복 횟수 지정. normalize : True로 지정 시, 자동으로 Feature Scaling 적용 이는 L2 정규화 모델 Ridge 에서도 동일하게 적용된다. """ model.fit(train_data, train_label)
plt.vlines(kb.bin_edges_[0], -3, 3, linewidth=1, alpha=.2) plt.plot(X[:, 0], y, 'o', c='k') plt.ylabel("Regression output") plt.xlabel("Input feature") plt.legend(loc="best") from sklearn.preprocessing import PolynomialFeatures # include polynomials up to x ** 10: # the default "include_bias=True" adds a feature that's constantly 1 poly = PolynomialFeatures(degree=10, include_bias=False) poly.fit(X) X_poly = poly.transform(X) print("X_poly.shape: {}".format(X_poly.shape)) print("Entries of X:\n{}".format(X[:5])) print("Entries of X_poly:\n{}".format(X_poly[:5])) print("Polynomial feature names:\n{}".format(poly.get_feature_names())) reg = LinearRegression().fit(X_poly, y) line_poly = poly.transform(line) plt.plot(line, reg.predict(line_poly), label='polynomial linear regression') plt.plot(X[:, 0], y, 'o', c='k') plt.ylabel("Regression output") plt.xlabel("Input feature") plt.legend(loc="best") from sklearn.svm import SVR for gamma in [1, 10]: svr = SVR(gamma=gamma).fit(X, y) plt.plot(line, svr.predict(line), label='SVR gamma={}'.format(gamma)) plt.plot(X[:, 0], y, 'o', c='k') plt.ylabel("Regression output")
insurance_df = pd.read_csv(INSURANCE_FILE_PATH) # 필요한 열들에 One-hot Encoding insurance_df = pd.get_dummies(data=insurance_df, columns=['sex', 'smoker', 'region']) # 입력 변수 데이터 X = insurance_df.drop(['charges'], axis=1) # 다항 함수 만들기 polynomial_transformer = PolynomialFeatures(4) # 4 차항 변형기 정의 polynomial_features = polynomial_transformer.fit_transform( X.values) # 4차 항 변수로 변환 # 새로운 변수 이름 생성 features = polynomial_transformer.get_feature_names(X.columns) # 다항 입력 변수 X = pd.DataFrame(polynomial_features, columns=features) # 목표 변수 y = insurance_df[['charges']] # 학습 데이터와 평가 데이터 나누기 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5) # Lasso 모델 학습시키기 model = Lasso(alpha=1, max_iter=2000, normalize=True)
def process_data(self): # Training data app_train = pd.read_csv('./Downloads/application_train.csv') # Testing data features app_test = pd.read_csv('./Downloads/application_test.csv') # Number of unique classes in each object column app_train.select_dtypes('object').apply(pd.Series.nunique, axis=0) # Create a label encoder object le = LabelEncoder() le_count = 0 # Iterate through the columns for col in app_train: if app_train[col].dtype == 'object': # If 2 or fewer unique categories if len(list(app_train[col].unique())) <= 2: # Train on the training data le.fit(app_train[col]) # Transform both training and testing data app_train[col] = le.transform(app_train[col]) app_test[col] = le.transform(app_test[col]) # Keep track of how many columns were label encoded le_count += 1 print('%d columns were label encoded.' % le_count) ''' 离散特征的编码分为两种情况: 1、离散特征的取值之间没有大小的意义,比如color:[red,blue],那么就使用one-hot编码 2、离散特征的取值有大小的意义,比如size:[X,XL,XXL],那么就使用数值的映射{X:1,XL:2,XXL:3} 使用get_dummies可以很方便的对离散型特征进行one-hot编码 ''' # one-hot encoding of categorical variables app_train = pd.get_dummies(app_train) app_test = pd.get_dummies(app_test) print('Training Features shape: ', app_train.shape) print('Testing Features shape: ', app_test.shape) # ### Aligning Training and Testing Data train_labels = app_train['TARGET'] # Align the training and testing data, keep only columns present in both dataframes app_train, app_test = app_train.align(app_test, join='inner', axis=1) # Add the target back in app_train['TARGET'] = train_labels print('Training Features shape: ', app_train.shape) print('Testing Features shape: ', app_test.shape) anom = app_train[app_train['DAYS_EMPLOYED'] == 365243] non_anom = app_train[app_train['DAYS_EMPLOYED'] != 365243] print('The non-anomalies default on %0.2f%% of loans' % (100 * non_anom['TARGET'].mean())) print('The anomalies default on %0.2f%% of loans' % (100 * anom['TARGET'].mean())) print('There are %d anomalous days of employment' % len(anom)) # Create an anomalous flag column app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243 # Replace the anomalous values with nan app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace=True) app_train['DAYS_EMPLOYED'].plot.hist(title='Days Employment Histogram') app_test['DAYS_EMPLOYED_ANOM'] = app_test["DAYS_EMPLOYED"] == 365243 app_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace=True) print('There are %d anomalies in the test data out of %d entries' % (app_test["DAYS_EMPLOYED_ANOM"].sum(), len(app_test))) correlations = app_train.corr()['TARGET'].sort_values() # Display correlations print('Most Positive Correlations:\n', correlations.tail(15)) print('\nMost Negative Correlations:\n', correlations.head(15)) # Find the correlation of the positive days since birth and target app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH']) app_train.to_csv('./data/app_train.csv', index=False) app_test.to_csv('./data/app_test.csv', index=False) # Make a new dataframe for polynomial features poly_features = app_train[[ 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'TARGET' ]] poly_features_test = app_test[[ 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH' ]] imputer = Imputer(strategy='median') poly_target = poly_features['TARGET'] poly_features = poly_features.drop(columns=['TARGET']) # Need to impute missing values poly_features = imputer.fit_transform(poly_features) poly_features_test = imputer.transform(poly_features_test) # Create the polynomial object with specified degree poly_transformer = PolynomialFeatures(degree=3) # Train the polynomial features poly_transformer.fit(poly_features) # Transform the features poly_features = poly_transformer.transform(poly_features) poly_features_test = poly_transformer.transform(poly_features_test) print('Polynomial Features shape: ', poly_features.shape) poly_features = pd.DataFrame( poly_features, columns=poly_transformer.get_feature_names( ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])) # Add in the target poly_features['TARGET'] = poly_target # Put test features into dataframe poly_features_test = pd.DataFrame( poly_features_test, columns=poly_transformer.get_feature_names( ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])) poly_features.to_csv('./data/poly_features.csv', index=False) poly_features_test.to_csv('./data/poly_features_test.csv', index=False) # Merge polynomial features into training dataframe poly_features['SK_ID_CURR'] = app_train['SK_ID_CURR'] app_train_poly = app_train.merge(poly_features, on='SK_ID_CURR', how='left') # Merge polnomial features into testing dataframe poly_features_test['SK_ID_CURR'] = app_test['SK_ID_CURR'] app_test_poly = app_test.merge(poly_features_test, on='SK_ID_CURR', how='left') # Align the dataframes app_train_poly, app_test_poly = app_train_poly.align(app_test_poly, join='inner', axis=1) app_train_poly['TARGET'] = poly_target # Print out the new shapes print('Training data with polynomial features shape: ', app_train_poly.shape) print('Testing data with polynomial features shape: ', app_test_poly.shape) app_train_poly.to_csv('./data/app_train_poly.csv', index=False) app_test_poly.to_csv('./data/app_test_poly.csv', index=False) app_train_domain = app_train.copy() app_test_domain = app_test.copy() app_train_domain['CREDIT_INCOME_PERCENT'] = app_train_domain[ 'AMT_CREDIT'] / app_train_domain['AMT_INCOME_TOTAL'] app_train_domain['ANNUITY_INCOME_PERCENT'] = app_train_domain[ 'AMT_ANNUITY'] / app_train_domain['AMT_INCOME_TOTAL'] app_train_domain['CREDIT_TERM'] = app_train_domain[ 'AMT_ANNUITY'] / app_train_domain['AMT_CREDIT'] app_train_domain['DAYS_EMPLOYED_PERCENT'] = app_train_domain[ 'DAYS_EMPLOYED'] / app_train_domain['DAYS_BIRTH'] app_test_domain['CREDIT_INCOME_PERCENT'] = app_test_domain[ 'AMT_CREDIT'] / app_test_domain['AMT_INCOME_TOTAL'] app_test_domain['ANNUITY_INCOME_PERCENT'] = app_test_domain[ 'AMT_ANNUITY'] / app_test_domain['AMT_INCOME_TOTAL'] app_test_domain['CREDIT_TERM'] = app_test_domain[ 'AMT_ANNUITY'] / app_test_domain['AMT_CREDIT'] app_test_domain['DAYS_EMPLOYED_PERCENT'] = app_test_domain[ 'DAYS_EMPLOYED'] / app_test_domain['DAYS_BIRTH'] app_train_domain.to_csv('./data/app_train_domain.csv', index=False) app_test_domain.to_csv('./data/app_test_domain.csv', index=False)
print(train.describe()) train = train.drop(['TenantHasSubscription'], axis=1) test = test.drop(['TenantHasSubscription'], axis=1) colnames = train.columns.values train.describe().to_csv(data_dir + 'data-description.csv') print('going through the columns to find out if they have missing value:') for n in colnames: if any(pd.isna(train[n])): print(n) # adding interaction terms: poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False) train_transformed = poly.fit_transform(train[colnames[1:-1]]) print('Shape of train after adding interactions= ', train_transformed.shape) feat_names = poly.get_feature_names(colnames[1:-1]) feat_names = [ '-'.join(name.split()) if len(name.split()) > 1 else name for name in feat_names ] train_transformed = pd.DataFrame(train_transformed, columns=feat_names) train_transformed['OMSTenantId'] = train['OMSTenantId'] train_transformed['Label'] = train['Label'] train_transformed = train_transformed[['OMSTenantId'] + feat_names + ['Label']] poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False) test_transformed = poly.fit_transform(test[colnames[1:-1]]) print('Shape of test after adding interactions= ', test_transformed.shape) feat_names = poly.get_feature_names(colnames[1:-1])
def clean_data(dataframe): survival = dataframe.pop('Survived') # Pop survived column for now. dataframe['Sex'] = dataframe['Sex'].map({ 'female': 1, 'male': 0 }) # Turn gender into binary value. temp_embarked = pd.get_dummies( dataframe['Embarked'] ) # Split port of embarkation into different feature sets. dataframe = pd.concat([dataframe, temp_embarked], axis=1) # Replace NaN age values with avg age of their Pclass. p1_avg = dataframe[dataframe['Pclass'] == 1]['Age'].mean() p2_avg = dataframe[dataframe['Pclass'] == 2]['Age'].mean() p3_avg = dataframe[dataframe['Pclass'] == 3]['Age'].mean() dataframe.loc[dataframe['Pclass'] == 1, 'Age'] = dataframe.loc[dataframe['Pclass'] == 1, 'Age'].fillna(p1_avg) dataframe.loc[dataframe['Pclass'] == 2, 'Age'] = dataframe.loc[dataframe['Pclass'] == 2, 'Age'].fillna(p2_avg) dataframe.loc[dataframe['Pclass'] == 3, 'Age'] = dataframe.loc[dataframe['Pclass'] == 3, 'Age'].fillna(p3_avg) # Add adult/child data. dataframe['Adult'] = [1 if age >= 18 else 0 for age in dataframe['Age']] dataframe['Old'] = [ 1 if age >= (dataframe['Age'].mean()) else 0 for age in dataframe['Age'] ] # Fill in missing Fare values w/ column mean, get FarePerPerson. dataframe.loc[dataframe['Fare'].isnull(), 'Fare'] = dataframe.loc[dataframe['Fare'].isnull(), 'Fare'].fillna( dataframe['Fare'].mean()) dataframe['FarePerPerson'] = dataframe['Fare'] / (dataframe['SibSp'] + dataframe['Parch'] + 1) # Get titles from Name field. unique_titles = list( set( re.search('^[^,]+, ([\w\s]+)\.', name).group(1) for name in dataframe['Name'])) unique_titles.sort() title_dict = {} for i, name in enumerate(unique_titles): title_dict[name] = i dataframe['Title'] = [ title_dict[re.search('^[^,]+, ([\w\s]+)\.', name).group(1)] for name in dataframe['Name'] ] # Get surnames from Name field, exclude adult men. all_surnames = [name.split(',')[0] for name in dataframe['Name']] unique_surnames = list(set(all_surnames)) unique_surnames.sort() surname_dict = {} for i, name in enumerate(unique_surnames): surname_dict[name] = [i + 1, 0] dataframe['SurnameWomenChildren'] = [ surname_dict[name][0] for name in all_surnames ] dataframe.loc[(dataframe['Adult'] == 1) & (dataframe['Sex'] == 0), 'SurnameWomenChildren'] = 0 # Identify single passengers. for name in all_surnames: surname_dict[name] = [surname_dict[name][0], surname_dict[name][1] + 1] single_dict = {} for name in surname_dict: if surname_dict[name][1] == 1: single_dict[name] = 1 else: single_dict[name] = 0 dataframe['Single'] = [single_dict[name] for name in all_surnames] # Add explicit 1/0 to average family survival for women/child groups. dataframe['WomenChildGroupSurvival'] = -3 dataframe['Surname'] = all_surnames dataframe = pd.concat([survival, dataframe], axis=1) for name in unique_surnames: avg_survival = dataframe.loc[(dataframe['Surname'] == name) & ~((dataframe['Adult'] == 1) & (dataframe['Sex'] == 0)), 'Survived'].dropna().mean() if pd.isnull(avg_survival): avg_survival = 0.0 dataframe.loc[(dataframe['Surname'] == name) & ~((dataframe['Adult'] == 1) & (dataframe['Sex'] == 0)), 'WomenChildGroupSurvival'] = avg_survival dataframe.loc[dataframe['Single'] == 1, 'WomenChildGroupSurvival'] = -1 # Single passengers. dataframe.loc[(dataframe['Adult'] == 1) & (dataframe['Sex'] == 0), 'WomenChildGroupSurvival'] = -2 # Adult men. dataframe = dataframe.drop(['Survived', 'Surname'], axis=1) # Cabin data. # dataframe['Cabin'] = [1 if len(name)>0 else 0 for name in dataframe['Cabin'].fillna('')] # dataframe['Cabin'] = [(ord(c[0].lower()) - 96) for c in dataframe['Cabin'].fillna('U')] # Remove data items from frame that are not numerical. dataframe = dataframe.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1) # Add polynomial features. pass_id = dataframe.pop('PassengerId') poly = PolynomialFeatures(2) temp = poly.fit_transform(dataframe) poly_header = poly.get_feature_names(dataframe.columns) dataframe = pd.DataFrame(data=temp, index=dataframe.index, columns=poly_header) dataframe = pd.concat([pass_id, dataframe], axis=1) # Perform feature scaling. scaler = StandardScaler() pass_id = dataframe.pop('PassengerId') dataframe[dataframe.columns] = scaler.fit_transform( dataframe[dataframe.columns]) dataframe = pd.concat([pass_id, dataframe], axis=1) # Drop features that have the same value for all rows. cols = list(dataframe) nunique = dataframe.apply(pd.Series.nunique) cols_to_drop = nunique[nunique == 1].index dataframe = dataframe.drop(cols_to_drop, axis=1) # Add survival back in. dataframe = pd.concat([survival, dataframe], axis=1) # Return data. return (dataframe)
# Section 2: Get fake data in correct format X = la.transpose([X]) Y = la.transpose([Y]) # Section 3: Pure Python Tools Fit poly_pp = ml.Poly_Features_Pure_Py(order=2) Xp = poly_pp.fit_transform(X) print('PP Feature Names:', poly_pp.get_feature_names()) ls_pp = ml.Least_Squares(tol=2, add_ones_column=False) ls_pp.fit(Xp, Y) print() # Section 4: SciKit Learn Fit poly_sk = PolynomialFeatures(degree=2) X_poly = poly_sk.fit_transform(X) print('SK Feature Names:', poly_sk.get_feature_names()) ls_sk = LinearRegression() ls_sk.fit(X_poly, Y) print() # Section 5: Coefficients Comparison tmp_ls_pp_coefs = sorted(ls_pp.coefs) rounded_ls_pp_coefs = [ round(x, 8) + 0 for x in la.transpose(tmp_ls_pp_coefs)[0] ] print('PurePy LS coefficients:', rounded_ls_pp_coefs) tmp_ls_sk_coefs = ls_sk.intercept_.tolist() + ls_sk.coef_[0][1:].tolist() tmp_ls_sk_coefs = sorted(tmp_ls_sk_coefs) rounded_ls_sk_coefs = [round(x, 8) + 0 for x in tmp_ls_sk_coefs] print('SKLearn LS coefficients:', rounded_ls_sk_coefs, '\n')
# In[84]: from sklearn.preprocessing import PolynomialFeatures poly = PolynomialFeatures(degree=2).fit(X_train_transformed) X_train_poly = poly.transform(X_train_transformed_scaled) X_test_poly = poly.transform(X_test_transformed_scaled) # In[68]: # Debug print(poly.get_feature_names()) # In[77]: from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 ## Get score using original model logreg = LogisticRegression(C=1) logreg.fit(X_train, y_train) scores = cross_val_score(logreg, X_train, y_train, cv=10) print('CV accuracy (original): %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) highest_score = np.mean(scores)
if DIVISION: X_recip = (1 / X).copy() X_recip.columns = ["recip_" + str(c) for c in X_recip.columns] X_recip = X_recip.replace([np.inf, -np.inf], np.nan).dropna(axis=1) X = pd.concat([X, X_recip], axis=1) del X_recip # determine if we are building a classifier model classifier = np.all(np.unique(Y.to_numpy()) == [0, 1]) outputs = Y.shape[1] # add 2nd order polynomial features to X poly = PolynomialFeatures(2, include_bias=False) x_columns = X.columns X = pd.DataFrame(poly.fit_transform(X)) X.columns = poly.get_feature_names(x_columns) # set up the model if classifier: selector = RFE(RandomForestClassifier(n_estimators=50, max_depth=14, min_samples_leaf=5, max_features="sqrt", random_state=42, class_weight="balanced_subsample", n_jobs=1), step=0.05, verbose=1) else: selector = RFE(RandomForestRegressor(n_estimators=50, max_depth=14,
print(features) poly = PolynomialFeatures(poly_degree) Y = np.array(y_id) reg_u = np.full(X_deg, avg_u_id) reg_y = np.full(AR_deg, avg_y_id) PHI = [] for i in tqdm(range(ID_LENGHT)): if i != 0: reg_y = np.append(reg_y, Y[i])[1:] reg_u = np.append(reg_u, u_id[i])[1:] regressors = np.append(reg_u, reg_y) PHI.append(poly.fit_transform([regressors])[0]) PHI = np.array(PHI) regressor_terms = poly.get_feature_names(features) print("Regressors: ", regressor_terms) #FROE 2 poly = PolynomialFeatures(poly_degree) Y_val = np.array(y_val) reg_u = np.full(X_deg, avg_u_val) reg_y = np.full(AR_deg, avg_y_val) PHI_val = [] for i in tqdm(range(VAL_LENGHT)): if i != 0: reg_y = np.append(reg_y, Y_val[i])[1:] reg_u = np.append(reg_u, u_val[i])[1:] regressors = np.append(reg_u, reg_y) PHI_val.append(poly.fit_transform([regressors])[0])
def analysis_7(df_Coredata): """ 多次元多項式モデル """ #https://www.jeremyjordan.me/polynomial-regression/ X = df_Coredata[['d','e','f','g','i']] y = df_Coredata['j'] # グラフのスタイルを指定 sns.set(style = 'whitegrid', context = 'notebook') # 変数のペアの関係をプロット #sns.pairplot(df_Coredata) #plt.show() #X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 0) #lr = linear_model.LinearRegression().fit(X_train, y_train) #print("Trainng set score: {:.2f}".format(lr.score(X_train, y_train))) #print("Test set score: {:.2f}".format(lr.score(X_test, y_test))) ### データのスケール変換 # 標準化 std_Scaler = StandardScaler() data_std = std_Scaler.fit_transform(X) mmx_Scaler =MinMaxScaler() X_scaled = mmx_Scaler.fit_transform(X) #X_test_scaled = scaler.transform(X_test) #print(X_train_scaled) poly = PolynomialFeatures(degree = 2).fit(data_std) print(poly.get_feature_names())