def test_imputation_error_invalid_strategy(strategy): X = np.ones((3, 5)) X[0, 0] = np.nan with pytest.raises(ValueError, match=str(strategy)): imputer = SimpleImputer(strategy=strategy) imputer.fit_transform(X)
def test_imputation_deletion_warning(strategy): X = np.ones((3, 5)) X[:, 0] = np.nan with pytest.warns(UserWarning, match="Deleting"): imputer = SimpleImputer(strategy=strategy, verbose=True) imputer.fit_transform(X)
def test_imputation_mean_median_error_invalid_type(strategy, dtype): X = np.array([["a", "b", 3], [4, "e", 6], ["g", "h", 9]], dtype=dtype) with pytest.raises(ValueError, match="non-numeric data"): imputer = SimpleImputer(strategy=strategy) imputer.fit_transform(X)
def test_imputation_constant_error_invalid_type(X_data, missing_value): # Verify that exceptions are raised on invalid fill_value type X = np.full((3, 5), X_data, dtype=float) X[0, 0] = missing_value with pytest.raises(ValueError, match="imputing numerical"): imputer = SimpleImputer(missing_values=missing_value, strategy="constant", fill_value="x") imputer.fit_transform(X)
def test_imputation_shape(): # Verify the shapes of the imputed matrix for different strategies. X = np.random.randn(10, 2) X[::2] = np.nan for strategy in ['mean', 'median', 'most_frequent']: imputer = SimpleImputer(strategy=strategy) X_imputed = imputer.fit_transform(X) assert_equal(X_imputed.shape, (10, 2)) X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) assert_equal(X_imputed.shape, (10, 2))
def test_imputation_shape(): # Verify the shapes of the imputed matrix for different strategies. X = np.random.randn(10, 2) X[::2] = np.nan for strategy in ['mean', 'median', 'most_frequent', "constant"]: imputer = SimpleImputer(strategy=strategy) X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) assert X_imputed.shape == (10, 2) X_imputed = imputer.fit_transform(X) assert X_imputed.shape == (10, 2) iterative_imputer = IterativeImputer(initial_strategy=strategy) X_imputed = iterative_imputer.fit_transform(X) assert X_imputed.shape == (10, 2)
def test_imputation_add_indicator(marker): X = np.array([ [marker, 1, 5, marker, 1], [2, marker, 1, marker, 2], [6, 3, marker, marker, 3], [1, 2, 9, marker, 4] ]) X_true = np.array([ [3., 1., 5., 1., 1., 0., 0., 1.], [2., 2., 1., 2., 0., 1., 0., 1.], [6., 3., 5., 3., 0., 0., 1., 1.], [1., 2., 9., 4., 0., 0., 0., 1.] ]) imputer = SimpleImputer(missing_values=marker, add_indicator=True) X_trans = imputer.fit_transform(X) assert_allclose(X_trans, X_true) assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3]))
def test_simple_imputation_add_indicator_sparse_matrix(arr_type): X_sparse = arr_type([ [np.nan, 1, 5], [2, np.nan, 1], [6, 3, np.nan], [1, 2, 9] ]) X_true = np.array([ [3., 1., 5., 1., 0., 0.], [2., 2., 1., 0., 1., 0.], [6., 3., 5., 0., 0., 1.], [1., 2., 9., 0., 0., 0.], ]) imputer = SimpleImputer(missing_values=np.nan, add_indicator=True) X_trans = imputer.fit_transform(X_sparse) assert sparse.issparse(X_trans) assert X_trans.shape == X_true.shape assert_allclose(X_trans.toarray(), X_true)
def __call__(self, data): from Orange.data.sql.table import SqlTable if isinstance(data, SqlTable): return Impute()(data) imputer = SimpleImputer(strategy=self.strategy) X = imputer.fit_transform(data.X) # Create new variables with appropriate `compute_value`, but # drop the ones which do not have valid `imputer.statistics_` # (i.e. all NaN columns). `sklearn.preprocessing.Imputer` already # drops them from the transformed X. features = [impute.Average()(data, var, value) for var, value in zip(data.domain.attributes, imputer.statistics_) if not np.isnan(value)] assert X.shape[1] == len(features) domain = Orange.data.Domain(features, data.domain.class_vars, data.domain.metas) new_data = data.transform(domain) new_data.X = X return new_data
def test_imputation_constant_integer(): # Test imputation using the constant strategy on integers X = np.array([ [-1, 2, 3, -1], [4, -1, 5, -1], [6, 7, -1, -1], [8, 9, 0, -1] ]) X_true = np.array([ [0, 2, 3, 0], [4, 0, 5, 0], [6, 7, 0, 0], [8, 9, 0, 0] ]) imputer = SimpleImputer(missing_values=-1, strategy="constant", fill_value=0) X_trans = imputer.fit_transform(X) assert_array_equal(X_trans, X_true)
def test_imputation_constant_object(marker): # Test imputation using the constant strategy on objects X = np.array([ [marker, "a", "b", marker], ["c", marker, "d", marker], ["e", "f", marker, marker], ["g", "h", "i", marker] ], dtype=object) X_true = np.array([ ["missing", "a", "b", "missing"], ["c", "missing", "d", "missing"], ["e", "f", "missing", "missing"], ["g", "h", "i", "missing"] ], dtype=object) imputer = SimpleImputer(missing_values=marker, strategy="constant", fill_value="missing") X_trans = imputer.fit_transform(X) assert_array_equal(X_trans, X_true)
def test_imputation_constant_pandas(dtype): # Test imputation using the constant strategy on pandas df pd = pytest.importorskip("pandas") f = io.StringIO("Cat1,Cat2,Cat3,Cat4\n" ",i,x,\n" "a,,y,\n" "a,j,,\n" "b,j,x,") df = pd.read_csv(f, dtype=dtype) X_true = np.array([ ["missing_value", "i", "x", "missing_value"], ["a", "missing_value", "y", "missing_value"], ["a", "j", "missing_value", "missing_value"], ["b", "j", "x", "missing_value"] ], dtype=object) imputer = SimpleImputer(strategy="constant") X_trans = imputer.fit_transform(df) assert_array_equal(X_trans, X_true)
def test_imputation_constant_float(array_constructor): # Test imputation using the constant strategy on floats X = np.array([ [np.nan, 1.1, 0, np.nan], [1.2, np.nan, 1.3, np.nan], [0, 0, np.nan, np.nan], [1.4, 1.5, 0, np.nan] ]) X_true = np.array([ [-1, 1.1, 0, -1], [1.2, -1, 1.3, -1], [0, 0, -1, -1], [1.4, 1.5, 0, -1] ]) X = array_constructor(X) X_true = array_constructor(X_true) imputer = SimpleImputer(strategy="constant", fill_value=-1) X_trans = imputer.fit_transform(X) assert_allclose_dense_sparse(X_trans, X_true)
#print_histogram(d) return results x_train = pd.read_csv("X_train.csv") y_train = pd.read_csv("y_train.csv") y_train = y_train.drop('id', axis=1) x_train = x_train.drop('id', axis=1) imputer = SimpleImputer(missing_values=numpy.nan, strategy='median') # est = ExtraTreesRegressor(n_estimators=10, random_state=42, max_features='sqrt', n_jobs=-1, verbose=0) # imputer = IterativeImputer( estimator=est, max_iter=10, tol=0.001, n_nearest_features=100 # , initial_strategy='median', imputation_order='ascending', verbose=2 # , random_state=0) x_train_filled = imputer.fit_transform(x_train) x_train = pd.DataFrame(x_train_filled) results = [ 44, 108, 137, 268, 332, 341, 461, 502, 580, 606, 664, 797, 833, 839, 882, 1007, 1018, 1148 ] # this is result after 1000 isolation forests # 3. Scaling scaler = RobustScaler() x_train_new = scaler.fit_transform(x_train) cols = list(x_train.columns.values) x_train = pd.DataFrame(data=x_train_new, columns=cols, index=x_train.index) #results = detect_outliers(x_train)
import numpy as np import pandas as pd print("Carregando a base de dados...") baseDeDados = pd.read_csv('admission.csv', delimiter=';') X = baseDeDados.iloc[:,:-1].values y = baseDeDados.iloc[:,-1].values print("ok!") print("Preenchendo dados que estão faltando...") from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy='median') imputer = imputer.fit_transform(X[:,1:]) print("ok!") print("Computando rotulação...") from sklearn.preprocessing import LabelEncoder labelencoder_X = LabelEncoder() X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) X = X[:,1:] D = pd.get_dummies(X[:,0]) X = np.insert(X, 0, D.values, axis=1) print("ok!") print("Separando conjuntos de teste e treino...") from sklearn.model_selection import train_test_split XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2) print("ok!") #remover warning de dataconversionwarning
df = df.drop(['branch_id', 'seller_code', 'item_total_price', 'register_date'], axis=1) #Filling in missing values. #The column 'is_churn' is used as the label of the classes (or dependent variable Y), # so it is a binary classification and the mean strategy would not be suitable because it would create a third class. #The most frequent strategy is adopted due to the imbalance between the classes, # what makes the probability of filling in the values correctly extremely high. #Another possibility would be using clustering on the other features of the vectors with non-missing values, # so as to create two cluster and, then, predict in which cluster the vectors with missing values are. This # information could be used as the value missing. But this would require handling the categorical features, # normalizing all features and finding an appropriate clutering method. And this task of finding a suitable method # could even involve developing a specific distance metric, since the categorical features do not lay in an Euclidean space. imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent') df.loc[:, :] = imputer.fit_transform(df) #Feature extraction. X = compute_features(df) #Keep feature names to indentify the likely reasons later on. feature_names = [str(name).replace('_', ' ') for name in X.columns.tolist()][1:] #Separate features and labels. y = X['is_churn'].to_numpy() X = X.drop(['is_churn'], axis=1).to_numpy() #split train and test sets, 25% for test X_train, X_test, y_train, y_test = train_test_split(X, y,
print(i) print(min) print(max) num_data.loc[num_data[i] < min, i] = np.nan num_data.loc[num_data[i] > max, i] = np.nan import matplotlib.pyplot as plt plt.hist(num_data.avg_training_score) mis_val(num_data) from sklearn.impute import SimpleImputer imputer = SimpleImputer(strategy='mean', missing_values=np.nan) num_data = pd.DataFrame(imputer.fit_transform(num_data), columns=num_cols) #FINDING highly correlated variables def corr_matrix(data): #extract numeric data num_data = data.select_dtypes('float64').copy() # Create correlation matrix corr_matrix = num_data.corr().abs() # Select upper triangle of correlation matrix corr_mat = corr_matrix.where( np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) upper = corr_matrix.where( np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) # Find index of feature columns with correlation greater than 0.95 to_drop = [
# then one-hot encode categorical variables if args.dataset == "flchain": df = pd.read_csv("./data/surv/flchain.csv") E = df["death"] Y = df["futime"] X = (df >> drop(X.death, X.futime, X.chapter) >> mutate( mgus=X.mgus.astype(float), age=X.age.astype(float))) X = X[Y > 0] E = E[Y > 0] Y = Y[Y > 0] # Y = np.c_[np.log(T) - np.mean(np.log(T)), C] X_num = X.select_dtypes(include=["float"]) X_cat = X.select_dtypes(exclude=["float"]) imputer = SimpleImputer(strategy="median") X_num = imputer.fit_transform(X_num.values) imputer = SimpleImputer(strategy="most_frequent") X_cat = imputer.fit_transform(X_cat.values) encoder = OneHotEncoder(sparse=False) X_cat = encoder.fit_transform(X_cat) X = np.c_[X_num, X_cat] elif args.dataset == "support": df = pd.read_csv("./data/surv/support2.csv") df = df.rename(columns={"d.time": "dtime"}) Y = df["dtime"] E = df["death"] # Y = np.c_[np.log(Y) - np.mean(np.log(Y)), C] df >>= drop( X.dtime, X.death,
y = train['Survived'] # Select features features = ['Pclass', 'Age', 'Fare', 'SibSp', 'Parch'] X = train[features] X_test = test[features] # - # ## Missing Values # + # In order to use 'Age' as a feature we need to impute missing values from sklearn.impute import SimpleImputer imputer = SimpleImputer(strategy='median') X_imp = pd.DataFrame(imputer.fit_transform(X)) X_test_imp = pd.DataFrame(imputer.transform(X_test)) X_imp.columns = X.columns X_test_imp.columns = X_test.columns X = X_imp X_test = X_test_imp # - X X_test # ## Scaling
# data.index = data.TIME data = data.drop(columns=['TIME']) imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') d = {} by_device = data.groupby('DEVICENAME') for device, device_df in by_device: print(device) by_day = device_df.groupby('Day') for day, day_df in by_day: if 24 > len(day_df) > 12: day_df = day_df.sort_values('SSCPUIDLE').drop_duplicates(subset=['Day', 'Hour'], keep='last').sort_values('Hour') day_df.index = day_df.Hour new_df = pd.DataFrame(index=list(range(0, 24)), columns=day_df.columns) new_df.update(day_df) new_df.DEVICENAME = device new_df.Day = day new_df.Hour = new_df.index new_df.SSCPUIDLE = imp_mean.fit_transform(new_df.SSCPUIDLE.values.reshape(-1, 1)) d.update({ device: [device, day, new_df.SSCPUIDLE.values] }) else: continue df = pd.DataFrame.from_dict(d, orient='index', columns=['Device', 'Day', 'TimeSeries']).reset_index(drop=True) df['Location'] = df.Device.str.extract(r'([^-]*).*') ts_data = df.reindex(['Location', 'Device', 'Day', 'TimeSeries'], axis=1) joblib.dump([data, ts_data], "data/data.job")
# Data Preprocessing Template # Importing the libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd # Importing the dataset dataset = pd.read_csv('c:\src\ML-AtoZ\Part 1 - Data Preprocessing\Data.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 3].values #Cleaning Data - Using the new functions. from sklearn.impute import SimpleImputer imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') X[:, 1:3] = imp_mean.fit_transform(X[:, 1:3]) #Encode Category Data from sklearn.preprocessing import LabelEncoder, OneHotEncoder le_X = LabelEncoder() X[:, 0] = le_X.fit_transform(X[:, 0]) ohe_X = OneHotEncoder(categorical_features=[0]) X = ohe_X.fit_transform(X).toarray() from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer, make_column_transformer process = make_column_transformer((OneHotEncoder(), [0]), remainder="passthrough") A = process.fit_transform(X)
import numpy as np import pandas as pd import matplotlib.pyplot as plt from io import StringIO csv_data = '''A,B,C,D 1.0,2.0,3.0,4.0 5.0,6.0,,8.0 10.0,11.0,12.0,''' df = pd.read_csv(StringIO(csv_data)) from sklearn.impute import SimpleImputer imr = SimpleImputer(strategy='constant', fill_value=0) imr.fit(df) imr.fit_transform(df) imputed_data = imr.transform(df.values) print(imputed_data)
train_X = data_set["train_X"].values train_y = data_set["train_Y"].values for i in range(len(data_set["train_X"])): if train_y[i] < 0: np.delete(train_y, i, 0) np.delete(train_X, i, 0) X_train, X_test, y_train, y_test = train_test_split( train_X, train_y, test_size=0.1, random_state=0) onehot_cats = list() for (i, _) in ONEHOT_CATEGORICAL_FEATURE_KEYS: cat_impute = SimpleImputer(strategy='constant') X_train[:, i] = cat_impute.fit_transform(X_train[:, i].reshape(-1, 1)).reshape(-1) onehot = OneHotEncoder() onehot_model = onehot.fit(X_train[:, i].reshape(-1, 1)) onehot_cats.append(onehot_model.categories_) tmp_l = list() for l in onehot_cats: tmp_l.append(l[0].tolist()) X_train = preprocessing(X_train, categories=tmp_l) X_test = preprocessing(X_test, categories=tmp_l) X_pred = preprocessing(data_set["pred_X"].values, categories=tmp_l) poly = PolynomialFeatures(degree=2) X_train = poly.fit_transform(X_train) X_test = poly.fit_transform(X_test)
def init_gmm(features, n_components): imp = SimpleImputer(missing_values=np.nan, strategy='mean') init_x = imp.fit_transform(features) gmm = GaussianMixture(n_components=n_components, covariance_type='diag').fit(init_x) return gmm
### Approach 1: drop columns with missing values # Get names of columns with missing value cols_with_missing = [col for col in X.columns if X[col].isnull().any()] # Drop columns in training and validation data reduced_X_train = X_train.drop(cols_with_missing, axis=1) reduced_X_valid = X_valid.drop(cols_with_missing, axis=1) # Measure quality of the approach 1 print("MAE (drop columns with missing values):") print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid)) ### Approach 2: imputation # Imputation my_imputer = SimpleImputer() imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train)) imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid)) # Imputation removes column names; put them back imputed_X_train.columns = X_train.columns imputed_X_valid.columns = X_valid.columns # Measure quality of the approach 2 print("MAE (imputation):") print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid)) ### Approach 3: an extension to imputation # Make copies to avoid changing original data (when imputing) X_train_plus = X_train.copy() X_valid_plus = X_valid.copy()
# In[ ]: #let's turn sex into a numerical feature instead of categorical from sklearn.preprocessing import LabelEncoder train_data['Sex'] = LabelEncoder().fit_transform(train_data['Sex']) # In[ ]: #handling missing values #print(train_data.isnull().sum()) from sklearn.impute import SimpleImputer imp = SimpleImputer(missing_values=np.nan, strategy='mean') train_data['Age'] = imp.fit_transform(train_data['Age'].values.reshape(-1,1)).reshape(-1) print(train_data.isnull().sum()) # In[ ]: # Find correlations with the target and sort correlations = train_data.corr()['Survived'].sort_values() # Display correlations print('Correlations: \n', correlations) # In[ ]:
p1d = combine[((combine["Pclass"] == 3) & (combine["Embarked"] == "S"))] combine["Fare"][1043] = p1d['Fare'].median() ddf = combine.copy() ddf.drop([ 'PassengerId', 'Name', 'Ticket', 'Cabin', 'Family', 'surname', 'Survived' ], axis=1, inplace=True) from sklearn.impute import SimpleImputer imp = SimpleImputer(strategy="most_frequent") X = imp.fit_transform(ddf) X = pd.DataFrame(X).copy() X.columns = [ "Age", "Embarked", "Fare", "Parch", "Pclass", "Sex", "SibSp", "title", "Fsize", "FsizeD", "Deck" ] combine["Age"] = X["Age"] combine["IsAdult"] = np.where(combine['Age'] < 18, '0', '1') combine["IsMother"] = np.where((combine['Sex'] == "female") & (combine["Parch"] > 0) & (combine["Age"] > 18) & (combine["title"] != "Miss"), '1', '0')
def Train(gender,age,educ,SES,MMSE,CDR,eTIV,nWBV,ASF): import pandas as pd import numpy as np import seaborn as sns oasis_long = pd.read_csv('data\\oasis_longitudinal.csv') oasis_long = oasis_long.drop(columns = ['Hand', 'MRI ID', 'MR Delay', 'Subject ID', 'Visit']) y = oasis_long['Group'].astype('category') X = oasis_long.iloc[:, 1:] #X1 = X X["M/F"].fillna("M",inplace = True) X["Age"].fillna(method='ffill',inplace=True) X["EDUC"].fillna(method='ffill',inplace=True) X["CDR"].fillna("0",inplace = True) X["eTIV"].fillna(method='ffill',inplace=True) from sklearn.preprocessing import LabelEncoder le = LabelEncoder() X.iloc[:, 0] = le.fit_transform(X.iloc[:, 0]) #Female 0 Male 1 from sklearn.impute import SimpleImputer imputer_SES = SimpleImputer(missing_values=np.nan, strategy='median') imputer_MMSE = SimpleImputer(missing_values=np.nan, strategy='mean') imputer_nWBV = SimpleImputer(missing_values=np.nan, strategy='mean') imputer_ASF = SimpleImputer(missing_values=np.nan, strategy='median') X.iloc[:, 3:4] = imputer_SES.fit_transform(X.iloc[:, 3:4]) X.iloc[:, 4:5] = imputer_MMSE.fit_transform(X.iloc[:, 4:5]) X.iloc[:,7:8] = imputer_nWBV.fit_transform(X.iloc[:, 7:8]) X.iloc[:,8:] = imputer_ASF.fit_transform(X.iloc[:, 8:]) '''' from sklearn.preprocessing import StandardScaler sc = StandardScaler() X = sc.fit_transform(X) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0) ''' #def __Predict__(self,gender,age,educ,SES,MMSE,CDR,eTIV,nWBV,ASF): from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.model_selection import cross_val_score models_list = [] models_list.append(('LOG', LogisticRegression())) models_list.append(('RFC', RandomForestClassifier())) models_list.append(('SVM', SVC(gamma = 'scale'))) models_list.append(('NB', GaussianNB())) models_list.append(('KNN', KNeighborsClassifier())) results = [] names = [] accuracy_score = {} for name, model in models_list: cv_results = cross_val_score(estimator = model, X = X, y = y, cv=10, scoring='accuracy', n_jobs = -1) results.append(cv_results) names.append(name) #print( "%s: %f " % (name, cv_results.mean())) accuracy_score[name]=cv_results.mean() #print(accuracy_score) Pred_Form1 = [[gender,age,educ,SES,MMSE,CDR,eTIV,nWBV,ASF]] #Pred_Form2 = [[0,60,16,4,30,0,1500,0.8,1]] max_key = max(accuracy_score, key=accuracy_score.get) #print(max_key) if(max_key=='RFC'): from sklearn.ensemble import RandomForestClassifier #rf = RandomForestClassifier() rf = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', bootstrap= True, max_features = 'auto') rf.fit(X, y) y_pred1 = rf.predict(Pred_Form1) #print(y_pred1) return y_pred1 elif(max_key=='LOG'): #print("CheckPoint1") #LogReg from sklearn import linear_model log_reg = linear_model.LogisticRegression(penalty='l2') log_reg.fit(X,y) y_pred_log1 = log_reg.predict(Pred_Form1) #print (y_pred_log1) return y_pred_log1 elif(max_key=='NB'): #GAUBAS from sklearn.naive_bayes import GaussianNB GB = GaussianNB() GB.fit(X,y) y_pred_GB1 = GB.predict(Pred_Form1) return y_pred_GB1
def random_boruta(self): with open(self.result_folder + '/param_CB_{}.json'.format(self.epoch)) as f: dati = json.load(f) for data in dati: del data['value'] cb_model = CatBoostClassifier(**data) cv = StratifiedKFold(n_splits=5, shuffle=True) for train_index, test_index in cv.split(self.X, self.y): X_train = self.X.iloc[lambda x: train_index] X_test = self.X.iloc[lambda x: test_index] y_train = np.take(self.y, train_index) y_test = np.take(self.y, test_index) median_imputer = SimpleImputer(missing_values=np.NaN, strategy='median') imputer = median_imputer.fit(X_train) vX_train = imputer.transform(X_train) imputertest = median_imputer.fit(X_test) vX_test = imputertest.transform(X_test) X_train = pd.DataFrame(vX_train, columns=X_train.columns, index=X_train.index) X_test = pd.DataFrame(vX_test, columns=X_test.columns, index=X_test.index) Feature_Selector = BorutaShap(model=cb_model, importance_measure='shap', percentile=90, pvalue=0.1, classification=True) Feature_Selector.fit(X_train, y_train, n_trials=500, random_state=0) Feature_Selector.TentativeRoughFix() Feature_Selector.plot(X_size=12, figsize=(12, 8), y_scale='log', which_features='all') Xstrain = Feature_Selector.Subset() selected = [x for x in Xstrain.columns] print('features selected', selected) v_test_X = median_imputer.fit_transform(self.X_test) test_X = pd.DataFrame(v_test_X, columns=self.X_test.columns, index=self.X_test.index) cb_model.fit(Xstrain, y_train) print('AUC') cb_model.fit(X_train, y_train) roc = roc_auc_score(y_test, cb_model.predict_proba(X_test)[:, 1]) print(roc) print('AUC TEST') roc_test = roc_auc_score( self.y_test, cb_model.predict_proba(test_X)[:, 1]) print(roc_test)
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0) elif configs['imputer'] == 'min': imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=configs['min_val']) else: logger.warning("No imputer selected!") # run grid search if configs['metric'] == 'corr': rcv = GridSearchCV(reg, param_grid, n_jobs=100, cv=cv, scoring=corr_score, refit=True) else: rcv = GridSearchCV(reg, param_grid, n_jobs=100, cv=cv, scoring=configs['metric'], refit=True) if configs['imputer'].lower() == 'none': rcv.fit(X_train, y_train) y_pred = rcv.best_estimator_.predict(X_test) else: rcv.fit(imputer.fit_transform(X_train), y_train) y_pred = rcv.best_estimator_.predict(imputer.transform(X_test)) # gather results if 'sign' in configs: sign = configs['sign'] else: sign = -1 if configs['task'] == 'regression' else 1 if configs['task'].lower() == 'classification': test_auc = roc_auc_score(y_test, y_pred) score_dict = {'drug_id': drug_id, 'val_mae': sign * rcv.best_score_, 'test_score': test_auc} else: test_mae = mean_absolute_error(y_test, y_pred) test_rmse = mean_squared_error(y_test, y_pred, squared=False) test_r2 = r2_score(y_test, y_pred)
def nan_padding(data, columns): for column in columns: imputer = SimpleImputer() data[column] = imputer.fit_transform(data[column].values.reshape( -1, 1)) return data
def function_q17(event): global screen, df df = pd.read_csv("DATA SET-2.csv") root = Toplevel(screen) big_frame = tk.Frame(root, bg='white', width='600', height='630', bd=4, relief=RIDGE) big_frame.place(x=50, y=60) w = 700 h = 700 ws = screen.winfo_screenwidth() hs = screen.winfo_screenheight() x = (ws / 2) - (w / 2) y = (hs / 2) - (h / 2) root.geometry("%dx%d+%d+%d" % (w, h, x, y)) root.configure(background='white') df.drop(9148, axis=0, inplace=True) df.drop(10472, axis=0, inplace=True) #print(df['Installs'].head(5)) df['Installs'] = df['Installs'].map(lambda x: x.rstrip('+')) df['Installs'] = df['Installs'].map(lambda x: ''.join(x.split(','))) df['Installs'] = pd.to_numeric(df['Installs']) # Data cleaning for "Size" column df['Size'] = df['Size'].map(lambda x: x.rstrip('M')) df['Size'] = df['Size'].map(lambda x: str( round((float(x.rstrip('k')) / 1024), 1)) if x[-1] == 'k' else x) df['Size'] = df['Size'].map(lambda x: np.nan if x.startswith('Varies') else x) df['Size'] = pd.to_numeric(df['Size']) # Replace "NaN" with mean imputer = SimpleImputer() df['Size'] = imputer.fit_transform(df[['Size']]) df['Installs'] = imputer.fit_transform(df[['Installs']]) #now creating linear approximation x = df['Size'].values.reshape( -1, 1 ) # this reshape wil converts the data into the specific format in which fit function is required y = df['Installs'].values.reshape(-1, 1) reg = LinearRegression() reg.fit(x, y) #reg.coef_calculates slope , reg.intercept_calculates 'C' #print(reg.coef_) #print(reg.score(x,y)) #now creating prediction prediction = reg.predict(x) #now assesing efficiency using R-squared model x = df['Size'] y = df['Installs'] x2 = sm.add_constant( x ) #sci-kit is used to eliminate the value of x because x is indipndent variable #Ordinary least squares is the simplest and most common estimator in which the two \(\beta\)s are chosen to minimize the square of the distance between est = sm.OLS(y, x2) est2 = est.fit() #print( est2.summary()) figure3 = plt.Figure(figsize=(5, 4), dpi=100) ax3 = figure3.add_subplot(111) ax3.scatter(df['Size'], df['Installs'], color='y') ax3.plot(df['Size'], prediction, color='r') scatter_plot = FigureCanvasTkAgg(figure3, big_frame) scatter_plot.get_tk_widget().place(x=50, y=20) ax3.legend() ax3.set_xlabel("Size of the App") ax3.set_ylabel("Installs") ax3.set_title("Trend of Install") String = """ Conclusion : - Here we have applied Linear Regression to find the Trend As we can observe from above graph There is a Positive Trend From the trend as increase in the size of App influence the number of installs""" tk.Label(big_frame, text=String, font=("Calibri", 13, 'italic'), fg='#ad023e', bg='white').place(x=10, y=450) root.mainloop()
def getPrediction(big_frame): global rating, size, installs, price, type, android, df df = pd.read_csv("DATA SET-2.csv") category = { 'SPORTS': 0, 'ENTERTAINMENT': 1, 'SOCIAL': 2, 'NEWS_AND_MAGAZINES': 3, 'EVENTS': 4, 'TRAVEL_AND_LOCAL': 5, 'GAME': 6 } for index in range(len(df['Category'])): if df['Category'][index] in category: continue else: df.drop(index, axis=0, inplace=True) df['Category'] = df['Category'].map(lambda x: category[x] if (x in category) else -1) dict_content_rating = { "Adults only 18+": 0, "Everyone": 1, "Everyone 10+": 2, "Mature 17+": 3, "Teen": 4 } df['Content Rating NUM'] = df['Content Rating'].map( lambda x: dict_content_rating[x] if (x in dict_content_rating) else -1) # Data cleaning for "Size" column df['Size'] = df['Size'].map(lambda x: x.rstrip('M')) df['Size'] = df['Size'].map(lambda x: str( round((float(x.rstrip('k')) / 1024), 1)) if x[-1] == 'k' else x) df['Size'] = df['Size'].map(lambda x: np.nan if x.startswith('Varies') else x) df['Price'] = df['Price'].map(lambda x: x if x == 0 else x.lstrip('$').rstrip()) df['Installs'] = df['Installs'].map(lambda x: x.rstrip('+')) df['Installs'] = df['Installs'].map(lambda x: ''.join(x.split(','))) # Change datatype df['Reviews'] = pd.to_numeric(df['Reviews']) df['Installs'] = pd.to_numeric(df['Installs']) df['Price'] = pd.to_numeric(df['Price']) # Replace "NaN" with mean imputer = SimpleImputer() df['Rating'] = imputer.fit_transform(df[['Rating']]) # Rounding the mean value to 1 decimal place df['Rating'].round(1) df.dropna(axis=0, inplace=True) #sns.heatmap(df.isnull()) df['Type'] = df['Type'].map(lambda x: 1 if (x == "Free") else 0) global And, val And = {} val = -1 df['Android Ver'] = df['Android Ver'].map(lambda x: And[str(x)] if (str(x) in And) else value(x)) # Features selection features = ['Rating', 'Size', 'Installs', 'Price', 'Type', 'Android Ver'] #Spliting the datat fro training and testing train, test = train_test_split(df, test_size=0.3) #creating a response and target variable #taking the training data input train_x = train[features] #multiple indepent variable train_y = train['Category'] #only one dependent variable ##print(list(train.columns)) train, test = train_test_split(df, test_size=0.3) #train,test=train_test_split(df,test_size = 0.2) #taking the testing data input test_x = test[features] test_y = test['Category'] ##print(list(test.columns)) """ #Creating a decision tree model based on the training data model = tree.DecisionTreeClassifier() model.fit(train_x,train_y) #now prediction using the trained model prediction = model.predict(test_x) #now displaying the predicted vs actual values #dataframe = pd.DataFrame(prediction,test_y) """ #idea of random forest to improve efficiency #will create a small different trees """ RANDOM FOREST """ model = RandomForestClassifier( n_estimators=100) #this will create the group of 100 data model.fit(train_x, train_y) prediction = model.predict(test_x) #now displaying the predicted vs actual values #print(metrics.accuracy_score(prediction,test_y)) #print(classification_report(test_y , prediction)) rating_app = float(rating.get()) size_app = float(size.get()) installs_app = int(installs.get()) if type == "Free": price_app = 0 else: price_app = int(price.get()) if type == "Free": type_app = 1 else: type_app = 0 android_app = int(And[android.get()]) prediction = model.predict( np.array([ rating_app, size_app, installs_app, price_app, type_app, android_app ]).reshape(1, -1)) #print(prediction) #print(model.score(test_x,test_y)) #print(category) val = "" for val in category: if category[val] == prediction: #print(val) break tk.Label(big_frame, text="-----RESULT-----", height='2', font=("Calibri", 19, 'bold'), fg='#ad023e', bg='white').place(x=250, y=400) #print(val) string = "With help of parameters {} category is most likely to be downloaded in comming years".format( val) tk.Label(big_frame, text=string, height='2', font=("Calibri", 10, 'italic'), fg='#ad023e', bg='white').place(x=0, y=450) string = "Accuracy score for this model is {:.2f}%".format( model.score(test_x, test_y) * 100) tk.Label(big_frame, text=string, height='2', font=("Calibri", 11, 'italic'), fg='#ad023e', bg='white').place(x=0, y=500)
# In[17]: df[colsobject].head() # In[18]: ### Use Simple Imputer with max_frequency for imputation for categorical variables from sklearn.impute import SimpleImputer imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent') df[colsobject] = imp_mode.fit_transform(df[colsobject]) # In[19]: ### Use Simple Imputer with median for imputation for continuous variables imp_median = SimpleImputer(missing_values=np.nan, strategy='median') df[colsnumeric] = imp_median.fit_transform(df[colsnumeric]) # In[20]:
X_train_copy = X_train.select_dtypes(exclude=['object']) X_val_copy = X_val.select_dtypes(exclude=['object']) object_cols = [ col for col in X_train.columns if X_train[col].dtype == 'object' ] X_train_object = X_train.select_dtypes(exclude=['int64', 'float64']) X_val_object = X_val.select_dtypes(exclude=['int64', 'float64']) X_test_copy = X_test.select_dtypes(exclude=['object']) X_test_object = X_test.select_dtypes(exclude=['int64', 'float64']) # --------------------------------------------------------------------------------------------- # Imputing missing values from sklearn.impute import SimpleImputer final_imputer = SimpleImputer(strategy='most_frequent') X_train_imputed = pd.DataFrame(final_imputer.fit_transform(X_train_copy), columns=X_train_copy.columns) X_val_imputed = pd.DataFrame(final_imputer.transform(X_val_copy), columns=X_val_copy.columns) X_test_imputed = pd.DataFrame(final_imputer.fit_transform(X_test_copy), columns=X_test_copy.columns) X_train_imputed['temp'] = 1 X_train_object['temp'] = 1 X_train_copy = pd.merge(X_train_object, X_train_imputed, on=['temp']).reindex(X_train.index) X_train_copy = X_train_copy.drop('temp', axis=1) X_train_object = X_train_object.drop('temp', axis=1) X_train_imputed = X_train_imputed.drop('temp', axis=1) X_train_copy = X_train_copy[X_train.columns] X_train_imputed.index = X_train_copy.index
# ids : all ids of cascades that have emo AND are in size range IDs = list(set(tweets.cascade_id).intersection(emos.cascade_id)) shuffle(IDs) split = int(len(IDs) * split_ratio) train_ids, test_ids = pd.DataFrame({'cascade_id': IDs[:split]}), pd.DataFrame( {'cascade_id': IDs[split:]}) tweets_train = pd.merge(tweets, train_ids, how='inner').reset_index(drop=True) tweets_test = pd.merge(tweets, test_ids, how='inner').reset_index(drop=True) emo_train = pd.merge(emos, train_ids, how='inner').reset_index(drop=True) emo_test = pd.merge(emos, test_ids, how='inner').reset_index(drop=True) tweets_train[['user_followers', 'user_followees', 'user_account_age']] = si.fit_transform(tweets_train[[ 'user_followers', 'user_followees', 'user_account_age' ]].values) tweets_test[['user_followers', 'user_followees', 'user_account_age']] = si.transform(tweets_test[[ 'user_followers', 'user_followees', 'user_account_age' ]].values) # get log of vars for cname in [ 'user_followers', 'user_followees', 'user_engagement', 'user_account_age', 'retweet_delay' ]: tweets_train[cname + '_log'] = logp(tweets_train[cname].values) tweets_test[cname + '_log'] = logp(tweets_test[cname].values) tweets_train[to_standardize] = ss.fit_transform(
def exploring_data(housing): # Display info about the data if False: display_info(housing) print('# of ocean_prox. categories: \n', housing["ocean_proximity"].value_counts(), '\n') if False: plot_hist(housing) # ------------------------------ # Split Data # ------------------------------ if False: # normal train, test = split_train_test(housing, 0.2) # by id housing_with_id = housing.reset_index() train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index") housing_with_id[ "id"] = housing["longitude"] * 10**3 + housing["latitude"] train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id") # use scikit-learn (equivalent to split_train_test) train_set, test_set = model_selection.train_test_split(housing, test_size=0.2, random_state=42) if True: # if import to keep the distribution of income_cat bins = [0., 1.5, 3.0, 4.5, 6., np.inf] test_size = 0.2 strat_train_set, strat_test_set = stratified_split(housing, cat="median_income", bins=bins, test_size=test_size) housing = strat_train_set.copy() # ------------------------------ # Investigate Data # ------------------------------ if False: housing.plot( kind="scatter", x="longitude", y="latitude", alpha=0.4, s=housing["population"] / 100, label="population", figsize=(10, 7), c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True, ) plt.legend() plt.show() # Correlation if False: corr_matrix = housing.corr() print(corr_matrix["median_house_value"].sort_values(ascending=False)) # Plot correlation as scatter plots for diff attributes attributes = [ "median_house_value", "median_income", "total_rooms", "housing_median_age" ] pd.plotting.scatter_matrix(housing[attributes], figsize=(12, 8)) plt.show() housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1) plt.show() # -> reveals horizontal lines that we may want to remove # Attribute Combination if False: housing["rooms_per_household"] = housing["total_rooms"] / housing[ "households"] housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing[ "total_rooms"] # housing["population_per_household"] = housing["population"]/housing["households"] if False: corr_matrix = housing.corr() print( corr_matrix["median_house_value"].sort_values(ascending=False)) # Preparing Data for machine learning if False: housing = strat_train_set.drop("median_house_value", axis=1) # housing_labels = strat_train_set["median_house_value"].copy() # missing values: 3 possibilities if False: housing.dropna(subset=["total_bedrooms"]) # Get rid of the data housing.drop("total_bedrooms", axis=1) # Get rid of the whole attribute median = housing["total_bedrooms"].median() housing["total_bedrooms"].fillna( median, inplace=True) # set missing value to zero/median/mean if False: # Median of category cannot be calculated -> create a copy without that category housing_num = housing.drop("ocean_proximity", axis=1) imputer = SimpleImputer(strategy="median") imputer.fit(housing_num) print(imputer.statistics_, housing_num.median().values) # Transform data X = imputer.transform(housing_num) # Combines the fit and the transform in one action imputer.fit_transform(housing_num) # Recreate a new DataFrame housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index) if False: print(housing_tr) # Handle Categorical and text if False: housing_cat = housing[["ocean_proximity"]] print(housing_cat.head(10)) # convert Cat to number ordinal_encoder = OrdinalEncoder() housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat) print(ordinal_encoder.categories_) print(housing_cat_encoded[:10]) # Problem with this is that 0 and 1 would be seen as close by the algo but not necessarily true # -> Prefer to OneHotEncode: 1 new category for the data per category, and for each of these it's either 1 or 0 cat_encoder = OneHotEncoder() housing_cat_1hot = cat_encoder.fit_transform(housing_cat) print(cat_encoder.categories_) print(housing_cat_1hot) if False: # This can be done with pandas directly housing = pd.get_dummies(housing, prefix='', prefix_sep='') # Custom transformers can be created (like OrdinalEncoder, OneHotEncoder, Imputer,...) # -> Create a class with fit() (returning itself), transform() and fit_transform() (not needed if TransformerMixin # used as a base class) and if BaseEstimator class -> get_params() and set_params() if False: attr_adder = CombinedAttributesDivide(housing, add_bedrooms_per_room=False) housing_extra_attribs = attr_adder.transform(housing.values) # Feature Scaling: fit on training data and then transform training and test set # 2 methods: -> min-max scaling: normalization # -> Standardization if False: scaler = MinMaxScaler() housing_scaled = scaler.fit_transform(housing_extra_attribs) if False: scaler = StandardScaler() housing_scaled = scaler.fit_transform(housing_extra_attribs) print(housing_scaled) # Pipeline: to organise all transformation of it in a simpler manner # On numerical attributes: if False: num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesDivide(housing)), ('std_scaler', StandardScaler()), ]) housing_num_tr = num_pipeline.fit_transform(housing_num) print(housing_num_tr) # To also take care of categorical attributes: if True: housing_num = housing.drop("ocean_proximity", axis=1) num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesDivide(housing)), ('std_scaler', StandardScaler()), ]) full_pipeline = ColumnTransformer([("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs)]) housing_prepared = full_pipeline.fit_transform(housing) print(housing_prepared)
def test_imputation_mean_median_error_invalid_type(strategy, dtype): X = np.array([["a", "b", 3], [4, "e", 6], ["g", "h", 9]], dtype=dtype) with pytest.raises(ValueError, match="non-numeric data"): imputer = SimpleImputer(strategy=strategy) imputer.fit_transform(X)
def manual_preprocess(self, config, folderLocation): """ This function is for preprocessing the data when the user selects manual preprocessing. """ # config = open("preprocess_config.yaml", 'r') config_data = yaml.safe_load(open(config, 'r')) df = pd.read_csv(config_data["raw_data_address"]) #### Handling missing data # drop columns def drop_NA(df): # On calling this function it drops all the columns and rows which are compleatly null. nan_value = config_data["na_notation"] df.replace("", nan_value, inplace=True) df = df.dropna(how='all', axis=1, inplace=True) df = df.dropna(how='all', inplace=True) if config_data['drop_column_name'][0] != None: df = df.drop(config_data["drop_column_name"], axis=1) drop_NA(df) else: drop_NA(df) # imputation if config_data['imputation_column_name'][0] != None: strategy_values_list = [] for index, column in enumerate( config_data["imputation_column_name"]): type = config_data["impution_type"][index] df_value = df[[column]].values if type == "mean": imputer = SimpleImputer( missing_values=config_data["na_notation"], strategy="mean") strategy_values_list.append(df[column].mean()) elif type == "median": imputer = SimpleImputer( missing_values=config_data["na_notation"], strategy="median") strategy_values_list.append(df[column].median()) elif type == "most_frequent": imputer = SimpleImputer( missing_values=config_data["na_notation"], strategy="most_frequent") strategy_values_list.append(df[column].mode()) elif type == 'knn': imputer = KNNImputer( n_neighbors=4, weights="uniform", missing_values=config_data["na_notation"]) df[[column]] = imputer.fit_transform(df_value) df.replace(to_replace=[config_data["na_notation"]], value=0) if strategy_values_list != []: config_data['mean_median_mode_values'] = strategy_values_list else: ## Checkin the z scone and replace it with mean if z < 3 df.replace(to_replace=[config_data["na_notation"]], value=0) ####using others for object type data. #feature scaling if config_data['scaling_column_name'][0] != None: for index, column in enumerate(config_data["scaling_column_name"]): type = config_data["scaling_type"][index] config_data['scaling_values'] = {} df_value = df[[column]].values if type == "normalization": df_std = (df_value - df_value.min(axis=0)) / ( df_value.max(axis=0) - df_value.min(axis=0)) scaled_value = df_std * (1 - 0) config_data['scaling_values'][index] = { "min": df_value.min(axis=0), "max": df_value.max(axis=0) } elif type == 'standarization': df_std = (df_value - df_value.min(axis=0)) / ( df_value.max(axis=0) - df_value.min(axis=0)) scaled_value = (df_value - df.value.mean()) / df_std config_data['scaling_values'][index] = { "std": df_std, "mean": df.value.mean() } df[[column]] = scaled_value #### handling catogarical data # encoding # Under the following if block only the columns selected by the used will be encoded as choosed by the used. if config_data['encode_column_name'][0] != None: for index, column in enumerate(config_data["encode_column_name"]): type = config_data["encoding_type"][index] if type == "Label Encodeing": encoder = LabelEncoder() df[column] = encoder.fit_transform(df[column]) label_encoding_dict = dict( zip(encoder.classes_, range(len(encoder.classes_)))) config_data['labels'] = {} config_data['labels'] = [label_encoding_dict] elif type == "One-Hot Encoding": encoder = OneHotEncoder(drop='first', sparse=False) df_encoded = pd.DataFrame( encoder.fit_transform(df[[column]])) df_encoded.columns = encoder.get_feature_names([column]) df.drop([column], axis=1, inplace=True) df = pd.concat([df, df_encoded], axis=1) # In case the user missed any column which is object type and need to be encoded will be encoded using OneHot encoding. objest_type_column_list = [] for col_name in df.columns: if df[col_name].dtype == 'object': objest_type_column_list.append(col_name) config_data['encodeing_type'].extend(['One-Hot Encoding']) if objest_type_column_list != []: config_data['encode_column_name'] = objest_type_column_list encoder = OneHotEncoder(drop='first', sparse=False) df_encoded = pd.DataFrame( encoder.fit_transform(df[objest_type_column_list])) df_encoded.columns = encoder.get_feature_names( [objest_type_column_list]) df.drop([objest_type_column_list], axis=1, inplace=True) df = pd.concat([df, df_encoded], axis=1) # Feature engineering & Feature Selection ### Outlier detection & Removel # We are removing the outliers if on the basis on z-score. if config_data["Remove_outlier"] == True: z = np.abs(stats.zscore(df)) df = df[(z < 3).all(axis=1)] # Here we are selecting the column which are having more then 70 correlation. if config_data["feature_selection"] == True: col_corr = set() corr_matrix = df.corr() for i in range(len(corr_matrix.columns)): for j in range(i): if abs(corr_matrix.iloc[i, j]) > 0.90: col_corr.add(corr_matrix.columns[i]) df = df.drop(col_corr, axis=1) # with the following function we can select highly correlated features # it will remove the first feature that is correlated with anything other feature # Droping the columns which are left behind and can cause problem at the time of model training. for col_name in df.columns: if df[col_name].dtype == 'object': df = df.drop(col_name, axis=1) df.to_csv('clean_data.csv') shutil.move("clean_data.csv", folderLocation) clean_data_address = os.path.abspath( os.path.join(folderLocation, "clean_data.csv")) config_data['clean_data_address'] = clean_data_address with open(config, 'w') as yaml_file: yaml_file.write(yaml.dump(config_data, default_flow_style=False)) return clean_data_address
import pandas as pd import numpy as np dataset = pd.read_csv("framingham_heart_disease.csv") X = dataset.iloc[:,:15] Y = dataset.iloc[:,15:16] X = X.drop(columns = ['currentSmoker', "education"]) from sklearn.impute import SimpleImputer imputer = SimpleImputer(strategy = 'most_frequent') X.iloc[:,1:14] = imputer.fit_transform(X.iloc[:,1:14]) from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0) ''' from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) ''' import keras from keras.models import Sequential from keras.layers import Dense from keras.layers import Input from keras import regularizers from keras.models import Model, load_model from keras.callbacks import ModelCheckpoint, TensorBoard ''' # Initialising the ANN classifier = Sequential() # Adding the input layer and the first hidden layer
def getResult(rating, installs, big_frame): df = pd.read_csv("DATA SET-2.csv") # Replace "NaN" with mean imputer = SimpleImputer() df['Rating'] = imputer.fit_transform(df[['Rating']]) temp = [] for index in range(len(df['Rating'])): if df['Rating'][index] >= rating: temp.append(1) else: temp.append(0) cat_rating = pd.DataFrame(zip(temp, temp), columns=["cat_Ratings", "ignore"]) df = pd.concat([df, cat_rating], axis=1) df.drop("ignore", axis=1, inplace=True) df.drop(df.index[9148], inplace=True) # Data cleaning for "Installs" column df['Installs'] = df['Installs'].map(lambda x: x.rstrip('+')) df['Installs'] = df['Installs'].map(lambda x: ''.join(x.split(','))) df['Installs'] = pd.to_numeric(df['Installs']) rating_sum = 0 rate = [] #1169 """ """ counter = 0 for index in range(len(df)): try: if df['Installs'][index] >= installs: #if df['Rating'][index]>=rating:""" """ rate.append(1) rating_sum += df['Rating'][index] counter += 1 """ """ else: rate.append(0) except: #print(index) continue #print(len(rate)) avg_rating = (rating_sum / counter) """ """ #print(df['Installs'].corr(df['Rating'])) """ """ val = "Yes" if (rating_sum / counter) >= rating else "No" rel = "Greater than" if val == "Yes" else "Lesser than" fig, ax = plt.subplots(figsize=(10, 10)) l1 = '{}>='.format(installs) l2 = '<{}'.format(installs) size = [rate.count(1), rate.count(0)] label = [l1, l2] title = 'Count of {}'.format(rating) figure1 = plt.Figure(figsize=(5, 5), dpi=70) #color = cm.rainbow(np.linspace(0, 1, 10)) #fig1, ax1 = plt.subplots() ax3 = figure1.add_subplot(111) ax3.pie(size, labels=label, colors=['red', 'blue'], autopct='%1.1f%%', startangle=200) ax3.set_title(title) #ax3.xlim(0,3.0) pie_plot = FigureCanvasTkAgg(figure1, big_frame) pie_plot.get_tk_widget().place(x=80, y=190) tk.Label(big_frame, text="--Results--", font=("Calibri", 13, 'italic'), fg='#ad023e', bg='white').place(x=220, y=470) String = "Average rating of all the apps who managed to get over {} download is {:.1f}".format( installs, avg_rating) tk.Label(big_frame, text=String, font=("Calibri", 13, 'italic'), fg='#ad023e', bg='white').place(x=0, y=500) String = """{}! All those apps who have managed to get over {} downloads , they have to get an average rating of {:.1f} which is {} than {} """.format( val, installs, avg_rating, rel, rating) tk.Label(big_frame, text=String, font=("Calibri", 13, 'italic'), fg='#ad023e', bg='white').place(x=0, y=530)
from sklearn.model_selection import train_test_split from sklearn.impute import SimpleImputer from xgboost import XGBRegressor # Read the data trainData = pd.read_csv('../input/train.csv') testData = pd.read_csv('../input/test.csv') # Select Predictors trainData.dropna(axis=0, subset=['SalePrice'], inplace=True) trainy = trainData.SalePrice trainX = trainData.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object']) testX = testData.select_dtypes(exclude=['object']) # Impute NaN columns myImputer = SimpleImputer() train_X = myImputer.fit_transform(trainX) test_X = myImputer.transform(testX) # Fit model my_model = XGBRegressor() my_model.fit(train_X, trainy, verbose=False) # Make prediction prediction = my_model.predict(test_X) # Make result submission file my_submission = pd.DataFrame({'Id': testData.Id, 'SalePrice': prediction}) my_submission.to_csv('submission.csv', index=False)
'糖尿病家族史', '一级亲属', '二级亲属', '父亲', '母亲', '父系', '母系', '孕次(次)', '产次(次)', '新生儿性别(男=1,女=2)', '胎膜早破(无=0,有=1)', '早产(无=0,有=1)', '羊水过多(无=0,有=1)', '妊娠期高血压(无=0,有=1)', '产后出血(无=0,有=1)', '胎膜早剥(无=0,有=1)', '羊水过少(无=0,有=1)', '流产(无=0,有=1)', '孕期并发症(无=0,有=1)', '胎儿宫内生长受限/发育迟缓(无=0,有=1)', '巨大儿(无=0,有=1)', '胎儿宫内窘迫(无=0,有=1)', '新生儿窒息(无=0,有=1)', '新生儿黄疸/高胆红素血症(无=0,有=1)', '低体重儿或小于胎龄儿(无=0,有=1)', '先天畸形(无=0,有=1)', '新生儿低血糖(无=0,有=1)', '新生儿合并症(无=0,有=1)', '营养咨询或治疗(GDM患者进行营养治疗,NGT接受孕妇学校讲座。咨询或治疗=1,无=0)', 'GDM孕妇用胰岛素治疗(是=1,否=0)' ] X_cat = X[cat_list] X_num = X.drop(columns=cat_list) # 定性变量用众位数填充 from sklearn.impute import SimpleImputer impute = SimpleImputer(missing_values=np.nan, strategy='most_frequent') y_tr = pd.DataFrame(impute.fit_transform(y), columns=y.columns) X_necessary_tr = pd.DataFrame(impute.fit_transform(X_necessary), columns=X_necessary.columns) X_cat_tr = pd.DataFrame(impute.fit_transform(X_cat), columns=X_cat.columns) # 定量变量用随机森林填充 X_num_tr = X_num.copy() df = pd.concat([X_necessary_tr, y_tr], axis=1) from sklearn.ensemble import RandomForestRegressor forest_reg = RandomForestRegressor(n_estimators=100) forest_reg.get_params() for col in list(X_num_tr.columns): if X_num_tr[col].isna().sum() == 0: continue fill = X_num_tr[col] Ytrain = fill[fill.notnull()]