# standardize. from sklearn.preprocessing import StandardScaler cols = X.columns X = StandardScaler().fit_transform(X) # Normalizer won't work with NAs, so this is a good time to fill them in. # if this is after iteration 0, use kmeans clustering to fill them. if ('iterationnum' not in globals()) or (iterationnum >= 1): labels, centroids, X = kmeans_missing(X, n_clusters=20, max_iter=10) del labels, centroids # if not, drop them. else: X = pd.DataFrame(X) nas = X.isnull().any(axis=1) X = X[~nas] y = y[~nas] winmults = winmults[~nas] del nas # normalize. from sklearn.preprocessing import Normalizer X = Normalizer().fit_transform(X) save('../out/d2-fight-level-standardize-normalize-kmeansNA.pkl', X, y, cols, winmults) del X, y, cols #, labels, centroids
'pl_bmassj','pl_radj','st_mass', 'st_teff', 'st_rad', 'st_metfe'] df = df.loc[:, prop] df = df.dropna(subset = prop, how = 'any', axis = 0) pl_prop = ['pl_orbper','pl_orbsmax','pl_orbeccen', 'pl_bmassj','pl_radj'] X = df.loc[:, pl_prop] X = StandardScaler().fit_transform(X.values) X = pd.DataFrame(X, columns = pl_prop) st_prop = ['st_mass', 'st_teff', 'st_rad', 'st_metfe'] y = df.loc[:, st_prop] y = StandardScaler().fit_transform(y.values) y = pd.DataFrame(y, columns = st_prop ) y.isnull().sum() sns.regplot(df['st_mass'], df['pl_bmassj']) sns.set(font='serif', font_scale=1.4, style='ticks') palette = sns.hls_palette(8, l=.3, s=.8) pal = palette.as_hex() from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(y,X,train_size = 0.8)