def test_inverse_transform(strategy): X = np.random.RandomState(0).randn(100, 3) kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode='ordinal') Xt = kbd.fit_transform(X) assert_array_equal(Xt.max(axis=0) + 1, kbd.n_bins_) X2 = kbd.inverse_transform(Xt) X2t = kbd.fit_transform(X2) assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_) assert_array_equal(Xt, X2t)
def test_nonuniform_strategies(strategy, expected_2bins, expected_3bins): X = np.array([0, 1, 2, 3, 9, 10]).reshape(-1, 1) # with 2 bins est = KBinsDiscretizer(n_bins=2, strategy=strategy, encode='ordinal') Xt = est.fit_transform(X) assert_array_equal(expected_2bins, Xt.ravel()) # with 3 bins est = KBinsDiscretizer(n_bins=3, strategy=strategy, encode='ordinal') Xt = est.fit_transform(X) assert_array_equal(expected_3bins, Xt.ravel())
def test_inverse_transform(strategy, encode): X = np.random.RandomState(0).randn(100, 3) kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode) Xt = kbd.fit_transform(X) X2 = kbd.inverse_transform(Xt) X2t = kbd.fit_transform(X2) if encode == 'onehot': assert_array_equal(Xt.todense(), X2t.todense()) else: assert_array_equal(Xt, X2t) if 'onehot' in encode: Xt = kbd._encoder.inverse_transform(Xt) X2t = kbd._encoder.inverse_transform(X2t) assert_array_equal(Xt.max(axis=0) + 1, kbd.n_bins_) assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
def test_overwrite(): X = np.array([0, 1, 2, 3])[:, None] X_before = X.copy() est = KBinsDiscretizer(n_bins=3, encode="ordinal") Xt = est.fit_transform(X) assert_array_equal(X, X_before) Xt_before = Xt.copy() Xinv = est.inverse_transform(Xt) assert_array_equal(Xt, Xt_before) assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
# Using sum of squared distance (ssd) import seaborn as sns def ssd_plot(dataframe, col, start_k, end_k): # ssd_plot(df, "balance", 2, 7) res = pd.DataFrame(columns = ["k", "ssd"]) for k in list(range(start_k, end_k)): model_clus = KMeans(n_clusters = k, max_iter=100) model_clus.fit(np.array(dataframe[col]).reshape((len(dataframe[col]),1))) res = res.append({"k": k, "ssd": model_clus.inertia_}, ignore_index=True) sns.lineplot(x="k", y="ssd", data=res) # v5: Discretizing immediately using user-defined k def discre_cols(dataframe, col_to_disc, k): discretizer = KBinsDiscretizer(n_bins=k, encode='ordinal', strategy='kmeans') for i in col_to_disc: res_col = np.array(dataframe[i]).reshape((len(dataframe[i]),1)) res2_col = discretizer.fit_transform(res_col) dataframe[i+"_disc"] = res2_col dataframe = dataframe.drop([i], axis=1) discre_cols(df, col_to_disc, 3) ## Using iloc to select rows df.iloc[0:3] # Returns rows with index values 0, 1, 2 df = df.iloc[1:] # Select row 2 onwards df.iloc[2:, -3:] # Returns from row 3 onwards, columns 3rd last to last df.iloc[(df['Age'] < 30).values, [1, 3]] # Can only use integers for column reference ## Using loc to select specific rows based on condition(s) # https://www.shanelynn.ie/select-pandas-dataframe-rows-and-columns-using-iloc-loc-and-ix/ df.set_index("last_name", inplace=True) # Setting column ["last_name"] as index name = ["Andreas", "Veness"] df.loc[name] # Returns df with index values "Andreas" and "Veness"
# Tranformación por la cual convertimos una variable continua en categórica # #### Binarizer # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer.html # # Nota: me salteo el ejemplo ya que es muy simple # ##### KBinsDiscretizer # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html # + enc = KBinsDiscretizer(n_bins=4, encode='ordinal') _df = df[['weight']].dropna().reset_index(drop=True) X_binned = enc.fit_transform(_df) X_binned = pd.DataFrame(X_binned.astype(int), columns=['weight_bins']) result = pd.concat([_df, X_binned], axis=1) display(result.head(10)) print("Límites bins:", enc.bin_edges_) # - # ##### pd.qcut # + # mismo ejemplo con pandas _df = df.copy() result, bins = pd.qcut(_df['weight'], 4, labels=[0, 1, 2, 3], retbins=True) _df['weight_bins'] = result
def test_inverse_transform(strategy, encode, expected_inv): kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode) Xt = kbd.fit_transform(X) Xinv = kbd.inverse_transform(Xt) assert_array_almost_equal(expected_inv, Xinv)
plt.ylabel("Average price") plt.show() columns = ['accommodates','bathrooms','bedrooms','guests_included','beds','price'] sb.heatmap(listings[columns].corr(),annot=True) plt.show() #Analysis of Rating for various Room types based on is loaction exact sb.violinplot("room_type", "review_scores_rating", hue="is_location_exact", data=listings,palette='rainbow') plt.show() # binning the price column bins = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform') listings['price']=bins.fit_transform(listings[['price']]) print(listings['price'].unique()) # printing the number of values by count print(listings["price"].value_counts()) plt.figure(figsize=(8,6)) sb.countplot(listings["price"]) plt.show() #first converting the last review column to date farm and then subtracting it from todays date to calculate no of days listings['last_review']= pd.to_datetime(listings['last_review']).dt.date listings['no_days']= (pd.datetime.now().date()-listings['last_review']).dt.days
def analise_temporal_base(X, X_perc=0.25, coluna_target=None, coluna_mes='mes', dsvpd_thr=0.25, nome_arquivo="analise_dados_mes", MI=False, Fast_Analysis=False, log_cat=False, log_fig=False, transpor=False, subs_miss_espec=True, vetor_esp_missing=[-1, -9, -99, -999, -999999999]): """ #Análise Temporal de Bases# Args: X: Base para analise (requerido pandas DataFrame). X_perc: Amostragem da base inicial que será utilizado (0 < X_perc <= 1). Análise fica mais rápida, caso realizado desta maneira. coluna_target: Coluna da base onde está o target, se None, análise por target não será feita. coluna_mes: coluna onde os dados são separados mensalmente, padrão "NUM_MES_REF". dsvpd_thr: Limiar mensal (threshold) da variação de coeficiente (detector de problemas) nome_arquivo: Nome do arquivo Excel que será salvo. MI: Mutual Information (true - retorna, False - não retorna) Fast_Analysis: Trabalhando com uma análise pontual e mais rápida sem tantos detalhes. log_cat: Logs em texto no Jupyter/terminal. log_fig: Plot de figuras no Jupyter ou não. transpor: Transpor informações nas planilhas subs_miss_espec: Substituição de missing especial por np.nan, sim ou não vetor_esp_missing: Vetor de Substituição de Missings Especiais, se subs_miss_espec == True Returns: Arquivo planilha Excel com dados consolidados. Author: Vinícius Ormenesse """ warnings.filterwarnings('ignore') if str(type(X)) == "<class 'pandas.core.frame.DataFrame'>": writer = pd.ExcelWriter(nome_arquivo + '.xlsx', engine='xlsxwriter') if X_perc > 1: X_perc = 1 print( 'Percentual de amostragem incorreto, o percentual utilizado nesta análise será de 100% da amostra.' ) elif X_perc <= 0: X_perc = 0.1 print( 'Percentual de amostragem incorreto, o percentual utilizado nesta análise será de 10% da amostra.' ) if dsvpd_thr > 1: dsvpd_thr = 1 print( 'dsvpd_thr incorreto, o percentual utilizado nesta análise será de 100% da amostra.' ) elif dsvpd_thr <= 0: dsvpd_thr = 0.1 print( 'dsvpd_thr incorreto, o percentual utilizado nesta análise será de 10% da amostra.' ) X = X.sample( frac=X_perc ) #fracionamento da entrada, para que análise fique mais rápida. meses = X[coluna_mes].fillna(method='ffill').unique() meses.sort() #Transformando missing em verdadeiros MISSINGS. if subs_miss_espec: print('Transformando Missings Especiais') X = X.replace(vetor_esp_missing, np.full(len(vetor_esp_missing), np.nan)) #tratando colunas com categoricas categoricas = [] for cols in X.columns: if cols is not coluna_mes: if str(X[cols].dtypes) == 'object': if log_cat == True: print("Ordem das classes de label para a coluna: " + cols) print('Antes:') print(X[cols].unique()) categoricas.append(cols) le = LabelEncoder() X[cols] = le.fit_transform(X[cols].astype(str)) if log_cat == True: print('Depois') print(list(le.classes_)) #Começando a análise para quem possui variável resposta. if coluna_target is not None: colunas_var = X.columns.tolist() colunas_var.remove(coluna_mes) colunas_var.remove(coluna_target) #fuer die variabeln """ 0 - desvpad 1 - mean 2 - missing 3 - outliers 4 - Coefficient Variation 5 - KS 6 - MI 7 - PSI """ analises = [] for i in range(0, 8): analises.append(pd.DataFrame([], index=colunas_var)) #valores targets valores_targets = X[coluna_target].unique().tolist() #olhando para missing media e desvio padrao for i, mes in enumerate(meses): print("Calculando dados no mês: " + str(int(mes))) for valor in valores_targets: valor = int(valor) outlier = [] apsi = [] describe = X[colunas_var][(X[coluna_mes] == mes) & ( X[coluna_target] == valor)].describe().transpose() pdstd = pd.DataFrame(describe['std'].values[:, None], columns=[str(mes) + "_" + str(valor)], index=describe.index) pdmedia = pd.DataFrame( describe['mean'].values[:, None], columns=[str(mes) + "_" + str(valor)], index=describe.index) pdcv = pdstd / pdmedia #coefficient variation c_shape = X[colunas_var][(X[coluna_mes] == mes) & ( X[coluna_target] == valor)].shape[0] pdmissing = pd.DataFrame( (describe['count'].values / c_shape), columns=[str(mes) + "_" + str(valor)], index=describe.index) for coluna in colunas_var: out = X[coluna][(X[coluna_mes] == mes) & ( np.abs(X[coluna] - describe['mean'].loc[coluna]) > (3 * X[coluna].mean() - describe['std'].loc[coluna]))].count() outlier.append(out) if i != 0 and not Fast_Analysis: apsi.append( calculate_psi( X[coluna][(X[coluna_mes] == mes) & ( X[coluna_target] == valor)].dropna(), X[coluna] [(X[coluna_mes] == meses[i - 1]) & (X[coluna_target] == valor)].dropna())) pdpsi = pd.DataFrame(apsi, columns=[str(mes) + "_" + str(valor)], index=colunas_var) pdoutlier = pd.DataFrame( outlier, columns=[str(mes) + "_" + str(valor)], index=colunas_var) analises[0] = pd.concat([analises[0], pdstd], axis=1) analises[1] = pd.concat([analises[1], pdmedia], axis=1) analises[2] = pd.concat([analises[2], pdmissing], axis=1) analises[3] = pd.concat([analises[3], pdoutlier], axis=1) analises[4] = pd.concat([analises[4], pdcv], axis=1) if not Fast_Analysis: analises[7] = pd.concat([analises[7], pdpsi], axis=1) #fazendo teste KS - Trabalhando com 1 ou mais classes. if not Fast_Analysis: combinacoes_target = list( itertools.combinations_with_replacement( valores_targets, 2)) for c_t in combinacoes_target: if c_t[0] != c_t[1]: for mes in meses: ksarray = [] for coluna in colunas_var: ks, pvalor = stats.ks_2samp( X[coluna][(X[coluna_mes] == mes) & ( X[coluna_target] == c_t[0])], X[coluna][(X[coluna_mes] == mes) & ( X[coluna_target] == c_t[1])]) ksarray.append(ks) pdks = pd.DataFrame( ksarray, columns=[str(mes) + "_" + str(str(c_t))], index=colunas_var) analises[5] = pd.concat([analises[5], pdks], axis=1) #Informação Mútua if MI: table_mi = X.copy() KB = KBinsDiscretizer(n_bins=10, encode='ordinal') colunas = [] for coluna in X.columns: if coluna not in categoricas and (coluna not in [ coluna_mes, coluna_target ]): colunas.append(coluna) table_mi[colunas] = KB.fit_transform( table_mi[colunas].fillna(0)) for mes in meses: mi = mutual_info_classif( table_mi[(table_mi[coluna_mes] == mes)].drop( [coluna_mes, coluna_target], axis=1).fillna(0), table_mi[coluna_target][(table_mi[coluna_mes] == mes)]) #mi /= np.max(mi) #melhor nao fazer essa análise pdmi = pd.DataFrame(mi, columns=[str(mes)], index=X.drop( [coluna_mes, coluna_target], axis=1).columns) analises[6] = pd.concat([analises[6], pdmi], axis=1) del table_mi, colunas, KB #Analisando Estatisticas no Extraídas lista_histogramas = set( ) #não vou querer imprimir isso várias vezes for i, analise in enumerate(analises): dsvpd = pd.DataFrame([], columns=['DESVPAD_FEATURE']) meanpd = pd.DataFrame([], columns=['MEAN_FEATURE']) for ind in analise.index: dsvpd.loc[ind] = analise.loc[ind].std() meanpd.loc[ind] = analise.loc[ind].mean() if (dsvpd.loc[ind].values / meanpd.loc[ind].values >= dsvpd_thr or dsvpd.loc[ind].values / meanpd.loc[ind].values <= 0.03) and (i != 2 or i != 3): lista_histogramas.add(ind) analises[i] = pd.concat([analise, dsvpd, meanpd], axis=1) #analisar dados no excel sheets = [ 'DESVPAD', 'MEDIA', 'MISSING', 'OUTLIERS', 'COEFF VAR', 'KS', 'MI', 'PSI' ] if not transpor: for i in range(0, 8): analises[i].to_excel(writer, sheets[i]) for sh in sheets: worksheet = writer.sheets[sh] for i in range(2, analises[0].shape[0] + 2): coluna_excel = num_to_col_letters(i) worksheet.conditional_format( 'B' + str(i) + ':' + num_to_col_letters(analises[0].shape[1] - 1) + str(i), { 'type': '3_color_scale', 'min_type': 'percent', 'mid_type': 'percent', 'max_type': 'percent' }) else: for i in range(0, 8): analises[i].transpose().to_excel(writer, sheets[i]) for sh in sheets: worksheet = writer.sheets[sh] for i in range(2, analises[0].shape[0] + 2): coluna_excel = num_to_col_letters(i) worksheet.conditional_format( coluna_excel + '2' + ':' + coluna_excel + str(1 + len(meses) * len(valores_targets)), { 'type': '3_color_scale', 'min_type': 'percent', 'mid_type': 'percent', 'max_type': 'percent' }) #Salvando Imagens if not Fast_Analysis: worksheet = writer.book.add_worksheet(name="Análises_Imagens") row = 0 for ind in lista_histogramas: col = 0 #histogramas plt.figure(figsize=(10, 5)) for mes in meses: try: for valor in valores_targets: sns.distplot( X[ind][(X[coluna_target] == valor) & (X[coluna_mes] == mes)].dropna()) plt.legend(valores_targets) plt.title("Distribuição de " + str(ind) + " " + str(mes)) imgdata = BytesIO() plt.savefig(imgdata, format="png") imgdata.seek(0) worksheet.insert_image(row, col, "", {'image_data': imgdata}) col += 17 if not log_fig: plt.close() else: plt.show() plt.close() except: pass row += 25 col = 0 #Matriz de covariancia worksheet = writer.book.add_worksheet(name="Covariância") correlations = X.corr() fig = plt.figure( figsize=(1 + int(len(X.columns) * 0.2401 + 0.8911), 1 + int(len(X.columns) * 0.2401 + 0.8911))) ax = fig.add_subplot(111) #cax = ax.matshow(correlations,extent=[0,len(colunas_var),0,len(colunas_var)],vmin=-1, vmax=1) cax = ax.matshow(correlations, vmin=-1, vmax=1) fig.colorbar(cax) ticks = np.arange(0, len(X.columns), 1) ax.set_xticks(ticks - 0.5) ax.set_yticks(ticks - 0.5) ax.set_xticklabels(X.columns, rotation=90, ma='center', size='medium') ax.set_yticklabels(X.columns, ma='center', size='medium') imgdata = BytesIO() fig.savefig(imgdata, format="png") imgdata.seek(0) worksheet.insert_image(0, 0, "", {'image_data': imgdata}) if not log_fig: plt.close() else: plt.show() plt.close() #Finalmente salvando o arquivo e testando. print("Salvando os dados como:" + nome_arquivo + ".xlsx") writer.save() else: colunas_var = X.columns.tolist() colunas_var.remove(coluna_mes) #fuer die variabeln """ 0 - desvpad 1 - mean 2 - missing 3 - outliers 4.- PSI 5 - Coefficient Variation """ analises = [] for i in range(0, 6): analises.append(pd.DataFrame([], index=colunas_var)) #olhando para missing media e desvio padrao for i, mes in enumerate(meses): print("Calculando dados no mês: " + str(int(mes))) outlier = [] apsi = [] describe = X[colunas_var][( X[coluna_mes] == mes)].describe().transpose() pdstd = pd.DataFrame(describe['std'].values[:, None], columns=[str(mes)], index=describe.index) pdmedia = pd.DataFrame(describe['mean'].values[:, None], columns=[str(mes)], index=describe.index) pdcv = pdstd / pdmedia c_shape = X[colunas_var][(X[coluna_mes] == mes)].shape[0] pdmissing = pd.DataFrame((describe['count'].values / c_shape), columns=[str(mes)], index=describe.index) for coluna in colunas_var: out = X[coluna][(X[coluna_mes] == mes) & ( np.abs(X[coluna] - describe['mean'].loc[coluna]) > (3 * X[coluna].mean() - describe['std'].loc[coluna]))].count() outlier.append(out) if i != 0: apsi.append( calculate_psi( X[coluna][(X[coluna_mes] == mes)].dropna(), X[coluna][( X[coluna_mes] == meses[i - 1])].dropna())) pdoutlier = pd.DataFrame(outlier, columns=[str(mes)], index=colunas_var) pdpsi = pd.DataFrame(apsi, columns=[str(mes)], index=colunas_var) analises[0] = pd.concat([analises[0], pdstd], axis=1) analises[1] = pd.concat([analises[1], pdmedia], axis=1) analises[2] = pd.concat([analises[2], pdmissing], axis=1) analises[3] = pd.concat([analises[3], pdoutlier], axis=1) analises[4] = pd.concat([analises[4], pdpsi], axis=1) analises[5] = pd.concat([analises[5], pdcv], axis=1) #Analisando Estatisticas no Extraídas lista_histogramas = set( ) #não vou querer imprimir isso várias vezes for i, analise in enumerate(analises): dsvpd = pd.DataFrame([], columns=['DESVPAD_FEATURE']) meanpd = pd.DataFrame([], columns=['MEAN_FEATURE']) for ind in analise.index: dsvpd.loc[ind] = analise.loc[ind].std() meanpd.loc[ind] = analise.loc[ind].mean() if (dsvpd.loc[ind].values / meanpd.loc[ind].values >= dsvpd_thr or dsvpd.loc[ind].values / meanpd.loc[ind].values <= 0.03) and (i != 2 or i != 3): lista_histogramas.add(ind) analises[i] = pd.concat([analise, dsvpd, meanpd], axis=1) #analisar dados no excel sheets = [ 'DESVPAD', 'MEDIA', 'MISSING', 'OUTLIERS', 'PSI', 'COEFF VAR' ] if not transpor: for i in range(0, 6): analises[i].to_excel(writer, sheets[i]) for sh in sheets: worksheet = writer.sheets[sh] for i in range(2, analises[0].shape[0] + 2): coluna_excel = num_to_col_letters(i) worksheet.conditional_format( 'B' + str(i) + ':' + num_to_col_letters(analises[0].shape[1] - 1) + str(i), { 'type': '3_color_scale', 'min_type': 'percent', 'mid_type': 'percent', 'max_type': 'percent' }) else: for i in range(0, 6): analises[i].transpose().to_excel(writer, sheets[i]) for sh in sheets: worksheet = writer.sheets[sh] for i in range(2, analises[0].shape[0] + 2): coluna_excel = num_to_col_letters(i) worksheet.conditional_format( coluna_excel + '2' + ':' + coluna_excel + str(1 + len(meses)), { 'type': '3_color_scale', 'min_type': 'percent', 'mid_type': 'percent', 'max_type': 'percent' }) #Salvando Imagens if not Fast_Analysis: worksheet = writer.book.add_worksheet(name="Análises_Imagens") row = 0 for ind in lista_histogramas: col = 0 #histogramas plt.figure(figsize=(10, 5)) try: for mes in meses: sns.distplot( X[ind][(X[coluna_mes] == mes)].dropna()) except: print('Erro plotar gráfico no índice', str(ind), 'no mes', str(mes)) plt.title("Distribuição de " + str(ind)) imgdata = BytesIO() plt.savefig(imgdata, format="png") imgdata.seek(0) worksheet.insert_image(row, col, "", {'image_data': imgdata}) col += 17 if not log_fig: plt.close() else: plt.show() plt.close() row += 25 col = 0 #Matriz de covariancia worksheet = writer.book.add_worksheet(name="Covariância") correlations = X.corr() fig = plt.figure( figsize=(1 + int(len(X.columns) * 0.2401 + 0.8911), 1 + int(len(X.columns) * 0.2401 + 0.8911))) ax = fig.add_subplot(111) cax = ax.matshow(correlations, vmin=-1, vmax=1) fig.colorbar(cax) ticks = np.arange(0, len(X.columns), 1) ax.set_xticks(ticks - 0.5) ax.set_yticks(ticks - 0.5) ax.set_xticklabels(X.columns, rotation=90, ma='center', size='medium') ax.set_yticklabels(X.columns, ma='center', size='medium') imgdata = BytesIO() fig.savefig(imgdata, format="png") imgdata.seek(0) worksheet.insert_image(0, 0, "", {'image_data': imgdata}) if not log_fig: plt.close() else: plt.show() plt.close() #Salvando o resultado em uma planilha de excel para análise. print("Salvando os dados como:" + nome_arquivo + ".xlsx") writer.save() else: print( "Por favor, utilizar um dataframe pandas para utilizar essa função.\nNada foi feito." )
def transform(self,X): discretizer = KBinsDiscretizer(n_bins=self.__num_bins, encode=self.__encoder, strategy=self.__strategy) return discretizer.fit_transform(X)
def q2(): discretizer = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile") pop_bins = discretizer.fit_transform(countries[['Pop_density']]) return pop_bins[pop_bins == 9].shape[0]
def load_mimic(base_dir: str = './data/'): data = pd.read_csv(f'{base_dir}/mimic-ii/full_cohort_data.csv') # data.drop('hgb_first') fs = [ 'aline_flg', 'gender_num', # 'hosp_exp_flg', # 'icu_exp_flg', # 'day_28_flg', # 'censor_flg', 'sepsis_flg', 'chf_flg', 'afib_flg', 'renal_flg', 'liver_flg', 'copd_flg', 'cad_flg', 'stroke_flg', 'mal_flg', 'resp_flg', ] features = fs data1 = data[fs].values imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') data1 = imp_mean.fit_transform(data1) f2 = fs.copy() f2.append('day_icu_intime') f2.append('service_unit') f2.append('day_28_flg') f2.append('hospital_los_day') f2.append('icu_exp_flg') f2.append('hosp_exp_flg') f2.append('censor_flg') f2.append('mort_day_censored') f2 = data.columns.difference(f2) data2 = data[f2].values imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') data2 = imp_mean.fit_transform(data2) scaler = MinMaxScaler((0, 1)) data2 = scaler.fit_transform(data2) features = features + list(f2) est = KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='uniform') data2d = est.fit_transform(data2) f2d = [] for feature in f2: # f2d.append(feature + '_VLOW') f2d.append(feature + '_LOW') f2d.append(feature + '_NORMAL') f2d.append(feature + '_HIGH') # f2d.append(feature + '_VHIGH') features = fs + f2d datax = np.hstack((data1, data2d)) # datay = data['day_28_flg'].values # datay = (data['hospital_los_day']>6).values # datay = data['hosp_exp_flg'].values datay = (data['day_28_flg'].values + data['hosp_exp_flg'].values + data['icu_exp_flg'].values + (1 - data['censor_flg'].values)) > 0 # datay = data['day_28_flg'].values # model = DecisionTreeClassifier(max_depth=3) # # model = RandomForestClassifier() # scores = cross_val_score(model, datax, datay, cv=10) # print(scores.mean()) # datax = np.vstack((datax, datax[datay==1], datax[datay==1])) # datay = np.hstack((datay, datay[datay==1], datay[datay==1])) x = torch.FloatTensor(datax) y = one_hot(torch.tensor(datay).to(torch.long)).to(torch.float) return x, y, features
[0], [0], [2], [3]]) ''' np.digitize(age, bins=[18]) ''' array([[0], [0], [1], [1], [1]]) ''' from sklearn.preprocessing import KBinsDiscretizer #4구간으로 나누기 kb = KBinsDiscretizer(4, encode='ordinal', strategy='quantile') kb.fit_transform(age) #원-핫 인코딩 반환 kb = KBinsDiscretizer(4, encode='onehot-dense', strategy='quantile') kb.fit_transform(age) #동일한 길이의 구간 kb = KBinsDiscretizer(4, encode='onehot-dense', strategy='uniform') kb.fit_transform(age) kb.bin_edges_ #array([array([ 6. , 20.75, 35.5 , 50.25, 65. ])], dtype=object)
def q2(): kbins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') intervals = kbins.fit_transform(countries[['Pop_density']]) return int((intervals >= 9).sum())
def load_vDem(base_dir='./data'): data = pd.read_csv(f'{base_dir}/vdem/V-Dem-CY-Core-v10.csv') data['country_name_year'] = data['country_name'] + '_' + data[ 'year'].astype(str) data_2000 = data[data['year'] > 2000].iloc[:, 12:-1].dropna(axis=1) high_level_indicators = [ 'v2x_polyarchy', # 'v2x_libdem', # 'v2x_partipdem', 'v2x_delibdem', 'v2x_egaldem' ] mid_level_indicators = [ 'v2x_api', 'v2x_mpi', 'v2x_freexp_altinf', 'v2x_frassoc_thick', 'v2x_suffr', 'v2xel_frefair', 'v2x_elecoff', # 'v2x_liberal', 'v2xcl_rol', # 'v2x_jucon', # 'v2xlg_legcon', # 'v2x_partip', 'v2x_cspart', # 'v2xdd_dd', # 'v2xel_locelec', # 'v2xel_regelec', 'v2xdl_delib', 'v2x_egal', 'v2xeg_eqprotec', 'v2xeg_eqaccess', 'v2xeg_eqdr', ] # drop_list = ['codelow', 'codehigh', 'sd', 'osp', 'nr', 'mean'] low_level_indicators = [] for f in data_2000.columns: if f.endswith( '_ord' ) and f not in high_level_indicators and f not in mid_level_indicators: low_level_indicators.append(f) low_level_indicators_continuous = [] for f in data_2000.columns: if f.endswith('_codehigh') or f.endswith('_codelow') and \ f not in high_level_indicators and f not in mid_level_indicators: low_level_indicators_continuous.append(f) print( f'Main {len(high_level_indicators)} - Area {len(mid_level_indicators)} - Raw {len(low_level_indicators)}' ) data_low_continuous = data_2000[low_level_indicators_continuous] data_low_raw = data_2000[low_level_indicators] one_hots = [] for indicator in low_level_indicators: c = data_low_raw[indicator].values n_bins = int(c.max()) kbin = KBinsDiscretizer(n_bins=n_bins, encode='onehot-dense', strategy='uniform') c1h = kbin.fit_transform(c.reshape(-1, 1)) one_hots.append(c1h) new_indicator_names = [] for clist, cname in zip(one_hots, low_level_indicators): if clist.shape[1] > 1: for i in range(clist.shape[1]): new_indicator_names.append(f'{cname}_{i}') else: new_indicator_names.append(f'{cname}') data_low = pd.DataFrame(np.hstack(one_hots), columns=new_indicator_names) data_mid = data_2000[mid_level_indicators] > 0.5 data_high = data_2000[high_level_indicators].iloc[:, 0] > 0.5 # data_mid = pd.DataFrame(np.hstack([data_low, data_mid]), columns=data_low.columns.append(data_mid.columns)) # scores = cross_val_score(LogisticRegression(), data_mid.values, data_high.values, cv=10) # print(scores.mean()) # scores = cross_val_score(DecisionTreeClassifier(), data_mid.values, data_high.values, cv=10) # print(scores.mean()) # scores = cross_val_score(RandomForestClassifier(), data_mid.values, data_high.values, cv=10) # print(scores.mean()) x = torch.FloatTensor(data_low.values) c = torch.FloatTensor(data_mid.values) y = one_hot(torch.tensor(data_high.values).to(torch.long)).to(torch.float) return x, c, y, data_mid.columns
def preprocessing_discrete(data_path, img, pctl, feat_list_all, batch, test): img_path = data_path / 'images' / img stack_path = img_path / 'stack' / 'stack.tif' # load cloudmasks clouds_dir = data_path / 'clouds' with rasterio.open(str(stack_path), 'r') as ds: data = ds.read() data = data.transpose((1, -1, 0)) data[data == -999999] = np.nan data[np.isneginf(data)] = np.nan data_vector = data.reshape([data.shape[0] * data.shape[1], data.shape[2]]) data_vector = data_vector[~np.isnan(data_vector).any(axis=1)] # Get indices of non-nan values nans = np.sum(data, axis=2) data_ind = np.where(~np.isnan(nans)) rows, cols = zip(data_ind) # Discretize continuous features cts_feats = ['GSW_distSeasonal', 'aspect', 'curve', 'elevation', 'hand', 'slope', 'spi', 'twi', 'sti'] non_cts_feats = ['developed', 'forest', 'planted', 'wetlands', 'openspace', 'carbonate', 'noncarbonate', 'akl_intrusive', 'silicic_resid', 'silicic_resid', 'extrusive_volcanic', 'colluvial_sed', 'glacial_till_clay', 'glacial_till_loam', 'glacial_till_coarse', 'glacial_lake_sed_fine', 'glacial_outwash_coarse', 'hydric', 'eolian_sed_coarse', 'eolian_sed_fine', 'saline_lake_sed', 'alluv_coastal_sed_fine', 'coastal_sed_coarse', 'GSW_perm', 'flooded'] feats_disc = [] all_edges = pd.DataFrame([]) # GSW_distSeasonal bins = 5 discretizer = KBinsDiscretizer(n_bins=bins, encode='onehot-dense', strategy='quantile') GSW_distSeasonal_disc = discretizer.fit_transform( data_vector[:, feat_list_all.index('GSW_distSeasonal')].reshape(-1, 1)) for i in range(bins): feats_disc.append('GSW_distSeasonal_' + str(i + 1)) disc_nan = np.zeros(data[:, :, 0:bins].shape) disc_nan[~np.isnan(disc_nan)] = np.nan for bin in range(bins): disc_nan[rows, cols, bin] = GSW_distSeasonal_disc[:, bin] GSW_distSeasonal_disc = disc_nan del disc_nan edges = [] for arr in discretizer.bin_edges_: for edge in arr[:-1]: edges.append(edge) all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0) # Elevation bins = 5 discretizer = KBinsDiscretizer(n_bins=bins, encode='onehot-dense', strategy='quantile') elevation_disc = discretizer.fit_transform(data_vector[:, feat_list_all.index('elevation')].reshape(-1, 1)) for i in range(bins): feats_disc.append('elevation' + str(i + 1)) disc_nan = np.zeros(data[:, :, 0:bins].shape) disc_nan[~np.isnan(disc_nan)] = np.nan for bin in range(bins): disc_nan[rows, cols, bin] = elevation_disc[:, bin] elevation_disc = disc_nan del disc_nan edges = [] for arr in discretizer.bin_edges_: for edge in arr[:-1]: edges.append(edge) all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0) # Slope bins = 5 discretizer = KBinsDiscretizer(n_bins=bins, encode='onehot-dense', strategy='quantile') slope_disc = discretizer.fit_transform(data_vector[:, feat_list_all.index('slope')].reshape(-1, 1)) for i in range(bins): feats_disc.append('slope' + str(i + 1)) disc_nan = np.zeros(data[:, :, 0:bins].shape) disc_nan[~np.isnan(disc_nan)] = np.nan for bin in range(bins): disc_nan[rows, cols, bin] = slope_disc[:, bin] slope_disc = disc_nan del disc_nan edges = [] for arr in discretizer.bin_edges_: for edge in arr[:-1]: edges.append(edge) all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0) # TWI bins = 5 discretizer = KBinsDiscretizer(n_bins=bins, encode='onehot-dense', strategy='quantile') twi_disc = discretizer.fit_transform(data_vector[:, feat_list_all.index('twi')].reshape(-1, 1)) for i in range(bins): feats_disc.append('twi' + str(i + 1)) disc_nan = np.zeros(data[:, :, 0:bins].shape) disc_nan[~np.isnan(disc_nan)] = np.nan for bin in range(bins): disc_nan[rows, cols, bin] = twi_disc[:, bin] twi_disc = disc_nan del disc_nan edges = [] for arr in discretizer.bin_edges_: for edge in arr[:-1]: edges.append(edge) all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0) # SPI bins = 5 discretizer = KBinsDiscretizer(n_bins=bins, encode='onehot-dense', strategy='quantile') spi_disc = discretizer.fit_transform(data_vector[:, feat_list_all.index('spi')].reshape(-1, 1)) for i in range(bins): feats_disc.append('spi' + str(i + 1)) disc_nan = np.zeros(data[:, :, 0:bins].shape) disc_nan[~np.isnan(disc_nan)] = np.nan for bin in range(bins): disc_nan[rows, cols, bin] = spi_disc[:, bin] spi_disc = disc_nan del disc_nan edges = [] for arr in discretizer.bin_edges_: for edge in arr[:-1]: edges.append(edge) all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0) # STI bins = 2 discretizer = KBinsDiscretizer(n_bins=bins, encode='onehot-dense', strategy='quantile') sti_disc = discretizer.fit_transform(data_vector[:, feat_list_all.index('sti')].reshape(-1, 1)) for i in range(bins): feats_disc.append('sti' + str(i + 1)) disc_nan = np.zeros(data[:, :, 0:bins].shape) disc_nan[~np.isnan(disc_nan)] = np.nan for bin in range(bins): disc_nan[rows, cols, bin] = sti_disc[:, bin] sti_disc = disc_nan del disc_nan edges = [] for arr in discretizer.bin_edges_: for edge in arr[:-1]: edges.append(edge) all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0) # Curve (flat, convex, concave) convex = np.zeros((data_vector.shape[0],)) concave = np.zeros((data_vector.shape[0],)) flat = np.zeros((data_vector.shape[0],)) convex[np.where(data_vector[:, feat_list_all.index('curve')] < 0)] = 1 concave[np.where(data_vector[:, feat_list_all.index('curve')] > 0)] = 1 flat[np.where(data_vector[:, feat_list_all.index('curve')] == 0)] = 1 names = ['convex', 'concave', 'flat'] bins = len(names) for name in names: feats_disc.append(name) curve = np.column_stack([convex, concave, flat]) shape = data[:, :, 0:curve.shape[1]].shape disc_nan = np.zeros(shape) disc_nan[~np.isnan(disc_nan)] = np.nan for bin in range(bins): disc_nan[rows, cols, bin] = curve[:, bin] curve = disc_nan del disc_nan, convex, concave, flat edges = [] for arr in discretizer.bin_edges_: for edge in arr[:-1]: edges.append(edge) all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0) # Aspect (north, northeast, northwest, south, southeast, southwest, east, west) north = np.zeros((data_vector.shape[0],)) northeast = np.zeros((data_vector.shape[0],)) east = np.zeros((data_vector.shape[0],)) southeast = np.zeros((data_vector.shape[0],)) south = np.zeros((data_vector.shape[0],)) southwest = np.zeros((data_vector.shape[0],)) west = np.zeros((data_vector.shape[0],)) northwest = np.zeros((data_vector.shape[0],)) north[np.where(np.logical_and.reduce((data_vector[:, feat_list_all.index('aspect')] >= 337.5, data_vector[:, feat_list_all.index('aspect')] < 22.5)))] = 1 northeast[np.where(np.logical_and.reduce((data_vector[:, feat_list_all.index('aspect')] >= 22.5, data_vector[:, feat_list_all.index('aspect')] < 67.5)))] = 1 east[np.where(np.logical_and.reduce((data_vector[:, feat_list_all.index('aspect')] >= 67.5, data_vector[:, feat_list_all.index('aspect')] < 112.5)))] = 1 southeast[np.where(np.logical_and.reduce((data_vector[:, feat_list_all.index('aspect')] >= 157.5, data_vector[:, feat_list_all.index('aspect')] < 157.5)))] = 1 south[np.where(np.logical_and.reduce((data_vector[:, feat_list_all.index('aspect')] >= 202.5, data_vector[:, feat_list_all.index('aspect')] < 202.5)))] = 1 southwest[np.where(np.logical_and.reduce((data_vector[:, feat_list_all.index('aspect')] >= 247.5, data_vector[:, feat_list_all.index('aspect')] < 247.5)))] = 1 west[np.where(np.logical_and.reduce((data_vector[:, feat_list_all.index('aspect')] >= 292.5, data_vector[:, feat_list_all.index('aspect')] < 337.5)))] = 1 northwest[np.where(np.logical_and.reduce((data_vector[:, feat_list_all.index('aspect')] >= 337.5, data_vector[:, feat_list_all.index('aspect')] < 360.5)))] = 1 names = ['north', 'northeast', 'east', 'southeast', 'south', 'southwest', 'west', 'northwest'] bins = len(names) for name in names: feats_disc.append(name) aspect = np.column_stack([north, northeast, east, southeast, south, southwest, west, northwest]) shape = data[:, :, 0:aspect.shape[1]].shape disc_nan = np.zeros(shape) disc_nan[~np.isnan(disc_nan)] = np.nan for bin in range(bins): disc_nan[rows, cols, bin] = aspect[:, bin] aspect = disc_nan del disc_nan, north, northeast, east, southeast, south, southwest, west, northwest edges = [] for arr in discretizer.bin_edges_: for edge in arr[:-1]: edges.append(edge) all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0) # Get original discrete features orig_disc_inds = [] for feat in non_cts_feats: orig_disc_inds.append(feat_list_all.index(feat)) orig_disc_data = data[:, :, orig_disc_inds] # Combine with new discrete features new_disc_data = np.dstack([GSW_distSeasonal_disc, elevation_disc, slope_disc, twi_disc, spi_disc, sti_disc, curve, aspect]) data = np.dstack([new_disc_data, orig_disc_data]) del orig_disc_data, new_disc_data edges = [] for arr in discretizer.bin_edges_: for edge in arr[:-1]: edges.append(edge) all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0) # Combine all edges and features feature_edges = pd.concat([all_edges, pd.DataFrame(data=feats_disc)], axis=1) feature_edges.columns = ['edge', 'feature'] # If a feat has only zeros or 1s in test OR train set, it is removed from both # Check train set clouds = np.load(clouds_dir / '{0}'.format(img + '_clouds.npy')) clouds[np.isnan(data[:, :, 0])] = np.nan cloudmask = np.less(clouds, np.nanpercentile(clouds, pctl), where=~np.isnan(clouds)) data_train = data.copy() data_train[cloudmask] = -999999 data_train[data_train == -999999] = np.nan data_vector_train = data_train.reshape([data.shape[0] * data_train.shape[1], data_train.shape[2]]) data_vector_train = data_vector_train[~np.isnan(data_vector_train).any(axis=1)] train_std = data_vector_train[:, 0:data_vector_train.shape[1] - 2].std(0) del data_train, data_vector_train # Check test set clouds = np.load(clouds_dir / '{0}'.format(img + '_clouds.npy')) clouds[np.isnan(data[:, :, 0])] = np.nan cloudmask = np.less(clouds, np.nanpercentile(clouds, pctl), where=~np.isnan(clouds)) data_test = data.copy() data_test[cloudmask] = -999999 data_test[data_test == -999999] = np.nan data_vector_test = data_test.reshape([data.shape[0] * data_test.shape[1], data_test.shape[2]]) data_vector_test = data_vector_test[~np.isnan(data_vector_test).any(axis=1)] test_std = data_vector_test[:, 0:data_vector_test.shape[1] - 2].std(0) del data_test, data_vector_test remove_inds = [] if 0 in train_std.tolist(): zero_inds = np.where(train_std == 0)[0].tolist() for ind in zero_inds: remove_inds.append(ind) if 0 in test_std.tolist(): zero_inds = np.where(test_std == 0)[0].tolist() for ind in zero_inds: remove_inds.append(ind) remove_inds = np.unique(remove_inds).tolist() # Mask clouds clouds = np.load(clouds_dir / '{0}'.format(img + '_clouds.npy')) clouds[np.isnan(data[:, :, 0])] = np.nan if test: cloudmask = np.greater(clouds, np.nanpercentile(clouds, pctl), where=~np.isnan(clouds)) if not test: cloudmask = np.less(clouds, np.nanpercentile(clouds, pctl), where=~np.isnan(clouds)) # And mask clouds data[cloudmask] = -999999 data[data == -999999] = np.nan # Get indices of non-nan values. These are the indices of the original image array nans = np.sum(data, axis=2) data_ind = np.where(~np.isnan(nans)) # Create data vector data_vector = data.reshape([data.shape[0] * data.shape[1], data.shape[2]]) data_vector = data_vector[~np.isnan(data_vector).any(axis=1)] feat_list_stack = feats_disc + feat_list_all remove_feats = [feat_list_stack[ind] for ind in remove_inds] data_vector = np.delete(data_vector, remove_inds, axis=1) feat_keep = [x for x in feat_list_all if x not in remove_feats] feature_edges_keep = feature_edges[~feature_edges.feature.isin(remove_feats)] # Save feature class bin edges if test: filedir = data_path / batch / 'class_bins' / 'test' else: filedir = data_path / batch / 'class_bins' / 'train' try: filedir.mkdir() except FileExistsError: pass filename = filedir / '{}'.format('feature_edges_' + str(pctl) + '.csv') feature_edges_keep.to_csv(filename, index=False) return data, data_vector, data_ind, feat_keep, feature_edges_keep
def fit(self, data: pd.DataFrame, target='class', n_bins=3, strategy='quantile'): """ Induce a ruleset from the given set of instances. Parameters ---------- data : DataFrame Input training dataset used to induce the rules. target : str, default='class' Name of the attribute that represents class labels. n_bins : int, default=3 Number of bins that numeric attributes will be discretized to. strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile' Strategy used to define the widths of the bins. See `sklearn.preprocessing.KBinsDiscretizer` for more info. Attributes ---------- ruleset_ : list The list of rules induced from the dataset. target_ : str Name of the attribute that represents class labels. majority_ : any Label with the largest amount of instances. """ # discretize numerical attributes if there are any num_attr = data.select_dtypes(include=['number']).columns num_attr = num_attr.drop(target) if target in num_attr else num_attr if len(num_attr) > 0: data = data.copy() discretizer = KBinsDiscretizer(n_bins=n_bins, strategy=strategy, encode='ordinal') with warnings.catch_warnings(): # sometimes bins are so small that they are merged together # that is ok so we do not want to worry about that warning warnings.filterwarnings('ignore', category=UserWarning) data[num_attr] = discretizer.fit_transform( data[num_attr]).astype(np.int8) bin_edges = discretizer.bin_edges_ data = data.drop_duplicates(data.columns.drop(target)) # count how many instances each class has classes, counts = np.unique(data[target], return_counts=True) ruleset = [] all_attr = data.columns.drop(target) # prepare a progress bar if user has the module pbar = tqdm(total=len(data)) if tqdm is not None else None # main loop - generate rules for each class for label, unclass_count in zip(classes, counts): instance_set = data total_tp = unclass_count while unclass_count > 0: rule = Rule(label=label) unused_attr = list(all_attr) rule_coverage = instance_set precision = 0 # construct the rule by adding selectors to the antecedent while len(unused_attr) > 0 and precision != 1: precision, best_tp = 0, 0 best_attr, best_value = None, None best_selector = None # look for the best attribute-value pair in terms of precision for attr in unused_attr: for value in rule_coverage[attr].unique(): selector = rule_coverage[attr].values == value tp = (rule_coverage[target].values[selector] == label).sum() tp_fp = selector.sum() selector_precision = tp / tp_fp if selector_precision > precision or \ selector_precision == precision and tp > best_tp: precision = selector_precision best_attr, best_value = attr, value best_tp = tp best_selector = selector rule_coverage = rule_coverage[best_selector] unused_attr.remove(best_attr) # append the best selector to the antecedent of the rule if best_attr in num_attr: idx = num_attr.get_loc(best_attr) edges = bin_edges[idx] if best_value == 0: # lower interval rule.antecedent.append((best_attr, '<', edges[1])) elif best_value == len(edges) - 2: # higher interval rule.antecedent.append( (best_attr, '>=', edges[-2])) else: # anything inbetween rule.antecedent.append( (best_attr, '>=', edges[best_value])) rule.antecedent.append( (best_attr, '<', edges[best_value + 1])) else: rule.antecedent.append((best_attr, '==', best_value)) rule.label = label rule.precision = precision rule.recall = best_tp / total_tp ruleset.append(rule) instance_set = instance_set.drop(rule_coverage.index) unclass_count -= best_tp # update progress bar if pbar is not None: pbar.update(best_tp) self.ruleset_ = ruleset self.target_ = target self.majority_ = data[target].mode().values[0] return self
ALPHA = 1 ALPHA_DECAY = 1 # 1 0.9 # 0.9999 #0.9975 ALPHA_MIN = 0.0001 ssc = StandardScaler() ssc = MinMaxScaler() digitizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='kmeans') data = load_breast_cancer() train_data = data.data y = data.target labels = y z = np.where(y == 1, 100, -100) #oe = OneHotEncoder(handle_unknown='ignore', sparse=False) #train_labels= oe.fit_transform(y.reshape(-1,1)) #digitizer followed by MinMaxScaler train_data = digitizer.fit_transform(train_data) ssc = MinMaxScaler() train_data = ssc.fit_transform(train_data) class DQN(): def __init__(self, input_size, output_size, data, labels): self.model = self.create_model(input_size, output_size) self.target_model = self.create_model(input_size, output_size) self.obs = data self.labels = labels self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE) pass def create_model(self, input_size, output_size):
def _prep_data(self, json): # read the file cmp = pd.read_csv(self.file) #save the user information user = json['Company name'] #merging input user details df_json = pd.DataFrame(json, index=[0]) df_json.drop(['Name'], axis=1, inplace=True) cmp_final = pd.concat([cmp, df_json], axis=0, sort=True).reset_index() cmp_final.fillna(0, inplace=True) #drop out the policy details cmp2 = cmp_final.drop(self.insurance_details, axis=1) #drop out variables not needed for model fitting cmp2 = cmp2.drop( ["Company name", "Total # of employees", "Business Start year"], axis=1) #feature extraction of address cmp2['Address_group'] = cmp2['Address'].astype(str).str[:2] cmp2.drop('Address', axis=1, inplace=True) binscategorical = [ 'Number of employees full time', 'Number of employees parttime', '# of years experience in Industry', 'Projected Annual Revenue', 'Projected Payroll for employers in next 12 months ' ] #binning continuous variables kbd = KBinsDiscretizer( n_bins=10, encode='ordinal', strategy='quantile') # read documentation for encode and strategy dfkbd = pd.DataFrame(kbd.fit_transform(cmp2[binscategorical]), columns=['kbd_' + x for x in binscategorical]) cmp_bins = pd.concat([cmp2, dfkbd], axis=1) cmp_bins = cmp_bins.drop(binscategorical, axis=1) #feature engineering for categorical variables- get_dummies categorical_new = [ 'Industry', 'Type of Ownership', 'Address_group', 'kbd_Number of employees full time', 'kbd_Number of employees parttime', 'kbd_# of years experience in Industry', 'kbd_Projected Annual Revenue', 'kbd_Projected Payroll for employers in next 12 months ' ] cmp_bins[categorical_new] = cmp_bins[categorical_new].astype(str) cmp_catdumm = pd.get_dummies(cmp_bins[categorical_new], drop_first=True) # dummy_na=True cmp_catdumm = pd.concat([cmp_bins, cmp_catdumm], axis=1) cmp_catdumm = cmp_catdumm.drop(categorical_new, axis=1) #Scaling numerical variables numericvars = ['Number of owners active ', 'Number of locations'] ss = StandardScaler(with_mean=True, with_std=True) cmp_catdummss = pd.DataFrame(ss.fit_transform( cmp_catdumm[numericvars]), columns=['ss_' + x for x in numericvars]) cmp_catdummss = pd.concat([cmp_catdumm, cmp_catdummss], axis=1) cmp_catdummss = cmp_catdummss.drop(numericvars, axis=1) return self._inference(cmp_final, cmp_catdummss, user)
def main(name, output_dir, transparent, context, style, palette, width, height, aspect, dpi, extension, seed): num_samples = 256 random_state = np.random.RandomState(seed) # preamble if height is None: height = width / aspect # height *= num_iterations # figsize = size(width, aspect) figsize = (width, height) suffix = f"{width*dpi:.0f}x{height*dpi:.0f}" rc = { "figure.figsize": figsize, "font.serif": ["Times New Roman"], "text.usetex": True, } sns.set(context=context, style=style, palette=palette, font="serif", rc=rc) output_path = Path(output_dir).joinpath(name) output_path.mkdir(parents=True, exist_ok=True) # / preamble benchmark = Branin() bounds = benchmark.get_bounds() (low, high), dim = from_bounds(bounds) x = random_state.uniform(low=low, high=high, size=(num_samples, dim)) X = np.expand_dims(x, axis=1) y = benchmark(x[::, 0], x[::, 1]) n_bins = 10 scaler = KBinsDiscretizer(n_bins=n_bins, encode="ordinal", strategy="quantile") z = 1 + scaler.fit_transform(y.reshape(-1, 1)).squeeze() frame = pd.DataFrame(data=x).assign(y=y, z=z) fig, ax = plt.subplots() pd.plotting.parallel_coordinates(frame, class_column="z", use_columns=False, colormap="turbo", sort_labels=False, linewidth=0.25, alpha=0.7, ax=ax) ax.legend() plt.tight_layout() for ext in extension: fig.savefig(output_path.joinpath(f"foo_{suffix}.{ext}"), dpi=dpi, transparent=transparent) plt.show() return 0
def main(use_simple_lr_pca_pipeline, kbins_strat, train_split, test_split, exclude_pca, hyperparameters, output_size, validation_size, n_process, precached_pkl, prestore_data, return_mode, use_simple_lin_reg_pca_pipeline, use_simple_lstm, discretize_age, kbins_encoding, num_epochs, num_pca_comp): if precached_pkl is not None: allData = pkl.load(open(precached_pkl, 'rb')) data = allData["data"] # clinical_txt_paths = precached_pkl["clinical_txt_paths"] ages = allData["ages"] testAges = allData["testAges"] testData = allData["testData"] # test_clinical_txt_paths = precached_pkl["test_clinical_txt_paths"] else: data, ages, clinical_txt_paths = get_data(split=train_split) testData, testAges, test_clinical_txt_paths = get_data( split=test_split) return_dict = Dict() if prestore_data: toStore = Dict() toStore.data = data toStore.ages = ages toStore.clinical_txt_paths = clinical_txt_paths toStore.testData = testData toStore.testAges = testAges toStore.test_clinical_txt_paths = test_clinical_txt_paths if return_mode == "age": pkl.dump(toStore, open("agePredictionData.pkl", 'wb')) elif return_mode == "bpm": pkl.dump(toStore, open("bpmPredictionData.pkl", 'wb')) return return_mode if discretize_age: kbins = KBinsDiscretizer(output_size, encode=kbins_encoding, strategy=kbins_strat) ages = np.array(ages).reshape(-1, 1) ages = kbins.fit_transform(ages) return_dict['kbins'] = kbins.bin_edges_ testAges = np.array(testAges).reshape(-1, 1) testAges = kbins.transform(testAges) print("KBins used! Edges are: {}".format(kbins.bin_edges_)) if use_simple_lstm: ageScaler = StandardScaler() ages = np.array(ages).reshape(-1, 1) ages = ageScaler.fit_transform(ages) testAges = np.array(testAges).reshape(-1, 1) testAges = ageScaler.transform(testAges) model = get_lstm() x = pad_sequences(data) model.fit(x, ages, epochs=num_epochs, validation_split=validation_size, callbacks=get_early_stopping()) testX = pad_sequences(testData) score = model.evaluate(testX, testAges) y_pred = model.predict(testX) ages = ageScaler.inverse_transform(ages) testAges = ageScaler.inverse_transform(testAges) mse = mean_squared_error(y_pred, testAges) r2 = r2_score(y_pred, testAges) print("MSE: {}".format(mse)) print("R2: {}".format(r2)) fn = "model_{}_epochs{}.h5".format(return_mode, num_epochs) model.save(fn) ex.add_artifact(fn) return score, mse, r2 if use_simple_lin_reg_pca_pipeline: ages = np.array(ages).reshape(-1, 1) testAges = np.array(testAges).reshape(-1, 1) data = np.stack(data).reshape(len(data), -1) testData = np.stack(testData).reshape(len(testData), -1) steps = [ ('pca', PCA(n_components=num_pca_comp)), ('scaler', StandardScaler()), ('lin_reg', LinearRegression()), ] if exclude_pca: steps = steps[1:] p = Pipeline(steps) cv = int(1 / validation_size) gridsearch = GridSearchCV(p, hyperparameters, scoring=make_scorer(r2_score), cv=cv, n_jobs=n_process) gridsearch.fit(data, ages) return_dict["gridsearch_best_estimator"] = gridsearch.best_estimator_ return_dict["best_cv_score"] = gridsearch.best_score_ print("best cv score was {}".format(gridsearch.best_score_)) best_pipeline = gridsearch.best_estimator_ best_pipeline.fit(data, ages) y_pred = best_pipeline.predict(data) y_pred[y_pred < 0] = 0 y_pred[y_pred > 90] = 90 print("train r^2 was {}".format(r2_score(ages, y_pred))) y_pred = best_pipeline.predict(testData) y_pred[y_pred < 0] = 0 y_pred[y_pred > 90] = 90 test_score = mean_squared_error(testAges, y_pred) print("test_score: {}".format(test_score)) print("test r^2 was {}".format(r2_score(testAges, y_pred))) return_dict["test_score"] = test_score pkl.dump(return_dict, open("predict_{}Exp.pkl".format(return_mode), 'wb')) ex.add_artifact("predict_{}Exp.pkl".format(return_mode)) return test_score, r2_score(testAges, y_pred) if use_simple_lr_pca_pipeline: data = np.stack(data).reshape(len(data), -1) testData = np.stack(testData).reshape(len(testData), -1) steps = [ ('pca', PCA(n_components=num_pca_comp)), ('scaler', StandardScaler()), ('lr', LogisticRegression()), ] if exclude_pca: steps = steps[1:] p = Pipeline(steps) cv = int(1 / validation_size) gridsearch = GridSearchCV(p, hyperparameters, scoring=make_scorer(r2_score), cv=cv, n_jobs=n_process) gridsearch.fit(data, ages) return_dict["gridsearch_best_estimator"] = gridsearch.best_estimator_ return_dict["best_cv_score"] = gridsearch.best_score_ print("best cv score was {}".format(gridsearch.best_score_)) best_pipeline = gridsearch.best_estimator_ best_pipeline.fit(data, ages) y_pred = best_pipeline.predict(data) print("train r^2 was {}".format(r2_score(ages, y_pred))) y_pred = best_pipeline.predict(testData) test_score = f1_score(testAges, y_pred, average="weighted") y_pred_orig = kbins.inverse_transform(y_pred.reshape(-1, 1)) test_ages_orig = kbins.inverse_transform(testAges.reshape(-1, 1)) print("test r^2 was {}".format(r2_score(testAges, y_pred))) print("test mse was {}".format( mean_squared_error(test_ages_orig, y_pred_orig))) print("test_score: f1 {}".format(test_score)) print("test_score: accuracy {}".format(accuracy_score( testAges, y_pred))) return_dict["test_score"] = test_score pkl.dump(return_dict, open("predict_{}Exp.pkl".format(return_mode), 'wb')) ex.add_artifact("predict_{}Exp.pkl".format(return_mode)) return test_score raise Exception("Valid config not set")
# In[539]: q1() # ## Questão 2 # # Discretizando a variável `Pop_density` em 10 intervalos com `KBinsDiscretizer`, seguindo o encode `ordinal` e estratégia `quantile`, quantos países se encontram acima do 90º percentil? Responda como um único escalar inteiro. # In[540]: #https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html discretizer = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile") pop_density_bins = discretizer.fit_transform(countries[["Pop_density"]]) pop_density_bins[:5] # In[541]: np.unique(pop_density_bins) # In[542]: def q2(): # Retorne aqui o resultado da questão 2. return int((pop_density_bins >= 9).sum())
## impute missing values with mean and trasform to test_instance col_names = train_set.columns imputer = SimpleImputer(strategy='mean') train_set = imputer.fit_transform(train_set) test_instance = imputer.transform(test_instance.values.reshape(1, -1)) ## normalize features scaler = StandardScaler() train_set = scaler.fit_transform(train_set) test_instance = scaler.transform(test_instance) ## discretize features disc = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='quantile') train_set = disc.fit_transform(train_set) test_instance = disc.transform(test_instance) ## re-assign colnames to train and test (convert back to dataframe and series) train_set = pd.DataFrame(train_set, columns = col_names) test_instance = pd.Series(test_instance[0], index=col_names) ## second drop constant columns to speedup feature-selection train_set = train_set.loc[:, train_set.nunique() != 1] test_instance = test_instance[train_set.columns] ## feature-selection for each target columns selected_features = utils.anova_feature_selection(train_set, targets[target_grp], pval_threshold=0.01) ## union selected features for all targets union_selected_features = reduce(np.union1d, tuple([feature_list for _, feature_list in selected_features.items()]))
from sklearn.preprocessing import KBinsDiscretizer df = pd.read_csv(open('../data/magic.csv', 'rb'), header=None) est = KBinsDiscretizer(n_bins=6, encode='ordinal', strategy='quantile') cat_df = pd.DataFrame( est.fit_transform(df[df.columns[:10]]), dtype='int' ) cat_df['TARGET'] = df[10] cat_df = cat_df.apply(lambda c: c.astype('category')) import pickle pickle.dump(cat_df, open('magic.pkl', 'wb'))
countries.dtypes # ## Inicia sua análise a partir daqui # In[30]: # Sua análise começa aqui. list(countries.Region.sort_values().unique()) # In[33]: discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') bin_pop_density = discretizer.fit_transform(countries[['Pop_density']]) int(sum(bin_pop_density[:, 0] == 9)) # In[36]: encoded = pd.get_dummies(countries[['Region', 'Climate']].fillna('')) int(encoded.shape[1]) # num_pipeline = Pipeline(steps = [ # ("imputer", SimpleImputer(strategy="median")), # ("standart_scaler", StandardScaler()) # ]) # # numeric_features = countries.select_dtypes(include=['float64', 'int64']) # num_pipeline.fit(numeric_features) # test_country_transform = num_pipeline.transform([test_country[2:]])
def q2(): # Retorne aqui o resultado da questão 2. discretizer = KBinsDiscretizer(n_bins = 10, encode = 'ordinal', strategy = 'quantile') discretizer_pop = discretizer.fit_transform(countries[['Pop_density']]) above_p90 = discretizer.bin_edges_[0][9] return int(countries[countries['Pop_density'] >= above_p90]['Pop_density'].count())
# We recall that a way to accelerate the gradient boosting is to reduce the # number of split considered within the tree building. One way is to bin the # data before to give them into the gradient boosting. A transformer called # `KBinsDiscretizer` is doing such transformation. Thus, we can pipeline # this preprocessing with the gradient boosting. # # We can first demonstrate the transformation done by the `KBinsDiscretizer`. # %% import numpy as np from sklearn.preprocessing import KBinsDiscretizer discretizer = KBinsDiscretizer(n_bins=256, encode="ordinal", strategy="quantile") X_trans = discretizer.fit_transform(X_train) X_trans # %% [markdown] # ```{note} # The code cell above will generate a couple of warnings. Indeed, for some of # the features, we requested too much bins in regard of the data dispersion for # those features. The too small bins will be removed. # ``` # We see that the discretizer transform the original data into an integer. # This integer represents the bin index when the distribution by quantile is # performed. We can check the number of bins per feature. # %% [len(np.unique(col)) for col in X_trans.T]
class WOEEncoder(BaseEstimator, TransformerMixin): """Weight of Evidence (WoE) encoder: encodes categorical features as a numerical vector using weight of evidence encoding. This is only supported with binary targets. Both the features and the target are assumed to be free of missing values, missing values should be handled separately before the encoding. A binning function can be provided to handle numerical features which are then binned first then encoded. Note that the sign of the weight of evidence values depends on the order in which the categories of the target column are detected. This does not affect the performance of any supervised model applied thereafter. See [1] for more details on WoE. Parameters ---------- binning: {'uniform', 'quantile', 'kmeans', None}, default=None What binning method to apply, no binning applied if set to None. This uses ScikitLearn's KBinsDiscretizer (see [2]). uniform: all bins in each feature have identical width. quantile: all bins in each feature have the same number of points. kmeans: values in each bin have the same nearest center of a 1D kmeans cluster. n_bins: int (default=10), greater than 2 Number of bins to use when binning is applied. alpha: float (default = 0.5), non-negative Regularization value to avoid numerical errors due to division by zero in the computation of the weight of evidence (e.g. in case the data points corresponding to one category of a feature all have the same target value). laplace: boolean (default = False) If alpha is positive, adds Laplace smoothing to the computation of the weight of evidence. Example ------- >>> import numpy as np >>> from sagemaker_sklearn_extension.preprocessing import WOEEncoder >>> np.random.seed(112) >>> N = 10 >>> y = np.random.choice([0, 1], size=N) >>> y array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0]) >>> sex = np.random.choice(['m', 'f'], size=N) >>> sex array(['m', 'f', 'm', 'm', 'f', 'm', 'f', 'm', 'm', 'm'], dtype='<U1') >>> WOEEncoder().fit_transform(sex.reshape(-1, 1), y) array([[ 1.06087196], [-2.35137526], [ 1.06087196], [ 1.06087196], [-2.35137526], [ 1.06087196], [-2.35137526], [ 1.06087196], [ 1.06087196], [ 1.06087196]]) >>> age = np.random.randint(low=25, high=95, size=N) >>> age array([54, 73, 76, 30, 53, 33, 28, 51, 62, 43]) >>> WOEEncoder(binning='quantile', n_bins=2).fit_transform(age.reshape(-1, 1), y) array([[-0.74193734], [-0.74193734], [-0.74193734], [ 0.69314718], [-0.74193734], [ 0.69314718], [ 0.69314718], [ 0.69314718], [-0.74193734], [ 0.69314718]]) Attributes ---------- binner_: estimator trained to bin numerical data if binning is not None. woe_pairs_: list of pairs (codex, woe) of size n_encoded_features The codex has the mapping feature_value => woe_index and woe has the weight of evidence values. References ---------- [1] https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html [2] https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html [3] https://en.wikipedia.org/wiki/Additive_smoothing """ def __init__(self, binning=None, n_bins=10, alpha=0.5, laplace=False): self.binning = binning self.n_bins = n_bins self.alpha = alpha self.laplace = laplace def _woe(self, x, count_y_0, mask_y_0, beta): """Return the categories for a feature vector `x` as well as the corresponding weight of evidence value for each of those categories. Parameters ---------- x: vector, shape (n_samples,) Feature vector to encode. count_y: list of length 2 List of counts for the number of observations with the first (resp. the second) target category. mask_y_0: vector, shape (n_samples,) Mask of observations with the first target category. beta: float Value to use for Laplace Smoothing (0 if laplace is False). """ cat_x = np.unique(x) mask_y_1 = np.logical_not(mask_y_0) count_y_1 = len(mask_y_0) - count_y_0 # Computation of the Weight of Evidence for a category c in cat_x and with # regularization α # # woe_c = log( { #(y==0 | c) + α / #(y==1 | c) + α } * # { #(y==1) + β / #(y==0) + β } ) # # where β = 2α if laplace == True, 0 otherwise. The second factor can be computed # once, call it `r10` then # # woe_c = log( r10 * ratio(c) ) # # where # # ratio(c) = { #(y==0 | c) + α } / { #(y==1 | x==c) + α } # def ratio(c): x_c = x == c # retrieve the number of (y == 0 | x == c) and same for y == 1 y_0_c = sum(np.logical_and(mask_y_0, x_c)) y_1_c = sum(np.logical_and(mask_y_1, x_c)) # compute the ratio with regularization for 0 events return (y_0_c + self.alpha) / (y_1_c + self.alpha) # computation of woe possibly using Laplace smoothing (beta factor) r10 = (count_y_1 + beta) / (count_y_0 + beta) woe = np.log(r10 * np.array([ratio(c) for c in cat_x])) # encoder from unique values of x to index codex = {c: i for (i, c) in enumerate(cat_x)} return (codex, woe) def fit(self, X, y): """Fit Weight of Evidence encoder to `X` and `y`. Parameters ---------- X: array-like, shape (n_samples, n_features) The data to encode. y: array-like, shape (n_samples,) The binary target vector. Returns ------- self: WOEEncoder. """ # Validate parameters if self.binning: assert self.binning in ("uniform", "quantile", "kmeans"), WOEAsserts.BINNING assert self.n_bins >= 2, WOEAsserts.NBINS assert self.alpha >= 0, WOEAsserts.ALPHA # Validate data X, y = check_X_y(X, y) # Keep track of number of features encoded self._dim = X.shape[1] # recover the target categories and check there's only two cat_y = np.unique(y) # it should be == 2 but relax to <= 2 for a single-sample test by check_estimator assert len(cat_y) <= 2, WOEAsserts.BINARY # value for laplace smoothing beta = 2 * self.alpha * self.laplace # count the number of occurrences per target class and form the mask # for the rows for which y==0 mask_y_0 = y == cat_y[0] count_y_0 = sum(mask_y_0) if self.binning: self.binner_ = KBinsDiscretizer(n_bins=self.n_bins, strategy=self.binning, encode="ordinal") Xp = self.binner_.fit_transform(X) else: Xp = X # go over each column and compute the woe self.woe_pairs_ = list(map(lambda x: self._woe(x, count_y_0, mask_y_0, beta), Xp.T)) return self def transform(self, X): """Transform each column of `X` using the Weight-of-Evidence encoding. Returns ------- X_encoded: array, shape (n_samples, n_encoded_features) Array with each of the encoded columns. """ # check is fitted check_is_fitted(self, "woe_pairs_") # check input X = check_array(X) if X.shape[1] != self._dim: raise ValueError(f"The input dimension is {X.shape[1]} instead of the expected {self._dim}") if self.binning: Xp = self.binner_.transform(X) else: Xp = X Xe = np.zeros(Xp.shape) for (i, x) in enumerate(Xp.T): codex, woe = self.woe_pairs_[i] # check that the data to encode doesn't have classes yet unseen assert all([e in codex.keys() for e in np.unique(x)]), WOEAsserts.UNSEEN_CAT # construct the encoded column by inverting the codex, if the category # is not recognised (not a key of the codex), a np.nan is inputted Xe[:, i] = np.array([woe[codex[xi]] for xi in x]) return Xe def fit_transform(self, X, y): return self.fit(X, y).transform(X) def _more_tags(self): return {"X_types": ["categorical"], "binary_only": True, "requires_y": True}
def q2(): discretizar = KBinsDiscretizer(n_bins=10, encode="ordinal",strategy="quantile") intervalo = discretizar.fit_transform(countries[["Pop_density"]]) resposta = len(intervalo) - (0.9 * len(intervalo)) return ceil(resposta)
########################################## if norm_target == 1: #Target normalization for continuous values target_np = scale(target_np) if norm_features == 1: #Feature normalization for continuous values data_np = scale(data_np) if binning == 1: #Discretize Target variable with KBinsDiscretizer enc = KBinsDiscretizer( n_bins=[bin_cnt], encode='ordinal', strategy='quantile' ) #Strategy here is important, quantile creating equal bins, but kmeans prob being more valid "clusters" target_np_bin = enc.fit_transform(target_np.reshape(-1, 1)) #Get Bin min/max temp = [[] for x in range(bin_cnt + 1)] for i in range(len(target_np)): for j in range(bin_cnt): if target_np_bin[i] == j: temp[j].append(target_np[i]) for j in range(bin_cnt): print('Bin', j, ':', min(temp[j]), max(temp[j]), len(temp[j])) print('\n') #Convert Target array back to correct shape target_np = np.ravel(target_np_bin)
# We recall that a way of accelerating the gradient boosting is to reduce the # number of split considered within the tree building. One way is to bin the # data before to give them into the gradient boosting. A transformer called # `KBinsDiscretizer` is doing such transformation. Thus, we can pipeline # this preprocessing with the gradient boosting. # # We can first demonstrate the transformation done by the `KBinsDiscretizer`. # %% import numpy as np from sklearn.preprocessing import KBinsDiscretizer discretizer = KBinsDiscretizer(n_bins=256, encode="ordinal", strategy="quantile") data_trans = discretizer.fit_transform(data) data_trans # %% [markdown] # ```{note} # The code cell above will generate a couple of warnings. Indeed, for some of # the features, we requested too much bins in regard of the data dispersion # for those features. The smallest bins will be removed. # ``` # We see that the discretizer transforms the original data into an integer. # This integer represents the bin index when the distribution by quantile is # performed. We can check the number of bins per feature. # %% [len(np.unique(col)) for col in data_trans.T]
ax1.plot(X[:, 0], y, 'o', c='k') #其他图形选项 ax1.legend(loc="best") ax1.set_ylabel("Regression output") ax1.set_xlabel("Input feature") ax1.set_title("Result before discretization") plt.tight_layout() plt.show() from sklearn.preprocessing import KBinsDiscretizer #将数据分箱 enc = KBinsDiscretizer( n_bins=10 #分几类? , encode="onehot") #ordinal X_binned = enc.fit_transform(X) #encode模式"onehot":使用做哑变量方式做离散化 #之后返回一个稀疏矩阵(m,n_bins),每一列是一个分好的类别 #对每一个样本而言,它包含的分类(箱子)中它表示为1,其余分类中它表示为0 X.shape X_binned #使用pandas打开稀疏矩阵 import pandas as pd pd.DataFrame(X_binned.toarray()).head() #我们将使用分箱后的数据来训练模型,在sklearn中,测试集和训练集的结构必须保持一致,否则报错 LinearR_ = LinearRegression().fit(X_binned, y) LinearR_.predict(line) #line作为测试集 line.shape #测试 X_binned.shape #训练 #因此我们需要创建分箱后的测试集:按照已经建好的分箱模型将line分箱 line_binned = enc.transform(line)
def qualify(dataframe, name, quantity, strategy): est = KBinsDiscretizer(n_bins=[quantity], encode='ordinal', strategy=strategy) dataframe[name] = est.fit_transform(dataframe[[name]]) return dataframe
from sklearn.preprocessing import KBinsDiscretizer KBTT = KBinsDiscretizer(n_bins=10, encode="onehot-dense") X_ti = titanic.loc[:, ["pclass", "age", "sex"]] Y_ti = titanic.loc[:, ["survived"]] print(X_ti.info()) print(Y_ti.info()) """ 由上面的信息我们设计如下几个数据处理的任务 1:age这个数据列 只有633个 需要补完 使用平均数或者中位数 2:sex 与 pclass 这两个列 都是类别类型的 需要转化为数值特征 用0/1代替 #观察信息 发现age列有缺失值 对于这个 """ X_ti["age"].fillna(X_ti["age"].mean(), inplace=True) #False:创建一个副本,修改副本,原对象不变(缺省默认) True:直接修改原对象 print(KBTT.fit_transform(X_ti.loc[:, ["age"]])) print("######################################") print(X_ti.info()) print("######################################") Xtraint, Xtestt, Ytraint, Ytestt = train_test_split(X_ti, Y_ti, random_state=33, test_size=0.25) #数据特征转换 我们使用sciki-learn.feature_extraction中的特征转换器 from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer(sparse=False) # 括号中的意思就是不是稀疏性的 而是稠密型的 # 这个对象参数要求是字典dict 所以我们要转化一下 #to_dict()方法中 可以进行六种转换 可以选择六种的转换类型,分别对应于参数 # ‘dict'({column -> {index -> value}}这样的结构,data_dict[key1][key2]), ‘list'({column -> [values]}, data_list[keys][index]), # ‘series'({column -> Series(values)},data_series[key1][key2]或data_dict[key1]), # ‘split'({index -> [index], columns -> [columns], data -> [values]},data_split[‘index'],data_split[‘data'],data_split[‘columns']),
from sklearn.linear_model import LinearRegression from sklearn.preprocessing import KBinsDiscretizer from sklearn.tree import DecisionTreeRegressor print(__doc__) # construct the dataset rnd = np.random.RandomState(42) X = rnd.uniform(-3, 3, size=100) y = np.sin(X) + rnd.normal(size=len(X)) / 3 X = X.reshape(-1, 1) # transform the dataset with KBinsDiscretizer enc = KBinsDiscretizer(n_bins=10, encode='onehot') X_binned = enc.fit_transform(X) # predict with original dataset fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize=(10, 4)) line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1) reg = LinearRegression().fit(X, y) ax1.plot(line, reg.predict(line), linewidth=2, color='green', label="linear regression") reg = DecisionTreeRegressor(min_samples_split=3, random_state=0).fit(X, y) ax1.plot(line, reg.predict(line), linewidth=2, color='red', label="decision tree") ax1.plot(X[:, 0], y, 'o', c='k') ax1.legend(loc="best") ax1.set_ylabel("Regression output") ax1.set_xlabel("Input feature") ax1.set_title("Result before discretization")