예제 #1
0
def test_inverse_transform(strategy):
    X = np.random.RandomState(0).randn(100, 3)
    kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode='ordinal')
    Xt = kbd.fit_transform(X)
    assert_array_equal(Xt.max(axis=0) + 1, kbd.n_bins_)

    X2 = kbd.inverse_transform(Xt)
    X2t = kbd.fit_transform(X2)
    assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
    assert_array_equal(Xt, X2t)
예제 #2
0
def test_nonuniform_strategies(strategy, expected_2bins, expected_3bins):
    X = np.array([0, 1, 2, 3, 9, 10]).reshape(-1, 1)

    # with 2 bins
    est = KBinsDiscretizer(n_bins=2, strategy=strategy, encode='ordinal')
    Xt = est.fit_transform(X)
    assert_array_equal(expected_2bins, Xt.ravel())

    # with 3 bins
    est = KBinsDiscretizer(n_bins=3, strategy=strategy, encode='ordinal')
    Xt = est.fit_transform(X)
    assert_array_equal(expected_3bins, Xt.ravel())
예제 #3
0
def test_inverse_transform(strategy, encode):
    X = np.random.RandomState(0).randn(100, 3)
    kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
    Xt = kbd.fit_transform(X)
    X2 = kbd.inverse_transform(Xt)
    X2t = kbd.fit_transform(X2)
    if encode == 'onehot':
        assert_array_equal(Xt.todense(), X2t.todense())
    else:
        assert_array_equal(Xt, X2t)
    if 'onehot' in encode:
        Xt = kbd._encoder.inverse_transform(Xt)
        X2t = kbd._encoder.inverse_transform(X2t)

    assert_array_equal(Xt.max(axis=0) + 1, kbd.n_bins_)
    assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
예제 #4
0
def test_overwrite():
    X = np.array([0, 1, 2, 3])[:, None]
    X_before = X.copy()

    est = KBinsDiscretizer(n_bins=3, encode="ordinal")
    Xt = est.fit_transform(X)
    assert_array_equal(X, X_before)

    Xt_before = Xt.copy()
    Xinv = est.inverse_transform(Xt)
    assert_array_equal(Xt, Xt_before)
    assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
예제 #5
0
  # Using sum of squared distance (ssd)
import seaborn as sns
def ssd_plot(dataframe, col, start_k, end_k): # ssd_plot(df, "balance", 2, 7)
    res = pd.DataFrame(columns = ["k", "ssd"])
    for k in list(range(start_k, end_k)):
        model_clus = KMeans(n_clusters = k, max_iter=100)
        model_clus.fit(np.array(dataframe[col]).reshape((len(dataframe[col]),1)))
        res = res.append({"k": k, "ssd": model_clus.inertia_}, ignore_index=True)
    sns.lineplot(x="k", y="ssd", data=res)

# v5: Discretizing immediately using user-defined k
def discre_cols(dataframe, col_to_disc, k):
  discretizer = KBinsDiscretizer(n_bins=k, encode='ordinal', strategy='kmeans')
  for i in col_to_disc:
    res_col = np.array(dataframe[i]).reshape((len(dataframe[i]),1))
    res2_col = discretizer.fit_transform(res_col)
    dataframe[i+"_disc"] = res2_col
    dataframe = dataframe.drop([i], axis=1)
discre_cols(df, col_to_disc, 3)
  
## Using iloc to select rows
df.iloc[0:3] # Returns rows with index values 0, 1, 2
df = df.iloc[1:] # Select row 2 onwards
df.iloc[2:, -3:] # Returns from row 3 onwards, columns 3rd last to last
df.iloc[(df['Age'] < 30).values, [1, 3]] # Can only use integers for column reference

## Using loc to select specific rows based on condition(s)
# https://www.shanelynn.ie/select-pandas-dataframe-rows-and-columns-using-iloc-loc-and-ix/
df.set_index("last_name", inplace=True) # Setting column ["last_name"] as index
name = ["Andreas", "Veness"]
df.loc[name] # Returns df with index values "Andreas" and "Veness"
# Tranformación por la cual convertimos una variable continua en categórica

# #### Binarizer
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer.html
#
# Nota: me salteo el ejemplo ya que es muy simple

# ##### KBinsDiscretizer
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html

# +
enc = KBinsDiscretizer(n_bins=4, encode='ordinal')

_df = df[['weight']].dropna().reset_index(drop=True)
X_binned = enc.fit_transform(_df)
X_binned = pd.DataFrame(X_binned.astype(int), columns=['weight_bins'])
result = pd.concat([_df, X_binned], axis=1)

display(result.head(10))
print("Límites bins:", enc.bin_edges_)
# -

# ##### pd.qcut

# +
# mismo ejemplo con pandas
_df = df.copy()

result, bins = pd.qcut(_df['weight'], 4, labels=[0, 1, 2, 3], retbins=True)
_df['weight_bins'] = result
예제 #7
0
def test_inverse_transform(strategy, encode, expected_inv):
    kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
    Xt = kbd.fit_transform(X)
    Xinv = kbd.inverse_transform(Xt)
    assert_array_almost_equal(expected_inv, Xinv)
plt.ylabel("Average price")
plt.show()

columns = ['accommodates','bathrooms','bedrooms','guests_included','beds','price']
sb.heatmap(listings[columns].corr(),annot=True)
plt.show()

#Analysis of Rating for various Room types based on is loaction exact
sb.violinplot("room_type", "review_scores_rating", hue="is_location_exact", data=listings,palette='rainbow')
plt.show()

# binning the price column 

bins = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')

listings['price']=bins.fit_transform(listings[['price']])

print(listings['price'].unique())

# printing the number of values by count
print(listings["price"].value_counts())

plt.figure(figsize=(8,6))
sb.countplot(listings["price"])
plt.show()

#first converting the last review column to date farm and then subtracting it from todays date to calculate no of days
listings['last_review']= pd.to_datetime(listings['last_review']).dt.date

listings['no_days']= (pd.datetime.now().date()-listings['last_review']).dt.days
예제 #9
0
def analise_temporal_base(X,
                          X_perc=0.25,
                          coluna_target=None,
                          coluna_mes='mes',
                          dsvpd_thr=0.25,
                          nome_arquivo="analise_dados_mes",
                          MI=False,
                          Fast_Analysis=False,
                          log_cat=False,
                          log_fig=False,
                          transpor=False,
                          subs_miss_espec=True,
                          vetor_esp_missing=[-1, -9, -99, -999, -999999999]):
    """
    #Análise Temporal de Bases#
    Args:
        X: Base para analise (requerido pandas DataFrame).
        X_perc: Amostragem da base inicial que será utilizado (0 < X_perc <= 1). Análise fica mais rápida, caso realizado desta maneira.
        coluna_target: Coluna da base onde está o target, se None, análise por target não será feita.
        coluna_mes:  coluna onde os dados são separados mensalmente, padrão "NUM_MES_REF".
        dsvpd_thr: Limiar mensal (threshold) da variação de coeficiente (detector de problemas) 
        nome_arquivo: Nome do arquivo Excel que será salvo.
        MI: Mutual Information (true - retorna, False - não retorna)
        Fast_Analysis: Trabalhando com uma análise pontual e mais rápida sem tantos detalhes.
        log_cat: Logs em texto no Jupyter/terminal.
        log_fig: Plot de figuras no Jupyter ou não.
        transpor: Transpor informações nas planilhas
        subs_miss_espec: Substituição de missing especial por np.nan, sim ou não
        vetor_esp_missing: Vetor de Substituição de Missings Especiais, se subs_miss_espec == True
        
    Returns:
        Arquivo planilha Excel com dados consolidados.
    
    Author:
        Vinícius Ormenesse
    """
    warnings.filterwarnings('ignore')
    if str(type(X)) == "<class 'pandas.core.frame.DataFrame'>":
        writer = pd.ExcelWriter(nome_arquivo + '.xlsx', engine='xlsxwriter')
        if X_perc > 1:
            X_perc = 1
            print(
                'Percentual de amostragem incorreto, o percentual utilizado nesta análise será de 100% da amostra.'
            )
        elif X_perc <= 0:
            X_perc = 0.1
            print(
                'Percentual de amostragem incorreto, o percentual utilizado nesta análise será de 10% da amostra.'
            )
        if dsvpd_thr > 1:
            dsvpd_thr = 1
            print(
                'dsvpd_thr incorreto, o percentual utilizado nesta análise será de 100% da amostra.'
            )
        elif dsvpd_thr <= 0:
            dsvpd_thr = 0.1
            print(
                'dsvpd_thr incorreto, o percentual utilizado nesta análise será de 10% da amostra.'
            )
        X = X.sample(
            frac=X_perc
        )  #fracionamento da entrada, para que análise fique mais rápida.
        meses = X[coluna_mes].fillna(method='ffill').unique()
        meses.sort()
        #Transformando missing em verdadeiros MISSINGS.
        if subs_miss_espec:
            print('Transformando Missings Especiais')
            X = X.replace(vetor_esp_missing,
                          np.full(len(vetor_esp_missing), np.nan))
        #tratando colunas com categoricas
        categoricas = []
        for cols in X.columns:
            if cols is not coluna_mes:
                if str(X[cols].dtypes) == 'object':
                    if log_cat == True:
                        print("Ordem das classes de label para a coluna: " +
                              cols)
                        print('Antes:')
                        print(X[cols].unique())
                    categoricas.append(cols)
                    le = LabelEncoder()
                    X[cols] = le.fit_transform(X[cols].astype(str))
                    if log_cat == True:
                        print('Depois')
                        print(list(le.classes_))
        #Começando a análise para quem possui variável resposta.
        if coluna_target is not None:
            colunas_var = X.columns.tolist()
            colunas_var.remove(coluna_mes)
            colunas_var.remove(coluna_target)
            #fuer die variabeln
            """
                0 - desvpad
                1 - mean 
                2 - missing
                3 - outliers
                4 - Coefficient Variation
                5 - KS
                6 - MI
                7 - PSI
            """
            analises = []
            for i in range(0, 8):
                analises.append(pd.DataFrame([], index=colunas_var))
            #valores targets
            valores_targets = X[coluna_target].unique().tolist()

            #olhando para missing media e desvio padrao
            for i, mes in enumerate(meses):
                print("Calculando dados no mês: " + str(int(mes)))
                for valor in valores_targets:
                    valor = int(valor)
                    outlier = []
                    apsi = []
                    describe = X[colunas_var][(X[coluna_mes] == mes) & (
                        X[coluna_target] == valor)].describe().transpose()
                    pdstd = pd.DataFrame(describe['std'].values[:, None],
                                         columns=[str(mes) + "_" + str(valor)],
                                         index=describe.index)
                    pdmedia = pd.DataFrame(
                        describe['mean'].values[:, None],
                        columns=[str(mes) + "_" + str(valor)],
                        index=describe.index)
                    pdcv = pdstd / pdmedia  #coefficient variation
                    c_shape = X[colunas_var][(X[coluna_mes] == mes) & (
                        X[coluna_target] == valor)].shape[0]
                    pdmissing = pd.DataFrame(
                        (describe['count'].values / c_shape),
                        columns=[str(mes) + "_" + str(valor)],
                        index=describe.index)
                    for coluna in colunas_var:
                        out = X[coluna][(X[coluna_mes] == mes) & (
                            np.abs(X[coluna] - describe['mean'].loc[coluna]) >
                            (3 * X[coluna].mean() -
                             describe['std'].loc[coluna]))].count()
                        outlier.append(out)
                        if i != 0 and not Fast_Analysis:
                            apsi.append(
                                calculate_psi(
                                    X[coluna][(X[coluna_mes] == mes) & (
                                        X[coluna_target] == valor)].dropna(),
                                    X[coluna]
                                    [(X[coluna_mes] == meses[i - 1])
                                     & (X[coluna_target] == valor)].dropna()))
                    pdpsi = pd.DataFrame(apsi,
                                         columns=[str(mes) + "_" + str(valor)],
                                         index=colunas_var)
                    pdoutlier = pd.DataFrame(
                        outlier,
                        columns=[str(mes) + "_" + str(valor)],
                        index=colunas_var)
                    analises[0] = pd.concat([analises[0], pdstd], axis=1)
                    analises[1] = pd.concat([analises[1], pdmedia], axis=1)
                    analises[2] = pd.concat([analises[2], pdmissing], axis=1)
                    analises[3] = pd.concat([analises[3], pdoutlier], axis=1)
                    analises[4] = pd.concat([analises[4], pdcv], axis=1)
                    if not Fast_Analysis:
                        analises[7] = pd.concat([analises[7], pdpsi], axis=1)
                #fazendo teste KS - Trabalhando com 1 ou mais classes.
                if not Fast_Analysis:
                    combinacoes_target = list(
                        itertools.combinations_with_replacement(
                            valores_targets, 2))
                    for c_t in combinacoes_target:
                        if c_t[0] != c_t[1]:
                            for mes in meses:
                                ksarray = []
                                for coluna in colunas_var:
                                    ks, pvalor = stats.ks_2samp(
                                        X[coluna][(X[coluna_mes] == mes) & (
                                            X[coluna_target] == c_t[0])],
                                        X[coluna][(X[coluna_mes] == mes) & (
                                            X[coluna_target] == c_t[1])])
                                    ksarray.append(ks)
                                pdks = pd.DataFrame(
                                    ksarray,
                                    columns=[str(mes) + "_" + str(str(c_t))],
                                    index=colunas_var)
                            analises[5] = pd.concat([analises[5], pdks],
                                                    axis=1)
            #Informação Mútua
            if MI:
                table_mi = X.copy()
                KB = KBinsDiscretizer(n_bins=10, encode='ordinal')
                colunas = []
                for coluna in X.columns:
                    if coluna not in categoricas and (coluna not in [
                            coluna_mes, coluna_target
                    ]):
                        colunas.append(coluna)
                table_mi[colunas] = KB.fit_transform(
                    table_mi[colunas].fillna(0))
                for mes in meses:
                    mi = mutual_info_classif(
                        table_mi[(table_mi[coluna_mes] == mes)].drop(
                            [coluna_mes, coluna_target], axis=1).fillna(0),
                        table_mi[coluna_target][(table_mi[coluna_mes] == mes)])
                    #mi /= np.max(mi) #melhor nao fazer essa análise
                    pdmi = pd.DataFrame(mi,
                                        columns=[str(mes)],
                                        index=X.drop(
                                            [coluna_mes, coluna_target],
                                            axis=1).columns)
                    analises[6] = pd.concat([analises[6], pdmi], axis=1)
                del table_mi, colunas, KB
            #Analisando Estatisticas no Extraídas
            lista_histogramas = set(
            )  #não vou querer imprimir isso várias vezes
            for i, analise in enumerate(analises):
                dsvpd = pd.DataFrame([], columns=['DESVPAD_FEATURE'])
                meanpd = pd.DataFrame([], columns=['MEAN_FEATURE'])
                for ind in analise.index:
                    dsvpd.loc[ind] = analise.loc[ind].std()
                    meanpd.loc[ind] = analise.loc[ind].mean()
                    if (dsvpd.loc[ind].values / meanpd.loc[ind].values >=
                            dsvpd_thr
                            or dsvpd.loc[ind].values / meanpd.loc[ind].values
                            <= 0.03) and (i != 2 or i != 3):
                        lista_histogramas.add(ind)
                analises[i] = pd.concat([analise, dsvpd, meanpd], axis=1)
            #analisar dados no excel
            sheets = [
                'DESVPAD', 'MEDIA', 'MISSING', 'OUTLIERS', 'COEFF VAR', 'KS',
                'MI', 'PSI'
            ]
            if not transpor:
                for i in range(0, 8):
                    analises[i].to_excel(writer, sheets[i])
                for sh in sheets:
                    worksheet = writer.sheets[sh]
                    for i in range(2, analises[0].shape[0] + 2):
                        coluna_excel = num_to_col_letters(i)
                        worksheet.conditional_format(
                            'B' + str(i) + ':' +
                            num_to_col_letters(analises[0].shape[1] - 1) +
                            str(i), {
                                'type': '3_color_scale',
                                'min_type': 'percent',
                                'mid_type': 'percent',
                                'max_type': 'percent'
                            })
            else:
                for i in range(0, 8):
                    analises[i].transpose().to_excel(writer, sheets[i])
                for sh in sheets:
                    worksheet = writer.sheets[sh]
                    for i in range(2, analises[0].shape[0] + 2):
                        coluna_excel = num_to_col_letters(i)
                        worksheet.conditional_format(
                            coluna_excel + '2' + ':' + coluna_excel +
                            str(1 + len(meses) * len(valores_targets)), {
                                'type': '3_color_scale',
                                'min_type': 'percent',
                                'mid_type': 'percent',
                                'max_type': 'percent'
                            })
            #Salvando Imagens
            if not Fast_Analysis:
                worksheet = writer.book.add_worksheet(name="Análises_Imagens")
                row = 0
                for ind in lista_histogramas:
                    col = 0
                    #histogramas
                    plt.figure(figsize=(10, 5))
                    for mes in meses:
                        try:
                            for valor in valores_targets:
                                sns.distplot(
                                    X[ind][(X[coluna_target] == valor)
                                           & (X[coluna_mes] == mes)].dropna())
                            plt.legend(valores_targets)
                            plt.title("Distribuição de " + str(ind) + " " +
                                      str(mes))
                            imgdata = BytesIO()
                            plt.savefig(imgdata, format="png")
                            imgdata.seek(0)
                            worksheet.insert_image(row, col, "",
                                                   {'image_data': imgdata})
                            col += 17
                            if not log_fig:
                                plt.close()
                            else:
                                plt.show()
                                plt.close()
                        except:
                            pass
                    row += 25
                col = 0
                #Matriz de covariancia
                worksheet = writer.book.add_worksheet(name="Covariância")
                correlations = X.corr()
                fig = plt.figure(
                    figsize=(1 + int(len(X.columns) * 0.2401 + 0.8911),
                             1 + int(len(X.columns) * 0.2401 + 0.8911)))
                ax = fig.add_subplot(111)
                #cax = ax.matshow(correlations,extent=[0,len(colunas_var),0,len(colunas_var)],vmin=-1, vmax=1)
                cax = ax.matshow(correlations, vmin=-1, vmax=1)
                fig.colorbar(cax)
                ticks = np.arange(0, len(X.columns), 1)
                ax.set_xticks(ticks - 0.5)
                ax.set_yticks(ticks - 0.5)
                ax.set_xticklabels(X.columns,
                                   rotation=90,
                                   ma='center',
                                   size='medium')
                ax.set_yticklabels(X.columns, ma='center', size='medium')
                imgdata = BytesIO()
                fig.savefig(imgdata, format="png")
                imgdata.seek(0)
                worksheet.insert_image(0, 0, "", {'image_data': imgdata})
                if not log_fig:
                    plt.close()
                else:
                    plt.show()
                    plt.close()
            #Finalmente salvando o arquivo e testando.
            print("Salvando os dados como:" + nome_arquivo + ".xlsx")
            writer.save()

        else:
            colunas_var = X.columns.tolist()
            colunas_var.remove(coluna_mes)
            #fuer die variabeln
            """
                0 - desvpad
                1 - mean 
                2 - missing
                3 - outliers
                4.- PSI
                5 - Coefficient Variation
            """
            analises = []
            for i in range(0, 6):
                analises.append(pd.DataFrame([], index=colunas_var))
            #olhando para missing media e desvio padrao
            for i, mes in enumerate(meses):
                print("Calculando dados no mês: " + str(int(mes)))
                outlier = []
                apsi = []
                describe = X[colunas_var][(
                    X[coluna_mes] == mes)].describe().transpose()
                pdstd = pd.DataFrame(describe['std'].values[:, None],
                                     columns=[str(mes)],
                                     index=describe.index)
                pdmedia = pd.DataFrame(describe['mean'].values[:, None],
                                       columns=[str(mes)],
                                       index=describe.index)
                pdcv = pdstd / pdmedia
                c_shape = X[colunas_var][(X[coluna_mes] == mes)].shape[0]
                pdmissing = pd.DataFrame((describe['count'].values / c_shape),
                                         columns=[str(mes)],
                                         index=describe.index)
                for coluna in colunas_var:
                    out = X[coluna][(X[coluna_mes] == mes) & (
                        np.abs(X[coluna] - describe['mean'].loc[coluna]) >
                        (3 * X[coluna].mean() -
                         describe['std'].loc[coluna]))].count()
                    outlier.append(out)
                    if i != 0:
                        apsi.append(
                            calculate_psi(
                                X[coluna][(X[coluna_mes] == mes)].dropna(),
                                X[coluna][(
                                    X[coluna_mes] == meses[i - 1])].dropna()))
                pdoutlier = pd.DataFrame(outlier,
                                         columns=[str(mes)],
                                         index=colunas_var)
                pdpsi = pd.DataFrame(apsi,
                                     columns=[str(mes)],
                                     index=colunas_var)
                analises[0] = pd.concat([analises[0], pdstd], axis=1)
                analises[1] = pd.concat([analises[1], pdmedia], axis=1)
                analises[2] = pd.concat([analises[2], pdmissing], axis=1)
                analises[3] = pd.concat([analises[3], pdoutlier], axis=1)
                analises[4] = pd.concat([analises[4], pdpsi], axis=1)
                analises[5] = pd.concat([analises[5], pdcv], axis=1)
            #Analisando Estatisticas no Extraídas
            lista_histogramas = set(
            )  #não vou querer imprimir isso várias vezes
            for i, analise in enumerate(analises):
                dsvpd = pd.DataFrame([], columns=['DESVPAD_FEATURE'])
                meanpd = pd.DataFrame([], columns=['MEAN_FEATURE'])
                for ind in analise.index:
                    dsvpd.loc[ind] = analise.loc[ind].std()
                    meanpd.loc[ind] = analise.loc[ind].mean()
                    if (dsvpd.loc[ind].values / meanpd.loc[ind].values >=
                            dsvpd_thr
                            or dsvpd.loc[ind].values / meanpd.loc[ind].values
                            <= 0.03) and (i != 2 or i != 3):
                        lista_histogramas.add(ind)
                analises[i] = pd.concat([analise, dsvpd, meanpd], axis=1)
            #analisar dados no excel
            sheets = [
                'DESVPAD', 'MEDIA', 'MISSING', 'OUTLIERS', 'PSI', 'COEFF VAR'
            ]
            if not transpor:
                for i in range(0, 6):
                    analises[i].to_excel(writer, sheets[i])
                for sh in sheets:
                    worksheet = writer.sheets[sh]
                    for i in range(2, analises[0].shape[0] + 2):
                        coluna_excel = num_to_col_letters(i)
                        worksheet.conditional_format(
                            'B' + str(i) + ':' +
                            num_to_col_letters(analises[0].shape[1] - 1) +
                            str(i), {
                                'type': '3_color_scale',
                                'min_type': 'percent',
                                'mid_type': 'percent',
                                'max_type': 'percent'
                            })
            else:
                for i in range(0, 6):
                    analises[i].transpose().to_excel(writer, sheets[i])
                for sh in sheets:
                    worksheet = writer.sheets[sh]
                    for i in range(2, analises[0].shape[0] + 2):
                        coluna_excel = num_to_col_letters(i)
                        worksheet.conditional_format(
                            coluna_excel + '2' + ':' + coluna_excel +
                            str(1 + len(meses)), {
                                'type': '3_color_scale',
                                'min_type': 'percent',
                                'mid_type': 'percent',
                                'max_type': 'percent'
                            })
            #Salvando Imagens
            if not Fast_Analysis:
                worksheet = writer.book.add_worksheet(name="Análises_Imagens")
                row = 0
                for ind in lista_histogramas:
                    col = 0
                    #histogramas
                    plt.figure(figsize=(10, 5))
                    try:
                        for mes in meses:
                            sns.distplot(
                                X[ind][(X[coluna_mes] == mes)].dropna())
                    except:
                        print('Erro plotar gráfico no índice', str(ind),
                              'no mes', str(mes))
                    plt.title("Distribuição de " + str(ind))
                    imgdata = BytesIO()
                    plt.savefig(imgdata, format="png")
                    imgdata.seek(0)
                    worksheet.insert_image(row, col, "",
                                           {'image_data': imgdata})
                    col += 17
                    if not log_fig:
                        plt.close()
                    else:
                        plt.show()
                        plt.close()
                    row += 25
                col = 0
                #Matriz de covariancia
                worksheet = writer.book.add_worksheet(name="Covariância")
                correlations = X.corr()
                fig = plt.figure(
                    figsize=(1 + int(len(X.columns) * 0.2401 + 0.8911),
                             1 + int(len(X.columns) * 0.2401 + 0.8911)))
                ax = fig.add_subplot(111)
                cax = ax.matshow(correlations, vmin=-1, vmax=1)
                fig.colorbar(cax)
                ticks = np.arange(0, len(X.columns), 1)
                ax.set_xticks(ticks - 0.5)
                ax.set_yticks(ticks - 0.5)
                ax.set_xticklabels(X.columns,
                                   rotation=90,
                                   ma='center',
                                   size='medium')
                ax.set_yticklabels(X.columns, ma='center', size='medium')
                imgdata = BytesIO()
                fig.savefig(imgdata, format="png")
                imgdata.seek(0)
                worksheet.insert_image(0, 0, "", {'image_data': imgdata})
                if not log_fig:
                    plt.close()
                else:
                    plt.show()
                    plt.close()
            #Salvando o resultado em uma planilha de excel para análise.
            print("Salvando os dados como:" + nome_arquivo + ".xlsx")
            writer.save()
    else:
        print(
            "Por favor, utilizar um dataframe pandas para utilizar essa função.\nNada foi feito."
        )
예제 #10
0
	def transform(self,X):

		discretizer = KBinsDiscretizer(n_bins=self.__num_bins, encode=self.__encoder, strategy=self.__strategy)
		return discretizer.fit_transform(X)
예제 #11
0
def q2():
    discretizer = KBinsDiscretizer(n_bins=10,
                                   encode="ordinal",
                                   strategy="quantile")
    pop_bins = discretizer.fit_transform(countries[['Pop_density']])
    return pop_bins[pop_bins == 9].shape[0]
예제 #12
0
def load_mimic(base_dir: str = './data/'):
    data = pd.read_csv(f'{base_dir}/mimic-ii/full_cohort_data.csv')
    # data.drop('hgb_first')
    fs = [
        'aline_flg',
        'gender_num',
        # 'hosp_exp_flg',
        # 'icu_exp_flg',
        # 'day_28_flg',
        # 'censor_flg',
        'sepsis_flg',
        'chf_flg',
        'afib_flg',
        'renal_flg',
        'liver_flg',
        'copd_flg',
        'cad_flg',
        'stroke_flg',
        'mal_flg',
        'resp_flg',
    ]
    features = fs
    data1 = data[fs].values
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    data1 = imp_mean.fit_transform(data1)

    f2 = fs.copy()
    f2.append('day_icu_intime')
    f2.append('service_unit')
    f2.append('day_28_flg')
    f2.append('hospital_los_day')
    f2.append('icu_exp_flg')
    f2.append('hosp_exp_flg')
    f2.append('censor_flg')
    f2.append('mort_day_censored')
    f2 = data.columns.difference(f2)
    data2 = data[f2].values
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    data2 = imp_mean.fit_transform(data2)
    scaler = MinMaxScaler((0, 1))
    data2 = scaler.fit_transform(data2)
    features = features + list(f2)
    est = KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='uniform')
    data2d = est.fit_transform(data2)
    f2d = []
    for feature in f2:
        # f2d.append(feature + '_VLOW')
        f2d.append(feature + '_LOW')
        f2d.append(feature + '_NORMAL')
        f2d.append(feature + '_HIGH')
        # f2d.append(feature + '_VHIGH')
    features = fs + f2d

    datax = np.hstack((data1, data2d))
    # datay = data['day_28_flg'].values
    # datay = (data['hospital_los_day']>6).values
    # datay = data['hosp_exp_flg'].values

    datay = (data['day_28_flg'].values + data['hosp_exp_flg'].values +
             data['icu_exp_flg'].values + (1 - data['censor_flg'].values)) > 0
    # datay = data['day_28_flg'].values

    # model = DecisionTreeClassifier(max_depth=3)
    # # model = RandomForestClassifier()
    # scores = cross_val_score(model, datax, datay, cv=10)
    # print(scores.mean())

    # datax = np.vstack((datax, datax[datay==1], datax[datay==1]))
    # datay = np.hstack((datay, datay[datay==1], datay[datay==1]))

    x = torch.FloatTensor(datax)
    y = one_hot(torch.tensor(datay).to(torch.long)).to(torch.float)
    return x, y, features
       [0],
       [0],
       [2],
       [3]])
'''

np.digitize(age, bins=[18])
'''
array([[0],
       [0],
       [1],
       [1],
       [1]])
'''

from sklearn.preprocessing import KBinsDiscretizer

#4구간으로 나누기
kb = KBinsDiscretizer(4, encode='ordinal', strategy='quantile')
kb.fit_transform(age)

#원-핫 인코딩 반환
kb = KBinsDiscretizer(4, encode='onehot-dense', strategy='quantile')
kb.fit_transform(age)

#동일한 길이의 구간
kb = KBinsDiscretizer(4, encode='onehot-dense', strategy='uniform')
kb.fit_transform(age)

kb.bin_edges_
#array([array([ 6.  , 20.75, 35.5 , 50.25, 65.  ])], dtype=object)
def q2():
    kbins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
    intervals = kbins.fit_transform(countries[['Pop_density']])
    
    return int((intervals >= 9).sum())
예제 #15
0
def load_vDem(base_dir='./data'):
    data = pd.read_csv(f'{base_dir}/vdem/V-Dem-CY-Core-v10.csv')
    data['country_name_year'] = data['country_name'] + '_' + data[
        'year'].astype(str)
    data_2000 = data[data['year'] > 2000].iloc[:, 12:-1].dropna(axis=1)

    high_level_indicators = [
        'v2x_polyarchy',
        # 'v2x_libdem',
        # 'v2x_partipdem',
        'v2x_delibdem',
        'v2x_egaldem'
    ]
    mid_level_indicators = [
        'v2x_api',
        'v2x_mpi',
        'v2x_freexp_altinf',
        'v2x_frassoc_thick',
        'v2x_suffr',
        'v2xel_frefair',
        'v2x_elecoff',
        # 'v2x_liberal',
        'v2xcl_rol',
        # 'v2x_jucon',
        # 'v2xlg_legcon',
        # 'v2x_partip',
        'v2x_cspart',
        # 'v2xdd_dd',
        # 'v2xel_locelec',
        # 'v2xel_regelec',
        'v2xdl_delib',
        'v2x_egal',
        'v2xeg_eqprotec',
        'v2xeg_eqaccess',
        'v2xeg_eqdr',
    ]

    # drop_list = ['codelow', 'codehigh', 'sd', 'osp', 'nr', 'mean']
    low_level_indicators = []
    for f in data_2000.columns:
        if f.endswith(
                '_ord'
        ) and f not in high_level_indicators and f not in mid_level_indicators:
            low_level_indicators.append(f)

    low_level_indicators_continuous = []
    for f in data_2000.columns:
        if f.endswith('_codehigh') or f.endswith('_codelow') and \
                f not in high_level_indicators and f not in mid_level_indicators:
            low_level_indicators_continuous.append(f)

    print(
        f'Main {len(high_level_indicators)} - Area {len(mid_level_indicators)} - Raw {len(low_level_indicators)}'
    )

    data_low_continuous = data_2000[low_level_indicators_continuous]

    data_low_raw = data_2000[low_level_indicators]
    one_hots = []
    for indicator in low_level_indicators:
        c = data_low_raw[indicator].values
        n_bins = int(c.max())
        kbin = KBinsDiscretizer(n_bins=n_bins,
                                encode='onehot-dense',
                                strategy='uniform')
        c1h = kbin.fit_transform(c.reshape(-1, 1))
        one_hots.append(c1h)

    new_indicator_names = []
    for clist, cname in zip(one_hots, low_level_indicators):
        if clist.shape[1] > 1:
            for i in range(clist.shape[1]):
                new_indicator_names.append(f'{cname}_{i}')
        else:
            new_indicator_names.append(f'{cname}')

    data_low = pd.DataFrame(np.hstack(one_hots), columns=new_indicator_names)
    data_mid = data_2000[mid_level_indicators] > 0.5
    data_high = data_2000[high_level_indicators].iloc[:, 0] > 0.5

    # data_mid = pd.DataFrame(np.hstack([data_low, data_mid]), columns=data_low.columns.append(data_mid.columns))

    # scores = cross_val_score(LogisticRegression(), data_mid.values, data_high.values, cv=10)
    # print(scores.mean())
    # scores = cross_val_score(DecisionTreeClassifier(), data_mid.values, data_high.values, cv=10)
    # print(scores.mean())
    # scores = cross_val_score(RandomForestClassifier(), data_mid.values, data_high.values, cv=10)
    # print(scores.mean())

    x = torch.FloatTensor(data_low.values)
    c = torch.FloatTensor(data_mid.values)
    y = one_hot(torch.tensor(data_high.values).to(torch.long)).to(torch.float)
    return x, c, y, data_mid.columns
예제 #16
0
def preprocessing_discrete(data_path, img, pctl, feat_list_all, batch, test):
    img_path = data_path / 'images' / img
    stack_path = img_path / 'stack' / 'stack.tif'

    # load cloudmasks
    clouds_dir = data_path / 'clouds'

    with rasterio.open(str(stack_path), 'r') as ds:
        data = ds.read()
        data = data.transpose((1, -1, 0))
        data[data == -999999] = np.nan
        data[np.isneginf(data)] = np.nan
        data_vector = data.reshape([data.shape[0] * data.shape[1], data.shape[2]])
        data_vector = data_vector[~np.isnan(data_vector).any(axis=1)]

    # Get indices of non-nan values
    nans = np.sum(data, axis=2)
    data_ind = np.where(~np.isnan(nans))
    rows, cols = zip(data_ind)

    # Discretize continuous features
    cts_feats = ['GSW_distSeasonal', 'aspect', 'curve', 'elevation', 'hand', 'slope', 'spi',
                 'twi', 'sti']
    non_cts_feats = ['developed', 'forest', 'planted', 'wetlands', 'openspace', 'carbonate', 'noncarbonate',
                     'akl_intrusive', 'silicic_resid', 'silicic_resid', 'extrusive_volcanic', 'colluvial_sed',
                     'glacial_till_clay', 'glacial_till_loam', 'glacial_till_coarse', 'glacial_lake_sed_fine',
                     'glacial_outwash_coarse', 'hydric', 'eolian_sed_coarse', 'eolian_sed_fine', 'saline_lake_sed',
                     'alluv_coastal_sed_fine', 'coastal_sed_coarse', 'GSW_perm', 'flooded']

    feats_disc = []
    all_edges = pd.DataFrame([])

    # GSW_distSeasonal
    bins = 5
    discretizer = KBinsDiscretizer(n_bins=bins, encode='onehot-dense', strategy='quantile')
    GSW_distSeasonal_disc = discretizer.fit_transform(
        data_vector[:, feat_list_all.index('GSW_distSeasonal')].reshape(-1, 1))
    for i in range(bins):
        feats_disc.append('GSW_distSeasonal_' + str(i + 1))

    disc_nan = np.zeros(data[:, :, 0:bins].shape)
    disc_nan[~np.isnan(disc_nan)] = np.nan
    for bin in range(bins):
        disc_nan[rows, cols, bin] = GSW_distSeasonal_disc[:, bin]

    GSW_distSeasonal_disc = disc_nan
    del disc_nan

    edges = []
    for arr in discretizer.bin_edges_:
        for edge in arr[:-1]:
            edges.append(edge)

    all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0)

    # Elevation
    bins = 5
    discretizer = KBinsDiscretizer(n_bins=bins, encode='onehot-dense', strategy='quantile')
    elevation_disc = discretizer.fit_transform(data_vector[:, feat_list_all.index('elevation')].reshape(-1, 1))
    for i in range(bins):
        feats_disc.append('elevation' + str(i + 1))

    disc_nan = np.zeros(data[:, :, 0:bins].shape)
    disc_nan[~np.isnan(disc_nan)] = np.nan
    for bin in range(bins):
        disc_nan[rows, cols, bin] = elevation_disc[:, bin]

    elevation_disc = disc_nan
    del disc_nan

    edges = []
    for arr in discretizer.bin_edges_:
        for edge in arr[:-1]:
            edges.append(edge)

    all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0)

    # Slope
    bins = 5
    discretizer = KBinsDiscretizer(n_bins=bins, encode='onehot-dense', strategy='quantile')
    slope_disc = discretizer.fit_transform(data_vector[:, feat_list_all.index('slope')].reshape(-1, 1))
    for i in range(bins):
        feats_disc.append('slope' + str(i + 1))

    disc_nan = np.zeros(data[:, :, 0:bins].shape)
    disc_nan[~np.isnan(disc_nan)] = np.nan
    for bin in range(bins):
        disc_nan[rows, cols, bin] = slope_disc[:, bin]

    slope_disc = disc_nan
    del disc_nan

    edges = []
    for arr in discretizer.bin_edges_:
        for edge in arr[:-1]:
            edges.append(edge)

    all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0)

    # TWI
    bins = 5
    discretizer = KBinsDiscretizer(n_bins=bins, encode='onehot-dense', strategy='quantile')
    twi_disc = discretizer.fit_transform(data_vector[:, feat_list_all.index('twi')].reshape(-1, 1))
    for i in range(bins):
        feats_disc.append('twi' + str(i + 1))

    disc_nan = np.zeros(data[:, :, 0:bins].shape)
    disc_nan[~np.isnan(disc_nan)] = np.nan
    for bin in range(bins):
        disc_nan[rows, cols, bin] = twi_disc[:, bin]

    twi_disc = disc_nan
    del disc_nan

    edges = []
    for arr in discretizer.bin_edges_:
        for edge in arr[:-1]:
            edges.append(edge)

    all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0)

    # SPI
    bins = 5
    discretizer = KBinsDiscretizer(n_bins=bins, encode='onehot-dense', strategy='quantile')
    spi_disc = discretizer.fit_transform(data_vector[:, feat_list_all.index('spi')].reshape(-1, 1))
    for i in range(bins):
        feats_disc.append('spi' + str(i + 1))

    disc_nan = np.zeros(data[:, :, 0:bins].shape)
    disc_nan[~np.isnan(disc_nan)] = np.nan
    for bin in range(bins):
        disc_nan[rows, cols, bin] = spi_disc[:, bin]

    spi_disc = disc_nan
    del disc_nan

    edges = []
    for arr in discretizer.bin_edges_:
        for edge in arr[:-1]:
            edges.append(edge)

    all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0)

    # STI
    bins = 2
    discretizer = KBinsDiscretizer(n_bins=bins, encode='onehot-dense', strategy='quantile')
    sti_disc = discretizer.fit_transform(data_vector[:, feat_list_all.index('sti')].reshape(-1, 1))
    for i in range(bins):
        feats_disc.append('sti' + str(i + 1))

    disc_nan = np.zeros(data[:, :, 0:bins].shape)
    disc_nan[~np.isnan(disc_nan)] = np.nan
    for bin in range(bins):
        disc_nan[rows, cols, bin] = sti_disc[:, bin]

    sti_disc = disc_nan
    del disc_nan

    edges = []
    for arr in discretizer.bin_edges_:
        for edge in arr[:-1]:
            edges.append(edge)

    all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0)

    # Curve (flat, convex, concave)
    convex = np.zeros((data_vector.shape[0],))
    concave = np.zeros((data_vector.shape[0],))
    flat = np.zeros((data_vector.shape[0],))
    convex[np.where(data_vector[:, feat_list_all.index('curve')] < 0)] = 1
    concave[np.where(data_vector[:, feat_list_all.index('curve')] > 0)] = 1
    flat[np.where(data_vector[:, feat_list_all.index('curve')] == 0)] = 1
    names = ['convex', 'concave', 'flat']
    bins = len(names)
    for name in names:
        feats_disc.append(name)

    curve = np.column_stack([convex, concave, flat])

    shape = data[:, :, 0:curve.shape[1]].shape
    disc_nan = np.zeros(shape)
    disc_nan[~np.isnan(disc_nan)] = np.nan
    for bin in range(bins):
        disc_nan[rows, cols, bin] = curve[:, bin]

    curve = disc_nan

    del disc_nan, convex, concave, flat

    edges = []
    for arr in discretizer.bin_edges_:
        for edge in arr[:-1]:
            edges.append(edge)

    all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0)

    # Aspect (north, northeast, northwest, south, southeast, southwest, east, west)
    north = np.zeros((data_vector.shape[0],))
    northeast = np.zeros((data_vector.shape[0],))
    east = np.zeros((data_vector.shape[0],))
    southeast = np.zeros((data_vector.shape[0],))
    south = np.zeros((data_vector.shape[0],))
    southwest = np.zeros((data_vector.shape[0],))
    west = np.zeros((data_vector.shape[0],))
    northwest = np.zeros((data_vector.shape[0],))

    north[np.where(np.logical_and.reduce((data_vector[:, feat_list_all.index('aspect')] >= 337.5,
                                          data_vector[:, feat_list_all.index('aspect')] < 22.5)))] = 1
    northeast[np.where(np.logical_and.reduce((data_vector[:, feat_list_all.index('aspect')] >= 22.5,
                                              data_vector[:, feat_list_all.index('aspect')] < 67.5)))] = 1
    east[np.where(np.logical_and.reduce((data_vector[:, feat_list_all.index('aspect')] >= 67.5,
                                         data_vector[:, feat_list_all.index('aspect')] < 112.5)))] = 1
    southeast[np.where(np.logical_and.reduce((data_vector[:, feat_list_all.index('aspect')] >= 157.5,
                                              data_vector[:, feat_list_all.index('aspect')] < 157.5)))] = 1
    south[np.where(np.logical_and.reduce((data_vector[:, feat_list_all.index('aspect')] >= 202.5,
                                          data_vector[:, feat_list_all.index('aspect')] < 202.5)))] = 1
    southwest[np.where(np.logical_and.reduce((data_vector[:, feat_list_all.index('aspect')] >= 247.5,
                                              data_vector[:, feat_list_all.index('aspect')] < 247.5)))] = 1
    west[np.where(np.logical_and.reduce((data_vector[:, feat_list_all.index('aspect')] >= 292.5,
                                         data_vector[:, feat_list_all.index('aspect')] < 337.5)))] = 1
    northwest[np.where(np.logical_and.reduce((data_vector[:, feat_list_all.index('aspect')] >= 337.5,
                                              data_vector[:, feat_list_all.index('aspect')] < 360.5)))] = 1
    names = ['north', 'northeast', 'east', 'southeast', 'south', 'southwest', 'west', 'northwest']
    bins = len(names)
    for name in names:
        feats_disc.append(name)

    aspect = np.column_stack([north, northeast, east, southeast, south, southwest, west, northwest])

    shape = data[:, :, 0:aspect.shape[1]].shape
    disc_nan = np.zeros(shape)
    disc_nan[~np.isnan(disc_nan)] = np.nan
    for bin in range(bins):
        disc_nan[rows, cols, bin] = aspect[:, bin]

    aspect = disc_nan

    del disc_nan, north, northeast, east, southeast, south, southwest, west, northwest

    edges = []
    for arr in discretizer.bin_edges_:
        for edge in arr[:-1]:
            edges.append(edge)

    all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0)

    # Get original discrete features
    orig_disc_inds = []
    for feat in non_cts_feats:
        orig_disc_inds.append(feat_list_all.index(feat))
    orig_disc_data = data[:, :, orig_disc_inds]

    # Combine with new discrete features
    new_disc_data = np.dstack([GSW_distSeasonal_disc, elevation_disc, slope_disc, twi_disc,
                               spi_disc, sti_disc, curve, aspect])
    data = np.dstack([new_disc_data, orig_disc_data])

    del orig_disc_data, new_disc_data

    edges = []
    for arr in discretizer.bin_edges_:
        for edge in arr[:-1]:
            edges.append(edge)

    all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0)

    # Combine all edges and features
    feature_edges = pd.concat([all_edges, pd.DataFrame(data=feats_disc)], axis=1)
    feature_edges.columns = ['edge', 'feature']

    # If a feat has only zeros or 1s in test OR train set, it is removed from both
    # Check train set
    clouds = np.load(clouds_dir / '{0}'.format(img + '_clouds.npy'))
    clouds[np.isnan(data[:, :, 0])] = np.nan
    cloudmask = np.less(clouds, np.nanpercentile(clouds, pctl), where=~np.isnan(clouds))
    data_train = data.copy()
    data_train[cloudmask] = -999999
    data_train[data_train == -999999] = np.nan
    data_vector_train = data_train.reshape([data.shape[0] * data_train.shape[1], data_train.shape[2]])
    data_vector_train = data_vector_train[~np.isnan(data_vector_train).any(axis=1)]
    train_std = data_vector_train[:, 0:data_vector_train.shape[1] - 2].std(0)
    del data_train, data_vector_train

    # Check test set
    clouds = np.load(clouds_dir / '{0}'.format(img + '_clouds.npy'))
    clouds[np.isnan(data[:, :, 0])] = np.nan
    cloudmask = np.less(clouds, np.nanpercentile(clouds, pctl), where=~np.isnan(clouds))
    data_test = data.copy()
    data_test[cloudmask] = -999999
    data_test[data_test == -999999] = np.nan
    data_vector_test = data_test.reshape([data.shape[0] * data_test.shape[1], data_test.shape[2]])
    data_vector_test = data_vector_test[~np.isnan(data_vector_test).any(axis=1)]
    test_std = data_vector_test[:, 0:data_vector_test.shape[1] - 2].std(0)
    del data_test, data_vector_test

    remove_inds = []
    if 0 in train_std.tolist():
        zero_inds = np.where(train_std == 0)[0].tolist()
        for ind in zero_inds:
            remove_inds.append(ind)

    if 0 in test_std.tolist():
        zero_inds = np.where(test_std == 0)[0].tolist()
        for ind in zero_inds:
            remove_inds.append(ind)

    remove_inds = np.unique(remove_inds).tolist()

    # Mask clouds
    clouds = np.load(clouds_dir / '{0}'.format(img + '_clouds.npy'))
    clouds[np.isnan(data[:, :, 0])] = np.nan
    if test:
        cloudmask = np.greater(clouds, np.nanpercentile(clouds, pctl), where=~np.isnan(clouds))
    if not test:
        cloudmask = np.less(clouds, np.nanpercentile(clouds, pctl), where=~np.isnan(clouds))

    # And mask clouds
    data[cloudmask] = -999999
    data[data == -999999] = np.nan

    # Get indices of non-nan values. These are the indices of the original image array
    nans = np.sum(data, axis=2)
    data_ind = np.where(~np.isnan(nans))

    # Create data vector
    data_vector = data.reshape([data.shape[0] * data.shape[1], data.shape[2]])
    data_vector = data_vector[~np.isnan(data_vector).any(axis=1)]

    feat_list_stack = feats_disc + feat_list_all
    remove_feats = [feat_list_stack[ind] for ind in remove_inds]
    data_vector = np.delete(data_vector, remove_inds, axis=1)
    feat_keep = [x for x in feat_list_all if x not in remove_feats]

    feature_edges_keep = feature_edges[~feature_edges.feature.isin(remove_feats)]

    # Save feature class bin edges
    if test:
        filedir = data_path / batch / 'class_bins' / 'test'
    else:
        filedir = data_path / batch / 'class_bins' / 'train'

    try:
        filedir.mkdir()
    except FileExistsError:
        pass

    filename = filedir / '{}'.format('feature_edges_' + str(pctl) + '.csv')
    feature_edges_keep.to_csv(filename, index=False)

    return data, data_vector, data_ind, feat_keep, feature_edges_keep
예제 #17
0
    def fit(self,
            data: pd.DataFrame,
            target='class',
            n_bins=3,
            strategy='quantile'):
        """
        Induce a ruleset from the given set of instances.

        Parameters
        ----------
        data : DataFrame
            Input training dataset used to induce the rules.
        
        target : str, default='class'
            Name of the attribute that represents class labels.

        n_bins : int, default=3
            Number of bins that numeric attributes will be discretized to.
        
        strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'
            Strategy used to define the widths of the bins.
            See `sklearn.preprocessing.KBinsDiscretizer` for more info.

        Attributes
        ----------
        ruleset_ : list
            The list of rules induced from the dataset.

        target_ : str
            Name of the attribute that represents class labels.

        majority_ : any
            Label with the largest amount of instances.
        """
        # discretize numerical attributes if there are any
        num_attr = data.select_dtypes(include=['number']).columns
        num_attr = num_attr.drop(target) if target in num_attr else num_attr
        if len(num_attr) > 0:
            data = data.copy()
            discretizer = KBinsDiscretizer(n_bins=n_bins,
                                           strategy=strategy,
                                           encode='ordinal')
            with warnings.catch_warnings():
                # sometimes bins are so small that they are merged together
                # that is ok so we do not want to worry about that warning
                warnings.filterwarnings('ignore', category=UserWarning)
                data[num_attr] = discretizer.fit_transform(
                    data[num_attr]).astype(np.int8)
            bin_edges = discretizer.bin_edges_

        data = data.drop_duplicates(data.columns.drop(target))

        # count how many instances each class has
        classes, counts = np.unique(data[target], return_counts=True)
        ruleset = []
        all_attr = data.columns.drop(target)

        # prepare a progress bar if user has the module
        pbar = tqdm(total=len(data)) if tqdm is not None else None

        # main loop - generate rules for each class
        for label, unclass_count in zip(classes, counts):
            instance_set = data
            total_tp = unclass_count

            while unclass_count > 0:
                rule = Rule(label=label)
                unused_attr = list(all_attr)
                rule_coverage = instance_set
                precision = 0

                # construct the rule by adding selectors to the antecedent
                while len(unused_attr) > 0 and precision != 1:
                    precision, best_tp = 0, 0
                    best_attr, best_value = None, None
                    best_selector = None

                    # look for the best attribute-value pair in terms of precision
                    for attr in unused_attr:
                        for value in rule_coverage[attr].unique():
                            selector = rule_coverage[attr].values == value
                            tp = (rule_coverage[target].values[selector] ==
                                  label).sum()
                            tp_fp = selector.sum()
                            selector_precision = tp / tp_fp
                            if selector_precision > precision or \
                               selector_precision == precision and tp > best_tp:
                                precision = selector_precision
                                best_attr, best_value = attr, value
                                best_tp = tp
                                best_selector = selector

                    rule_coverage = rule_coverage[best_selector]
                    unused_attr.remove(best_attr)

                    # append the best selector to the antecedent of the rule
                    if best_attr in num_attr:
                        idx = num_attr.get_loc(best_attr)
                        edges = bin_edges[idx]
                        if best_value == 0:  # lower interval
                            rule.antecedent.append((best_attr, '<', edges[1]))
                        elif best_value == len(edges) - 2:  # higher interval
                            rule.antecedent.append(
                                (best_attr, '>=', edges[-2]))
                        else:  # anything inbetween
                            rule.antecedent.append(
                                (best_attr, '>=', edges[best_value]))
                            rule.antecedent.append(
                                (best_attr, '<', edges[best_value + 1]))
                    else:
                        rule.antecedent.append((best_attr, '==', best_value))

                rule.label = label
                rule.precision = precision
                rule.recall = best_tp / total_tp
                ruleset.append(rule)
                instance_set = instance_set.drop(rule_coverage.index)
                unclass_count -= best_tp

                # update progress bar
                if pbar is not None:
                    pbar.update(best_tp)

        self.ruleset_ = ruleset
        self.target_ = target
        self.majority_ = data[target].mode().values[0]
        return self
ALPHA = 1
ALPHA_DECAY = 1  # 1 0.9 # 0.9999 #0.9975
ALPHA_MIN = 0.0001
ssc = StandardScaler()
ssc = MinMaxScaler()
digitizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='kmeans')

data = load_breast_cancer()
train_data = data.data
y = data.target
labels = y
z = np.where(y == 1, 100, -100)
#oe = OneHotEncoder(handle_unknown='ignore', sparse=False)
#train_labels= oe.fit_transform(y.reshape(-1,1))
#digitizer followed by MinMaxScaler
train_data = digitizer.fit_transform(train_data)
ssc = MinMaxScaler()
train_data = ssc.fit_transform(train_data)


class DQN():
    def __init__(self, input_size, output_size, data, labels):
        self.model = self.create_model(input_size, output_size)
        self.target_model = self.create_model(input_size, output_size)
        self.obs = data
        self.labels = labels
        self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
        pass

    def create_model(self, input_size, output_size):
예제 #19
0
    def _prep_data(self, json):
        # read the file
        cmp = pd.read_csv(self.file)

        #save the user information
        user = json['Company name']

        #merging input user details
        df_json = pd.DataFrame(json, index=[0])
        df_json.drop(['Name'], axis=1, inplace=True)
        cmp_final = pd.concat([cmp, df_json], axis=0, sort=True).reset_index()
        cmp_final.fillna(0, inplace=True)

        #drop out the policy details
        cmp2 = cmp_final.drop(self.insurance_details, axis=1)

        #drop out variables not needed for model fitting
        cmp2 = cmp2.drop(
            ["Company name", "Total # of employees", "Business Start year"],
            axis=1)

        #feature extraction of address
        cmp2['Address_group'] = cmp2['Address'].astype(str).str[:2]
        cmp2.drop('Address', axis=1, inplace=True)

        binscategorical = [
            'Number of employees full time', 'Number of employees parttime',
            '# of years experience in Industry', 'Projected Annual Revenue',
            'Projected Payroll for employers in next 12 months '
        ]

        #binning continuous variables
        kbd = KBinsDiscretizer(
            n_bins=10, encode='ordinal',
            strategy='quantile')  # read documentation for encode and strategy
        dfkbd = pd.DataFrame(kbd.fit_transform(cmp2[binscategorical]),
                             columns=['kbd_' + x for x in binscategorical])
        cmp_bins = pd.concat([cmp2, dfkbd], axis=1)
        cmp_bins = cmp_bins.drop(binscategorical, axis=1)

        #feature engineering for categorical variables- get_dummies
        categorical_new = [
            'Industry', 'Type of Ownership', 'Address_group',
            'kbd_Number of employees full time',
            'kbd_Number of employees parttime',
            'kbd_# of years experience in Industry',
            'kbd_Projected Annual Revenue',
            'kbd_Projected Payroll for employers in next 12 months '
        ]
        cmp_bins[categorical_new] = cmp_bins[categorical_new].astype(str)
        cmp_catdumm = pd.get_dummies(cmp_bins[categorical_new],
                                     drop_first=True)  # dummy_na=True
        cmp_catdumm = pd.concat([cmp_bins, cmp_catdumm], axis=1)
        cmp_catdumm = cmp_catdumm.drop(categorical_new, axis=1)

        #Scaling numerical variables
        numericvars = ['Number of owners active ', 'Number of locations']
        ss = StandardScaler(with_mean=True, with_std=True)
        cmp_catdummss = pd.DataFrame(ss.fit_transform(
            cmp_catdumm[numericvars]),
                                     columns=['ss_' + x for x in numericvars])
        cmp_catdummss = pd.concat([cmp_catdumm, cmp_catdummss], axis=1)
        cmp_catdummss = cmp_catdummss.drop(numericvars, axis=1)

        return self._inference(cmp_final, cmp_catdummss, user)
예제 #20
0
def main(name, output_dir, transparent, context, style, palette, width, height,
         aspect, dpi, extension, seed):

    num_samples = 256
    random_state = np.random.RandomState(seed)

    # preamble
    if height is None:
        height = width / aspect
    # height *= num_iterations
    # figsize = size(width, aspect)
    figsize = (width, height)

    suffix = f"{width*dpi:.0f}x{height*dpi:.0f}"

    rc = {
        "figure.figsize": figsize,
        "font.serif": ["Times New Roman"],
        "text.usetex": True,
    }
    sns.set(context=context, style=style, palette=palette, font="serif", rc=rc)

    output_path = Path(output_dir).joinpath(name)
    output_path.mkdir(parents=True, exist_ok=True)
    # / preamble

    benchmark = Branin()
    bounds = benchmark.get_bounds()

    (low, high), dim = from_bounds(bounds)

    x = random_state.uniform(low=low, high=high, size=(num_samples, dim))
    X = np.expand_dims(x, axis=1)

    y = benchmark(x[::, 0], x[::, 1])

    n_bins = 10
    scaler = KBinsDiscretizer(n_bins=n_bins,
                              encode="ordinal",
                              strategy="quantile")
    z = 1 + scaler.fit_transform(y.reshape(-1, 1)).squeeze()

    frame = pd.DataFrame(data=x).assign(y=y, z=z)

    fig, ax = plt.subplots()

    pd.plotting.parallel_coordinates(frame,
                                     class_column="z",
                                     use_columns=False,
                                     colormap="turbo",
                                     sort_labels=False,
                                     linewidth=0.25,
                                     alpha=0.7,
                                     ax=ax)

    ax.legend()

    plt.tight_layout()

    for ext in extension:
        fig.savefig(output_path.joinpath(f"foo_{suffix}.{ext}"),
                    dpi=dpi,
                    transparent=transparent)

    plt.show()

    return 0
예제 #21
0
def main(use_simple_lr_pca_pipeline, kbins_strat, train_split, test_split,
         exclude_pca, hyperparameters, output_size, validation_size, n_process,
         precached_pkl, prestore_data, return_mode,
         use_simple_lin_reg_pca_pipeline, use_simple_lstm, discretize_age,
         kbins_encoding, num_epochs, num_pca_comp):
    if precached_pkl is not None:
        allData = pkl.load(open(precached_pkl, 'rb'))
        data = allData["data"]
        # clinical_txt_paths = precached_pkl["clinical_txt_paths"]
        ages = allData["ages"]
        testAges = allData["testAges"]
        testData = allData["testData"]
        # test_clinical_txt_paths = precached_pkl["test_clinical_txt_paths"]
    else:
        data, ages, clinical_txt_paths = get_data(split=train_split)
        testData, testAges, test_clinical_txt_paths = get_data(
            split=test_split)
    return_dict = Dict()

    if prestore_data:
        toStore = Dict()
        toStore.data = data
        toStore.ages = ages
        toStore.clinical_txt_paths = clinical_txt_paths
        toStore.testData = testData
        toStore.testAges = testAges
        toStore.test_clinical_txt_paths = test_clinical_txt_paths
        if return_mode == "age":
            pkl.dump(toStore, open("agePredictionData.pkl", 'wb'))
        elif return_mode == "bpm":
            pkl.dump(toStore, open("bpmPredictionData.pkl", 'wb'))
        return return_mode

    if discretize_age:
        kbins = KBinsDiscretizer(output_size,
                                 encode=kbins_encoding,
                                 strategy=kbins_strat)
        ages = np.array(ages).reshape(-1, 1)
        ages = kbins.fit_transform(ages)
        return_dict['kbins'] = kbins.bin_edges_
        testAges = np.array(testAges).reshape(-1, 1)
        testAges = kbins.transform(testAges)
        print("KBins used!  Edges are: {}".format(kbins.bin_edges_))

    if use_simple_lstm:
        ageScaler = StandardScaler()
        ages = np.array(ages).reshape(-1, 1)
        ages = ageScaler.fit_transform(ages)
        testAges = np.array(testAges).reshape(-1, 1)
        testAges = ageScaler.transform(testAges)
        model = get_lstm()
        x = pad_sequences(data)
        model.fit(x,
                  ages,
                  epochs=num_epochs,
                  validation_split=validation_size,
                  callbacks=get_early_stopping())
        testX = pad_sequences(testData)
        score = model.evaluate(testX, testAges)
        y_pred = model.predict(testX)

        ages = ageScaler.inverse_transform(ages)
        testAges = ageScaler.inverse_transform(testAges)
        mse = mean_squared_error(y_pred, testAges)
        r2 = r2_score(y_pred, testAges)
        print("MSE: {}".format(mse))
        print("R2: {}".format(r2))
        fn = "model_{}_epochs{}.h5".format(return_mode, num_epochs)
        model.save(fn)
        ex.add_artifact(fn)
        return score, mse, r2

    if use_simple_lin_reg_pca_pipeline:
        ages = np.array(ages).reshape(-1, 1)
        testAges = np.array(testAges).reshape(-1, 1)
        data = np.stack(data).reshape(len(data), -1)
        testData = np.stack(testData).reshape(len(testData), -1)

        steps = [
            ('pca', PCA(n_components=num_pca_comp)),
            ('scaler', StandardScaler()),
            ('lin_reg', LinearRegression()),
        ]
        if exclude_pca:
            steps = steps[1:]
        p = Pipeline(steps)
        cv = int(1 / validation_size)
        gridsearch = GridSearchCV(p,
                                  hyperparameters,
                                  scoring=make_scorer(r2_score),
                                  cv=cv,
                                  n_jobs=n_process)
        gridsearch.fit(data, ages)
        return_dict["gridsearch_best_estimator"] = gridsearch.best_estimator_
        return_dict["best_cv_score"] = gridsearch.best_score_
        print("best cv score was {}".format(gridsearch.best_score_))
        best_pipeline = gridsearch.best_estimator_
        best_pipeline.fit(data, ages)

        y_pred = best_pipeline.predict(data)
        y_pred[y_pred < 0] = 0
        y_pred[y_pred > 90] = 90
        print("train r^2 was {}".format(r2_score(ages, y_pred)))

        y_pred = best_pipeline.predict(testData)
        y_pred[y_pred < 0] = 0
        y_pred[y_pred > 90] = 90
        test_score = mean_squared_error(testAges, y_pred)
        print("test_score: {}".format(test_score))
        print("test r^2 was {}".format(r2_score(testAges, y_pred)))
        return_dict["test_score"] = test_score
        pkl.dump(return_dict,
                 open("predict_{}Exp.pkl".format(return_mode), 'wb'))
        ex.add_artifact("predict_{}Exp.pkl".format(return_mode))
        return test_score, r2_score(testAges, y_pred)

    if use_simple_lr_pca_pipeline:
        data = np.stack(data).reshape(len(data), -1)
        testData = np.stack(testData).reshape(len(testData), -1)

        steps = [
            ('pca', PCA(n_components=num_pca_comp)),
            ('scaler', StandardScaler()),
            ('lr', LogisticRegression()),
        ]
        if exclude_pca:
            steps = steps[1:]
        p = Pipeline(steps)
        cv = int(1 / validation_size)
        gridsearch = GridSearchCV(p,
                                  hyperparameters,
                                  scoring=make_scorer(r2_score),
                                  cv=cv,
                                  n_jobs=n_process)
        gridsearch.fit(data, ages)
        return_dict["gridsearch_best_estimator"] = gridsearch.best_estimator_
        return_dict["best_cv_score"] = gridsearch.best_score_
        print("best cv score was {}".format(gridsearch.best_score_))
        best_pipeline = gridsearch.best_estimator_
        best_pipeline.fit(data, ages)
        y_pred = best_pipeline.predict(data)
        print("train r^2 was {}".format(r2_score(ages, y_pred)))

        y_pred = best_pipeline.predict(testData)
        test_score = f1_score(testAges, y_pred, average="weighted")

        y_pred_orig = kbins.inverse_transform(y_pred.reshape(-1, 1))
        test_ages_orig = kbins.inverse_transform(testAges.reshape(-1, 1))

        print("test r^2 was {}".format(r2_score(testAges, y_pred)))
        print("test mse was {}".format(
            mean_squared_error(test_ages_orig, y_pred_orig)))

        print("test_score: f1 {}".format(test_score))
        print("test_score: accuracy {}".format(accuracy_score(
            testAges, y_pred)))

        return_dict["test_score"] = test_score
        pkl.dump(return_dict,
                 open("predict_{}Exp.pkl".format(return_mode), 'wb'))
        ex.add_artifact("predict_{}Exp.pkl".format(return_mode))
        return test_score

    raise Exception("Valid config not set")
예제 #22
0
# In[539]:

q1()

# ## Questão 2
#
# Discretizando a variável `Pop_density` em 10 intervalos com `KBinsDiscretizer`, seguindo o encode `ordinal` e estratégia `quantile`, quantos países se encontram acima do 90º percentil? Responda como um único escalar inteiro.

# In[540]:

#https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html
discretizer = KBinsDiscretizer(n_bins=10,
                               encode="ordinal",
                               strategy="quantile")

pop_density_bins = discretizer.fit_transform(countries[["Pop_density"]])

pop_density_bins[:5]

# In[541]:

np.unique(pop_density_bins)

# In[542]:


def q2():
    # Retorne aqui o resultado da questão 2.
    return int((pop_density_bins >= 9).sum())

예제 #23
0

        ## impute missing values with mean and trasform to test_instance
        col_names = train_set.columns
        imputer = SimpleImputer(strategy='mean')
        train_set = imputer.fit_transform(train_set)
        test_instance = imputer.transform(test_instance.values.reshape(1, -1))

        ## normalize features
        scaler = StandardScaler()
        train_set = scaler.fit_transform(train_set)
        test_instance = scaler.transform(test_instance)
        
        ## discretize features
        disc = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='quantile')
        train_set = disc.fit_transform(train_set)
        test_instance = disc.transform(test_instance)
        
        ## re-assign colnames to train and test (convert back to dataframe and series)
        train_set = pd.DataFrame(train_set, columns = col_names)
        test_instance = pd.Series(test_instance[0], index=col_names)
        
        ## second drop constant columns to speedup feature-selection
        train_set = train_set.loc[:, train_set.nunique() != 1]
        test_instance = test_instance[train_set.columns]

        ## feature-selection for each target columns
        selected_features = utils.anova_feature_selection(train_set, targets[target_grp], pval_threshold=0.01)

        ## union selected features for all targets
        union_selected_features = reduce(np.union1d, tuple([feature_list for _, feature_list in selected_features.items()]))
예제 #24
0
from sklearn.preprocessing import KBinsDiscretizer

df = pd.read_csv(open('../data/magic.csv', 'rb'), header=None)
est = KBinsDiscretizer(n_bins=6, encode='ordinal', strategy='quantile')

cat_df = pd.DataFrame( est.fit_transform(df[df.columns[:10]]), dtype='int' )
cat_df['TARGET'] = df[10]
cat_df = cat_df.apply(lambda c: c.astype('category'))

import pickle
pickle.dump(cat_df, open('magic.pkl', 'wb'))
예제 #25
0
countries.dtypes

# ## Inicia sua análise a partir daqui

# In[30]:

# Sua análise começa aqui.

list(countries.Region.sort_values().unique())

# In[33]:

discretizer = KBinsDiscretizer(n_bins=10,
                               encode='ordinal',
                               strategy='quantile')
bin_pop_density = discretizer.fit_transform(countries[['Pop_density']])
int(sum(bin_pop_density[:, 0] == 9))

# In[36]:

encoded = pd.get_dummies(countries[['Region', 'Climate']].fillna(''))
int(encoded.shape[1])

# num_pipeline = Pipeline(steps = [
#     ("imputer", SimpleImputer(strategy="median")),
#     ("standart_scaler", StandardScaler())
# ])
#
# numeric_features = countries.select_dtypes(include=['float64', 'int64'])
# num_pipeline.fit(numeric_features)
# test_country_transform = num_pipeline.transform([test_country[2:]])
예제 #26
0
def q2():
    # Retorne aqui o resultado da questão 2.
    discretizer = KBinsDiscretizer(n_bins = 10, encode = 'ordinal', strategy = 'quantile')
    discretizer_pop = discretizer.fit_transform(countries[['Pop_density']])
    above_p90 = discretizer.bin_edges_[0][9]
    return int(countries[countries['Pop_density'] >= above_p90]['Pop_density'].count())
예제 #27
0
# We recall that a way to accelerate the gradient boosting is to reduce the
# number of split considered within the tree building. One way is to bin the
# data before to give them into the gradient boosting. A transformer called
# `KBinsDiscretizer` is doing such transformation. Thus, we can pipeline
# this preprocessing with the gradient boosting.
#
# We can first demonstrate the transformation done by the `KBinsDiscretizer`.

# %%
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer

discretizer = KBinsDiscretizer(n_bins=256,
                               encode="ordinal",
                               strategy="quantile")
X_trans = discretizer.fit_transform(X_train)
X_trans

# %% [markdown]
# ```{note}
# The code cell above will generate a couple of warnings. Indeed, for some of
# the features, we requested too much bins in regard of the data dispersion for
# those features. The too small bins will be removed.
# ```
# We see that the discretizer transform the original data into an integer.
# This integer represents the bin index when the distribution by quantile is
# performed. We can check the number of bins per feature.

# %%
[len(np.unique(col)) for col in X_trans.T]
class WOEEncoder(BaseEstimator, TransformerMixin):
    """Weight of Evidence (WoE) encoder: encodes categorical features as a numerical vector
    using weight of evidence encoding. This is only supported with binary targets. Both the
    features and the target are assumed to be free of missing values, missing values should
    be handled separately before the encoding. A binning function can be provided to handle
    numerical features which are then binned first then encoded.
    Note that the sign of the weight of evidence values depends on the order in which the
    categories of the target column are detected. This does not affect the performance of
    any supervised model applied thereafter.
    See [1] for more details on WoE.

    Parameters
    ----------
    binning: {'uniform', 'quantile', 'kmeans', None}, default=None
        What binning method to apply, no binning applied if set to None.
        This uses ScikitLearn's KBinsDiscretizer (see [2]).
        uniform: all bins in each feature have identical width.
        quantile: all bins in each feature have the same number of points.
        kmeans: values in each bin have the same nearest center of a 1D kmeans cluster.

    n_bins: int (default=10), greater than 2
        Number of bins to use when binning is applied.

    alpha: float (default = 0.5), non-negative
        Regularization value to avoid numerical errors due to division by zero in the
        computation of the weight of evidence (e.g. in case the data points corresponding
        to one category of a feature all have the same target value).

    laplace: boolean (default = False)
        If alpha is positive, adds Laplace smoothing to the computation of the weight of
        evidence.

    Example
    -------
    >>> import numpy as np
    >>> from sagemaker_sklearn_extension.preprocessing import WOEEncoder
    >>> np.random.seed(112)
    >>> N = 10
    >>> y = np.random.choice([0, 1], size=N)
    >>> y
    array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0])
    >>> sex = np.random.choice(['m', 'f'], size=N)
    >>> sex
    array(['m', 'f', 'm', 'm', 'f', 'm', 'f', 'm', 'm', 'm'], dtype='<U1')
    >>> WOEEncoder().fit_transform(sex.reshape(-1, 1), y)
    array([[ 1.06087196],
           [-2.35137526],
           [ 1.06087196],
           [ 1.06087196],
           [-2.35137526],
           [ 1.06087196],
           [-2.35137526],
           [ 1.06087196],
           [ 1.06087196],
           [ 1.06087196]])
    >>> age = np.random.randint(low=25, high=95, size=N)
    >>> age
    array([54, 73, 76, 30, 53, 33, 28, 51, 62, 43])
    >>> WOEEncoder(binning='quantile', n_bins=2).fit_transform(age.reshape(-1, 1), y)
    array([[-0.74193734],
           [-0.74193734],
           [-0.74193734],
           [ 0.69314718],
           [-0.74193734],
           [ 0.69314718],
           [ 0.69314718],
           [ 0.69314718],
           [-0.74193734],
           [ 0.69314718]])

    Attributes
    ----------
    binner_: estimator trained to bin numerical data if binning is not None.

    woe_pairs_: list of pairs (codex, woe) of size n_encoded_features
        The codex has the mapping feature_value => woe_index and woe has the weight of
        evidence values.

    References
    ----------
    [1] https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html
    [2] https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html
    [3] https://en.wikipedia.org/wiki/Additive_smoothing
    """

    def __init__(self, binning=None, n_bins=10, alpha=0.5, laplace=False):
        self.binning = binning
        self.n_bins = n_bins
        self.alpha = alpha
        self.laplace = laplace

    def _woe(self, x, count_y_0, mask_y_0, beta):
        """Return the categories for a feature vector `x` as well as the corresponding
        weight of evidence value for each of those categories.

        Parameters
        ----------
        x: vector, shape (n_samples,)
            Feature vector to encode.
        count_y: list of length 2
            List of counts for the number of observations with the first (resp. the second)
            target category.
        mask_y_0: vector, shape (n_samples,)
            Mask of observations with the first target category.
        beta: float
            Value to use for Laplace Smoothing (0 if laplace is False).
        """
        cat_x = np.unique(x)
        mask_y_1 = np.logical_not(mask_y_0)
        count_y_1 = len(mask_y_0) - count_y_0

        # Computation of the Weight of Evidence for a category c in cat_x and with
        # regularization α
        #
        #   woe_c = log( { #(y==0 | c) + α / #(y==1 | c)  + α } *
        #                { #(y==1) + β / #(y==0) + β } )
        #
        # where β = 2α if laplace == True, 0 otherwise. The second factor can be computed
        # once, call it `r10` then
        #
        #   woe_c = log( r10 * ratio(c) )
        #
        # where
        #
        #   ratio(c) = { #(y==0 | c) + α } / { #(y==1 | x==c) + α }
        #

        def ratio(c):
            x_c = x == c
            # retrieve the number of (y == 0 | x == c) and same for y == 1
            y_0_c = sum(np.logical_and(mask_y_0, x_c))
            y_1_c = sum(np.logical_and(mask_y_1, x_c))
            # compute the ratio with regularization for 0 events
            return (y_0_c + self.alpha) / (y_1_c + self.alpha)

        # computation of woe possibly using Laplace smoothing (beta factor)
        r10 = (count_y_1 + beta) / (count_y_0 + beta)
        woe = np.log(r10 * np.array([ratio(c) for c in cat_x]))
        # encoder from unique values of x to index
        codex = {c: i for (i, c) in enumerate(cat_x)}
        return (codex, woe)

    def fit(self, X, y):
        """Fit Weight of Evidence encoder to `X` and `y`.

        Parameters
        ----------
        X: array-like, shape (n_samples, n_features)
            The data to encode.

        y: array-like, shape (n_samples,)
            The binary target vector.

        Returns
        -------
        self: WOEEncoder.
        """
        # Validate parameters
        if self.binning:
            assert self.binning in ("uniform", "quantile", "kmeans"), WOEAsserts.BINNING
            assert self.n_bins >= 2, WOEAsserts.NBINS
        assert self.alpha >= 0, WOEAsserts.ALPHA
        # Validate data
        X, y = check_X_y(X, y)
        # Keep track of number of features encoded
        self._dim = X.shape[1]
        # recover the target categories and check there's only two
        cat_y = np.unique(y)
        # it should be == 2 but relax to <= 2 for a single-sample test by check_estimator
        assert len(cat_y) <= 2, WOEAsserts.BINARY

        # value for laplace smoothing
        beta = 2 * self.alpha * self.laplace

        # count the number of occurrences per target class and form the mask
        # for the rows for which y==0
        mask_y_0 = y == cat_y[0]
        count_y_0 = sum(mask_y_0)

        if self.binning:
            self.binner_ = KBinsDiscretizer(n_bins=self.n_bins, strategy=self.binning, encode="ordinal")
            Xp = self.binner_.fit_transform(X)
        else:
            Xp = X
        # go over each column and compute the woe
        self.woe_pairs_ = list(map(lambda x: self._woe(x, count_y_0, mask_y_0, beta), Xp.T))
        return self

    def transform(self, X):
        """Transform each column of `X` using the Weight-of-Evidence encoding.

        Returns
        -------
        X_encoded: array, shape (n_samples, n_encoded_features)
            Array with each of the encoded columns.
        """
        # check is fitted
        check_is_fitted(self, "woe_pairs_")
        # check input
        X = check_array(X)

        if X.shape[1] != self._dim:
            raise ValueError(f"The input dimension is {X.shape[1]} instead of the expected {self._dim}")

        if self.binning:
            Xp = self.binner_.transform(X)
        else:
            Xp = X

        Xe = np.zeros(Xp.shape)
        for (i, x) in enumerate(Xp.T):
            codex, woe = self.woe_pairs_[i]
            # check that the data to encode doesn't have classes yet unseen
            assert all([e in codex.keys() for e in np.unique(x)]), WOEAsserts.UNSEEN_CAT
            # construct the encoded column by inverting the codex, if the category
            # is not recognised (not a key of the codex), a np.nan is inputted
            Xe[:, i] = np.array([woe[codex[xi]] for xi in x])

        return Xe

    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X)

    def _more_tags(self):
        return {"X_types": ["categorical"], "binary_only": True, "requires_y": True}
예제 #29
0
def q2():
    discretizar = KBinsDiscretizer(n_bins=10, encode="ordinal",strategy="quantile")
    intervalo = discretizar.fit_transform(countries[["Pop_density"]])
    
    resposta = len(intervalo) - (0.9 * len(intervalo))
    return ceil(resposta)
예제 #30
0
def test_inverse_transform(strategy, encode, expected_inv):
    kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
    Xt = kbd.fit_transform(X)
    Xinv = kbd.inverse_transform(Xt)
    assert_array_almost_equal(expected_inv, Xinv)
예제 #31
0
##########################################

if norm_target == 1:
    #Target normalization for continuous values
    target_np = scale(target_np)

if norm_features == 1:
    #Feature normalization for continuous values
    data_np = scale(data_np)

if binning == 1:
    #Discretize Target variable with KBinsDiscretizer
    enc = KBinsDiscretizer(
        n_bins=[bin_cnt], encode='ordinal', strategy='quantile'
    )  #Strategy here is important, quantile creating equal bins, but kmeans prob being more valid "clusters"
    target_np_bin = enc.fit_transform(target_np.reshape(-1, 1))

    #Get Bin min/max
    temp = [[] for x in range(bin_cnt + 1)]
    for i in range(len(target_np)):
        for j in range(bin_cnt):
            if target_np_bin[i] == j:
                temp[j].append(target_np[i])

    for j in range(bin_cnt):
        print('Bin', j, ':', min(temp[j]), max(temp[j]), len(temp[j]))
    print('\n')

    #Convert Target array back to correct shape
    target_np = np.ravel(target_np_bin)
예제 #32
0
# We recall that a way of accelerating the gradient boosting is to reduce the
# number of split considered within the tree building. One way is to bin the
# data before to give them into the gradient boosting. A transformer called
# `KBinsDiscretizer` is doing such transformation. Thus, we can pipeline
# this preprocessing with the gradient boosting.
#
# We can first demonstrate the transformation done by the `KBinsDiscretizer`.

# %%
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer

discretizer = KBinsDiscretizer(n_bins=256,
                               encode="ordinal",
                               strategy="quantile")
data_trans = discretizer.fit_transform(data)
data_trans

# %% [markdown]
# ```{note}
# The code cell above will generate a couple of warnings. Indeed, for some of
# the features, we requested too much bins in regard of the data dispersion
# for those features. The smallest bins will be removed.
# ```
# We see that the discretizer transforms the original data into an integer.
# This integer represents the bin index when the distribution by quantile is
# performed. We can check the number of bins per feature.

# %%
[len(np.unique(col)) for col in data_trans.T]
예제 #33
0
파일: 分箱操作.py 프로젝트: HG1227/ML
ax1.plot(X[:, 0], y, 'o', c='k')
#其他图形选项
ax1.legend(loc="best")
ax1.set_ylabel("Regression output")
ax1.set_xlabel("Input feature")
ax1.set_title("Result before discretization")
plt.tight_layout()
plt.show()

from sklearn.preprocessing import KBinsDiscretizer
#将数据分箱
enc = KBinsDiscretizer(
    n_bins=10  #分几类?
    ,
    encode="onehot")  #ordinal
X_binned = enc.fit_transform(X)
#encode模式"onehot":使用做哑变量方式做离散化
#之后返回一个稀疏矩阵(m,n_bins),每一列是一个分好的类别
#对每一个样本而言,它包含的分类(箱子)中它表示为1,其余分类中它表示为0
X.shape
X_binned
#使用pandas打开稀疏矩阵
import pandas as pd
pd.DataFrame(X_binned.toarray()).head()
#我们将使用分箱后的数据来训练模型,在sklearn中,测试集和训练集的结构必须保持一致,否则报错
LinearR_ = LinearRegression().fit(X_binned, y)
LinearR_.predict(line)  #line作为测试集
line.shape  #测试
X_binned.shape  #训练
#因此我们需要创建分箱后的测试集:按照已经建好的分箱模型将line分箱
line_binned = enc.transform(line)
def qualify(dataframe, name, quantity, strategy):
    est = KBinsDiscretizer(n_bins=[quantity],
                           encode='ordinal',
                           strategy=strategy)
    dataframe[name] = est.fit_transform(dataframe[[name]])
    return dataframe
예제 #35
0
from sklearn.preprocessing import KBinsDiscretizer
KBTT = KBinsDiscretizer(n_bins=10, encode="onehot-dense")
X_ti = titanic.loc[:, ["pclass", "age", "sex"]]
Y_ti = titanic.loc[:, ["survived"]]
print(X_ti.info())
print(Y_ti.info())
"""
由上面的信息我们设计如下几个数据处理的任务 
1:age这个数据列 只有633个 需要补完 使用平均数或者中位数
2:sex 与 pclass 这两个列 都是类别类型的 需要转化为数值特征 用0/1代替
#观察信息 发现age列有缺失值 对于这个
"""

X_ti["age"].fillna(X_ti["age"].mean(),
                   inplace=True)  #False:创建一个副本,修改副本,原对象不变(缺省默认) True:直接修改原对象
print(KBTT.fit_transform(X_ti.loc[:, ["age"]]))
print("######################################")
print(X_ti.info())
print("######################################")
Xtraint, Xtestt, Ytraint, Ytestt = train_test_split(X_ti,
                                                    Y_ti,
                                                    random_state=33,
                                                    test_size=0.25)
#数据特征转换 我们使用sciki-learn.feature_extraction中的特征转换器
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)  # 括号中的意思就是不是稀疏性的 而是稠密型的
# 这个对象参数要求是字典dict 所以我们要转化一下
#to_dict()方法中 可以进行六种转换 可以选择六种的转换类型,分别对应于参数
# ‘dict'({column -> {index -> value}}这样的结构,data_dict[key1][key2]), ‘list'({column -> [values]}, data_list[keys][index]),
# ‘series'({column -> Series(values)},data_series[key1][key2]或data_dict[key1]),
# ‘split'({index -> [index], columns -> [columns], data -> [values]},data_split[‘index'],data_split[‘data'],data_split[‘columns']),
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.tree import DecisionTreeRegressor

print(__doc__)

# construct the dataset
rnd = np.random.RandomState(42)
X = rnd.uniform(-3, 3, size=100)
y = np.sin(X) + rnd.normal(size=len(X)) / 3
X = X.reshape(-1, 1)

# transform the dataset with KBinsDiscretizer
enc = KBinsDiscretizer(n_bins=10, encode='onehot')
X_binned = enc.fit_transform(X)

# predict with original dataset
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize=(10, 4))
line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1)
reg = LinearRegression().fit(X, y)
ax1.plot(line, reg.predict(line), linewidth=2, color='green',
         label="linear regression")
reg = DecisionTreeRegressor(min_samples_split=3, random_state=0).fit(X, y)
ax1.plot(line, reg.predict(line), linewidth=2, color='red',
         label="decision tree")
ax1.plot(X[:, 0], y, 'o', c='k')
ax1.legend(loc="best")
ax1.set_ylabel("Regression output")
ax1.set_xlabel("Input feature")
ax1.set_title("Result before discretization")