Пример #1
0
def main(params, inputs, outputs):
    ### 读入数据 ###
    df = inputs.df
    df_new = outputs.df_new
    df = pd.read_pickle(df)

    ### 从变量中选择连续变量 ###
    df_x = df.iloc[:, :-1]
    df_y = df.iloc[:, -1]
    df_standard = df_x.drop(df_x.select_dtypes(['object']), axis=1)
    df_label = df_x.select_dtypes(['object'])

    ### 变量标准化 ###
    df_standard = StandardScaler().fit_transform(df_standard)
    df_standard = pd.DataFrame(df_standrad,
                               columns=df_x.drop(df_x.select_dtypes(['object'
                                                                     ]),
                                                 axis=1).columns)

    ### 合并数据 ###
    df_combine = df_standard.join(df_label)

    ### 输出数据集 ###
    df_combine.to_pickle(df_new)
plt.ylabel('Score')
plt.title('Elbow Curve')
#plt.show()
#Corremos KMeans
kmeans = KMeans(n_clusters=7).fit(users)
#Graficamos los rsultados de KMeans
df_labeled = pd.DataFrame(kmeans.labels_, columns=list(['labels']))
df_labeled['labels'] = df_labeled['labels'].astype('category')
plt.figure(figsize=(10, 8))
df_labeled['labels'].value_counts().plot.bar(color='y')
plt.xlabel("Cluster")
plt.ylabel("Número de clientes")
plt.title("Número clientes por Cluster")
#plt.show()
#Agragamos los resultados las categorias del KMeans a users
users = users.join(df_labeled)
#Graicamos el dendograma
plt.figure(figsize=(20, 10))
merg = linkage(users.drop('labels', 1), method='ward')
dendrogram(merg, leaf_rotation=360)
plt.title('Dendrogram')
#plt.show()
#Definimos el clusterin jerarquico
hier_clus = AgglomerativeClustering(n_clusters=5,
                                    affinity='euclidean',
                                    linkage='ward')
cluster = hier_clus.fit_predict(users.drop('labels', 1))
#Agregamos las categorias del clustrein jerarqico
users['Agg_label'] = cluster
#Grafica del CH
df_labeled = pd.DataFrame(hier_clus.labels_, columns=list(['labels']))
Пример #3
0
 def pca(self, x, components=None):
     if self.cols_to_pca == None:
     return x
     x_other = x.iloc[:,list(set(list(range(0, len(x.columns))))-set(self.cols_to_pca))]
     x = x.iloc[:,self.cols_to_pca]
     scaled_x = StandardScaler().fit_transform(x)
     if self.pcaler == None:
     if (components==None):
     components = scaled_x.shape[1]
     pcaler = PCA(n_components=components)
     self.pcaler = pcaler
     else:
     pcaler = self.pcaler
     pca_x = pcaler.fit_transform(scaled_x)
     scaled_x = pd.DataFrame(pca_x, columns = x.columns, index = x.index)
     x = scaled_x.join(x_other, how='outer')
     return x
     
 #Rolling scaling to ensure out of sample testing
 #TODO
# def roll_scale(self, x_test, x):
# if self.cols_to_scale == None:
# return x_test
# x_other = x.iloc[:,list(set(list(range(0, len(x.columns))))-set(self.cols_to_scale))]
# x = x.iloc[:,self.cols_to_scale]
# x_test = x_test.iloc[:,self.cols_to_scale]
# x_all = x.append(x_test)
# scaled_x = x_all.expanding(min_periods=len(x)).apply(lambda x: self.scale(pd.DataFrame(x).iloc[len(x)-1:,], pd.DataFrame(x)[:len(x)-1]))
# scaled_x = pd.DataFrame(scaled_x, columns = x_all.columns, index = x_all.index)
# x = scaled_x.join(x_other, how='outer')
# return x
 
 #Merge data_dict into one dataframe
 def merge_data(self, data_dict):
     data = data_dict.values()
     temp_data = reduce(lambda x,y: x.join(y, how='outer'), data)
     temp_data = temp_data.fillna(method='pad')
     self.data_df = temp_data.dropna(axis=0)
     #self.data_df.sort_index(inplace=True)
     self.data_df.columns = list(data_dict.keys())
     return self.data_df
 
 #Ensure data is in a pd.Dataframe before splitting in train/test
 #If threshold given, use that, else use the train/test cutoff points with optional scaling
 #Returns y_train, y_test, x_train, x_test as DataFrames
 def get_traintest(self, x, y, threshold = 0.8):
     #pdb.set_trace()
     common_ind = x.index.intersection(y.index)
     y = y.loc[common_ind]
     x = x.loc[common_ind]
     merged = y.to_frame().join(x)
     merged = merged.dropna(how='all')
     y = merged.iloc[:,0]
     x = merged.iloc[:,1:]
     #pdb.set_trace()
     cutoff = int(np.floor(len(merged)*threshold))
     end_cut = len(merged)
     y_train = y.iloc[0:cutoff]
     y_test = y.iloc[cutoff+1:end_cut]
     x_train = x.iloc[0:cutoff, :]
     x_test = x.iloc[cutoff+1:end_cut,:]
     return y_train.to_frame().values.ravel(), y_test.to_frame().values.ravel(), x_train, x_test