def main(params, inputs, outputs): ### 读入数据 ### df = inputs.df df_new = outputs.df_new df = pd.read_pickle(df) ### 从变量中选择连续变量 ### df_x = df.iloc[:, :-1] df_y = df.iloc[:, -1] df_standard = df_x.drop(df_x.select_dtypes(['object']), axis=1) df_label = df_x.select_dtypes(['object']) ### 变量标准化 ### df_standard = StandardScaler().fit_transform(df_standard) df_standard = pd.DataFrame(df_standrad, columns=df_x.drop(df_x.select_dtypes(['object' ]), axis=1).columns) ### 合并数据 ### df_combine = df_standard.join(df_label) ### 输出数据集 ### df_combine.to_pickle(df_new)
plt.ylabel('Score') plt.title('Elbow Curve') #plt.show() #Corremos KMeans kmeans = KMeans(n_clusters=7).fit(users) #Graficamos los rsultados de KMeans df_labeled = pd.DataFrame(kmeans.labels_, columns=list(['labels'])) df_labeled['labels'] = df_labeled['labels'].astype('category') plt.figure(figsize=(10, 8)) df_labeled['labels'].value_counts().plot.bar(color='y') plt.xlabel("Cluster") plt.ylabel("Número de clientes") plt.title("Número clientes por Cluster") #plt.show() #Agragamos los resultados las categorias del KMeans a users users = users.join(df_labeled) #Graicamos el dendograma plt.figure(figsize=(20, 10)) merg = linkage(users.drop('labels', 1), method='ward') dendrogram(merg, leaf_rotation=360) plt.title('Dendrogram') #plt.show() #Definimos el clusterin jerarquico hier_clus = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward') cluster = hier_clus.fit_predict(users.drop('labels', 1)) #Agregamos las categorias del clustrein jerarqico users['Agg_label'] = cluster #Grafica del CH df_labeled = pd.DataFrame(hier_clus.labels_, columns=list(['labels']))
def pca(self, x, components=None): if self.cols_to_pca == None: return x x_other = x.iloc[:,list(set(list(range(0, len(x.columns))))-set(self.cols_to_pca))] x = x.iloc[:,self.cols_to_pca] scaled_x = StandardScaler().fit_transform(x) if self.pcaler == None: if (components==None): components = scaled_x.shape[1] pcaler = PCA(n_components=components) self.pcaler = pcaler else: pcaler = self.pcaler pca_x = pcaler.fit_transform(scaled_x) scaled_x = pd.DataFrame(pca_x, columns = x.columns, index = x.index) x = scaled_x.join(x_other, how='outer') return x #Rolling scaling to ensure out of sample testing #TODO # def roll_scale(self, x_test, x): # if self.cols_to_scale == None: # return x_test # x_other = x.iloc[:,list(set(list(range(0, len(x.columns))))-set(self.cols_to_scale))] # x = x.iloc[:,self.cols_to_scale] # x_test = x_test.iloc[:,self.cols_to_scale] # x_all = x.append(x_test) # scaled_x = x_all.expanding(min_periods=len(x)).apply(lambda x: self.scale(pd.DataFrame(x).iloc[len(x)-1:,], pd.DataFrame(x)[:len(x)-1])) # scaled_x = pd.DataFrame(scaled_x, columns = x_all.columns, index = x_all.index) # x = scaled_x.join(x_other, how='outer') # return x #Merge data_dict into one dataframe def merge_data(self, data_dict): data = data_dict.values() temp_data = reduce(lambda x,y: x.join(y, how='outer'), data) temp_data = temp_data.fillna(method='pad') self.data_df = temp_data.dropna(axis=0) #self.data_df.sort_index(inplace=True) self.data_df.columns = list(data_dict.keys()) return self.data_df #Ensure data is in a pd.Dataframe before splitting in train/test #If threshold given, use that, else use the train/test cutoff points with optional scaling #Returns y_train, y_test, x_train, x_test as DataFrames def get_traintest(self, x, y, threshold = 0.8): #pdb.set_trace() common_ind = x.index.intersection(y.index) y = y.loc[common_ind] x = x.loc[common_ind] merged = y.to_frame().join(x) merged = merged.dropna(how='all') y = merged.iloc[:,0] x = merged.iloc[:,1:] #pdb.set_trace() cutoff = int(np.floor(len(merged)*threshold)) end_cut = len(merged) y_train = y.iloc[0:cutoff] y_test = y.iloc[cutoff+1:end_cut] x_train = x.iloc[0:cutoff, :] x_test = x.iloc[cutoff+1:end_cut,:] return y_train.to_frame().values.ravel(), y_test.to_frame().values.ravel(), x_train, x_test