import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt plt.style.use('fivethirtyeight') from warnings import filterwarnings filterwarnings('ignore') from sklearn.cluster import KMeans from common import utils as u from plotly.offline import init_notebook_mode init_notebook_mode(connected=True) import plotly.graph_objs as go #%% file_path = r'C:\Users\bolat\Desktop\file\DataAnalysis\data.zip' file_dict = u.unpack_file_to_df(file_path) df = file_dict['data'] u.show_info(df) # %% # 检查重复记录数量,并去重 u.show_duplicated(df) # %% # 检查NA值 u.show_na(df) # %% # 处理异常,缺失值数据 print(df.describe()) print(len(df))
''' event_time -购买时间 event_type -行为类别 product_id -产品编号 category_id -产品的类别ID category_code -产品的类别分类法(代码名称) brand -品牌名称 price -产品价格 user_id -用户ID ''' # %% # 1.读取文件,查看基本信息 file_path = r'C:\Users\bolat\Desktop\file\DataAnalysis\archive.zip' df = u.unpack_file_to_df(file_path) df = df['kz'] u.show_info(df) print(df.shape) u.show_na(df) u.show_duplicated(df) print(df.describe()) # %% # 修正各字段 # 1.删除重复记录 df.drop_duplicates(inplace=True, ignore_index=True) # %% # 2.修改event_time字段为日期属性,并添加月字段 df['event_time'] = pd.to_datetime(df['event_time']) df['month'] = df['event_time'].dt.month
from pyecharts.charts import * from pyecharts import options as opt from common import utils as u import pandas as pd # %% file_path = r'D:\file\DataAnalysis\GENDER8810.zip' file_dict = u.unpack_file_to_df(file_path, encoding='utf-8') # 为了方便后面统计,所以更新了key键值 file_name = list(file_dict.keys()) file_name_new = ['town', 'country', 'city', 'area', 'all'] for i in range(len(file_name_new)): file_dict[file_name_new[i]] = file_dict.pop(file_name[i]) # # 处理表头 def rename(tab): for t in range(tab.shape[1]): if tab.iloc[1, t] == '小计': tab.iloc[1, t] = '小计' + '_' + str(tab.iloc[0, t]) elif tab.iloc[1, t] == '男': tab.iloc[1, t] = '男' + '_' + str(tab.iloc[0, t - 1]) elif tab.iloc[1, t] == '女': tab.iloc[1, t] = '女' + '_' + str(tab.iloc[0, t - 2]) tab.iloc[1, 0] = '地 区' tab.columns = tab.iloc[1, ] tab.drop(index=[0, 1], inplace=True) tab.reset_index(drop=True, inplace=True)