import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from warnings import filterwarnings
filterwarnings('ignore')
from sklearn.cluster import KMeans
from common import utils as u
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
import plotly.graph_objs as go

#%%
file_path = r'C:\Users\bolat\Desktop\file\DataAnalysis\data.zip'
file_dict = u.unpack_file_to_df(file_path)
df = file_dict['data']
u.show_info(df)

# %%
# 检查重复记录数量,并去重
u.show_duplicated(df)

# %%
# 检查NA值
u.show_na(df)

# %%
# 处理异常,缺失值数据
print(df.describe())
print(len(df))
示例#2
0
'''
event_time -购买时间
event_type -行为类别
product_id -产品编号
category_id -产品的类别ID
category_code -产品的类别分类法(代码名称)
brand -品牌名称
price -产品价格
user_id -用户ID
'''

# %%
# 1.读取文件,查看基本信息
file_path = r'C:\Users\bolat\Desktop\file\DataAnalysis\archive.zip'

df = u.unpack_file_to_df(file_path)
df = df['kz']
u.show_info(df)
print(df.shape)
u.show_na(df)
u.show_duplicated(df)
print(df.describe())

# %%
# 修正各字段
# 1.删除重复记录
df.drop_duplicates(inplace=True, ignore_index=True)
# %%
# 2.修改event_time字段为日期属性,并添加月字段
df['event_time'] = pd.to_datetime(df['event_time'])
df['month'] = df['event_time'].dt.month
示例#3
0
from pyecharts.charts import *
from pyecharts import options as opt
from common import utils as u
import pandas as pd

# %%
file_path = r'D:\file\DataAnalysis\GENDER8810.zip'
file_dict = u.unpack_file_to_df(file_path, encoding='utf-8')

# 为了方便后面统计,所以更新了key键值
file_name = list(file_dict.keys())
file_name_new = ['town', 'country', 'city', 'area', 'all']
for i in range(len(file_name_new)):
    file_dict[file_name_new[i]] = file_dict.pop(file_name[i])


# # 处理表头
def rename(tab):
    for t in range(tab.shape[1]):
        if tab.iloc[1, t] == '小计':
            tab.iloc[1, t] = '小计' + '_' + str(tab.iloc[0, t])
        elif tab.iloc[1, t] == '男':
            tab.iloc[1, t] = '男' + '_' + str(tab.iloc[0, t - 1])
        elif tab.iloc[1, t] == '女':
            tab.iloc[1, t] = '女' + '_' + str(tab.iloc[0, t - 2])
    tab.iloc[1, 0] = '地 区'
    tab.columns = tab.iloc[1, ]
    tab.drop(index=[0, 1], inplace=True)
    tab.reset_index(drop=True, inplace=True)