def biao1(df): df['结果2'] = df['结果'].apply(lambda x: x.split(',')) list1 = [] for x in range(df.shape[0]): if df.loc[x, '结果'] != '阴性': list1 = list1 + df.loc[x, '结果2'] s1 = pd.DataFrame(list1) s1.replace(' ', inplace=True) s1 = pd.DataFrame(s1.groupby(0).size()) s1.columns = ['阳性例数'] s1.reset_index(inplace=True) s1.columns = ['HPV类型', '阳性例数'] s1['构成比'] = round(s1['阳性例数'] / s1['阳性例数'].sum() * 100, 2) s1['阳性率'] = round(s1['阳性例数'] / df.shape[0] * 100, 2) df2 = pd.read_excel(or_path('MAP')) df = pd.merge(s1, df2, on='HPV类型', how='left') df['类型'] = df['类型'].fillna('高危型') df.to_excel(or_path('HPV各型的阳性例数'))
def biao2(): def age(x): if x <= 10: return '0-10' elif x <= 30: return '10-30' elif x <= 50: return '30-50' elif x <= 70: return '50-70' elif x <= 90: return '70-90' df['年龄2'] = df['年龄'].apply(lambda x: age(x)) s1 = pd.DataFrame(df.groupby('年龄2').size()) s1.columns = ['个数'] df2 = df[(df['TCT'] == 'L') | (df['TCT'] == 'H') | (df['TCT'] == 'A')] s1['TCT阳性'] = df2.groupby('年龄2').size() df3 = df[(df['HPV阳性'].notnull())] s1['HPV阳性'] = df3.groupby('年龄2').size() df4 = df[(df['感染层'] == '单一感染')] s1['单一感染'] = df4.groupby('年龄2').size() df5 = df[(df['感染层'] == '多重感染')] s1['多重感染'] = df5.groupby('年龄2').size() s1.fillna(0, inplace=True) s1.loc['汇总'] = s1.apply(lambda x: sum(x)) def more(col): s1[col] = s1[col].apply(lambda x: str(int(x))) + '(' + ( s1[col] / s1['个数']).apply(lambda x: str('%.2f%%' % (x * 100))) + ')' more('TCT阳性') more('HPV阳性') more('单一感染') more('多重感染') s1['个数'] = s1['个数'].apply(lambda x: str(int(x))) + '(' + ( s1['个数'] / 714).apply(lambda x: str('%.2f%%' % (x * 100))) + ')' s1.reset_index(inplace=True) s1.rename(columns={'年龄2': '年龄(岁)'}, inplace=True) s1.to_excel(or_path('各个年龄阶段的总数以及总计'), index=False) print(s1)
print(itemsets) print(rules) exit() # # # # 官网案例 # transactions = [['eggs', 'bacon', 'soup'], # ['eggs', 'bacon', 'apple'], # ['soup', 'bacon', 'banana']] # # # print(type(transactions)) # # exit() # itemsets, rules = apriori(transactions, min_support=0.5, min_confidence=1) # print(itemsets) # print(rules) # exit() df = pd.read_excel(or_path('\wx\张艺谋合作影视明星 - 副本')) df['name2'] = df['name'].apply(lambda x: x.split(' / ')[1:]) data = df['name2'].tolist() # print(type(data)) # print(data) # exit() itemsets, rules = apriori(data, min_support=0.5, min_confidence=1) print(itemsets) print(rules)
today = datetime.date.today() day_cut = int( str(pd.to_datetime(today) - pd.to_datetime('2018/11/14')).split(' ')[0]) df_all = pd.DataFrame() for x in range(day_cut, 0, -1): day = today - datetime.timedelta(days=x) df = date(lc_url(day)) df['time'] = day df_all = df_all.append(df) print('{}号抓取完毕!'.format(day)) df_all.reset_index(inplace=True) df_all = df_all[['time', 'counts', 'day1', 'day3', 'day7', 'day14', 'day30']] # 测试专用 # df_all.to_excel(or_path('TTT')) # df_all = pd.read_excel(or_path('TTT')) def more(form, col): form[col] = form[col].apply(lambda x: str(int(x))) + '(' + ( form[col] / form['counts']).apply(lambda x: str('%.0f%%' % (x * 100))) + ')' for col in df_all.columns[2:]: more(df_all, col) df_all.to_excel(or_path('趣头条每次用户留存'))
predict_y = gridsearch.predict(test_x) print("#准确率: %0.4lf" % accuracy_score(test_y, predict_y)) response['predict_y'] = predict_y return response from build.Func import or_path df_all = pd.DataFrame() for model, model_name, model_param_grid in zip(classifiers, classifier_names, classifier_param_grid): # print(model_name, '\n', '-' * 50) # print(model_param_grid, '\n', '-' * 50) # print(model) # 管道流水机制 pipeline = Pipeline([('scaler', StandardScaler()), (model_name, model)]) print('\n{}模型输出结果:'.format(model_name)) # 参数调优 result = GridSearchCV_work(pipeline, model_param_grid, score='accuracy') result.columns = [model_name] df_all = pd.concat([df_all, result], axis=1) print('-' * 50) df_all.to_excel(or_path('各算法预测结果'))
# -*- coding: utf-8 -*- # author:Super.Shen import pandas as pd from build.Func import or_path, gb pd.set_option('expand_frame_repr', False) pd.set_option('display.max_rows', 1000) import warnings from Func import append_excel warnings.filterwarnings('ignore') df = pd.read_excel(or_path('晶\\1107')) zhuce = pd.read_excel(or_path('晶\\注册')) zhuce['flag'] = 'new' zhuce = zhuce[['用户ID', 'flag']] zhuce.rename(columns={'用户ID': 'player_id'}, inplace=True) df = pd.merge(left=df, right=zhuce, on='player_id', how='left') df['flag'].fillna('old', inplace=True) df = df[['player_id', 'flag']] print(df.groupby(['flag']).size()) # exit() # print(df.groupby('flag').size())
def old_app(pingtai='qi'): if pingtai=='qi': file_path = 'C:\\Users\Administrator\Desktop\图表数据\奇奇乐' writer = pd.ExcelWriter(or_path('奇奇乐周报-报表')) list1 = ['宝石分类', '宝石明细', '税收'] else: file_path = 'C:\\Users\Administrator\Desktop\图表数据\浪仔' writer = pd.ExcelWriter(or_path('浪仔周报-报表')) list1 = ['宝石分类', '宝石明细', '税收'] os.chdir(file_path) # 总的 # list1 = ['渠道', '充值支付类型占比', '新注册其次占比', '金币产出', '金币消耗', '金币系统赠送', '宝石分类', '宝石明细', '奖品发放', '税收', '我要赚钱', '回收比'] for name in list1: df_all = pd.DataFrame() for x, y, z in os.walk(file_path): for file in z: df = pd.read_excel(file, sheet_name=name, index=False, encoding='utf8') if name == '宝石明细': df = df.drop([2, 3]) elif name == '新注册其次占比': df = df[['日期', '新用户量', '次日再消费用户量']] elif name == '金币产出': df['其他'] = df['领取邮件'] + df['系统赠送'] if pingtai == 'qi': df = df[['时间', '用户充值', '兑换红宝石', '兑换鱼雷', '其他']] else: # print(file) df = df[['时间', '用户充值', '兑换红包券', '兑换鱼雷', '其他']] elif name == '宝石分类': try: df['其他'] = df['充值礼包'] + df['分享抽奖'] + df['成就任务'] + df['新手礼包'] df['其他2'] = df['幸运抽奖'] + df['欢乐夺宝'] + df['购买物品'] df = df[['时间', '游戏产出', '其他', '玩家兑换红包', '玩家兑换话费', '玩家兑换金币', '其他2']] except KeyError: if pingtai == 'qi': df['其他'] = df['充值礼包'] + df['成就任务'] + df['新手礼包'] df['其他2'] = df['幸运抽奖'] + df['欢乐夺宝'] + df['购买物品'] df = df[['时间', '游戏产出', '其他', '玩家兑换红包', '玩家兑换话费', '玩家兑换金币', '其他2']] else: df['其他'] = df['充值礼包'] + df['分享抽奖'] + df['新手礼包'] df['其他2'] = df['幸运抽奖'] + df['欢乐夺宝'] + df['购买物品'] df = df[['时间', '游戏产出', '其他', '玩家兑换红包', '玩家兑换金币', '其他2']] elif name == '宝石明细': del df['总和'] elif name == '税收': if pingtai == 'qi': df['鱼雷场'] = df['鱼雷初级场'] + df['鱼雷中级场'] + df['鱼雷高级场'] df = df[['日期', '红包场', '鱼雷场', '猜猜乐']] else: pass df_all = df_all.append(df) df_all.sort_values(df_all.columns[0], inplace=True) df_all.drop_duplicates(keep='last', inplace=True) df_all.to_excel(writer, sheet_name=name, index=False) writer.save()
u'最大每5秒回蓝', u'初始每5秒回蓝', u'最大攻速', u'攻击范围'] data = df[features_remain] data[u'最大攻速'] = data[u'最大攻速'].apply(lambda x: float(x.strip('%')) / 100) data[u'攻击范围'] = data[u'攻击范围'].map({'远程': 1, '近战': 0}) # 采用 Z-Score 规范化数据,保证每个特征维度的数据均值为 0,方差为 1 from sklearn.preprocessing import StandardScaler ss = StandardScaler() data = ss.fit_transform(data) # 构造 GMM 聚类 from sklearn.mixture import GaussianMixture gmm = GaussianMixture(n_components=30, covariance_type='full') gmm.fit(data) # 训练数据 prediction = gmm.predict(data) # print(prediction) # exit() # 将分组结果输出到 CSV 文件中 df.insert(0, '分组', prediction) df.sort_values('分组', ascending=1, inplace=True) df.to_excel(or_path('英雄分类结果'), index=False) print(df.groupby('分组').apply(lambda x: list(x['英雄'])))
# 构造高斯模型 gmm = GaussianMixture(n_components=no, covariance_type='full') gmm.fit(train_x) # 训练数据 predict_x = gmm.predict(train_x) # 训练结果插入原始数据 result = pd.concat((pd.DataFrame(predict_x), data), axis=1) result.rename(columns={0: u'EM聚类'}, inplace=True) # ------------------------------------------------------------------- # k-Means 算法 from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=no) kmeans.fit(train_x) predict_y = kmeans.predict(train_x) # 合并聚类结果,插入到原数据中 result = pd.concat((pd.DataFrame(predict_y), result), axis=1) result.rename(columns={0: u'K-Means聚类'}, inplace=True) # 结果查看 print(result.head()) print('-' * 50) # 输出到桌面 result.to_excel(or_path('聚类结果'))
# -*- coding: utf-8 -*- # author:Super import pandas as pd from build.Func import or_path import numpy as np pd.set_option('expand_frame_repr', False) pd.set_option('display.max_rows', 1000) df = pd.read_excel(or_path('data')) df.replace('阴性(-)', '', inplace=True) df.replace(np.NaN, '', inplace=True) result = pd.DataFrame() for x in range(df.shape[0]): list1 = [] for y in df.columns[1:]: if df.loc[x, y] != '': list1.append(y[:5].strip() + '+') if len(list1) == 0: result.loc[x, '结果'] = np.NaN else: result.loc[x, '结果'] = ", ".join(list1) result.fillna('阴性', inplace=True) print(result.groupby('结果').size()) result.to_excel(or_path('ttt'))
# 存放到df df['price_KNN'] = pred_y # 使用SVM 回归模型 from sklearn import svm model = svm.SVR() model.fit(train_x, train_y) pred_y = model.predict(test_x) mse = mean_squared_error(test_y, pred_y) print("SVM 均方误差 = ", round(mse, 2)) # 存放到df df['price_SVM'] = pred_y # 将数据放到桌面 df.to_excel(or_path('各模型回归预测')) # 画图 df = pd.read_excel(or_path('各模型回归预测')) print(df.head()) for col in df.columns[-4:]: fig = plt.figure(figsize=(13, 7)) df['price'].plot(color='black') df[col].plot(color='lime', linestyle='-.') plt.legend(loc='upper right') plt.savefig('C:\\Users\Administrator\Desktop\\{}'.format(col))
def init(): df_all = pd.DataFrame() for count in range(1, 10): url = 'https://www.zhipin.com/c101210100/?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&page={}'.format( count) if count == 1: div = 3 else: div = 2 ip = random.choice(list1) ua = random.choice(list2) psoxy = {'http': 'http://' + ip} headers = { 'accept': "application/json, text/javascript, */*; q=0.01", 'accept-encoding': "gzip, deflate, br", 'accept-language': "zh-CN,zh;q=0.9,en;q=0.8", 'content-type': "application/x-www-form-urlencoded; charset=UTF-8", 'cookie': "JSESSIONID=" "; __c=1530137184; sid=sem_pz_bdpc_dasou_title; __g=sem_pz_bdpc_dasou_title; __l=r=https%3A%2F%2Fwww.zhipin.com%2Fgongsi%2F5189f3fadb73e42f1HN40t8~.html&l=%2Fwww.zhipin.com%2Fgongsir%2F5189f3fadb73e42f1HN40t8~.html%3Fka%3Dcompany-jobs&g=%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_title; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1531150234,1531231870,1531573701,1531741316; lastCity=101010100; toUrl=https%3A%2F%2Fwww.zhipin.com%2Fjob_detail%2F%3Fquery%3Dpython%26scity%3D101010100; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1531743361; __a=26651524.1530136298.1530136298.1530137184.286.2.285.199", 'origin': "https://www.zhipin.com", 'referer': "https://www.zhipin.com/job_detail/?query=python&scity=101010100", 'user-agent': ua } html = requests.get(url, headers=headers, proxies=psoxy).text tree = etree.HTML(html) res = PyQuery(html) df = pd.DataFrame() for i in range(30): df.loc[i, '公司'] = res( 'li:nth-child({}) > div > div.info-company > div > h3 > a'. format(i + 1)).text() df.loc[i, '职位'] = res( 'li:nth-child({}) > div > div.info-primary > h3 > a > div.job-title' .format(i + 1)).text() df.loc[i, '薪水'] = res( 'li:nth-child({}) > div > div.info-primary > h3 > a > span'. format(i + 1)).text() df.loc[i, '地区'] = \ tree.xpath('//*[@id="main"]/div/div[{}]/ul/li[{}]/div/div[1]/p/text()[1]'.format(div, i + 1))[0] df.loc[i, '经验要求'] = \ tree.xpath('//*[@id="main"]/div/div[{}]/ul/li[{}]/div/div[1]/p/text()[2]'.format(div, i + 1))[0] df.loc[i, '学历要求'] = \ tree.xpath('//*[@id="main"]/div/div[{}]/ul/li[{}]/div/div[1]/p/text()[3]'.format(div, i + 1))[0] df.loc[i, '行业'] = \ tree.xpath('//*[@id="main"]/div/div[{}]/ul/li[{}]/div/div[2]/div/p/text()[1]'.format(div, i + 1))[0] df.loc[i, '融资'] = \ tree.xpath('//*[@id="main"]/div/div[{}]/ul/li[{}]/div/div[2]/div/p/text()[2]'.format(div, i + 1))[0] try: df.loc[i, '人数'] = \ tree.xpath('//*[@id="main"]/div/div[{}]/ul/li[{}]/div/div[2]/div/p/text()[3]'.format(div, i + 1))[0] except IndexError: print('\n{} - 该公司有数据缺失!\n'.format(df.loc[i, '公司'])) df.loc[i, 'url'] = url_b + res( 'li:nth-child({}) > div > div.info-primary > h3 > a'.format( i + 1)).attr.href time.sleep(5) df_all = df_all.append(df, ignore_index=True) print('第{}页抓取完毕!……'.format(count)) df_all.to_excel(or_path('boss直聘数据分析岗位'))
df_all = df_all.append(df, ignore_index=True) print('第{}页抓取完毕!……'.format(count)) df_all.to_excel(or_path('boss直聘数据分析岗位')) import os file = 'C:\\Users\Administrator\Desktop\\boss直聘数据分析岗位.xlsx' if os.path.exists(file): print('\n{} - 已存在!\n'.format('岗位数据已存在')) else: init() # 读取数据 df = pd.read_excel(or_path('boss直聘数据分析岗位')) df.drop_duplicates('url', inplace=True) def content(): df2 = pd.DataFrame() for i in range(df.shape[0]): try: ip = random.choice(list1) print(ip) ua = random.choice(list2) psoxy = {'http': 'http://' + ip} headers = { 'accept': "application/json, text/javascript, */*; q=0.01",
# -*- coding: utf-8 -*- # author:Super import pandas as pd import numpy as np pd.set_option('expand_frame_repr', False) pd.set_option('display.max_rows', 1000) from build.Func import or_path, hash_data df = pd.read_excel(or_path('HPV07原始数据')) def biao1(df): df['结果2'] = df['结果'].apply(lambda x: x.split(',')) list1 = [] for x in range(df.shape[0]): if df.loc[x, '结果'] != '阴性': list1 = list1 + df.loc[x, '结果2'] s1 = pd.DataFrame(list1) s1.replace(' ', inplace=True) s1 = pd.DataFrame(s1.groupby(0).size()) s1.columns = ['阳性例数'] s1.reset_index(inplace=True) s1.columns = ['HPV类型', '阳性例数'] s1['构成比'] = round(s1['阳性例数'] / s1['阳性例数'].sum() * 100, 2)
def run3(): df =pd.read_excel(or_path('奇奇乐')) df_map = pd.read_excel('C:\\Users\Administrator\Desktop\map.xlsx') # 修改不规则的列 for x in range(df.shape[0]): if '鱼雷' in str(df.loc[x, '数值']): df.loc[x, '原因'] = '玩家兑换鱼雷' elif '红宝石' in str(df.loc[x, '数值']): df.loc[x, '原因'] = '玩家兑换红宝石' def change_col(x): if '鱼雷' in x: return int(x.split('(')[0]) elif '红宝石' in x: return int(x.split('(')[0]) else: return int(x) df['数值'] = df['数值'].apply(lambda x: change_col(str(x))) '-----------------系统赠送金币分类-----------------' df3 = df[df.columns[:3]] df3.dropna(axis=0, how='any', inplace=True) df3 = pd.pivot_table(df3, values='数值', index='时间', columns='原因') df3 = df3[['每日登录抽奖', 'VIP奖励', '新手礼包', '成就任务', '分享抽奖']] df3.reset_index(inplace=True) df3 = df3[df3['时间'] >= pd.to_datetime('{}'.format(bef_yesterday))] df3['时间'] = df3['时间'].apply(lambda x: str(x)[:10]) '-----------------金币消耗汇总表--------------' # 金币消耗-透视 df2 = df[df.columns[-3:]] df2.dropna(axis=0, how='any', inplace=True) df2 = pd.pivot_table(df2, values='数值2', index='时间2', columns='原因2') del df2['单局结算'] df2.reset_index(inplace=True) df2['时间2'] = df2['时间2'].apply(lambda x: str(x)[:10]) '----------------------------金币产出汇总表--------------------------' # 金币产出-透视 df = df[df.columns[:3]] df_map = df_map[['原因', 'jinbi']] df_map.dropna(inplace=True) # 合并匹配表 df = pd.merge(left=df, right=df_map, on='原因', how='left') df = df.groupby(['时间', 'jinbi'])['数值'].sum() df = pd.DataFrame(df) df.reset_index(inplace=True) df = pd.pivot_table(df, values='数值', index='时间', columns='jinbi') df.reset_index(inplace=True) df = df[['时间', '用户充值', '系统赠送', '兑换红宝石','兑换鱼雷', '领取邮件']] df['时间'] = df['时间'].apply(lambda x: str(x)[:10]) df.to_excel(or_path('奇奇乐-红宝石兑换')) # print(df) exit() print('\n第三个表运行完毕……') return df, df2, df3
df['游戏种类']=df['游戏种类'].apply(lambda x:zl(x)) df['变动属性']=df['变动属性'].apply(lambda x:sx(x)) out=pd.DataFrame() for x,y in df.groupby('用户ID'): y = pd.pivot_table(y, values='差值', index='游戏种类', columns='变动属性') y.fillna(0,inplace=True) y['求和']=y['红包券']*2000+y['金币']+y['鱼雷']*10000 y['ID']=x y.reset_index(inplace=True) y= pd.pivot_table(y, values='求和', index='ID', columns='游戏种类') out=out.append(y) del out['大厅'] out.fillna(0,inplace=True) out['求和']=out.apply(lambda x:x.sum(),axis=1) out2=out/20000 out.to_excel(or_path('四个用户变动详情')) out2.to_excel(or_path('四个用户变动详情2'))
# -*- coding: utf-8 -*- # author:Super.Shen import pandas as pd pd.set_option('expand_frame_repr', False) pd.set_option('display.max_rows', 1000) import warnings warnings.filterwarnings('ignore') from build.database import url11, date, url77 from build.Func import or_path, gb # # # 导出数据 date(url77).to_excel(or_path('注册充值用户2')) date(url11).to_excel(or_path('变动日志2')) # exit() # # 读取充值新用户 df_reg = pd.read_excel(or_path('注册充值用户2')) df_reg = gb(df_reg, '用户id', '充值金额') df_reg.rename(columns={'用户id': '用户ID'}, inplace=True) # 读取变动日志 df = pd.read_excel(or_path('变动日志2')) # 数据分析 df['变动时间'] = df['变动时间'].apply(lambda x: pd.to_datetime(x))
# print('{}号抓取完毕!'.format(day)) # # df_all. # # 筛选出属于CPA的变动日志 # df = pd.read_hdf('C:\\Users\Administrator\Desktop\\bdrz.h5', key='data') # df_reg = pd.read_excel(or_path('新闻资讯注册')) # df_reg['flag'] = 'new' # df_reg = df_reg[['用户ID', 'flag']] # df = pd.merge(left=df, right=df_reg, on='用户ID', how='left') # df = df[df['flag'].notnull()] # df.to_hdf('C:\\Users\Administrator\Desktop\\test.h5', key='data') # print(df.shape[0]) # exit() df_pay = pd.read_excel(or_path('新闻资讯充值')) df_pay = gb(df_pay, 'player_id', 'amount') df = pd.read_hdf('C:\\Users\Administrator\Desktop\\test.h5', key='data') df = pd.DataFrame(df.groupby(['游戏种类', '用户ID']).size()) df.reset_index(inplace=True) df2 = pd.pivot_table(df, values=0, index='用户ID', columns='游戏种类') df2 = df2[['大厅', '红包场', '鱼雷场', '水果狂欢', '鱼乐场']] df2.reset_index(inplace=True) df2.rename(columns={'用户ID': 'player_id'}, inplace=True) df2 = pd.merge(left=df2, right=df_pay, on='player_id', how='left')
# -*- coding: utf-8 -*- # author:Super.Shen import pandas as pd pd.set_option('expand_frame_repr', False) pd.set_option('display.max_rows', 1000) import warnings warnings.filterwarnings('ignore') from build.database import url1, date, url7 from build.Func import or_path, gb # # 导出数据 date(url7).to_excel(or_path('注册充值用户')) date(url1).to_excel(or_path('变动日志')) # # 读取充值新用户 df_reg = pd.read_excel(or_path('注册充值用户')) df_reg = gb(df_reg, '用户id', '充值金额') df_reg.rename(columns={'用户id': '用户ID'}, inplace=True) # 读取变动日志 df = pd.read_excel(or_path('变动日志')) # 数据分析 df['变动时间'] = df['变动时间'].apply(lambda x: pd.to_datetime(x)) df.sort_values('变动时间', inplace=True)
# 转化数值类型 from sklearn import preprocessing dvec = preprocessing.LabelEncoder() for col in features[1:]: train_features[col] = dvec.fit_transform(train_features[col]) # 规范化到 [0,1] 空间 min_max_scaler = preprocessing.MinMaxScaler() train_x = min_max_scaler.fit_transform(train_features) print(train_features.head()) print('-'*50) # k-Means 算法 from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=10) kmeans.fit(train_x) predict_y = kmeans.predict(train_x) # 合并聚类结果,插入到原数据中 result = pd.concat((pd.DataFrame(predict_y), data), axis=1) result.rename(columns={0: u'聚类'}, inplace=True) result.to_excel(or_path('聚类结果')) print(result.head()) print('-'*50) # 输出聚类效果 df = pd.DataFrame(result.groupby('聚类').apply(lambda x: list(x['公司'] + '-' + x['职位']))) df.to_excel(or_path('职位分类')) print(df)