def read(self, table, features = None): conn = MysqlConnection(config.dbhost,config.dbuser,config.dbpassword,config.dbname) # features = ["login_times", "spin_times", "bonus_times", "active_days", "average_day_active_time", "average_login_interval", "average_spin_interval", "average_bonus_win", "average_bet", "bonus_ratio", "spin_per_active_day", "bonus_per_active_day"] # features = ["login_times", "spin_times", "bonus_times", "active_days", "average_day_active_time", "average_login_interval", "average_spin_interval", "average_bonus_win"] # sql = "select uid, level, coin, purchase_times, active_days, average_day_active_time, average_login_interval, average_spin_interval from slot_user_profile where purchase_times > 0" if features == None: fs = '*' else: fs = ",".join(features) sql = "select "+ fs +" from " + table + " where purchase_times > 0 and active_days > 1" result_pay = conn.query(sql) pay_num = len(result_pay) df_pay = pd.DataFrame(result_pay) df_pay["purchase"] = 1 sql = "select "+ fs + " from " + table + " where purchase_times = 0 and active_days > 1" result_no_pay = conn.query(sql) if self.proportion > 0: result_no_pay = random.sample(result_no_pay, self.proportion * pay_num) no_pay_num = len(result_no_pay) df_no_pay = pd.DataFrame(result_no_pay) df_no_pay["purchase"] = 0 df = pd.concat([df_pay, df_no_pay]) conn.close() # return np.array(x), np.array(y) return df
def levelTotal(start, end): """ 画出每个等级的7日流失人数总和分布 获取每个等级的7日流失用户总数,并以等级为横坐标,人数为纵坐标画图 Arguments: start {int} -- 要统计的起始等级 end {int} -- 要统计的结束等级 """ connection = MysqlConnection(config.dbhost, config.dbuser, config.dbpassward, config.dbname) path = utils.figure_path("level_left") sql = "select * from log_level_left_total" result = connection.query(sql) result = list(zip(*result)) level = result[0][start:end] user_7day = result[1][start:end] plt.grid(True) plt.bar(level, user_7day) plt.gca().set_xlabel('level') plt.gca().set_ylabel('user_7day') plt.savefig( os.path.join(path, "level_left_total_" + str(start) + "_" + str(end))) plt.show() connection.close()
def level7DayLeft(levels): """ 各个等级在不同日期的7日留存数 按天统计各个等级7日留存,并以日期为横坐标,留存为纵坐标,每个等级画出一张图 Arguments: levels {list} -- 要统计的等级,以列表形式给出 """ connection = MysqlConnection(config.dbhost, config.dbuser, config.dbpassward, config.dbname) sql = "select date,user_7day from log_level_left_s_wja_1 where level = %s" #sql = "select * from test" for level in levels: result = connection.query(sql, level) dates_num = [] number = [] result = list(zip(*result)) dates_num = result[0] number = result[1] dates = [str(x) for x in dates_num] plt.gca().xaxis.set_major_formatter( mdate.DateFormatter('%Y-%m-%d')) #设置时间标签显示格式 plt.gca().xaxis.set_major_locator(mdate.AutoDateLocator()) plt.xticks(pd.date_range(dates[0], dates[-1], freq='5d')) #时间间隔 plt.xticks(rotation=90) plt.plot(dates, number, 'r--') path = utils.figure_path("7DayLeft_level") plt.savefig(os.path.join(path, str(level))) plt.show() plt.cla() connection.close()
def date7DayLeft(dates): """ 每日的各等级7日留存统计 统计同一天的各个等级的7日留存数据,并以等级为横坐标,留存为纵坐标,每天画出一张图 Arguments: dates {list} -- 要统计的日期,以列表形式给出 """ connection = MysqlConnection(config.dbhost, config.dbuser, config.dbpassward, config.dbname) sql = "select * from log_level_left_s_wja_1 where date = %s" for date in dates: print("------------", date, "--------------") result = connection.query(sql, date) level = [] number = [] for record in result: print(record[1], record[2]) level.append(record[1]) number.append(record[2]) plt.plot(level, number, 'ro-') plt.show() connection.close()
def dauAndDnu_Bar(): """ dau和dnu的直方图 """ connection = MysqlConnection(config.dbhost, config.dbuser, config.dbpassword, config.dbname) sql = "select date, login_count, register_count from log_return_s_wja_1_percent where channel_id = %s" result = connection.query(sql, -2) # result = list(zip(*result)) dates = [str(x['date']) for x in result] dau = [x['login_count'] for x in result] dnu = [x['register_count'] for x in result] fig = plt.gcf() fig.set_size_inches(18, 9) plt.gca().xaxis.set_major_formatter( mdate.DateFormatter('%Y-%m-%d')) #设置时间标签显示格式 plt.gca().xaxis.set_major_locator(mdate.AutoDateLocator()) plt.xticks(pd.date_range(dates[0], dates[-1], freq='5d')) #时间间隔 plt.xticks(rotation=90) old_user = [] for i in range(len(dau)): old_user.append(dau[i] - dnu[i]) plt.bar(dates, old_user, width=0.35, label="old_user") plt.bar(dates, dnu, bottom=old_user, width=0.35, label="dnu") plt.legend(loc='upper right') path = file_util.get_figure_path("dau and dnu test") plt.savefig(os.path.join(path, "dau and dnu bar"), dpi=100) #plt.show() connection.close()
def sim(): connection = MysqlConnection(config.dbhost, config.dbname, config.dbuser, config.dbpassword) sql = "select * from user_item" result = connection.query(sql) R_u = [x[1:] for x in result] R_u_centralized = list(map(centralized, R_u)) #items = np.array(list(zip(*R_u_centralized))) items = np.array(list(zip(*R_u))) for i in range(len(items)): for j in range(i + 1, len(items)): sim_ij = sim_cosin(items[i], items[j]) print(sim_ij) connection.close()
def dayReturn(days=list(range(2, 31))): """ n-day留存情况 获取不同日期下的同一个n-day 留存(2<=n<=30),并以日期为横坐标,留存率为纵坐标,每一个n作出一张图 Keyword Arguments: days {list} -- 要统计的n日留存,以列表方式给出 (default: {list(range(2,31))}) """ connection = MysqlConnection(config.dbhost, config.dbuser, config.dbpassword, config.dbname) for day in days: sql = "select date, " + str( day) + "day from log_return_s_wja_1_percent where channel_id = %s" result = connection.query(sql, -2) # result = list(zip(*result)) # number = result[1] # dates_num = result[0] return_percent = [x[str(day) + 'day'] for x in result] dates = [str(x['date']) for x in result] #dates = [str(x) for x in dates_num] fig = plt.gcf() fig.set_size_inches(18, 9) plt.gca().xaxis.set_major_formatter( mdate.DateFormatter('%Y-%m-%d')) #设置时间标签显示格式 plt.gca().xaxis.set_major_locator(mdate.AutoDateLocator()) plt.xticks(pd.date_range(dates[0], dates[-1], freq='5d')) #时间间隔 plt.xticks(rotation=90) plt.plot(dates, return_percent, 'r-o', label=str(day) + "day return") plt.gca().set_xlabel('date') plt.gca().set_ylabel('return percent(%)') plt.legend(loc="upper right") plt.grid(True) path = file_util.get_figure_path("return_day_test") plt.savefig(os.path.join(path, str(day) + "day")) #plt.show() plt.cla() fig.set_size_inches(10, 5) plt.hist(return_percent) path = file_util.get_figure_path("return_day_test", "hist") plt.savefig(os.path.join(path, str(day) + "day")) #plt.show() connection.close()
def relativeLevelLeft(start, end): """ 对应顶级范围的相对流失曲线 该等级的流失人数除以前五个等级流失人数的平均数 Arguments: start {int} -- 起始等级 end {int} -- 结束等级 """ connection = MysqlConnection(config.dbhost, config.dbuser, config.dbpassward, config.dbname) path = utils.figure_path("level_left") sql = "select * from log_level_left_total" result = connection.query(sql) result = list(zip(*result)) level = result[0] user_7day = result[1] left_total = {} for i in range(len(level)): left_total[level[i]] = user_7day[i] relative_left = [] for i in range(start, end + 1): pre = 0 n = 0 for j in range(i - 5, i): if j in left_total: n += 1 pre = pre + left_total[j] if pre != 0: relative_left.append(left_total[i] / (pre / n)) else: relative_left.append(0) x = list(range(start, end + 1)) plt.gca().set_xlabel('level') plt.gca().set_ylabel('relative left rate') plt.plot(x, relative_left) plt.grid(True) plt.savefig( os.path.join(path, "relative_level_left" + str(start) + "_" + str(end))) plt.show() connection.close()
def read_positive(self, table, features): conn = MysqlConnection(config.dbhost,config.dbuser,config.dbpassword,config.dbname) # features = ["login_times", "spin_times", "bonus_times", "active_days", "average_day_active_time", "average_login_interval", "average_spin_interval", "average_bonus_win", "average_bet", "bonus_ratio", "spin_per_active_day", "bonus_per_active_day"] # features = ["login_times", "spin_times", "bonus_times", "active_days", "average_day_active_time", "average_login_interval", "average_spin_interval", "average_bonus_win"] x = [] y = [] # sql = "select uid, level, coin, purchase_times, active_days, average_day_active_time, average_login_interval, average_spin_interval from slot_user_profile where purchase_times > 0" sql = "select * from " + table + " where purchase_times > 0 and active_days > 1" result_pay = conn.query(sql) pay_num = len(result_pay) for record in result_pay: d = [] for feature in features: d.append(record[feature]) x.append(d) y.append(1) conn.close() return np.array(x), np.array(y)
def dnuOfChannelID_Percent(channels=[-1]): """ dnu不同channel占总数的比例曲线图 Keyword Arguments: channels {list} -- 要统计的channel,以列表形式给出 (default: {[-1]}) """ connection = MysqlConnection(config.dbhost, config.dbuser, config.dbpassword, config.dbname) sql = "select date,register_count from log_return_s_wja_1_percent where channel_id = %s" result = connection.query(sql, -2) # result = list(zip(*result)) dates = [str(x['date']) for x in result] total_register = [x['register_count'] for x in result] # total_register = result[1] sql = "select date,register_count from log_return_s_wja_1_percent where date = %s and channel_id = %s" for channel in channels: path = file_util.get_figure_path() register_count = [] for i in range(len(dates)): result = connection.query(sql, [dates[i], channel]) if len(result): register_count.append(result[0]['register_count'] / total_register[i]) else: register_count.append(0) fig = plt.gcf() fig.set_size_inches(18, 9) plt.gca().xaxis.set_major_formatter( mdate.DateFormatter('%Y-%m-%d')) #设置时间标签显示格式 plt.gca().xaxis.set_major_locator(mdate.AutoDateLocator()) plt.xticks(pd.date_range(dates[0], dates[-1], freq='5d')) #时间间隔 plt.xticks(rotation=90) plt.plot(dates, register_count, label=channel) plt.legend(loc='upper right') #plt.savefig(os.path.join(path,"dnu_percent_" + str(channel)),dpi=100) #plt.show() plt.cla() connection.close()
def dnuOfChannelId(channels=[-2]): """ 不同channel_id的dnu人数柱状图 Keyword Arguments: channels {list} -- 要统计的channel_id 列表,-2为全部 (default: {[-2]}) """ connection = MysqlConnection(config.dbhost, config.dbuser, config.dbpassword, config.dbname) sql = "select date, register_count from log_return_s_wja_1_percent where channel_id = %s" pre = channels[0] for channel in channels: result = connection.query(sql, channel) # result = list(zip(*result)) dates = [str(x['date']) for x in result] register_count = [x['register_count'] for x in result] fig = plt.gcf() fig.set_size_inches(18, 9) plt.gca().xaxis.set_major_formatter( mdate.DateFormatter('%Y-%m-%d')) #设置时间标签显示格式 plt.gca().xaxis.set_major_locator(mdate.AutoDateLocator()) plt.xticks(pd.date_range(dates[0], dates[-1], freq='5d')) #时间间隔 plt.xticks(rotation=90) if channel == pre: plt.bar(dates, register_count, width=0.35, label=channel) else: plt.bar(dates, register_count, bottom=pre, width=0.35, label=channel) pre = channel plt.legend(loc='upper right') path = file_util.get_figure_path("dau and dnu") #plt.savefig(os.path.join(path,"dnu of differente channel")) plt.show() connection.close()
def dauAndDnu(): """ 绘制每日活跃用户、新增用户以及两者差值(老用户)的曲线图 """ connection = MysqlConnection(config.dbhost, config.dbuser, config.dbpassword, config.dbname) sql = "select date, login_count, register_count from log_return_s_wja_1_percent where channel_id = %s" result = connection.query(sql, -2) #result = list(zip(*result)) dates = [str(x['date']) for x in result] #dates = [str(x) for x in result[0]] plt.gca().xaxis.set_major_formatter( mdate.DateFormatter('%Y-%m-%d')) #设置时间标签显示格式 plt.gca().xaxis.set_major_locator(mdate.AutoDateLocator()) plt.xticks(pd.date_range(dates[0], dates[-1], freq='5d')) #时间间隔 plt.xticks(rotation=90) # dau = result[1] # dnu = result[2] dau = [x['login_count'] for x in result] dnu = [x['register_count'] for x in result] old_user = [] for i in range(len(dau)): old_user.append(dau[i] - dnu[i]) plt.plot(dates, dau, 'r-o', label="dau") plt.plot(dates, dnu, 'b-o', label="dnu") plt.plot(dates, old_user, 'g-o', label="old_user") fig = plt.gcf() fig.set_size_inches(19, 9) plt.gca().set_xlabel('date') plt.gca().set_ylabel('user') plt.legend(loc='upper right') path = file_util.get_figure_path("dau and dnu test") #plt.savefig(os.path.join(path,"dau and dnu"),dpi=100) plt.show() connection.close()
def fitDateReturn(game_id, locales = [-2], channels = [-2]): """ 拟合每天新用户的30日留存曲线 采用scipy 的 curve_fit,原理是最小二乘 利用10折交叉验证,选取了error最小的一次作为最终模型 最后的结果保存到文件中 Arguments: game_id {int} -- 游戏id Keyword Arguments: channels {list} -- 要拟合的渠道信息,-2为总和 (default: {[-2]}) locales {list} -- 要拟合的渠道信息,-2为总和 (default: {[-2]}) """ # connection = MysqlConnection(config.dbhost,config.dbuser,config.dbpassword,config.dbname) # days = list(range(2,31)) # log_type_tmp_path = utils.get_log_type_tmp_path("return",game_id) # dates = os.listdir(log_type_tmp_path) # for channel in channels: # for locale in locales: # all_data = np.ones(29) # sql = "select * from log_return_s_wja_1_percent where channel_id = %s and locale = %s" # result = connection.query(sql,[channel,locale]) # dates = [] # for i in range(len(result)): # return_percent= [] # for j in range(2,31): # return_percent.append(result[i][str(j) + 'day']) # if return_percent.count(0) < 5: # all_data = np.row_stack((all_data,return_percent)) # dates.append(result[i]["date"]) # else: # print("data dropped: %s %s %s" %(channel,locale,result[i]['date'])) connection = MysqlConnection(config.dbhost,config.dbuser,config.dbpassword,config.dbname) days = list(range(2,31)) log_type_tmp_path = file_util.get_log_type_tmp_path("return",game_id) date_list = os.listdir(log_type_tmp_path) for channel in channels: for locale in locales: all_data = np.ones(29) dates = [] for date in date_list: log_tmp_path = os.path.join(log_type_tmp_path,date) log_file = os.path.join(log_tmp_path, "channel_" + str(channel) + "_locale_" + str(locale)) with open(log_file, 'r') as f: return_percent = [float(x) for x in f.read().strip().split(" ")] if return_percent.count(0) < 5: all_data = np.row_stack((all_data,return_percent)) dates.append(date) else: print("data dropped: %s %s %s" %(channel,locale,date)) path = file_util.get_figure_path("return_date_fit","channel_" + str(channel) + "_locale_" + str(locale)) if(all_data.shape[0] < 11 or len(all_data.shape) == 1): print("The valid data size is too small (less than 10) for channel %d locale %s" %(channel,locale)) break # y是一个矩阵,每一行为某一日的2到30日留存值,x与y的维数相等,每一行都是2,3,...,29 y = all_data[1:] #这里因为all_data第一行不是真实数据,而是全部为1 x = np.array(days) x = np.tile(x,(y.shape[0],1)) min_err = float("inf") min_i = 0 ''' 10折交叉验证,这里直接使用了sklearn中的KFold接口,用于将数据分为10份 每次训练使用其中的9份作为训练数据,其中的1份作为验证集来对模型进行评价,最后选取效果最好的一个 ''' kf = KFold(n_splits=10) for train, test in kf.split(y): #这里train 和 test分别保存了训练集和验证集在数据集中的下标,所以可以直接里利用该下标来取出对应的数据 x_train, x_test, y_train, y_test = x[train], x[test], y[train], y[test] popt, pcov = curve_fit(func, x_train.ravel(), y_train.ravel(), bounds = ((0,0,0),(np.inf,1,np.inf))) #ravel()函数将原本的二维数据展开为一维 err_sum = 0 y_hat = np.array([func(day,popt[0],popt[1],popt[2]) for day in days]) # y_hat = np.array([func(day,popt[0],popt[1]) for day in days]) for i in range(y_test.shape[0]): err_sum += sum((y_test[i] - y_hat) ** 2) if err_sum < min_err: min_err = err_sum best_popt = popt best_pcov = pcov '''计算拟合曲线''' y_hat = [func(day,best_popt[0],best_popt[1],best_popt[2]) for day in days] # y_hat = [func(day,best_popt[0],best_popt[1]) for day in days] plt.plot(days, y_hat,'b') # plt.show() plt.cla() for i in range(len(dates)): print("------",dates[i],"------") plt.plot(days,y[i],'r--',label = 'origin') plt.plot(days,y_hat,'b--',label = 'fit curve') plt.legend(loc = 'upper right') plt.gca().set_xlabel("days") plt.gca().set_ylabel("return_percent") plt.grid(True) # plt.show() plt.savefig(os.path.join(path,str(dates[i]))) plt.cla() print("result for channel %d locale %s is %f * (%f ** x) + %f" %(channel, locale, best_popt[0],best_popt[1],best_popt[2])) result_path = file_util.get_result_path("return", game_id) result_file = os.path.join(result_path,"channel_" + str(channel) + "_locale_" + str(locale)) with open(result_file, 'w') as f: for p in best_popt: f.write(str(p) + " ") # f.write("%f * (%f ** x) + %f" %(best_popt[0],best_popt[1],best_popt[2])) connection.close()
def sim(date_end, game_id): """ 计算item之间的相似度,这里采用的是pearson相关性系数 Arguments: game_id {int} -- game_id """ connection = MysqlConnection(config.dbhost, config.dbuser, config.dbpassword, config.dbname) # sql = "select column_name from Information_schema.columns where table_Name = %s" # columns = connection.query(sql,item_user_table) # columns = list(zip(*columns))[0][1:] # 文件接口 # item_item_table = db_util.item_item_table(game_id) log_type_tmp_path = file_util.get_log_type_tmp_path("item_used", game_id) # max_date = max(os.listdir(log_type_tmp_path)) max_date = date_end total_file = file_util.item_used_total_file(game_id, max_date) with open(total_file, 'rb') as f: total_data = pickle.load(f) item_item_relation = {} items = list(total_data.keys()) items = sorted(items, key=lambda x: int(x)) for i in range(len(items)): item_item_relation[items[i]] = {} for j in range(i, len(items)): print("------%s:%s------" % (items[i], items[j])) users = set(total_data[items[i]].keys()) | set( total_data[items[j]].keys()) corated = [] for user in users: corated.append([ total_data[items[i]].get(user, 0), total_data[items[j]].get(user, 0) ]) corated = np.array(corated) # print(len(corated)) if len(corated) != 0: corr_coef = np.corrcoef(corated, rowvar=False) #相关系数 # sim_ij = corr_coef[0,1] if corr_coef[0,1] != np.nan else 0 item_item_relation[items[i]][items[j]] = corr_coef[ 0, 1] if corr_coef[0, 1] != np.nan else 0 else: # sim_ij = 0 item_item_relation[items[i]][items[j]] = 0 # sql = "update " + item_item_table + " set " + items[j] + " = %s where item_id = %s" print(item_item_relation[items[i]][items[j]]) # connection.query(sql,[float(sim_ij),items[i][5:]]) #print(sim_ij) result_path = file_util.get_result_path("item_used", game_id) with open(os.path.join(result_path, "result"), 'wb') as f: pickle.dump(item_item_relation, f) # 数据库接口 # item_user_table = db_util.item_user_table(game_id) # # sql = "select * from " + item_user_table # result = connection.query(sql) # columns = list(result[0].keys())[1:] # user_item_table = np.array([list(x.values())[1:] for x in result]) # cols = user_item_table.shape[1] # for i in range(cols): # for j in range(i, cols): # print("-------",i,j,"-------") # corated = [] # for k in range(len(user_item_table[:,i])): # if user_item_table[k,i] != 0 or user_item_table[k,j] != 0: # corated.append([user_item_table[k,i],user_item_table[k,j]]) # # if user_item_table[k,i] != 0 and user_item_table[k,j] != 0: # # corated.append([user_item_table[k,i],user_item_table[k,j]]) # corated = np.array(corated) # print(len(corated)) # if len(corated) != 0: # corr_coef = np.corrcoef(corated, rowvar = False) #协方差矩阵 # print(corr_coef) # # sim_ij = corr_coef[0,1]/np.sqrt(corr_coef[0,0] * corr_coef[1,1]) #相关系数 # sim_ij = corr_coef[0,1] if corr_coef[0,1] != np.nan else 0 # else: # sim_ij = 0 # #sim_ij = sim_cosin(user_item_table[i],user_item_table[j]) # sql = "update " + item_item_table + " set " + columns[j] + " = %s where item_id = %s" # print(sql) # print(sim_ij,columns[i]) # connection.query(sql,[float(sim_ij),columns[i][5:]]) # #print(sim_ij) connection.close()
#plt.savefig(os.path.join(path,"dnu_percent_" + str(channel)),dpi=100) #plt.show() plt.cla() connection.close() if __name__ == '__main__': game_id = sys.argv[1] connection = MysqlConnection(config.dbhost, config.dbuser, config.dbpassword, config.dbname) result_path = file_util.get_result_path("return", game_id) for file in os.listdir(result_path): full_file = os.path.join(result_path, file) if os.path.isfile(full_file): file_name_split = file.split('_') channel = file_name_split[1] locale = file_name_split[-1] with open(full_file, 'r') as f: popt = f.read().split() result_table = db_util.get_result_table("return", game_id) sql = "delete from " + result_table + " where channel = %s and locale = %s" connection.query(sql, (channel, locale)) sql = "insert into " + result_table + "(channel, locale, a, b, c) values (%s, %s, %s, %s, %s)" connection.query(sql, (channel, locale, popt[0], popt[1], popt[2])) connection.close()