def main(): db_base_option = get_mysql_option(sys.argv[1]) db_target_option = get_mysql_option(sys.argv[2]) no_color_option = False if len(sys.argv) > 3 and sys.argv[3] == '--no-color': no_color_option = True db_base = MySQLHelper(**db_base_option) db_target = MySQLHelper(**db_target_option) if db_base_option['passwd']: db_base_option['passwd'] = '***' if db_target_option['passwd']: db_target_option['passwd'] = '***' print '基准数据库:', ', '.join(['{}={}'.format(k, v) for k, v in db_base_option.items()]) print '目标数据库:', ', '.join(['{}={}'.format(k, v) for k, v in db_target_option.items()]) db_base_schema = get_mysql_schema(db_base) db_target_schema = get_mysql_schema(db_target) schema_diff = compare_schema(db_base_schema, db_target_schema) if schema_diff: print '\n目标数据库相对于基准数据库存在以下差异:' print_schema_diff(schema_diff, no_color_option) else: print COLOR_GREEN + '\n数据库结构完全一致' + COLOR_RESET
def __stop_command(self): try: self.orig_master_host = getattr(self, "orig_master_host") self.orig_master_config = ConfigHelper(self.orig_master_host) except Exception as e: print("Failed to read configuration for original master: %s" % str(e)) return False # Original master try: orig_master_ip = getattr(self, "orig_master_ip", self.orig_master_host) orig_master_mysql_port = getattr(self, "orig_master_port", None) orig_master_ssh_ip = getattr(self, "orig_master_ssh_ip", orig_master_ip) orig_master_ssh_port = getattr(self, "orig_master_ssh_port", None) orig_master_ssh_user = getattr(self, "orig_master_ssh_user", None) orig_master_mysql_user = self.__unescape_from_shell(getattr(self, "orig_master_user")) orig_master_mysql_pass = self.__unescape_from_shell(getattr(self, "orig_master_password")) except AttributeError as e: print("Failed to read one or more required original master parameter(s): %s" % str(e)) return False # Setup MySQL connections mysql_orig_master = MySQLHelper(orig_master_ip, orig_master_mysql_port, orig_master_mysql_user, orig_master_mysql_pass) try: print("Connecting to mysql on the original master '%s'" % self.orig_master_host) if not mysql_orig_master.connect(): return False if self.orig_master_config.get_manage_vip(): vip_type = self.orig_master_config.get_vip_type() print("Removing the vip using the '%s' provider from the original master '%s'" % (vip_type, self.orig_master_host)) if not self.__remove_vip_from_host(vip_type, self.orig_master_host, orig_master_ssh_ip, orig_master_ssh_user, orig_master_ssh_port, self.FAILOVER_TYPE_ONLINE): return False if self.orig_master_config.get_super_read_only() and mysql_orig_master.super_read_only_supported(): print("Setting super_read_only to '1' on the original master '%s'" % self.orig_master_host) if not mysql_orig_master.set_super_read_only() or not mysql_orig_master.is_super_read_only(): return False else: print("Setting read_only to '1' on the original master '%s'" % self.orig_master_host) if not mysql_orig_master.set_read_only() or not mysql_orig_master.is_read_only(): return False if not self.__mysql_kill_threads(self.orig_master_host, mysql_orig_master): return False except Exception as e: print("Unexpected error: %s" % str(e)) return False finally: print("Disconnecting from mysql on the original master '%s'" % self.orig_master_host) mysql_orig_master.disconnect() return True
def __init__(self): self.conf = (SparkConf() .setAppName("BandCard") .set("spark.cores.max", "2") .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar')) self.sc = SparkContext(conf=self.conf) self.sqlctx = SQLContext(self.sc) self.mysql_helper = MySQLHelper('core', host='10.9.29.212')
def __rollback_stop_command(self): try: self.orig_master_host = getattr(self, "orig_master_host") self.orig_master_config = ConfigHelper(self.orig_master_host) except Exception as e: print("Failed to read configuration for original master: %s" % str(e)) return False # Original master try: orig_master_ip = getattr(self, "orig_master_ip", self.orig_master_host) orig_master_mysql_port = getattr(self, "orig_master_port", None) orig_master_mysql_user = self.__unescape_from_shell(getattr(self, "orig_master_user")) orig_master_mysql_pass = self.__unescape_from_shell(getattr(self, "orig_master_password")) orig_master_ssh_ip = getattr(self, "orig_master_ssh_ip", orig_master_ip) orig_master_ssh_port = getattr(self, "orig_master_ssh_port", None) orig_master_ssh_user = getattr(self, "orig_master_ssh_user", None) orig_master_ssh_options = getattr(self, "ssh_options", None) except AttributeError as e: print("Failed to read one or more required original master parameter(s): %s" % str(e)) return False # Setup MySQL connections mysql_orig_master = MySQLHelper(orig_master_ip, orig_master_mysql_port, orig_master_mysql_user, orig_master_mysql_pass) print("Rolling back the failover changes on the original master '%s'" % self.orig_master_host) try: if not mysql_orig_master.connect(): print("Failed to connect to mysql on the original master '%s'" % self.orig_master_host) return False if not mysql_orig_master.unset_read_only() or mysql_orig_master.is_read_only(): print("Failed to reset read_only to '0' on the original master '%s'" % self.orig_master_host) return False print("Set read_only back to '0' on the original master '%s'" % self.orig_master_host) if self.orig_master_config.get_manage_vip(): vip_type = self.orig_master_config.get_vip_type() if not self.__add_vip_to_host(vip_type, self.orig_master_host, orig_master_ssh_ip, orig_master_ssh_user, orig_master_ssh_port, orig_master_ssh_options): print("Failed to add back the vip using the '%s' provider to the original master '%s'" % (vip_type, self.orig_master_host)) return False print("Added back the vip to the original master '%s'" % self.orig_master_host) except Exception as e: print("Unexpected error: %s" % str(e)) return False finally: mysql_orig_master.disconnect() return True
class DataHandler: def __init__(self): self.conf = (SparkConf() .setAppName("BandCard") .set("spark.cores.max", "2") .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar')) self.sc = SparkContext(conf=self.conf) self.sqlctx = SQLContext(self.sc) self.mysql_helper = MySQLHelper('core', host='10.9.29.212') def load_from_mysql(self, table, database='core'): url = "jdbc:mysql://10.9.29.212:3306/%s?user=root&characterEncoding=UTF-8" % database df = self.sqlctx.read.format("jdbc").options(url=url, dbtable=table, driver="com.mysql.jdbc.Driver").load() return df def prepare_life_cycle(self, year, season): ''' 准备生命周期数据 从t_CMMS_ASSLIB_ASSET中获取每日AUM数据 prepare data saum1 (last season sum aum) saum2 (current season sum aum) aum_now account_age (months) last_tr_date (days) :param year: :param season: 1,2,3,4 :return: ''' # 计算月份 print('----------------------生命周期-Start----------------------') print('开始准备生命周期数据...') print('开始计算月份') if season == 1: # date1 当前季度月份 date1 = [str(year) + '-01', str(year) + '-02', str(year) + '-03'] # date2 上一季月份 date2 = [str(year - 1) + '-10', str(year - 1) + '-11', str(year - 1) + '-12'] elif season == 4: date1 = [str(year) + '-10', str(year) + '-11', str(year) + '-12'] date2 = [str(year) + '-07', str(year) + '-08', str(year) + '-9'] else: date1 = [str(year) + '-0' + str(3 * season - 2), str(year) + '-0' + str(3 * season - 1), str(year) + '-0' + str(3 * season)] date2 = [str(year) + '-0' + str(3 * season - 5), str(year) + '-0' + str(3 * season - 4), str(year) + '-0' + str(3 * season - 3)] print('当前季度月份 new:', date1) print('上一季度月份 old:', date2) # 加载AUM表 aum = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').cache() # 拼接每季度三个月断数据 season_new = aum.filter(aum.STAT_DAT == date1[0]).unionAll(aum.filter(aum.STAT_DAT == date1[1])).unionAll( aum.filter(aum.STAT_DAT == date1[2])) season_old = aum.filter(aum.STAT_DAT == date2[0]).unionAll(aum.filter(aum.STAT_DAT == date2[1])).unionAll( aum.filter(aum.STAT_DAT == date2[2])) # 计算每季度AUM aum_season_old = season_old.select('CUST_NO', season_old.AUM.alias('AUM1')).groupBy('CUST_NO').sum('AUM1') aum_season_new = season_new.select('CUST_NO', season_new.AUM.alias('AUM2')).groupBy('CUST_NO').sum('AUM2') # 两个季度进行外联接 ''' +-----------+---------+---------+ | CUST_NO|sum(AUM2)|sum(AUM1)| +-----------+---------+---------+ |81005329523| null|294844.59| |81011793167| null| 365.20| |81015319088| null| 9640.96| +-----------+---------+---------+ ''' union_season = aum_season_old.join(aum_season_new, 'CUST_NO', 'outer') # 筛选当前AUM temp_result = aum.select('CUST_NO', 'AUM', 'STAT_DAT').groupBy('CUST_NO', 'STAT_DAT').sum('AUM').sort( 'CUST_NO').sort(aum.STAT_DAT.desc()) temp_result.select('CUST_NO', temp_result['sum(AUM)'].alias('AUM'), 'STAT_DAT').registerTempTable('group_in') aum_now_sql = "select CUST_NO,first(AUM) as AUM_NOW from group_in group by CUST_NO" aum_now = self.sqlctx.sql(aum_now_sql) # 清除缓存表 self.sqlctx.dropTempTable('group_in') # 联合 union_season_aumnow = union_season.join(aum_now, 'CUST_NO', 'outer') # 计算用户开户至今时间(months) # 载入账户表 account = self.load_from_mysql('t_CMMS_ACCOUNT_LIST').cache() account.select('CUST_NO', 'OPEN_DAT').registerTempTable('account') account_age_aql = "select CUST_NO, first(ACCOUNT_AGE) as ACCOUNT_AGE from " \ "(select CUST_NO, round(datediff(now(), OPEN_DAT) / 30) as ACCOUNT_AGE " \ "from account order by CUST_NO, ACCOUNT_AGE desc ) as t group by CUST_NO" account_age = self.sqlctx.sql(account_age_aql) # calculate last tran date account_1 = account.select('CUST_NO', 'ACC_NO15') detail = self.load_from_mysql('t_CMMS_ACCOUNT_DETAIL').select('ACC_NO15', 'TRAN_DAT') a_d = account_1.join(detail, 'ACC_NO15', 'outer') a_d.filter(a_d.CUST_NO != '').registerTempTable('adtable') last_tr_date_sql = "select CUST_NO,first(TRAN_DAT) as LAST_TR_DATE from (select CUST_NO,TRAN_DAT from adtable order by TRAN_DAT desc) as t group by CUST_NO" last_tr_date = self.sqlctx.sql(last_tr_date_sql) # 联合 season aum_now account_age last_tr_date unions = union_season_aumnow.join(account_age, 'CUST_NO', 'outer').join(last_tr_date, 'CUST_NO', 'outer') # 清除缓存表 self.sqlctx.dropTempTable('account') self.sqlctx.dropTempTable('adtable') self.sqlctx.clearCache() # 结果插入表 print('结果插入临时表:t_CMMS_TEMP_LIFECYCLE...') insert_lifecycle_sql = "replace into t_CMMS_TEMP_LIFECYCLE(CUST_NO,SAUM1,SAUM2,INCREASE,ACCOUNT_AGE,AUM_NOW,LAST_TR_DATE) values(%s,%s,%s,%s,%s,%s,%s)" # 缓冲区 temp = [] for row in unions.collect(): row_dic = row.asDict() if len(temp) >= 1000: # 批量写入数据库 self.mysql_helper.executemany(insert_lifecycle_sql, temp) temp.clear() # 加载数据到缓冲区 try: # 计算增长率 increase = (row_dic['sum(AUM2)'] - row_dic['sum(AUM1)']) / row_dic['sum(AUM1)'] except Exception: increase = 0 # 计算开户时长(月份数) 若无则视为6个月以上 if row_dic['ACCOUNT_AGE'] is None: row_dic['ACCOUNT_AGE'] = 7 # 最后交易日期 ltd = row_dic['LAST_TR_DATE'] if ltd is not None: try: ltd = datetime.datetime.strptime(ltd, '%Y-%m-%d') except Exception: ltd = ltd[:4] + '-' + ltd[4:6] + '-' + ltd[6:] ltd = datetime.datetime.strptime(ltd, '%Y-%m-%d') days = (datetime.datetime.now() - ltd).days else: days = 366 temp.append((row_dic['CUST_NO'], row_dic['sum(AUM1)'], row_dic['sum(AUM2)'], increase, row_dic['ACCOUNT_AGE'], row_dic['AUM_NOW'], days)) if len(temp) != 0: self.mysql_helper.executemany(insert_lifecycle_sql, temp) temp.clear() def calculate_life_cycle(self): ''' 根据AUM变化情况计算生命周期阶段 calculate life cycle period :return: ''' print('开始计算生命周期...') life_cycle = self.load_from_mysql('t_CMMS_TEMP_LIFECYCLE').cache() def clcmap(line): cust_no = line['CUST_NO'] account_age = line['ACCOUNT_AGE'] last_tr_date = line['LAST_TR_DATE'] aum_now = line['AUM_NOW'] increase = line['INCREASE'] period = 0 if aum_now is None: period = 9 # 未知 elif aum_now < 1000 and last_tr_date > 365: period = 3 # 流失期 else: if increase > 20 or account_age < 6: period = 0 # 成长期 elif increase >= -20 and increase <= 20: period = 1 # 成熟期 else: period = 2 # 稳定期 return period, cust_no map_result = life_cycle.map(clcmap).collect() # clear the life_cycle cache self.sqlctx.clearCache() temp = [] print('结果更新到临时表:t_CMMS_TEMP_LIFECYCLE...') update_life_period_sql = "update t_CMMS_TEMP_LIFECYCLE set PERIOD = %s where CUST_NO = %s" for row in map_result: if len(temp) >= 1000: self.mysql_helper.executemany(update_life_period_sql, temp) temp.clear() temp.append(row) if len(temp) != 1000: self.mysql_helper.executemany(update_life_period_sql, temp) temp.clear() def lifecycle_to_real_table(self, year, season): ''' 将生命周期数据写入正式表中 put life_cycle tmp table to real table :return: ''' print('开始将生命周期数据写入正式表中...') life_cycle = self.load_from_mysql('t_CMMS_TEMP_LIFECYCLE').select('CUST_NO', 'PERIOD') cust_info = self.load_from_mysql('t_CMMS_INFO_CUSTOMER').select('CUST_NO', 'CUST_ID', 'CUST_NAM') union = life_cycle.join(cust_info, 'CUST_NO', 'left_outer').cache() temp = [] sql = "replace into t_CMMS_ANALYSE_LIFE(CUST_NO,CUST_ID,CUST_NM,LIFE_CYC,QUARTER,UPDATE_TIME) values(%s,%s,%s,%s,%s,now())" quarter = str(year) + '-' + str(season) for row in union.collect(): if len(temp) >= 1000: self.mysql_helper.executemany(sql, temp) temp.clear() cust_id = row['CUST_ID'] if row['CUST_ID'] is not None else '0' temp.append((row['CUST_NO'], cust_id, row['CUST_NAM'], row['PERIOD'], quarter)) if len(temp) != 1000: self.mysql_helper.executemany(sql, temp) temp.clear() self.sqlctx.clearCache() def run_life_cycle(self,year,season): ''' 运行完整的生命周期流程 1 准备生命周期数据,计算AUM及其变化幅度 2 根据变化幅度计算生命周期阶段 3 将数据从缓存表放到实际表 :param year: :param season: :return: ''' self.prepare_life_cycle(year,season) self.calculate_life_cycle() self.lifecycle_to_real_table(year,season) #------------------------------------------------------------------------生命周期结束------------------------------------------------------------------------# def customer_value(self, year, half_year): ''' 计算客户价值 calculate customer value :param year: which year to calculate :param half_year: 0 for month 1-6 , 1 for month 7-12 :return: ''' print('---------------------------客户价值-Start--------------------------') cust_info = self.load_from_mysql('t_CMMS_INFO_CUSTOMER').select('CUST_NO', 'CUST_ID', 'CUST_NAM').cache() aum = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').select('CUST_NO', 'STAT_DAT', 'AUM', 'ASS_TYPE').cache() base = half_year * 6 aum_slot_filter = None for i in range(1, 7): i = base + i if i < 10: i = '0' + str(i) else: i = str(i) slot = str(year) + '-' + i slot_filter = aum.filter(aum.STAT_DAT == slot) if aum_slot_filter is None: aum_slot_filter = slot_filter else: aum_slot_filter = aum_slot_filter.unionAll(slot_filter) # CUST_NO sum(AUM) huoqi_aum = aum_slot_filter.select('CUST_NO', 'ASS_TYPE', aum_slot_filter['AUM'].alias('AUM_HQ')).filter( aum_slot_filter.ASS_TYPE == '1').groupBy('CUST_NO').sum('AUM_HQ') dingqi_aum = aum_slot_filter.select('CUST_NO', 'ASS_TYPE', (aum_slot_filter.AUM * 0.8).alias('AUM_DQ')).filter( aum_slot_filter.ASS_TYPE == '2').groupBy('CUST_NO').sum('AUM_DQ') # 定期活期已计算好,sum(AUM_HQ),sum(AUM_DQ) j = huoqi_aum.join(dingqi_aum, 'CUST_NO', 'outer') # j.show() # 清除原有数据 self.mysql_helper.execute('truncate core.t_CMMS_ANALYSE_VALUE') # 开始联合其他表 all_col = j.join(cust_info, 'CUST_NO', 'outer') print(j.count(), cust_info.count()) # all_col.show() #根据客户价值计算客户等级 def calculate_rank(value): if value < 1000: return 0 elif value < 10000: return 1 elif value < 100000: return 2 elif value < 500000: return 3 elif value < 2000000: return 4 elif value < 5000000: return 5 else: return 6 temp = [] print('将数据replace到正式表...') update_value_sql = "replace into t_CMMS_ANALYSE_VALUE(CUST_ID,CUST_NO,CUST_NM,CUST_VALUE,CUST_RANK,SLOT,UPDATE_TIME) values(%s,%s,%s,%s,%s,%s,now())" for row in all_col.collect(): if len(temp) >= 1000: self.mysql_helper.executemany(update_value_sql, temp) temp.clear() val_dq = row['sum(AUM_DQ)'] if row['sum(AUM_DQ)'] is not None else 0 val_hq = row['sum(AUM_HQ)'] if row['sum(AUM_HQ)'] is not None else 0 cust_val = float(val_dq) + float(val_hq) cust_rank = calculate_rank(cust_val) slot = str(year) + '-' + str(half_year) cust_id = row['CUST_ID'] if row['CUST_ID'] is not None else 1 temp.append((cust_id, row['CUST_NO'], row['CUST_NAM'], cust_val, cust_rank, slot)) if len(temp) != 1000: self.mysql_helper.executemany(update_value_sql, temp) temp.clear() def aum_total(self): ''' 计算AUM总和 data for t_CMMS_ASSLIB_ASSTOT :return: ''' print('---------------------------总资产-Start--------------------------') # TODO t_CMMS_ASSLIB_ASSET_c 要改成正式表t_CMMS_ASSLIB_ASSET df_asset = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').select('CUST_NO', 'CUST_ID', 'STAT_DAT', 'AUM', 'CUR', 'ACC_NAM').cache() # print(df_asset.count(), df_asset.columns) other_col = df_asset.select('CUST_NO', 'CUST_ID', 'CUR', 'ACC_NAM').distinct() # print(other_col.count(),other_col.columns) aum = df_asset.select('CUST_NO', 'STAT_DAT', 'AUM') # print(aum.count(), aum.columns) aum = aum.select('CUST_NO', 'STAT_DAT', 'AUM').groupBy(['CUST_NO', 'STAT_DAT']).sum('AUM').sort( ['CUST_NO', aum.STAT_DAT.desc()]) \ .groupBy('CUST_NO').agg({'sum(AUM)': 'first', 'STAT_DAT': 'first'}) # print(aum.count(), aum.columns) total = aum.select('CUST_NO', aum['first(sum(AUM))'].alias('AUM'), aum['first(STAT_DAT)'].alias('STAT_DAT')). \ join(other_col, 'CUST_NO', 'left_outer').distinct() # total.filter(total.STAT_DAT == '2016-06-') .show() # prepare params def list_map(line): return line['CUST_ID'], line['CUST_NO'], line['ACC_NAM'], line['STAT_DAT'], line['CUR'], line['AUM'] df = total.map(list_map) # clear old data self.mysql_helper.execute('truncate t_CMMS_ASSLIB_ASSTOT') sql = "insert into t_CMMS_ASSLIB_ASSTOT(CUST_ID,CUST_NO,ACC_NAM,STAT_DAT,CUR,AUM) values(%s,%s,%s,%s,%s,%s)" # execute sql self.mysql_helper.batch_operate(sql, df, 100) def debt_total(self): ''' prepare data for total debt :return: ''' print('---------------------------总负债-Start--------------------------') df_debt = self.load_from_mysql('t_CMMS_ASSLIB_DEBT').select('LOAN_ACC', 'CUST_NO', 'CUST_ID', 'CUST_NAM', 'BAL_AMT', 'GRANT_AMT', 'CUR') df_debt = df_debt.filter(df_debt.LOAN_ACC != '') df_sum = df_debt.groupBy('CUST_NO').sum('GRANT_AMT', 'BAL_AMT') df_other = df_debt.groupBy('CUST_NO').agg({'CUST_ID': 'first', 'CUST_NAM': 'first', 'CUR': 'first'}) df_total = df_sum.join(df_other, 'CUST_NO', 'left_outer').distinct() stat_dat = datetime.datetime.now().strftime('%Y%m%d') def m(line): return line['CUST_NO'], line['first(CUST_ID)'], line['first(CUST_NAM)'], line['first(CUR)'], line[ 'sum(GRANT_AMT)'], line['sum(BAL_AMT)'], stat_dat df = df_total.map(m) sql = "replace into t_CMMS_ASSLIB_DEBTOT(CUST_NO,CUST_ID,ACC_NAM,CUR,LOAN_AMT,BAL_AMT,STAT_DAT) values(%s,%s,%s,%s,%s,%s,%s)" self.mysql_helper.batch_operate(sql, df) def run(self): # 生命周期 年份 季度1,2,3,4 dh.run_life_cycle(2016, 2) # 客户价值 上半年:0,下半年:1 dh.customer_value(2016, 0) # 总资产 dh.aum_total() # 总负债 dh.debt_total()
def __start_command(self): try: self.orig_master_host = getattr(self, "orig_master_host") self.orig_master_config = ConfigHelper(self.orig_master_host) except Exception as e: print("Failed to read configuration for original master: %s" % str(e)) return False try: self.new_master_host = getattr(self, "new_master_host") self.new_master_config = ConfigHelper(self.new_master_host) except Exception as e: print("Failed to read configuration for new master: %s" % str(e)) return False # New master try: new_master_ip = getattr(self, "new_master_ip", self.new_master_host) new_master_mysql_port = getattr(self, "new_master_port", None) new_master_mysql_user = self.__unescape_from_shell(getattr(self, "new_master_user")) new_master_mysql_pass = self.__unescape_from_shell(getattr(self, "new_master_password")) new_master_ssh_ip = getattr(self, "new_master_ssh_ip", new_master_ip) new_master_ssh_port = getattr(self, "new_master_ssh_port", None) new_master_ssh_options = getattr(self, "ssh_options", None) if self.failover_type == self.FAILOVER_TYPE_HARD: new_master_ssh_user = getattr(self, "ssh_user", None) else: new_master_ssh_user = getattr(self, "new_master_ssh_user", None) except AttributeError as e: print("Failed to read one or more required new master parameter(s): %s" % str(e)) return False # Setup MySQL connection mysql_new_master = MySQLHelper(new_master_ip, new_master_mysql_port, new_master_mysql_user, new_master_mysql_pass) try: print("Connecting to mysql on the new master '%s'" % self.new_master_host) if not mysql_new_master.connect(): return False print("Setting read_only to '0' on the new master '%s'" % self.new_master_host) if not mysql_new_master.unset_read_only() or mysql_new_master.is_read_only(): return False if self.new_master_config.get_manage_vip(): vip_type = self.new_master_config.get_vip_type() print("Adding the vip using the '%s' provider to the new master '%s'" % (vip_type, self.new_master_host)) if not self.__add_vip_to_host(vip_type, self.new_master_host, new_master_ssh_ip, new_master_ssh_user, new_master_ssh_port, new_master_ssh_options): return False except Exception as e: print("Unexpected error: %s" % str(e)) return False finally: print("Disconnecting from mysql on the new master '%s'" % self.new_master_host) mysql_new_master.disconnect() return True
def __stop_command(self): try: self.orig_master_host = getattr(self, "orig_master_host") self.orig_master_config = ConfigHelper(self.orig_master_host) except Exception as e: print("Failed to read configuration for original master: %s" % str(e)) return False # Original master try: orig_master_ip = getattr(self, "orig_master_ip", self.orig_master_host) orig_master_mysql_port = getattr(self, "orig_master_port", None) orig_master_ssh_ip = getattr(self, "orig_master_ssh_ip", orig_master_ip) orig_master_ssh_port = getattr(self, "orig_master_ssh_port", None) orig_master_ssh_user = getattr(self, "orig_master_ssh_user", None) orig_master_mysql_user = self.__unescape_from_shell( getattr(self, "orig_master_user")) orig_master_mysql_pass = self.__unescape_from_shell( getattr(self, "orig_master_password")) except AttributeError as e: print( "Failed to read one or more required original master parameter(s): %s" % str(e)) return False # Setup MySQL connections mysql_orig_master = MySQLHelper(orig_master_ip, orig_master_mysql_port, orig_master_mysql_user, orig_master_mysql_pass) try: print("Connecting to mysql on the original master '%s'" % self.orig_master_host) if not mysql_orig_master.connect(): return False if self.orig_master_config.get_manage_vip(): vip_type = self.orig_master_config.get_vip_type() print( "Removing the vip using the '%s' provider from the original master '%s'" % (vip_type, self.orig_master_host)) if not self.__remove_vip_from_host( vip_type, self.orig_master_host, orig_master_ssh_ip, orig_master_ssh_user, orig_master_ssh_port, self.FAILOVER_TYPE_ONLINE): return False if self.orig_master_config.get_super_read_only( ) and mysql_orig_master.super_read_only_supported(): print( "Setting super_read_only to '1' on the original master '%s'" % self.orig_master_host) if not mysql_orig_master.set_super_read_only( ) or not mysql_orig_master.is_super_read_only(): return False else: print("Setting read_only to '1' on the original master '%s'" % self.orig_master_host) if not mysql_orig_master.set_read_only( ) or not mysql_orig_master.is_read_only(): return False if not self.__mysql_kill_threads(self.orig_master_host, mysql_orig_master): return False except Exception as e: print("Unexpected error: %s" % str(e)) return False finally: print("Disconnecting from mysql on the original master '%s'" % self.orig_master_host) mysql_orig_master.disconnect() return True
def __rollback_stop_command(self): try: self.orig_master_host = getattr(self, "orig_master_host") self.orig_master_config = ConfigHelper(self.orig_master_host) except Exception as e: print("Failed to read configuration for original master: %s" % str(e)) return False # Original master try: orig_master_ip = getattr(self, "orig_master_ip", self.orig_master_host) orig_master_mysql_port = getattr(self, "orig_master_port", None) orig_master_mysql_user = self.__unescape_from_shell( getattr(self, "orig_master_user")) orig_master_mysql_pass = self.__unescape_from_shell( getattr(self, "orig_master_password")) orig_master_ssh_ip = getattr(self, "orig_master_ssh_ip", orig_master_ip) orig_master_ssh_port = getattr(self, "orig_master_ssh_port", None) orig_master_ssh_user = getattr(self, "orig_master_ssh_user", None) except AttributeError as e: print( "Failed to read one or more required original master parameter(s): %s" % str(e)) return False # Setup MySQL connections mysql_orig_master = MySQLHelper(orig_master_ip, orig_master_mysql_port, orig_master_mysql_user, orig_master_mysql_pass) print("Rolling back the failover changes on the original master '%s'" % self.orig_master_host) try: if not mysql_orig_master.connect(): print( "Failed to connect to mysql on the original master '%s'" % self.orig_master_host) return False if not mysql_orig_master.unset_read_only( ) or mysql_orig_master.is_read_only(): print( "Failed to reset read_only to '0' on the original master '%s'" % self.orig_master_host) return False print("Set read_only back to '0' on the original master '%s'" % self.orig_master_host) if self.orig_master_config.get_manage_vip(): vip_type = self.orig_master_config.get_vip_type() if not self.__add_vip_to_host( vip_type, self.orig_master_host, orig_master_ssh_ip, orig_master_ssh_user, orig_master_ssh_port): print( "Failed to add back the vip using the '%s' provider to the original master '%s'" % (vip_type, self.orig_master_host)) return False print("Added back the vip to the original master '%s'" % self.orig_master_host) except Exception as e: print("Unexpected error: %s" % str(e)) return False finally: mysql_orig_master.disconnect() return True
def __start_command(self): try: self.orig_master_host = getattr(self, "orig_master_host") self.orig_master_config = ConfigHelper(self.orig_master_host) except Exception as e: print("Failed to read configuration for original master: %s" % str(e)) return False try: self.new_master_host = getattr(self, "new_master_host") self.new_master_config = ConfigHelper(self.new_master_host) except Exception as e: print("Failed to read configuration for new master: %s" % str(e)) return False # New master try: new_master_ip = getattr(self, "new_master_ip", self.new_master_host) new_master_mysql_port = getattr(self, "new_master_port", None) new_master_mysql_user = self.__unescape_from_shell( getattr(self, "new_master_user")) new_master_mysql_pass = self.__unescape_from_shell( getattr(self, "new_master_password")) new_master_ssh_ip = getattr(self, "new_master_ssh_ip", new_master_ip) new_master_ssh_port = getattr(self, "new_master_ssh_port", None) if self.failover_type == self.FAILOVER_TYPE_HARD: new_master_ssh_user = getattr(self, "ssh_user", None) else: new_master_ssh_user = getattr(self, "new_master_ssh_user", None) except AttributeError as e: print( "Failed to read one or more required new master parameter(s): %s" % str(e)) return False # Setup MySQL connection mysql_new_master = MySQLHelper(new_master_ip, new_master_mysql_port, new_master_mysql_user, new_master_mysql_pass) try: print("Connecting to mysql on the new master '%s'" % self.new_master_host) if not mysql_new_master.connect(): return False print("Setting read_only to '0' on the new master '%s'" % self.new_master_host) if not mysql_new_master.unset_read_only( ) or mysql_new_master.is_read_only(): return False if self.new_master_config.get_manage_vip(): vip_type = self.new_master_config.get_vip_type() print( "Adding the vip using the '%s' provider to the new master '%s'" % (vip_type, self.new_master_host)) if not self.__add_vip_to_host( vip_type, self.new_master_host, new_master_ssh_ip, new_master_ssh_user, new_master_ssh_port): return False except Exception as e: print("Unexpected error: %s" % str(e)) return False finally: print("Disconnecting from mysql on the new master '%s'" % self.new_master_host) mysql_new_master.disconnect() return True
class KMData: def __init__(self): self.conf = (SparkConf() .setAppName("KMeans") .set("spark.cores.max", "2") .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar')) self.sc = SparkContext(conf=self.conf) self.sqlctx = SQLContext(self.sc) self.mysql_helper = MySQLHelper('core', host='10.9.29.212') self.base = 'hdfs://master:9000/gmc/' def load_from_mysql(self, table, database='core'): url = "jdbc:mysql://10.9.29.212:3306/%s?user=root&characterEncoding=UTF-8" % database df = self.sqlctx.read.format("jdbc").options(url=url, dbtable=table, driver="com.mysql.jdbc.Driver").load() return df def rfm(self): life_cycle = self.load_from_mysql('t_CMMS_ANALYSE_LIFE').select('CUST_NO', 'LIFE_CYC') value = self.load_from_mysql('t_CMMS_ANALYSE_VALUE').select('CUST_NO', 'CUST_VALUE', 'CUST_RANK') loyalty = self.load_from_mysql('t_CMMS_ANALYSE_LOYALTY').select('CUST_NO', 'LOYALTY') rfm = loyalty.join(value, 'CUST_NO', 'left_outer').join(life_cycle, 'CUST_NO', 'left_outer').distinct() \ .map(lambda x: (x['CUST_NO'], x['LIFE_CYC'], x['CUST_VALUE'], x['CUST_RANK'], x['LOYALTY'])) temp = [] sql = "replace into t_CMMS_TEMP_KMEANS_COLUMNS(CUST_NO,LIFE_CYC,CUST_VALUE,CUST_RANK,LOYALTY) values(%s,%s,%s,%s,%s)" for row in rfm.collect(): if len(temp) >= 1000: self.mysql_helper.executemany(sql, temp) temp.clear() temp.append(row) if len(temp) != 1000: self.mysql_helper.executemany(sql, temp) temp.clear() def idcard(self): ''' get sex age and local from id card number :return: ''' kcolums = self.load_from_mysql('t_CMMS_TEMP_KMEANS_COLUMNS').select('CUST_NO') cust_info = self.load_from_mysql('t_CMMS_INFO_CUSTOMER') j = kcolums.join(cust_info, 'CUST_NO', 'left_outer').select('CUST_NO', 'CUST_ID') year_now = datetime.datetime.now().year def split_map(line): no = line['CUST_NO'] id = line['CUST_ID'] if id is not None: sex_flag = 2 year = 0 if len(id) == 21: year = id[9:13] sex_flag = id[-2:-1] elif len(line) == 18: year = '19' + id[9:11] sex_flag = id[-1] # 1 man 2 woman sex = int(sex_flag) % 2 age = year_now - int(year) if age > 100: age = 0 local = id[3:9] return age, sex, local, no else: return 0, 2, '000000', no asl = j.map(split_map).collect() temp = [] sql = "update t_CMMS_TEMP_KMEANS_COLUMNS set AGE = %s, SEX = %s, LOCAL = %s where CUST_NO = %s" for row in asl: if len(temp) >= 1000: self.mysql_helper.executemany(sql, temp) temp.clear() temp.append(row) if len(temp) != 1000: self.mysql_helper.executemany(sql, temp) temp.clear() def aum(self): kcolums = self.load_from_mysql('t_CMMS_TEMP_KMEANS_COLUMNS').select('CUST_NO') aum = self.load_from_mysql('t_CMMS_ASSLIB_ASSTOT').select('CUST_NO', 'AUM') j = kcolums.join(aum, 'CUST_NO', 'left_outer').distinct() def m(line): aum = line['AUM'] if line['AUM'] is not None else 0 return aum, line['CUST_NO'] df = j.map(m) sql = "update t_CMMS_TEMP_KMEANS_COLUMNS set AUM = %s where CUST_NO = %s" self.sql_operate(sql, df) def sql_operate(self, sql, df, once_size=1000): ''' 批量数据库操作 :param sql: :param df: :param once_size: :return: ''' temp = [] for row in df.collect(): if len(temp) >= once_size: self.mysql_helper.executemany(sql, temp) temp.clear() temp.append(row) if len(temp) != 0: self.mysql_helper.executemany(sql, temp) temp.clear() def cust_info(self): self.rfm() self.idcard() self.aum() ''' ************************************************************ ''' #TODO 时间跨度未设置 def credit_card(self): #清空缓存表,导入新的额度数据 init_credit_sql = "replace into t_CMMS_TEMP_KMEANS_CREDIT(ACCTNBR,CREDIT) (select XACCOUNT,CRED_LIMIT from core.ACCT)" self.mysql_helper.execute('truncate t_CMMS_TEMP_KMEANS_CREDIT') self.mysql_helper.execute(init_credit_sql) #信用卡交易记录 credit_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN') credit_df = credit_df.filter(credit_df['BILL_AMTFLAG'] == '+').cache() #计算循环(此处为刷卡金额占额度的比例,不是真正的循环信用) # join = credit_df.groupBy('ACCTNBR').sum('BILL_AMT').join(credit_limit_df,'ACCTNBR','outer') # # cycle = join.select('ACCTNBR',join['sum(BILL_AMT)'].alias('CYCLE_AMT'),'CREDIT') #刷卡金额与次数 swipe_amt = credit_df.groupBy('ACCTNBR').sum('BILL_AMT') swipe_times = credit_df.groupBy('ACCTNBR').count() # swipe_amt.show() # swipe_times.show() swipe_result = swipe_amt.join(swipe_times,'ACCTNBR','outer') swipe_result = swipe_result.select('ACCTNBR',swipe_result['sum(BILL_AMT)'].alias('SWIPE_AMT'),swipe_result['count'].alias('SWIPE_TIMES')) #取现金额与次数 codes = [2010,2050,2060,2182,2184] cash = credit_df.filter(credit_df['TRANS_TYPE'] == codes.pop()) print(type(cash)) for c in codes: cash = cash.unionAll(credit_df.filter(credit_df['TRANS_TYPE'] == c)) cash_amt = cash.groupBy('ACCTNBR').sum('BILL_AMT') cash_count = cash.groupBy('ACCTNBR').count() cash_result = cash_amt.join(cash_count,'ACCTNBR','outer') cash_result = cash_result.select('ACCTNBR',cash_result['sum(BILL_AMT)'].alias('CASH_AMT'),cash_result['count'].alias('CASH_TIMES')) #cash_amt.show() #cash_count.show() #分期付款金额与次数 installment_df = self.load_from_mysql('MPUR_D').cache() im_amt = installment_df.groupBy('XACCOUNT').sum('ORIG_PURCH') im_times = installment_df.groupBy('XACCOUNT').count() im_result = im_amt.join(im_times,'XACCOUNT','outer') im_result = im_result.select(im_result['XACCOUNT'].alias('ACCTNBR'),im_result['sum(ORIG_PURCH)'].alias('INSTALLMENT_AMT'),im_result['count'].alias('INSTALLMENT_TIMES')) #汇总 # 加载额度数据 credit_limit_df = self.load_from_mysql('t_CMMS_TEMP_KMEANS_CREDIT').select('ACCTNBR', 'CREDIT') all_join = swipe_result.join(cash_result,'ACCTNBR','outer').join(im_result,'ACCTNBR','outer').join(credit_limit_df,'ACCTNBR','outer') all_join.show() def m(line): acctnbr = line['ACCTNBR'] credit = line['CREDIT'] if line['CREDIT'] is not None else 0 # cycle_amt = line['CYCLE_AMT'] if line['CYCLE_AMT'] is not None else 0 # try: # cycle_times = cycle_amt/credit # except ZeroDivisionError: # cycle_times = 0 swipe_amt = line['SWIPE_AMT'] if line['SWIPE_AMT'] is not None else 0 swipe_times = line['SWIPE_TIMES'] if line['SWIPE_TIMES'] is not None else 0 cash_amt = line['CASH_AMT'] if line['CASH_AMT'] is not None else 0 cash_times = line['CASH_TIMES'] if line['CASH_TIMES'] is not None else 0 installment_amt = line['INSTALLMENT_AMT'] if line['INSTALLMENT_AMT'] is not None else 0 installment_times = line['INSTALLMENT_TIMES'] if line['INSTALLMENT_TIMES'] is not None else 0 return acctnbr,credit,swipe_amt,swipe_times,cash_amt,cash_times,installment_amt,installment_times sql = "replace into t_CMMS_TEMP_KMEANS_CREDIT(ACCTNBR,CREDIT,SWIPE_AMT,SWIPE_TIMES,CASH_AMT,CASH_TIMES,INSTALLMENT_AMT,INSTALLMENT_TIMES) values(%s,%s,%s,%s,%s,%s,%s,%s)" df = all_join.map(m) self.sql_operate(sql,df)
class KMAnalyse: def __init__(self): self.conf = (SparkConf() .setAppName("KMeans") .set("spark.cores.max", "2") .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar')) self.sc = SparkContext(conf=self.conf) self.sqlctx = SQLContext(self.sc) self.mysql_helper = MySQLHelper('core', host='10.9.29.212') self.base = 'hdfs://master:9000/gmc/' def load_from_mysql(self, table, database='core'): url = "jdbc:mysql://10.9.29.212:3306/%s?user=root&characterEncoding=UTF-8" % database df = self.sqlctx.read.format("jdbc").options(url=url, dbtable=table, driver="com.mysql.jdbc.Driver").load() return df @staticmethod def clustering_score(data, k): model = KMeans.train(data, k=k) def distance(v1, v2): s = 0 # [1,2,3] [4,5,6] --> [(1,4),(2,5),(3,6)] pairs = zip(v1, v2) for p in pairs: sub = float(p[0]) - float(p[1]) s = s + sub * sub return math.sqrt(s) def dist_to_centroid(datum): # predict the data cluster = model.predict(datum) # get the current centroid --> means center point centroid = model.clusterCenters[cluster] # call distance method return distance(centroid, datum) return data.map(dist_to_centroid).mean() def try_different_k(self, dataframe, max_k, min_k=2): data = self.prepare_data(dataframe) get_id_sql = "select ID from t_CMMS_TEMP_KMEANS_RESULT order by ID desc limit 1" try: id = int(self.mysql_helper.fetchone(get_id_sql)[0]) + 1 except: id = 1 columns = str(dataframe.columns) l = [] for k in range(min_k, max_k): sorce = self.clustering_score(data, k) print(k, sorce) l.append(sorce) self.mysql_helper.execute('insert into t_CMMS_TEMP_KMEANS_RESULT(ID,K,SORCE,COLUMNS) values(%s,%s,%s,%s)', (id, k, sorce, columns)) return id def prepare_data(self, dataframe): ''' format data :param dataframe: :return: ''' def v_map(line): lst = [] for c in line: if c is not None: lst.append(float(c)) else: lst.append(0.0) return lst return dataframe.rdd.cache().map(v_map) def train_model(self, dataframe, k, model_name): ''' use data to train model :param dataframe: all columns for train :param k:k value :param model_name:the trained model :return:None ''' data = self.prepare_data(dataframe) # train to get model model = KMeans.train(data, k) # create model saving path path = self.base + model_name # try to delete the old model if it exists try: import subprocess subprocess.call(["hadoop", "fs", "-rm", "-f", path]) except: pass # save new model on hdfs model.save(self.sc, path) # print all cluster of the model for c in model.clusterCenters: l = [] for i in c: i = decimal.Decimal(i).quantize(decimal.Decimal('0.01')) l.append(float(i)) print(l) def predict(self, model_name, data): ''' predict unknown data :param model_name: the trained model saving on hdfs :param data: unknown data :return: (cluster_index, cluster) ''' # try to load the specified model path = self.base + model_name try: model = KMeansModel.load(self.sc, path) except: raise Exception('No such model found on hdfs!') # get the predict : means which cluster it belongs to index = model.predict(data) print('Data:%s belongs to cluster:%s. The index is %s' % (data, model.clusterCenters[index], index)) return index, model.clusterCenters[index] def vaildate(self, validate_data, model_name): correct = 0 error = 0 for line in validate_data: known_cluster = line[0] stay_predict_data = line[1] predict_cluster = self.predict(model_name, stay_predict_data)[0] # Only get the index if known_cluster == predict_cluster: correct += 1 else: error += 1 total = len(validate_data) result = {'total': total, 'correct': correct, 'error': error, 'accurancy': correct / total} print(result) return result def find_best_k(self, df, times, min_k=2, max_k=15): l = [] for i in range(times): id = kma.try_different_k(df, max_k) l.append(id) sql = "SELECT k,avg(SORCE) FROM t_CMMS_TEMP_KMEANS_RESULT where ID in %s group by K" % str(tuple(l)) result = self.mysql_helper.fetchall(sql) for i in result: print(i[0]) print('\n') for i in result: print(i[1]) def print_model(self,model_name): # try to load the specified model path = self.base + model_name try: model = KMeansModel.load(self.sc, path) except: raise Exception('No such model found on hdfs!') for c in model.clusterCenters: print(c) for c in model.clusterCenters: l = [] for i in c: i = decimal.Decimal(i).quantize(decimal.Decimal('0.01')) l.append(float(i)) print(l) def test_rfm_data(self): life_cycle = self.load_from_mysql('t_CMMS_ANALYSE_LIFE').select('CUST_NO', 'LIFE_CYC') value = self.load_from_mysql('t_CMMS_ANALYSE_VALUE').select('CUST_NO', 'CUST_VALUE') loyalty = self.load_from_mysql('t_CMMS_ANALYSE_LOYALTY').select('CUST_NO', 'LOYALTY') rfm = loyalty.join(value, 'CUST_NO', 'left_outer').join(life_cycle, 'CUST_NO', 'left_outer').select('LIFE_CYC', 'CUST_VALUE', 'LOYALTY') return rfm def test_cust_info_data(self): return self.load_from_mysql('t_CMMS_TEMP_KMEANS_COLUMNS').select('LIFE_CYC', 'LOYALTY', 'CUST_RANK', 'AGE', 'LOCAL', 'SEX', 'AUM') def test_credit_data(self): return self.load_from_mysql('t_CMMS_TEMP_KMEANS_CREDIT').select('CREDIT', 'CYCLE_TIMES', 'CYCLE_AMT', 'INSTALLMENT_TIMES', 'INSTALLMENT_AMT', 'SWIPE_TIMES', 'SWIPE_AMT', 'CASH_TIMES', 'CASH_AMT') def full_keys_data(self): return self.load_from_mysql('t_CMMS_TEMP_KMEANS_CREDIT').select('CREDIT', 'CYCLE_TIMES', 'CYCLE_AMT', 'CYCLE_RATE', 'INSTALLMENT_TIMES', 'INSTALLMENT_AMT', 'SWIPE_TIMES', 'SWIPE_AMT', 'CASH_TIMES', 'CASH_AMT') def no_cycle_data(self): return self.load_from_mysql('t_CMMS_TEMP_KMEANS_CREDIT').select('CREDIT', 'INSTALLMENT_TIMES', 'INSTALLMENT_AMT', 'SWIPE_TIMES', 'SWIPE_AMT', 'CASH_TIMES', 'CASH_AMT') def no_amt_data(self): return self.load_from_mysql('t_CMMS_TEMP_KMEANS_CREDIT').select('CREDIT', 'CYCLE_TIMES', 'CYCLE_RATE', 'INSTALLMENT_TIMES', 'SWIPE_TIMES', 'CASH_TIMES') def no_cycle_amt(self): return self.load_from_mysql('t_CMMS_TEMP_KMEANS_CREDIT').select('CREDIT', 'CYCLE_TIMES', 'CYCLE_RATE', 'INSTALLMENT_TIMES', 'INSTALLMENT_AMT', 'SWIPE_TIMES', 'SWIPE_AMT', 'CASH_TIMES', 'CASH_AMT')
class Credit: def __init__(self): self.conf = (SparkConf() .setAppName("CREDIT") .set("spark.cores.max", "2") .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar')) self.sc = SparkContext(conf=self.conf) self.sqlctx = SQLContext(self.sc) self.mysql_helper = MySQLHelper('core', host='10.9.29.212') self.base = 'hdfs://master:9000/gmc/' def load_from_mysql(self, table, database='core'): url = "jdbc:mysql://10.9.29.212:3306/%s?user=root&characterEncoding=UTF-8" % database df = self.sqlctx.read.format("jdbc").options(url=url, dbtable=table, driver="com.mysql.jdbc.Driver").load() return df def sql_operate(self, sql, rdd, once_size=1000): temp = [] for row in rdd.collect(): # print(row) if len(temp) >= once_size: self.mysql_helper.executemany(sql, temp) temp.clear() temp.append(row) if len(temp) != 0: self.mysql_helper.executemany(sql, temp) temp.clear() def prepare_fpgrowth_data(self): tran_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN').filter("BILL_AMTFLAG = '+'").select('ACCTNBR', 'MER_CAT_CD') \ .filter("MER_CAT_CD != 0").filter("MER_CAT_CD != 6013") result = tran_df.map(lambda x: (str(int(x['ACCTNBR'])), [str(int(x['MER_CAT_CD'])), ])).groupByKey() def m(x): k = x[0] l = list(x[1]) v = set() for i in l: v.add(i[0]) return set(v) result = result.map(m) for i in result.take(10): print(i) model = FPGrowth.train(result, minSupport=0.05, numPartitions=10) result = model.freqItemsets().collect() for r in result: print(r) def cycle_credit(self): ''' 信用卡聚类数据预处理 :return: ''' print('---------------------------信用卡-Start--------------------------') # 交易流水 credit_tran_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN').select('ACCTNBR', 'MONTH_NBR', 'BILL_AMT', 'BILL_AMTFLAG').filter( "BILL_AMTFLAG ='-'").cache() # 卡账户信息 credit_acct_df = self.load_from_mysql('ACCT_D').select('ACCTNBR', 'MONTH_NBR', 'STM_MINDUE') # 还款计算 return_amt = credit_tran_df.groupBy('ACCTNBR', 'MONTH_NBR').sum('BILL_AMT') return_amt = return_amt.select('ACCTNBR', 'MONTH_NBR', return_amt['sum(BILL_AMT)'].alias('RETURNED')) # 去除0最低还款额,即未消费的账单月 join = credit_acct_df.join(return_amt, ['ACCTNBR', 'MONTH_NBR'], 'outer').filter('STM_MINDUE != 0') # 清除缓存 self.sqlctx.clearCache() def which_cycle_type(line): mindue = line['STM_MINDUE'] returned = line['RETURNED'] ''' 0:normal,all returned 1:cycle credit 2:overdue,don't return money ''' if mindue is not None and returned is None: flag = 2 elif returned >= mindue * 10: flag = 0 elif returned > mindue and returned < mindue * 10: flag = 1 else: flag = 9 return Row(ACCTNBR=int(line['ACCTNBR']), MONTH_NBR=line['MONTH_NBR'], DUE_FLAG=flag, STM_MINDUE=line['STM_MINDUE']) # 返回为PipelinedRDD join = join.map(which_cycle_type) # 转为DataFrame join = self.sqlctx.createDataFrame(join) ''' +-------+--------+-----+ | ACCTNBR | DUE_FLAG | count | +-------+--------+-----+ | 608126 | 2 | 1 | | 608126 | 0 | 6 | | 608868 | 0 | 4 | ''' # 按还款类型分类 each_type = join.groupBy(['ACCTNBR', 'DUE_FLAG']) # 计算每种还款情况数量 each_type_count = each_type.count() # 计算每种还款情况的最低还款额之和 each_type_mindue_sum = each_type.sum('STM_MINDUE') # 计算还款情况总数 all_type_count = each_type_count.groupBy('ACCTNBR').sum('count') # join 上述三表 rate = each_type_count.join(each_type_mindue_sum, ['ACCTNBR', 'DUE_FLAG'], 'outer').join(all_type_count, 'ACCTNBR', 'outer') # print(rate.columns) # ['ACCTNBR', 'DUE_FLAG', 'count', 'sum(STM_MINDUE)', 'sum(count)'] # 筛选出循环信用的数据 # TODO 暂时只取了循环信用的 rate = rate.filter(rate['DUE_FLAG'] == 1) # 计算进入循环信用的比例 rate = rate.select('ACCTNBR', (rate['sum(STM_MINDUE)'] * 10).alias('CYCLE_AMT'), rate['count'].alias('CYCLE_TIMES'), (rate['count'] / rate['sum(count)']).alias('CYCLE_RATE')) # rate.show() # print(rate.count()) def m(line): return line['CYCLE_TIMES'], line['CYCLE_AMT'], line['CYCLE_RATE'], line['ACCTNBR'] sql = "update t_CMMS_TEMP_KMEANS_CREDIT set CYCLE_TIMES=%s,CYCLE_AMT=%s,CYCLE_RATE=%s where ACCTNBR=%s" df = rate.map(m) print('将数据更新至数据库...') self.sql_operate(sql, df) # 将未进入循环的 设为0 print('将未进入循环的 设为0...') self.mysql_helper.execute( "update t_CMMS_TEMP_KMEANS_CREDIT set CYCLE_TIMES=0,CYCLE_AMT=0,CYCLE_RATE=0 where CYCLE_TIMES is null ") def losing_warn(self,year,month): # #计算月份 # if season == 1: # months_now = [1,2,3] # months_before = [10,11,12] # year_before = year - 1 # else: # months_now = [season * 3 - 2, season * 3 - 1, season * 3] # months_before = [season * 3 - 5, season * 3 - 4, season * 3-3] # year_before = year # # # # # 抽取每个季度数据 # # for m in months_now: # 最近一个月 month = '%02d' % month for i in range(2,29): day = '%02d' % i date = str(year) + month + day print(date) sql = "select MONTH_NBR from t_CMMS_CREDIT_TRAN where INP_DATE = %s limit 1" try: month_nbr = self.mysql_helper.fetchone(sql,(date,)) if month_nbr is None: continue else: month_nbr = int(month_nbr[0]) break except Exception: continue if month_nbr is None: raise Exception("There is no data in database for month:%s" % month) else: print('the latest month_nbr is %s' % month_nbr) months_now = [month_nbr-2,month_nbr-1,month_nbr] months_before = [month_nbr-5,month_nbr-4,month_nbr-3] print('months_now',months_now) print('months_before',months_before) # 交易流水 credit_tran_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN').select('ACCTNBR', 'INP_DATE', 'MONTH_NBR', 'BILL_AMT', 'BILL_AMTFLAG').filter("BILL_AMTFLAG ='+'").cache() # 筛选出两个季度对应的流水 months_now_filter = None months_before_filter = None for i in months_now: f = credit_tran_df.filter(credit_tran_df['MONTH_NBR'] == i) if months_now_filter is None: months_now_filter = f else: months_now_filter = months_now_filter.unionAll(f) for i in months_before: f = credit_tran_df.filter(credit_tran_df['MONTH_NBR'] == i) if months_before_filter is None: months_before_filter = f else: months_before_filter = months_before_filter.unionAll(f) months_now_filter.groupBy('MONTH_NBR').count().show() months_before_filter.groupBy('MONTH_NBR').count().show() months_now_count = months_now_filter.groupBy('ACCTNBR').count() months_before_count = months_before_filter.groupBy('ACCTNBR').count() months_now_count.show() months_before_count.show() join = months_now_count.select('ACCTNBR',months_now_count['count'].alias('NOW_COUNT')).join( months_before_count.select('ACCTNBR',months_before_count['count'].alias('BEFORE_COUNT')),'ACCTNBR','outer' ) join.show() def m(line): ncount = line['NOW_COUNT'] bcount = line['BEFORE_COUNT'] ''' 计算增长率 9999:两季度均无数据 8888:仅第一季度有数据,流失客户 7777:仅第二季度有数据,新增客户 其他数字:第二个季度较第一季度增长率 (s2-s1)/s1*100 ''' if ncount is None: if bcount is None:# n none,b none pass else:# n none, b not none increment = -9999 else: if bcount is None:# n not none, b none increment = 8888 else:# n not none,b not none increment = round((ncount-bcount)/bcount*100) ''' 计算信用卡生命周期(以增长率计算) 100+ 快速增长 50-100 增长 -50-50 稳定 -50- 衰退 9999 流失 ''' if increment > 100 and increment != -9999: life = 1 # fast growing elif increment <=100 and increment >50: life = 2 # growing elif increment <= 50 and increment > -50: life = 3 # stable elif increment <= -50: life = 4 # losing else: life = 9 # no more tran completely lost return line['ACCTNBR'],month_nbr,increment,life sql = "replace into t_CMMS_ANALYSE_CREDIT(ACCTNBR, MONTH_NBR, INCREMENT,LIFE, UPDATE_TIME) values(%s,%s,%s,%s,now())" rdd = join.map(m) print(type(rdd)) self.sql_operate(sql,rdd)