def etl_fact_market(*args): """fact_market表主函数 :param args: 按位参数engine_zone_macro,engine_draw,engine_target """ # 初始化 extract,transform和load三个对象 extract = Extract(engine_zone_macro, engine_draw, engine_target) transform = Transform() load = Load(engine_target) # 抽取已经经过etl的商圈 done_market = extract.done_market() df_tag_counts = extract.tag_counts() df_industry = extract.industry() has_dealed = [] for i, sample_tag_counts in df_tag_counts.iterrows(): grandParentId = sample_tag_counts['grandParentId'] if len(grandParentId) != 36: # 判断grandParentId的有效性 logging.warning('Round %d, %s is invalid ,skipped.' % (i, grandParentId)) continue elif grandParentId in done_market: # 判断该商圈是已经经过etl logging.warning('Round %d, %s etl before' % (i, grandParentId)) continue if grandParentId in has_dealed: logging.warning('Round %d, %s etl before' % (i, grandParentId)) continue else: has_dealed.append(grandParentId) # 抽取数据 zone_grandparent = extract.zone_grandparent(grandParentId) if len(zone_grandparent) == 0: logging.warning('Round %d, has no draw samples' % i) continue rent = extract.rent_details(grandParentId) industry_tmp = df_industry[df_industry['grandParentId'] == grandParentId] # 转换数据 rent = transform.rent_calculate(rent) industry_dict = transform.reshape_industry(industry_tmp) # 组合数据 clean = transform.compile_dfs(sample_tag_counts, rent, industry_dict, zone_grandparent) try: load.loading(clean) logging.info('Round %d, %s etl secceed' % (i, grandParentId)) except Exception as e: logging.error('Round %d, %s' % (i, e))
def etl_fact_market(source_engine, target_engine, rec_path): extract = Extract(source_engine, target_engine) transform = Transform() load = Load(target_engine) record = Record('rec.cfg') start_params = record.get_record() unique_marketguid = [] done_market = [] has_dealed = [] for i, grandParentId in enumerate(unique_marketguid): if len(grandParentId) != 36: # 判断grandParentId的有效性 logging.error('Round %d, %s is not valid.' % (i, grandParentId)) continue elif grandParentId in done_market: # 判断该商圈是已经经过etl logging.warning('Round %d, %s etl before' % (i, grandParentId)) continue if grandParentId in has_dealed: logging.warning('Round %d, %s etl before' % (i, grandParentId)) continue else: has_dealed.append(grandParentId) zone_grandparent = extract.zone_grandparent(grandParentId) if len(zone_grandparent) == 0: logging.warning('Round %d, has no draw samples' % i) continue rent = extract.rent_details(grandParentId) industry_tmp = industry[industry['grandParentId'] == grandParentId] # 转换数据 rent = transform.rent_calculate(rent) industry_dict = transform.reshape_industry(industry_tmp) # 组合数据 clean = transform.compile_dfs(sample_tag_counts, rent, industry_dict, zone_grandparent) try: load.loading(clean) except Exception as e: logging.error('Round %d, %s' % (i, e))
def market_to_api2(source, target, record_file='api2.record'): """anti_fraud数据库api2表的etl主函数 :param source: 源数据库引擎 :param target: 目标数据库引擎 :param record_file: 负责记录装载id的文件名,默认为 app2.record """ # 初始化对象 extract = Extract(source, target, record_file) transform = Transform() load = Load(target, record_file) # 抽取数据 market_df = extract.market() draw_samples = extract.draw_samples() # 转换数据 reshaped_market = transform.reshape_market(market_df) aggregated_samples = transform.aggregate_from_samples(draw_samples) api2_df = transform.compile_dfs(reshaped_market, aggregated_samples) # 装载数据 load.loading(api2_df)