def clean_logs(cfg, df_persona, df_keywords, log_table_names): sc = SparkContext.getOrCreate() sc.setLogLevel(cfg['log']['level']) hive_context = HiveContext(sc) cfg_clean = cfg['pipeline']['main_clean'] conditions = cfg_clean['conditions'] start_date, end_date, load_minutes = load_batch_config(cfg) timer_start = timeit.default_timer() showlog_table, showlog_output_table, clicklog_table, clicklog_output_table = log_table_names starting_time = datetime.strptime(start_date, "%Y-%m-%d") ending_time = datetime.strptime(end_date, "%Y-%m-%d") batched_round = 1 while starting_time < ending_time: time_start = starting_time.strftime("%Y-%m-%d %H:%M:%S") batch_time_end = starting_time + timedelta(minutes=load_minutes) batch_time_end = min(batch_time_end, ending_time) time_end = batch_time_end.strftime("%Y-%m-%d %H:%M:%S") print_batching_info("Main clean", batched_round, time_start, time_end) command = """select did, adv_id, adv_type as media, slot_id, spread_app_id, device_name, net_type, adv_bill_mode_cd as price_model, {time} as action_time from {table} where {time} >= '{time_start}' and {time} < '{time_end}'""" df_clicklog_batched = hive_context.sql( command.format(time='click_time', table=clicklog_table, time_start=time_start, time_end=time_end)) df_showlog_batched = hive_context.sql( command.format(time='show_time', table=showlog_table, time_start=time_start, time_end=time_end)) mode = 'overwrite' if batched_round == 1 else 'append' is_empty_showlog_batched = df_showlog_batched.rdd.isEmpty() if not is_empty_showlog_batched: df_showlog_batched = clean_batched_log(df_showlog_batched, df_persona, conditions, df_keywords) write_to_table(df_showlog_batched, showlog_output_table, mode=mode) is_empty_clicklog_batched = df_clicklog_batched.rdd.isEmpty() if not is_empty_clicklog_batched: df_clicklog_batched = clean_batched_log(df_clicklog_batched, df_persona, conditions, df_keywords) write_to_table(df_clicklog_batched, clicklog_output_table, mode=mode) batched_round += 1 starting_time = batch_time_end timer_end = timeit.default_timer() print('Total batching seconds: ' + str(timer_end - timer_start))
def run(hive_context, cfg): cfg_logs = cfg['pipeline']['main_logs'] logs_table_name = cfg_logs['logs_output_table_name'] interval_time_in_seconds = cfg_logs['interval_time_in_seconds'] cfg_train = cfg['pipeline']['main_trainready'] trainready_table = cfg_train['trainready_output_table'] batch_config = load_batch_config(cfg) generate_trainready(hive_context, batch_config, interval_time_in_seconds, logs_table_name, trainready_table)
def run(hive_context, cfg): # prepare parameters for processing batched logs. cfg_clean = cfg['pipeline']['main_clean'] cfg_clean_output = cfg_clean['data_output'] batch_config = load_batch_config(cfg) clicklog_table_name = cfg_clean_output['clicklog_output_table'] showlog_table_name = cfg_clean_output['showlog_output_table'] cfg_logs = cfg['pipeline']['main_logs'] logs_table_name = cfg_logs['logs_output_table_name'] interval_time_in_seconds = cfg_logs['interval_time_in_seconds'] log_table_names = (showlog_table_name, clicklog_table_name, logs_table_name) join_logs(hive_context, batch_config, interval_time_in_seconds, log_table_names)
def run(hive_context, cfg): batch_config = load_batch_config(cfg) cfg_logs = cfg['pipeline']['main_logs'] logs_table = cfg_logs['logs_output_table_name'] # add region ids to logs. add_region_to_logs(hive_context, batch_config, logs_table)