def band(self): all_tasks = [] my_config = SomeConfig() for i, d in enumerate(period_dates(self.task_target_date, self.period)): source = TDataSource(task_target_date=d) complicated = TComplicatedTask( specific_input=source, some_param=my_config.some_param, task_input=source.logs, # empty_input=None, task_target_date=d, ) all_tasks.append(complicated) self.combined_output = data_combine( [t.simplest_output for t in all_tasks]) self.list_output = [t.simplest_output for t in all_tasks] self.list_tasks_output = [t for t in all_tasks] self.nested = [ TNestedPipeline1(task_name="MyNewPipe").some_output, TNestedPipeline1(task_name="custom_task_name").some_output, ] self.nested2 = TNestedPipeline2().some_output self.nested3 = TSuperNestedPipeline( list_parameter=self.list_param).some_output
def fetch_data(task_target_date, period=datetime.timedelta(days=7)): all_data = [] for d in period_dates(task_target_date, period): data = fetch_wine_quality(task_target_date=d) all_data.append(data) return data_combine(all_data, sort=True)
def band(self): projected_logs = [] for i, d in enumerate(period_dates(self.task_target_date, self.period)): raw_logs = RawDeviceLog(task_target_date=d).logs projected = DeviceLogProjection(raw_logs=raw_logs, task_target_date=d) projected_logs.append(projected.projected_logs) self.projected = data_combine(projected_logs)
def top_artists_report(task_target_date, period=timedelta(days=2)): logging.info("top_artists_report") streams = [ stream(task_name="Stream_%s" % i, task_target_date=d) for i, d in enumerate(period_dates(task_target_date, period)) ] artists = aggregate_artists(stream=data_combine(streams)) top_n = top_n_artists(artists=artists) return top_n
def band(self): all_ids, all_data = {}, {} for i, d in enumerate(period_dates(self.task_target_date, self.period)): # if self.task_env == TaskEnv.prod and not self.run_on_prod: # ids = cb_data_dump_path(task_target_date=d, name="ids") # data = cb_data_dump_path(task_target_date=d, name="data") # else: ids = FetchIds(task_target_date=d, period=one_day).ids data = FetchData(task_target_date=d, ids=ids).data d_key = d.strftime("%Y-%m-%d") all_ids[d_key] = ids all_data[d_key] = data self.ids = data_combine(all_ids.values(), sort=True) self.data = data_combine(all_data.values(), sort=True)
def fetch_partner_data( task_target_date, selected_partners: List[str], period=datetime.timedelta(days=7) ) -> List[pd.DataFrame]: partner_data = [] for partner in selected_partners: all_data = [] for d in period_dates(task_target_date, period): if partner == "a": data = ingest_partner_a(task_target_date=d) elif partner == "b": data = ingest_partner_b(task_target_date=d) elif partner == "c": data = ingest_partner_c(task_target_date=d) else: raise Exception("Partner not found!") all_data.append(data) partner_data.append(data_combine(all_data, sort=True)) return partner_data
def generate_partner_data( seed: pd.DataFrame = demo_data_repo.seed, task_target_date=datetime.datetime.now().date(), period=datetime.timedelta(days=7), bad_labels_date=datetime.datetime.strptime("2018-01-01", "%Y-%m-%d").date(), ): data = rename_columns(data=seed) data = calculate_target_variable(data=data) results = {} for d in period_dates(task_target_date, period): r = generate_for_date( task_target_date=d, data=data, a_out=demo_data_repo.partner_a_file(d), b_out=demo_data_repo.partner_b_file(d), c_out=demo_data_repo.partner_c_file(d), ) results[str(d)] = r noisy_data = generate_for_date( task_target_date=bad_labels_date, data=data, noise=True, a_out=demo_data_repo.partner_a_file(bad_labels_date), b_out=demo_data_repo.partner_b_file(bad_labels_date), c_out=demo_data_repo.partner_c_file(bad_labels_date), ) results[str(bad_labels_date)] = noisy_data customers = create_customer_files( seed=data, a_out=partner_data_file("customer_a.csv"), b_out=partner_data_file("customer_b.csv"), ) results["customers"] = customers return results