def run(self): df_interventions = s3.read_parquet(self.input()[0].path).reset_index() df_interventions = self.group_efa(df_interventions) df_interventions = self.add_training(df_interventions) df_interventions = self.group_training(df_interventions) df_interventions = self.filter_interventions(df_interventions) df_interventions = self.map_old_new_interventions(df_interventions) df_journeys = s3.read_parquet(self.input()[1].path) df_output = self.transform_interventions(df_interventions, df_journeys) # Recount journeys, because we removed journeys in between df_output["journey_count"] = 1 df_output["journey_count"] = df_output.groupby( ["user_id"])["journey_count"].cumsum() s3.write_parquet(df_output, self.output().path)
def run(self): params = yaml.load(open("./conf/base/parameters.yml"), Loader=yaml.FullLoader)["evaluation_params"] model = s3.read_pickle(self.input()[0].path) model_id, test_path, train_path = get_model_info_by_path( self.input()[0].path) df_train = s3.read_parquet(train_path) df_test = s3.read_parquet(test_path) rec_error = get_aggregate_recommendation_error( df_train, df_test, model, params["set_size"], params["num_recs"], params["percent_sample"], ) write_recommendation_eval(get_db_engine(), rec_error, model_id, params) self.task_complete = True
def run(self): df_modelling = s3.read_parquet(self.input().path) df_train, df_test = self.train_test_split(df_modelling) df_train, df_test = self.scale_numeric_feats(df_train, df_test) # NOTE: Save both datasets twice. # - One set that is tied to a trained model # - One set that gets overwritten with the current one s3.write_parquet(df_train, self.output()[0].path) s3.write_parquet(df_test, self.output()[1].path) s3.write_parquet(df_train, self.output()[2].path) s3.write_parquet(df_test, self.output()[3].path)
def concat_parquet(paths, s3path): dfs = [] for path in paths: df = s3.read_parquet(path) # NOTE: Convert to datetime seconds because the parquet # engine can not handle datetime nanoseconds. df_dates = df.select_dtypes("datetime") df_dates = df_dates.astype("datetime64[s]") df[df_dates.columns] = df_dates dfs.append(df) df = pd.concat(dfs) s3.write_parquet(df, s3path)
def run(self): df_train = s3.read_parquet(self.input()[0].path) y_train = df_train.loc[:, "ttj_sub_12"] X_train = df_train.drop(["ttj", "ttj_sub_12"], axis="columns") grid = yaml.load(open("./conf/base/parameters.yml"), Loader=yaml.FullLoader)["rf_small_grid"] model = self.train_rf_cv(X_train, y_train, scoring_metric="f1", grid=grid) s3.write_pickle(model, self.output().path)
def cli(recommendations, set_size, journey_id): model_path, _, test_path = postgres.get_best_model_paths() df_test = s3.read_parquet(test_path) model = s3.read_pickle(model_path) if not journey_id: click.echo("No Journey id specified. Try for example {}".format( df_test.sample(5).index.tolist())) return elif journey_id not in df_test.index.tolist(): click.echo("Journey ID {} not found. Try for example {}".format( journey_id, df_test.sample(5).index.tolist())) return observation = df_test.loc[journey_id, :].copy() observation = observation.drop(["ttj_sub_12", "ttj"]) interv_cols = [col for col in observation.index if "i_" in col[:2]] dem_cols = [col for col in observation.index if "d_" in col[:2]] click.echo("Journey {} --- Demographics".format(journey_id)) click.echo("---------------") output = observation[dem_cols] click.echo(output[output == 1]) # NOTE: Un-normalize age here. # Get maximum age from journey data instead click.echo("Age: {}".format(round(output["d_age"] * 78))) click.echo("---------------") click.echo("Use Model: {}".format(model.__class__.__name__)) click.echo("---------------") observation[interv_cols] = 0 base_probability = model.predict_proba(observation.to_numpy().reshape( 1, -1)) click.echo("Base employment probability {:.4f}".format( base_probability[0][1])) click.echo("---------------") click.echo("Intervention Recommendations".format(journey_id)) click.echo("---------------") df_recs = get_top_recommendations(model, observation, set_size=set_size, n=recommendations) click.echo(df_recs)
def run(self): df_test = s3.read_parquet(self.input()[0][1].path) y_test = df_test.loc[:, "ttj_sub_12"] X_test = df_test.drop(["ttj", "ttj_sub_12"], axis="columns") lg = s3.read_pickle(self.input()[1].path) metrics = evaluate(lg, X_test, y_test) model_info_to_db( engine=get_db_engine(), model=lg, metrics=metrics, features=X_test.columns.tolist(), date=self.date, model_path=self.input()[1].path, train_data_path=self.input()[0][2].path, test_data_path=self.input()[0][3].path, ) # NOTE: Set task as completed manually. Use the build-in # luigi.contrib.postgres.CopyToTable Task would the right. self.task_complete = True
def run(self): df_journeys = s3.read_parquet(self.input().path) df_journeys = df_journeys.set_index(["user_id", "journey_count"]) df_interventions = self.dummy_interventions(df_journeys) df_feats = self.transform_features(df_journeys) df_model = df_feats.merge(df_interventions, right_index=True, left_index=True) # NOTE: Cut-off modeling table at specified time. modelling_params = yaml.load(open("./conf/base/parameters.yml"), Loader=yaml.FullLoader) df_model = df_model[ df_model["register_date"] >= modelling_params["data_set"] ["start"]].reset_index() df_model = df_model.drop(["user_id", "register_date"], axis="columns") df_model = self.drop_empty_cols(df_model) df_model = self.set_target_variables(df_model) s3.write_parquet(df_model, self.output().path)
def run(self): df_journeys = s3.read_parquet(self.input().path) df_journeys = self.add_outcomes(df_journeys) s3.write_parquet(df_journeys, self.output().path)
def run(self): df = s3.read_parquet(self.input().path) df = self.translate_intervention_codes(df) s3.write_parquet(df, self.output().path)
def run(self): df = s3.read_parquet(self.input().path) df = self.transform_journeys(df) s3.write_parquet(df, self.output().path)
def run(self): df_pedidos = s3.read_parquet(self.input()[0].path) df_journeys = s3.read_parquet(self.input()[1].path) df_journeys = self.add_demographics(df_pedidos, df_journeys) s3.write_parquet(df_journeys, self.output().path)
def run(self): df_intermediate = s3.read_parquet(self.input().path) df_intermediate = self.add_mappings(df_intermediate) s3.write_parquet(df_intermediate, self.output().path)