def apply_reference_action_type(df): df["reference_" + clean_filename("interaction item image")] = df.apply( lambda row: row["reference"] if row["action_type"] == "interaction item image" else "<none>", axis=1, ) df["reference_" + clean_filename("search for poi")] = df.apply( lambda row: row["reference"] if row["action_type"] == "search for poi" else "<none>", axis=1, ) df["reference_" + clean_filename("interaction item rating")] = df.apply( lambda row: row["reference"] if row["action_type"] == "interaction item rating" else "<none>", axis=1, ) df["reference_" + clean_filename("clickout item")] = df.apply( lambda row: row["reference"] if row["action_type"] == "clickout item" else "<none>", axis=1, ) df["reference_" + clean_filename("interaction item deals")] = df.apply( lambda row: row["reference"] if row["action_type"] == "interaction item deals" else "<none>", axis=1, ) df["reference_" + clean_filename("change of sort order")] = df.apply( lambda row: row["reference"] if row["action_type"] == "change of sort order" else "<none>", axis=1, ) df["reference_" + clean_filename("search for item")] = df.apply( lambda row: row["reference"] if row["action_type"] == "search for item" else "<none>", axis=1, ) df["reference_" + clean_filename("search for destination")] = df.apply( lambda row: row["reference"] if row["action_type"] == "search for destination" else "<none>", axis=1, ) df["reference_" + clean_filename("filter selection")] = df.apply( lambda row: row["reference"] if row["action_type"] == "filter selection" else "<none>", axis=1, ) df["reference_" + clean_filename("interaction item info")] = df.apply( lambda row: row["reference"] if row["action_type"] == "interaction item info" else "<none>", axis=1, ) return df
def output(self): return ( luigi.LocalTarget( os.path.join( DATASET_DIR, clean_filename(self.filter_city), "train_indexed__size=%d.csv" % (self.sample_size), )), luigi.LocalTarget( os.path.join( DATASET_DIR, clean_filename(self.filter_city), "item_metadata_indexed__size=%d.csv" % (self.sample_size), )), )
def apply_reference_action_type(df): """ Split click type in columns """ df['reference_' + clean_filename("interaction item image")] = df.apply( lambda row: row['reference'] if row['action_type'] == "interaction item image" else "<none>", axis=1) df['reference_' + clean_filename("search for poi")] = df.apply( lambda row: row['reference'] if row['action_type'] == "search for poi" else "<none>", axis=1) df['reference_' + clean_filename("interaction item rating")] = df.apply( lambda row: row['reference'] if row['action_type'] == "interaction item rating" else "<none>", axis=1) df['reference_' + clean_filename("clickout item")] = df.apply( lambda row: row['reference'] if row['action_type'] == "clickout item" else "<none>", axis=1) df['reference_' + clean_filename("interaction item deals")] = df.apply( lambda row: row['reference'] if row['action_type'] == "interaction item deals" else "<none>", axis=1) df['reference_' + clean_filename("change of sort order")] = df.apply( lambda row: row['reference'] if row['action_type'] == "change of sort order" else "<none>", axis=1) df['reference_' + clean_filename("search for item")] = df.apply( lambda row: row['reference'] if row['action_type'] == "search for item" else "<none>", axis=1) df['reference_' + clean_filename("search for destination")] = df.apply( lambda row: row['reference'] if row['action_type'] == "search for destination" else "<none>", axis=1) df['reference_' + clean_filename("filter selection")] = df.apply( lambda row: row['reference'] if row['action_type'] == "filter selection" else "<none>", axis=1) df['reference_' + clean_filename("interaction item info")] = df.apply( lambda row: row['reference'] if row['action_type'] == "interaction item info" else "<none>", axis=1) return df
def output(self): return luigi.LocalTarget( os.path.join( DATASET_DIR, clean_filename(self.filter_city), "train__agg_indexed__size=%d_window=%d.csv" % (self.sample_size, self.window_hist), ))
def output(self): return luigi.LocalTarget(os.path.join(DATASET_DIR, clean_filename(self.filter_city), "item_indices__size=%d.csv" % (self.sample_size))),\ luigi.LocalTarget(os.path.join(DATASET_DIR, clean_filename(self.filter_city), "user_indices__size=%d.csv" % (self.sample_size))),\ luigi.LocalTarget(os.path.join(DATASET_DIR, clean_filename(self.filter_city), "session_indices__size=%d.csv" % (self.sample_size))),\ luigi.LocalTarget(os.path.join(DATASET_DIR, clean_filename(self.filter_city), "action_type_indices__size=%d.csv" % (self.sample_size))),\ luigi.LocalTarget(os.path.join(DATASET_DIR, clean_filename(self.filter_city), "platform_indices__size=%d.csv" % (self.sample_size))),\ luigi.LocalTarget(os.path.join(DATASET_DIR, clean_filename(self.filter_city), "city_indices__size=%d__csv" % (self.sample_size))),\ luigi.LocalTarget(os.path.join(DATASET_DIR, clean_filename(self.filter_city), "device_indices__size=%d.csv" % (self.sample_size))),\
def output(self): return ( luigi.LocalTarget( os.path.join( DATASET_DIR, clean_filename(self.filter_city), "train_transform__size=%d.csv" % (self.sample_size), )), luigi.LocalTarget( os.path.join( DATASET_DIR, clean_filename(self.filter_city), "text_vocabulary__size=%d.csv" % (self.sample_size), )), luigi.LocalTarget( os.path.join( DATASET_DIR, clean_filename(self.filter_city), "filter_session_size=%d.csv" % (self.sample_size), )), )
def main(self, sc: SparkContext, *args): os.makedirs(os.path.join(DATASET_DIR, clean_filename(self.filter_city)), exist_ok=True) spark = SparkSession(sc) train_df = spark.read.csv(self.input()[0].path, header=True, inferSchema=True) train_df = train_df.withColumn("impressions_array", F.split(train_df.impressions, "\|")) meta_df = spark.read.csv(self.input()[1].path, header=True, inferSchema=True) # Filter dataset if self.filter_city != 'all': if self.filter_city == 'recsys': train_df = train_df.filter(train_df.city.isin(RECSYS_CITIES)) else: train_df = train_df.filter(train_df.city == self.filter_city) # Filter reference reference_df = train_df.select("reference").distinct() # Filte item impressions item_id_df = train_df.select( posexplode("impressions_array").alias( "pos_item_idx", "reference")).select("reference").distinct() #raise(Exception(train_df.show())) item_id_df = item_id_df.union(reference_df).select( "reference").distinct() meta_df = meta_df.join( item_id_df, meta_df.item_id == item_id_df.reference).select( "item_id", "properties") if self.sample_size > 0: train_df = train_df.sort("timestamp", ascending=False).limit(self.sample_size) # Save train_df.toPandas().to_csv(self.output()[0].path, index=False) meta_df.toPandas().to_csv(self.output()[1].path, index=False)
def dataset_dir(self) -> str: return os.path.join(DATASET_DIR, clean_filename(self.filter_city))
def main(self, sc: SparkContext, *args): os.makedirs(DATASET_DIR, exist_ok=True) spark = SparkSession(sc) print("Load Data...") # Load df = spark.read.csv(self.input()[0].path, header=True, inferSchema=True) # .limit(500000) df = df.withColumn("idx", F.monotonically_increasing_id()) print("Transform Interactions data...") def to_int_array(x): return [] if x == "" or x == None else [ int(i) or 0 for i in x.split("|") ] to_int_array_udf = udf(lambda x: to_int_array(x), ArrayType(IntegerType())) def to_float_array(x): return [] if x == "" or x == None else [ float(i) or 0 for i in x.split("|") ] to_float_array_udf = udf(lambda x: to_float_array(x), ArrayType(FloatType())) df = df.\ withColumn("impressions", to_int_array_udf(col('impressions'))).\ withColumn("prices", to_float_array_udf(col('prices'))) def to_reference_action(action_type, text, reference): return reference if action_type == text else "<none>" to_reference_action_udf = udf( lambda a, b, c: to_reference_action(a, b, c), StringType()) for ref in [ "interaction item image", "search for poi", "interaction item rating", "clickout item", "interaction item deals", "change of sort order", "search for item", "search for destination", "filter selection", "interaction item info" ]: df = df.\ withColumn('reference_'+clean_filename(ref), to_reference_action_udf(col('action_type'), lit(ref), col('reference'))) print("Split filter session data...") df, df_filters = self.split_columns_filter(df) print(df.columns) print("Tokenizer reference search...") # Transform columns with text columns_with_string = [ "reference_search_for_poi", "reference_change_of_sort_order", "reference_search_for_destination", "reference_filter_selection" ] df_text = df.select(["idx"] + columns_with_string).toPandas() # vocabulario vocab = ["<none>"] for c in columns_with_string: df_text[c] = df_text[c].fillna("<none>") vocab += df_text[c].tolist() # Tokenizer tokenizer = StaticTokenizerEncoder( vocab, tokenize=lambda x: re.split('\W+', x), min_occurrences=10, reserved_tokens=[]) df_vocabulary = pd.DataFrame(tokenizer.vocab, columns=['vocabulary']) #Apply tokenizer for text_column in columns_with_string: df_text[text_column] = tokenizer.batch_encode( df_text[text_column])[0].cpu().detach().numpy().tolist() df_text[text_column + '_max_words'] = len(df_text[text_column][0]) df_text = spark.createDataFrame(df_text) df = df.drop(*columns_with_string).join(df_text, ['idx']) # Save df.toPandas().to_csv(self.output()[0].path, index=False) df_vocabulary.to_csv(self.output()[1].path) df_filters.toPandas().to_csv(self.output()[2].path) return
def output(self): return luigi.LocalTarget( os.path.join( DATASET_DIR, clean_filename(self.filter_city), "item_metadata_transform__size=%d.csv" % (self.sample_size)))