def test_extract_pe_components(): # get event category and the type of action taken when parameter i = 1 assert preprocess.extract_pe_components([('page1', 'eventCategory1<:<eventAction1'), ('page2', 'eventCategory2<:<eventAction2'), ('page3', 'eventCategory2<:<eventAction1')], 1) ==\ [('eventCategory1', 'eventAction1'), ('eventCategory2', 'eventAction2'), ('eventCategory2', 'eventAction1')] # should this return an empty list? Potential bug? assert preprocess.extract_pe_components( [('page1', 'eventCategory1<:<eventAction1'), ('page2', 'eventCategory2<:<eventAction2'), ('page3', 'eventCategory2<:<eventAction1')], i=0) == []
def event_preprocess(user_journey_df): """ Bulk-execute event related functions... Run after sequence_preprocess(user_journey_df) so that Page_Event_List column exists :param user_journey_df: dataframe :return: no return, columns added in place. """ logger.info("Preprocess and aggregate events...") logger.debug("Page_Event_List to Event_List...") user_journey_df['Event_List'] = user_journey_df['Page_Event_List'].map( lambda x: prep.extract_pe_components(x, 1)) logger.debug("Computing event-related counts and frequencies...") event_counters(user_journey_df)
def sequence_preprocess(user_journey_df): """ Bulk-execute main input pre-processing functions: from BigQuery journey strings to Page_Event_List to Page_List. PageSequence required for dataframes groupbys/filtering. :param user_journey_df: dataframe :return: no return, columns added in place. """ logger.info("BQ Sequence string to Page_Event_List...") user_journey_df['Page_Event_List'] = user_journey_df['Sequence'].map( prep.bq_journey_to_pe_list) logger.info("Page_Event_List to Page_List...") user_journey_df['Page_List'] = user_journey_df['Page_Event_List'].map( lambda x: prep.extract_pe_components(x, 0)) logger.info("Page_List to PageSequence...") # TODO: Remove condition + internal PageSequence post-testing/debugging. if 'PageSequence' not in user_journey_df.columns: user_journey_df['PageSequence'] = user_journey_df['Page_List'].map( lambda x: ">>".join(x)) else: user_journey_df['PageSequence_internal'] = user_journey_df[ 'Page_List'].map(lambda x: ">>".join(x))
def read_write_file(input_path, output_path, number_lines): """ :param input_path: :param output_path: :return: """ with gzip.open(output_path, "w") as write_file: with gzip.open(input_path, "r") as read_file: df_columns = read_file.readline().decode().replace("\n", "").split("\t") print(df_columns) if 'PageSequence' not in df_columns: OTHER_COLUMNS.insert(2, 'PageSequence') sequence_index = df_columns.index("Sequence") logging.info("Write headers...") all_cols = df_columns + OTHER_COLUMNS print(all_cols) write_to_file = "\t".join(all_cols) + "\n" logging.info("Iteration...") for i, line in enumerate(read_file): line = line.decode().replace("\n", "") row = line.split("\t") for element in row: if not isinstance(element, str): write_to_file += str(element) + "\t" else: write_to_file += "\"" + str(element) + "\"" + "\t" sequence = row[sequence_index] # Writing sequence columns # Page event page_event_list = prep.bq_journey_to_pe_list(sequence) write_to_file += "\"" + str(page_event_list) + "\"" + "\t" # Page list page_list = prep.extract_pe_components(page_event_list, 0) write_to_file += "\"" + str(page_list) + "\"" + "\t" if 'PageSequence' not in df_columns: write_to_file += "\"" + ">>".join(page_list) + "\"" + "\t" # Writing events columns event_list = prep.extract_pe_components(page_event_list, 1) write_to_file += "\"" + str(event_list) + "\"" + "\t" write_to_file += "\"" + str( feat.count_event_cat(event_list)) + "\"" + "\t" write_to_file += "\"" + str( feat.aggregate_event_cat(event_list)) + "\"" + "\t" write_to_file += "\"" + str( feat.aggregate_event_cat_act(event_list)) + "\"" + "\t" # Writing taxon_list write_to_file += "\"" + str( prep.extract_cd_components(page_event_list, 2)) + "\"" + "\t" write_to_file += "\"" + str( prep.extract_pcd_list(page_event_list, 2)) + "\"" + "\t" # Writing loop column stuff de_looped = prep.collapse_loop(page_list) write_to_file += "\"" + str(de_looped) + "\"" + "\t" write_to_file += "\"" + ">>".join(de_looped) + "\"" write_to_file += "\n" if i % 500000 == 0: logging.info("At index: {}".format(i)) write_file.write(write_to_file.encode()) write_to_file = "" write_file.flush() if i == number_lines - 1 and write_to_file != "": logging.info("At index via last: {}".format(i)) write_file.write(write_to_file.encode()) write_to_file = "" write_file.flush()