示例#1
0
def test_extract_pe_components():
    # get event category and the type of action taken when parameter i = 1
    assert preprocess.extract_pe_components([('page1', 'eventCategory1<:<eventAction1'),
                                             ('page2', 'eventCategory2<:<eventAction2'),
                                             ('page3', 'eventCategory2<:<eventAction1')], 1) ==\
           [('eventCategory1', 'eventAction1'), ('eventCategory2', 'eventAction2'), ('eventCategory2', 'eventAction1')]
    # should this return an empty list? Potential bug?
    assert preprocess.extract_pe_components(
        [('page1', 'eventCategory1<:<eventAction1'),
         ('page2', 'eventCategory2<:<eventAction2'),
         ('page3', 'eventCategory2<:<eventAction1')],
        i=0) == []
示例#2
0
def event_preprocess(user_journey_df):
    """
    Bulk-execute event related functions... Run after sequence_preprocess(user_journey_df) so that
    Page_Event_List column exists
    :param user_journey_df: dataframe
    :return: no return, columns added in place.
    """
    logger.info("Preprocess and aggregate events...")
    logger.debug("Page_Event_List to Event_List...")
    user_journey_df['Event_List'] = user_journey_df['Page_Event_List'].map(
        lambda x: prep.extract_pe_components(x, 1))
    logger.debug("Computing event-related counts and frequencies...")
    event_counters(user_journey_df)
示例#3
0
def sequence_preprocess(user_journey_df):
    """
    Bulk-execute main input pre-processing functions: from BigQuery journey strings to Page_Event_List to Page_List.
    PageSequence required for dataframes groupbys/filtering.
    :param user_journey_df: dataframe
    :return: no return, columns added in place.
    """
    logger.info("BQ Sequence string to Page_Event_List...")
    user_journey_df['Page_Event_List'] = user_journey_df['Sequence'].map(
        prep.bq_journey_to_pe_list)
    logger.info("Page_Event_List to Page_List...")
    user_journey_df['Page_List'] = user_journey_df['Page_Event_List'].map(
        lambda x: prep.extract_pe_components(x, 0))
    logger.info("Page_List to PageSequence...")
    # TODO: Remove condition + internal PageSequence post-testing/debugging.
    if 'PageSequence' not in user_journey_df.columns:
        user_journey_df['PageSequence'] = user_journey_df['Page_List'].map(
            lambda x: ">>".join(x))
    else:
        user_journey_df['PageSequence_internal'] = user_journey_df[
            'Page_List'].map(lambda x: ">>".join(x))
def read_write_file(input_path, output_path, number_lines):
    """

    :param input_path:
    :param output_path:
    :return:
    """
    with gzip.open(output_path, "w") as write_file:
        with gzip.open(input_path, "r") as read_file:
            df_columns = read_file.readline().decode().replace("\n",
                                                               "").split("\t")
            print(df_columns)
            if 'PageSequence' not in df_columns:
                OTHER_COLUMNS.insert(2, 'PageSequence')
            sequence_index = df_columns.index("Sequence")
            logging.info("Write headers...")
            all_cols = df_columns + OTHER_COLUMNS
            print(all_cols)
            write_to_file = "\t".join(all_cols) + "\n"
            logging.info("Iteration...")

            for i, line in enumerate(read_file):

                line = line.decode().replace("\n", "")
                row = line.split("\t")

                for element in row:
                    if not isinstance(element, str):
                        write_to_file += str(element) + "\t"
                    else:
                        write_to_file += "\"" + str(element) + "\"" + "\t"

                sequence = row[sequence_index]
                # Writing sequence columns
                # Page event
                page_event_list = prep.bq_journey_to_pe_list(sequence)
                write_to_file += "\"" + str(page_event_list) + "\"" + "\t"
                # Page list
                page_list = prep.extract_pe_components(page_event_list, 0)
                write_to_file += "\"" + str(page_list) + "\"" + "\t"

                if 'PageSequence' not in df_columns:
                    write_to_file += "\"" + ">>".join(page_list) + "\"" + "\t"

                # Writing events columns
                event_list = prep.extract_pe_components(page_event_list, 1)
                write_to_file += "\"" + str(event_list) + "\"" + "\t"
                write_to_file += "\"" + str(
                    feat.count_event_cat(event_list)) + "\"" + "\t"
                write_to_file += "\"" + str(
                    feat.aggregate_event_cat(event_list)) + "\"" + "\t"
                write_to_file += "\"" + str(
                    feat.aggregate_event_cat_act(event_list)) + "\"" + "\t"

                # Writing taxon_list
                write_to_file += "\"" + str(
                    prep.extract_cd_components(page_event_list,
                                               2)) + "\"" + "\t"
                write_to_file += "\"" + str(
                    prep.extract_pcd_list(page_event_list, 2)) + "\"" + "\t"

                # Writing loop column stuff
                de_looped = prep.collapse_loop(page_list)
                write_to_file += "\"" + str(de_looped) + "\"" + "\t"
                write_to_file += "\"" + ">>".join(de_looped) + "\""

                write_to_file += "\n"

                if i % 500000 == 0:
                    logging.info("At index: {}".format(i))
                    write_file.write(write_to_file.encode())
                    write_to_file = ""
                    write_file.flush()

                if i == number_lines - 1 and write_to_file != "":
                    logging.info("At index via last: {}".format(i))
                    write_file.write(write_to_file.encode())
                    write_to_file = ""
                    write_file.flush()