Пример #1
0
def al_pkdd(dataset_number):
    """
    Load Activation Log number `dataset_number`

    Parameters
    ----------
    dataset_number: int
        index of the Activation Log, lower than 500

    Returns
    -------
    al: logs.ActivationLog
        The Activation Log with given index

    Notes
    -----
    The Activation Logs are saved with 500 per file (to limit the number of files). This function handles the retrieval.

    """
    assert dataset_number < 100

    df = dataframe_operations \
             .import_df(f'data/public/PKDD/ActivationLogs2.csv') \
             .iloc[dataset_number * 1000: (dataset_number + 1) * 1000]

    return logs.ActivationLog(df)
Пример #2
0
def al2dsx(dataset, hierarchy, additional_rp=None):
    """
    Wrapper to adapt the dataset and the hierarchy to the new formulations of Subsection \\ref{sec:compopt:ws2w}

    Parameters
    ----------
    dataset: Counter, logs.ActivationLog, str, or Path.
        The value of :math:`\mathcal{D'}`. If ActivationLog, str or Path, it is (imported and) transformed into a bag
    hierarchy: hierarchy.Hierarchy
        Hierarchy object defining the relations between (super-)categories
    additional_rp: Counter
        Additional r_prime values to be added apart from the dataset. These values are not part of visit_ids_per_rp

    Returns
    -------
    dataset: DSX
        D initiated with the adaption presented in Subsection \\ref{sec:compopt:ws2w}

        - :math:`r'`, :math:`\mathcal{R}'`
        - hierarchy class as given
        - invoice_ids_per_rp, reference to original iids that belong to each rp
    """
    # import the data
    if isinstance(dataset, str) or isinstance(dataset, Path):
        dataset = logs.ActivationLog(dataset)
    else:
        assert isinstance(dataset, logs.ActivationLog)

    assert isinstance(hierarchy, Hierarchy)

    if additional_rp is None:
        additional_rp = Counter()
    else:
        assert isinstance(additional_rp, Counter)

    df = dataset.df.copy()

    h_vector = [
        bitops.subset2int(hierarchy[k], hierarchy.c) for k in hierarchy.sc
    ]

    def row2rp(row):
        r = int(''.join([str(row[i]) for i in hierarchy.c]), 2)
        return r + (int(
            ''.join([
                str(1 if (hamming_weight(ci & r) > 0) else 0)
                for ci in h_vector
            ]), 2) << hierarchy.num_cat)

    df['rp'] = df.apply(row2rp, axis=1)
    dataset = Counter(df['rp'])

    dataset += additional_rp
    # noinspection PyTypeChecker
    visit_ids_per_rp = {
        v: list(df[df['rp'] == v].index)
        for v in df['rp'].unique()
    }

    return DSX(dataset, hierarchy, visit_ids_per_rp)
Пример #3
0
def al_apkdd(dataset_number):
    assert dataset_number < 100
    sr = pd.Series(
        data=range(100000)).apply(lambda x: (x % 1000) // 10 == dataset_number)
    df = dataframe_operations.import_df(
        f'data/public/PKDD/ActivationLogs2.csv')[sr]
    return logs.ActivationLog(df)
Пример #4
0
def al_data(dataset_exponent, dataset_number):
    assert isinstance(dataset_exponent, int)
    assert isinstance(dataset_number, int)

    if dataset_exponent in [3, 4, 5]:
        # These are saved with multiple values in one log
        assert dataset_number < 10**(7 - dataset_exponent)
        size = 10**dataset_exponent
        number_of_datasets_per_file = 500000 // size
        file_number = dataset_number // number_of_datasets_per_file
        df = dataframe_operations.import_df(
            retailer_folder / f'D{dataset_exponent}' / f'{file_number}.csv')
        dataset_number_in_file = dataset_number % number_of_datasets_per_file
        df = df.iloc[dataset_number_in_file *
                     size:(dataset_number_in_file + 1) * size]
        return logs.ActivationLog(df)
    elif dataset_exponent in [6, 7]:
        assert (dataset_exponent == 6
                and dataset_number < 10) or (dataset_exponent == 7
                                             and dataset_number == 0)
        return logs.ActivationLog(retailer_folder / f'D{dataset_exponent}' /
                                  f'{dataset_number}.csv')
    else:
        raise ValueError('Illegal dataset_exponent')
Пример #5
0
def al_extended_example2():
    return logs.ActivationLog(public_data_folder / 'samples' /
                              'implementation guide' / 'extended_example2.csv',
                              categories=example_cat_list())
Пример #6
0
def al_running_example():
    return logs.ActivationLog(public_data_folder / 'samples' /
                              'implementation guide' / 'running_example.csv',
                              categories=example_cat_list())
Пример #7
0
def al_sample(exp):
    assert exp in range(1, 5), 'given exp must be in 1...4'
    return logs.ActivationLog(public_data_folder / 'samples' / 'real dataset' /
                              f'S{exp}.csv')