예제 #1
0
def train_deepdb(seed, dataset, version, workload, params, sizelimit):
    L.info(f"params: {params}")
    args = Args(**params)

    # for sampling
    np.random.seed(seed)

    table = load_table(dataset, version)
    # load validation queries and labels
    valid_queries = load_queryset(dataset,
                                  workload)['valid'][:VALID_NUM_DATA_DRIVEN]
    labels = load_labels(dataset, version,
                         workload)['valid'][:VALID_NUM_DATA_DRIVEN]

    schema = construct_schema(table)

    # convert data from csv to hdf
    hdf_path = DATA_ROOT / dataset / 'deepdb' / f"hdf-{version}"
    if hdf_path.is_dir():
        L.info('Use existing hdf file!')
    else:
        hdf_path.mkdir(parents=True)
        prepare_all_tables(schema,
                           str(hdf_path),
                           csv_seperator=',',
                           max_table_data=args.max_rows_per_hdf_file)

    # generate SPN for table
    prep = JoinDataPreparator(hdf_path / 'meta_data.pkl',
                              schema,
                              max_table_data=args.max_rows_per_hdf_file)
    spn_ensemble = SPNEnsemble(schema)
    table_obj = schema.tables[0]
    L.info(f"table name: {table_obj.table_name}")
    df_samples, meta_types, null_values, full_join_est = prep.generate_n_samples(
        args.hdf_sample_size,
        single_table=table_obj.table_name,
        post_sampling_factor=1.0)
    assert len(df_samples) == min(args.hdf_sample_size,
                                  table.row_num), '{} != min({}, {})'.format(
                                      len(df_samples), args.hdf_sample_size,
                                      table.row_num)

    model_path = MODEL_ROOT / table.dataset
    model_path.mkdir(parents=True, exist_ok=True)
    model_file = model_path / f"{table.version}-spn_sample{len(df_samples)}_rdc{args.rdc_threshold}_ms{args.ratio_min_instance_slice}-{seed}.pkl"

    # learn spn
    L.info(f"Start learning SPN for {table_obj.table_name}.")
    start_stmp = time.time()
    aqp_spn = AQPSPN(meta_types,
                     null_values,
                     full_join_est,
                     schema,
                     relationship_list=None,
                     full_sample_size=len(df_samples),
                     table_set={table_obj.table_name},
                     column_names=list(df_samples.columns),
                     table_meta_data=prep.table_meta_data)
    min_instance_slice = args.ratio_min_instance_slice * len(df_samples)
    aqp_spn.learn(df_samples.values,
                  min_instances_slice=min_instance_slice,
                  bloom_filters=False,
                  rdc_threshold=args.rdc_threshold)
    spn_ensemble.add_spn(aqp_spn)
    dur_min = (time.time() - start_stmp) / 60

    mb = get_deepdb_size(spn_ensemble)
    L.info(
        f"SPN built finished, time spent since start: {dur_min:.1f} mins with {mb:.2f}MB size of memory"
    )
    L.info(f'Final SPN: {get_structure_stats_dict(spn_ensemble.spns[0].mspn)}')

    if sizelimit > 0 and mb > (sizelimit * table.data_size_mb):
        L.info(
            f"Exceeds size limit {mb:.2f}MB > {sizelimit} x {table.data_size_mb}, do not conintue!"
        )
        return

    L.info(f"Evaluating on valid set with {VALID_NUM_DATA_DRIVEN} queries...")
    estimator = DeepDB(spn_ensemble, table, schema, 'valid')
    preds = []
    for q in valid_queries:
        est_card, _ = estimator.query(q)
        preds.append(est_card)
    _, metrics = evaluate(preds, [l.cardinality for l in labels])

    spn_ensemble.state = {
        'train_time': dur_min,
        'model_size': mb,
        'args': args,
        'device': 'cpu',
        'threads': NUM_THREADS,
        'dataset': table.dataset,
        'version': table.version,
        'valid_error': {
            workload: metrics
        }
    }

    # save spn to file
    spn_ensemble.save(model_file)
    L.info(
        f'Training finished! Save model to {model_file} Time spent since start: {dur_min:.2f} mins'
    )
예제 #2
0
    # Generate HDF files for simpler sampling
    if args.generate_hdf:
        logger.info(
            f"Generating HDF files for tables in {args.csv_path} and store to path {args.hdf_path}"
        )

        if os.path.exists(args.hdf_path):
            logger.info(f"Removing target path {args.hdf_path}")
            shutil.rmtree(args.hdf_path)

        logger.info(f"Making target path {args.hdf_path}")
        os.makedirs(args.hdf_path)

        prepare_all_tables(schema,
                           args.hdf_path,
                           csv_seperator=args.csv_seperator,
                           max_table_data=args.max_rows_per_hdf_file)
        logger.info(f"Files successfully created")

    # Generate sampled HDF files for fast join calculations
    if args.generate_sampled_hdfs:
        logger.info(
            f"Generating sampled HDF files for tables in {args.csv_path} and store to path {args.hdf_path}"
        )
        prepare_sample_hdf(schema, args.hdf_path, args.max_rows_per_hdf_file,
                           args.hdf_sample_size, version)  # Add a parameter
        logger.info(f"Files successfully created")
        # Pretime

    # Generate ensemble for cardinality schemas
    if args.generate_ensemble:
예제 #3
0
def update_deepdb(seed: int, dataset: str, new_version: str, workload: str,
                  params: Dict[str, Any], overwrite: bool) -> None:
    # for sampling
    np.random.seed(seed)
    # load old model
    new_table = load_table(dataset, new_version)
    model_path = MODEL_ROOT / new_table.dataset
    model_file = model_path / f"{params['model']}.pkl"
    L.info(f"load model from {model_file} ...")
    estimator, state = load_deepdb(dataset, params['model'])
    spn_ensemble = estimator.spn_ensemble

    old_version = state['version']
    args = state['args']
    old_table = load_table(dataset, old_version)
    # load updated data and save to csv
    updated_dataset = load_table(dataset, new_version)
    updated_dataset.data = updated_dataset.data.iloc[len(old_table.data
                                                         ):].sample(frac=0.01)
    updated_dataset.data.reset_index(drop=True)
    updated_dataset.version += '_cut'
    updated_dataset.name += '_cut'
    updated_dataset.data.to_csv(DATA_ROOT / dataset /
                                f"{updated_dataset.version}.csv",
                                index=False)
    updated_dataset.row_num = len(updated_dataset.data)

    L.info(f"Updated size {updated_dataset.row_num}")

    # load validation queries and labels
    valid_queries = load_queryset(dataset,
                                  workload)['valid'][:VALID_NUM_DATA_DRIVEN]
    labels = load_labels(dataset, new_version,
                         workload)['valid'][:VALID_NUM_DATA_DRIVEN]

    schema = construct_schema(updated_dataset)
    L.info(f"{schema}")
    # convert data from csv to hdf
    hdf_path = DATA_ROOT / dataset / 'deepdb' / f"hdf-{updated_dataset.version}"
    if hdf_path.is_dir():
        L.info('Use existing hdf file!')
    else:
        hdf_path.mkdir(parents=True)
    prepare_all_tables(schema,
                       str(hdf_path),
                       csv_seperator=',',
                       max_table_data=args.max_rows_per_hdf_file)

    # generate SPN for table
    prep = JoinDataPreparator(hdf_path / 'meta_data.pkl',
                              schema,
                              max_table_data=args.max_rows_per_hdf_file)
    table_obj = schema.tables[0]
    L.info(f"table name: {table_obj.table_name}")
    L.info(f"table attributes: {schema.tables[0].attributes}")
    df_samples, meta_types, null_values, full_join_est = prep.generate_n_samples(
        args.hdf_sample_size,
        single_table=table_obj.table_name,
        post_sampling_factor=1.0)
    # assert len(df_samples) == min(args.hdf_sample_size, old_table.row_num), '{} != min({}, {})'.format(len(df_samples), args.hdf_sample_size, old_table.row_num)

    # Update model
    L.info(f"Start learning SPN for {table_obj.table_name}.")
    start_stmp = time.time()
    spn_ensemble.spns[0].learn_incremental(df_samples.to_numpy())
    dur_min = (time.time() - start_stmp) / 60

    L.info(f"SPN update finished, time spent since start: {dur_min:.4f} mins")
    L.info(f'Final SPN: {get_structure_stats_dict(spn_ensemble.spns[0].mspn)}')

    # L.info(f"Evaluating on valid set with {VALID_NUM_DATA_DRIVEN} queries...")
    # estimator = DeepDB(spn_ensemble, new_table, schema, 'valid')
    # preds = []
    # for q in valid_queries:
    #     est_card, _ = estimator.query(q)
    #     preds.append(est_card)
    # _, metrics = evaluate(preds, [l.cardinality for l in labels])

    spn_ensemble.state['update_time'] = dur_min
    args = state['args']
    # save spn to file
    sample_size = min(args.hdf_sample_size, old_table.row_num)
    new_model_file = model_path / f"{new_table.version}-spn_sample{sample_size}_rdc{args.rdc_threshold}_ms{args.ratio_min_instance_slice}-{seed}.pkl"

    spn_ensemble.save(new_model_file)
    L.info(
        f'Updating finished! Save model to {new_model_file} Time spent since start: {dur_min:.4f} mins'
    )