Exemplo n.º 1
0
 def MakeProgressiveSamplers(self,
                             model,
                             train_data,
                             do_fanout_scaling=False):
     estimators = []
     dropout = self.dropout or self.per_row_dropout
     for n in self.eval_psamples:
         if self.factorize:
             estimators.append(
                 estimators_lib.FactorizedProgressiveSampling(
                     model,
                     train_data,
                     n,
                     self.join_spec,
                     device=train_utils.get_device(),
                     shortcircuit=dropout,
                     do_fanout_scaling=do_fanout_scaling))
         else:
             estimators.append(
                 estimators_lib.ProgressiveSampling(
                     model,
                     train_data,
                     n,
                     self.join_spec,
                     device=train_utils.get_device(),
                     shortcircuit=dropout,
                     do_fanout_scaling=do_fanout_scaling))
     return estimators
def Main():
    os.environ['CUDA_VISIBLE_DEVICES'] = '1'
    all_ckpts = glob.glob('./models/{}'.format(args.glob))
    all_ckpts = glob.glob('./models/' + str(args.dataset) + '*.pt')
    print(args.blacklist)
    if args.blacklist:
        all_ckpts = [ckpt for ckpt in all_ckpts if args.blacklist not in ckpt]
    print(all_ckpts)
    selected_ckpts = all_ckpts
    oracle_cards = LoadOracleCardinalities()
    print('ckpts', selected_ckpts)
    os.environ['CUDA_VISIBLE_DEVICES'] = '1'
    if not args.run_bn:
        # OK to load tables now
        table, train_data, oracle_est = MakeTable()
        cols_to_train = table.columns

    Ckpt = collections.namedtuple(
        'Ckpt', 'epoch model_bits bits_gap path loaded_model seed')
    parsed_ckpts = []

    for s in selected_ckpts:
        if args.order is None:
            z = re.match('.+model([\d\.]+)-data([\d\.]+).+seed([\d\.]+).*.pt',
                         s)
        else:
            z = re.match(
                '.+model([\d\.]+)-data([\d\.]+).+seed([\d\.]+)-order.*.pt', s)
        assert z
        model_bits = float(z.group(1))
        data_bits = float(z.group(2))
        seed = int(z.group(3))
        print('model_bits:', model_bits)
        bits_gap = model_bits - data_bits

        order = None
        if args.order is not None:
            order = list(args.order)

        if args.heads > 0:
            model = MakeTransformer(cols_to_train=table.columns,
                                    fixed_ordering=order,
                                    seed=seed)
        else:
            if args.dataset in ['dmv-tiny', 'dmv', 'forest', 'power']:
                model = MakeMade(
                    scale=args.fc_hiddens,
                    cols_to_train=table.columns,
                    seed=seed,
                    fixed_ordering=order,
                )
            else:
                assert False, args.dataset

        assert order is None or len(order) == model.nin, order
        ReportModel(model)
        print('Loading ckpt:', s)
        model.load_state_dict(torch.load(s))
        model.eval()

        print(s, bits_gap, seed)

        parsed_ckpts.append(
            Ckpt(path=s,
                 epoch=None,
                 model_bits=model_bits,
                 bits_gap=bits_gap,
                 loaded_model=model,
                 seed=seed))

    # Estimators to run.
    if args.run_bn:
        estimators = RunNParallel(estimator_factory=MakeBnEstimators,
                                  parallelism=50,
                                  rng=np.random.RandomState(1234),
                                  num=args.num_queries,
                                  num_filters=None,
                                  oracle_cards=oracle_cards)
    else:
        print(parsed_ckpts)
        for c in parsed_ckpts:
            print(c.loaded_model, table, args.psample, args.column_masking)
        estimators = [
            estimators_lib.ProgressiveSampling(
                c.loaded_model,
                table,
                args.psample,
                device=DEVICE,
                shortcircuit=args.column_masking) for c in parsed_ckpts
        ]
        print(estimators)
        for est, ckpt in zip(estimators, parsed_ckpts):
            est.name = str(est) + '_{}_{:.3f}'.format(ckpt.seed, ckpt.bits_gap)

        if args.inference_opts:
            print('Tracing forward_with_encoded_input()...')
            for est in estimators:
                encoded_input = est.model.EncodeInput(
                    torch.zeros(args.psample, est.model.nin, device=DEVICE))

                # NOTE: this line works with torch 1.0.1.post2 (but not 1.2).
                # The 1.2 version changes the API to
                # torch.jit.script(est.model) and requires an annotation --
                # which was found to be slower.
                est.traced_fwd = torch.jit.trace(
                    est.model.forward_with_encoded_input, encoded_input)

        if args.run_sampling:
            SAMPLE_RATIO = {'dmv': [0.0013]}  # ~1.3MB.
            for p in SAMPLE_RATIO.get(args.dataset, [0.01]):
                estimators.append(estimators_lib.Sampling(table, p=p))

        if args.run_maxdiff:
            estimators.append(
                estimators_lib.MaxDiffHistogram(table, args.maxdiff_limit))

        # Other estimators can be appended as well.

        if len(estimators):
            RunN(table,
                 cols_to_train,
                 estimators,
                 rng=np.random.RandomState(1234),
                 num=args.num_queries,
                 log_every=1,
                 num_filters=None,
                 oracle_cards=oracle_cards,
                 oracle_est=oracle_est)
    print(len(estimators))
    SaveEstimators(args.err_csv, estimators)
    print('...Done, result:', args.err_csv)
Exemplo n.º 3
0
def Main():
    all_ckpts = glob.glob('./models/{}'.format(args.glob))
    if args.blacklist:
        all_ckpts = [ckpt for ckpt in all_ckpts if args.blacklist not in ckpt]

    selected_ckpts = all_ckpts
    oracle_cards = LoadOracleCardinalities()
    print('ckpts', selected_ckpts)

    if not args.run_bn:
        # OK to load tables now
        table, train_data, oracle_est = MakeTable()
        cols_to_train = table.columns

    Ckpt = collections.namedtuple(
        'Ckpt', 'epoch model_bits bits_gap path loaded_model seed')
    parsed_ckpts = []

    for s in selected_ckpts:
        if args.order is None:
            z = re.match('.+model([\d\.]+)-data([\d\.]+).+seed([\d\.]+).*.pt',
                         s)
        else:
            z = re.match(
                '.+model([\d\.]+)-data([\d\.]+).+seed([\d\.]+)-order.*.pt', s)
        assert z
        model_bits = float(z.group(1))
        data_bits = float(z.group(2))
        seed = int(z.group(3))
        bits_gap = model_bits - data_bits

        order = None
        if args.order is not None:
            order = list(args.order)

        if args.heads > 0:
            model = MakeTransformer(cols_to_train=table.columns,
                                    fixed_ordering=order,
                                    seed=seed)
        else:
            if args.dataset in ['dmv-tiny', 'dmv']:
                model = MakeMade(
                    scale=args.fc_hiddens,
                    cols_to_train=table.columns,
                    seed=seed,
                    fixed_ordering=order,
                )
            else:
                assert False, args.dataset

        assert order is None or len(order) == model.nin, order
        ReportModel(model)
        print('Loading ckpt:', s)
        model.load_state_dict(torch.load(s))
        model.eval()

        print(s, bits_gap, seed)

        parsed_ckpts.append(
            Ckpt(path=s,
                 epoch=None,
                 model_bits=model_bits,
                 bits_gap=bits_gap,
                 loaded_model=model,
                 seed=seed))

    # Estimators to run.
    if args.run_bn:
        ests, truths, training_time, test_time, estimators = RunNParallelNew(estimator_factory=MakeBnEstimatorsNew,
                                                                             parallelism=50,
                                                                             rng=np.random.RandomState(1234),
                                                                             num=args.num_queries,
                                                                             num_filters=None,
                                                                             oracle_cards=oracle_cards,
                                                                             test_file_path=args.test_file_path,
                                                                             single_data_path=args.single_data_path,
                                                                             join_data_path=args.join_data_path,
                                                                             join_num_path=args.join_num_path,
                                                                             join_sample_size=args.join_sample_size)
        # with open(args.test_file_path+'.bayesian.model', 'wb') as f:
        # dill.dump(models, f)  #pickle.dump(models)
        ests_new = []
        truths_new = []
        for e, t in zip(ests, truths):
            if e >= 0:
                ests_new.append(e)
                truths_new.append(t)
        print('Training Time {}s'.format(training_time))
        print('Testing Time Per Query {}ms'.format(test_time))
        print_qerror(np.array(ests_new), np.array(truths_new))
        print_mse(np.array(ests_new), np.array(truths_new))
        print_mape(np.array(ests_new), np.array(truths_new))
        print_pearson_correlation(np.array(ests_new), np.array(truths_new))
    else:
        estimators = [
            estimators_lib.ProgressiveSampling(c.loaded_model,
                                               table,
                                               args.psample,
                                               device=DEVICE,
                                               shortcircuit=args.column_masking)
            for c in parsed_ckpts
        ]
        for est, ckpt in zip(estimators, parsed_ckpts):
            est.name = str(est) + '_{}_{:.3f}'.format(ckpt.seed, ckpt.bits_gap)

        if args.inference_opts:
            print('Tracing forward_with_encoded_input()...')
            for est in estimators:
                encoded_input = est.model.EncodeInput(
                    torch.zeros(args.psample, est.model.nin, device=DEVICE))

                # NOTE: this line works with torch 1.0.1.post2 (but not 1.2).
                # The 1.2 version changes the API to
                # torch.jit.script(est.model) and requires an annotation --
                # which was found to be slower.
                est.traced_fwd = torch.jit.trace(
                    est.model.forward_with_encoded_input, encoded_input)

        if args.run_sampling:
            SAMPLE_RATIO = {'dmv': [0.0013]}  # ~1.3MB.
            for p in SAMPLE_RATIO.get(args.dataset, [0.01]):
                estimators.append(estimators_lib.Sampling(table, p=p))

        if args.run_maxdiff:
            estimators.append(
                estimators_lib.MaxDiffHistogram(table, args.maxdiff_limit))

        # Other estimators can be appended as well.

        if len(estimators):
            RunN(table,
                 cols_to_train,
                 estimators,
                 rng=np.random.RandomState(1234),
                 num=args.num_queries,
                 log_every=1,
                 num_filters=None,
                 oracle_cards=oracle_cards,
                 oracle_est=oracle_est)