Пример #1
0
def main():
    config = experiments.JOB_FULL
    join_spec = join_utils.get_join_spec(config)
    prepare_utils.prepare(join_spec)
    loaded_tables = []
    for t in join_spec.join_tables:
        print('Loading', t)
        table = datasets.LoadImdb(t, use_cols=config["use_cols"])
        table.data.info()
        loaded_tables.append(table)

    t_start = time.time()
    join_iter_dataset = FactorizedSamplerIterDataset(
        loaded_tables,
        join_spec,
        sample_batch_size=1000 * 100,
        disambiguate_column_names=True)

    table = common.ConcatTables(loaded_tables,
                                join_spec.join_keys,
                                sample_from_join_dataset=join_iter_dataset)

    join_iter_dataset = common.FactorizedSampleFromJoinIterDataset(
        join_iter_dataset,
        base_table=table,
        factorize_blacklist=[],
        word_size_bits=10,
        factorize_fanouts=True)
    t_end = time.time()
    log.info(f"> Initialization took {t_end - t_start} seconds.")

    join_iter_dataset.join_iter_dataset._sample_batch()
    print('-' * 60)
    print("Done")
Пример #2
0
def test_sampler(join_spec, batch_size=1024 * 16):
    tables = [
        datasets.LoadImdb(t, use_cols="multi") for t in join_spec.join_tables
    ]
    sampler = FairSampler(join_spec, tables, batch_size)
    print("-" * 60)
    print("initialization done")
    print("-" * 60)

    sample = time_this(sampler.run)()
    print(sample)
    print(sample.columns)
Пример #3
0
    def testSimple(self):
        t = datasets.LoadImdb('title')
        self.assertEqual('production_year', t.columns[-1].name)
        production_year = t.columns[-1]

        self.assertTrue(pd.isnull(production_year.all_distinct_values[0]))
        min_val = production_year.all_distinct_values[1]
        # 'RuntimeWarning: invalid value encountered in less' expected.
        s = (production_year.all_distinct_values < min_val).sum()
        self.assertEqual(0, s, 'np.nan should not be considered as < value')

        s = (production_year.all_distinct_values == np.nan).sum()
        self.assertEqual(0, s, 'np.nan not == np.nan either')
Пример #4
0
def generate_title_movie_companies(p):
    table2alias = {'title': 't', 'movie_companies': 'mc', 'company_name': 'cn'}
    join_tables = ['title', 'movie_companies', 'company_name']
    join_keys = {'title': ['id'], 'movie_companies': ['movie_id', 'company_id'], 'company_name': ['id']}
    join_clauses = {'title': 'title.id=movie_companies.movie_id',
                    'company_name': 'company_name.id=movie_companies.company_id'}
    # all_cols = {
    #         'title': [
    #             'title','kind_id','production_year','id2', 'id'
    #         ],
    #         'movie_companies': [
    #             'company_type_id', 'company_id', 'movie_id'
    #         ],
    #         'company_name': ['name', 'country_code', 'id'],
    #     }

    config = JOB_jintao
    p = p + ['movie_companies']
    key = '_'.join(sorted([table2alias[x] for x in p]))
    join_spec = join_utils.get_join_spec(config)
    prepare_utils.prepare(join_spec)
    loaded_tables = []
    for t in join_spec.join_tables:
        print('Loading', t)
        table = datasets.LoadImdb(t, use_cols=config["use_cols"])
        table.data.info()
        loaded_tables.append(table)
    t_start = time.time()
    join_iter_dataset = FactorizedSamplerIterDataset(
        loaded_tables,
        join_spec,
        sample_batch_size=51000 * 100,
        disambiguate_column_names=True)
    table = common.ConcatTables(loaded_tables,
                                join_spec.join_keys,
                                sample_from_join_dataset=join_iter_dataset)

    join_iter_dataset = common.FactorizedSampleFromJoinIterDataset(
        join_iter_dataset,
        base_table=table,
        factorize_blacklist=[],
        word_size_bits=10,
        factorize_fanouts=True)
    t_end = time.time()
    log.info(f"> Initialization took {t_end - t_start} seconds.")
    print(join_iter_dataset.join_iter_dataset.combined_columns)
    samples = []
    for i in tqdm(range(5000000)):
        samples.append(next(join_iter_dataset.join_iter_dataset))
    df = pd.DataFrame(data=pd.concat(samples, axis=1)).T
    df.to_csv('/home/jintao/{}.csv'.format(key), index=False)
Пример #5
0
def load_or_create_join_count_table(join_clauses, join_how, filename=None):
    if filename is None:
        filename = ",".join(join_clauses + [join_how])
        max_filename_length = 240
        if len(filename) > max_filename_length:
            h = hashlib.sha1(filename.encode()).hexdigest()[:8]
            filename = filename[:max_filename_length] + "_" + h
        filename += ".df"
    save_path = os.path.join(CACHE_DIR, filename)
    try:
        with open(save_path, "rb") as f:
            log.info("Loading join count table from {}".format(save_path))
            df = pickle.load(f)
        assert isinstance(df, pd.DataFrame), type(df)
        return df
    except:
        log.info("Creating the join count table.")
        table_info, parsed_join_clauses = get_table_info(join_clauses)
        log.info("Loading tables {}".format(", ".join(t for t in table_info)))
        data_tables = {
            table: datasets.LoadImdb(table, use_cols=None).data
            for table, cols in table_info.items()
        }

        log.info("Making count tables")
        count_tables = make_count_tables(table_info, data_tables)

        log.info("Joining count tables")
        df = join_count_tables(table_info, parsed_join_clauses, count_tables,
                               join_how)

        log.info("Calculated full join size = {}".format(df["cnt"].sum()))

        log.info("Saved join count table to {}".format(save_path))
        with open(save_path, "wb") as f:
            pickle.dump(df, f, protocol=4)
        return df
    def _setup(self, config):
        self.config = config
        print('NeuroCard config:')
        pprint.pprint(config)
        os.chdir(config['cwd'])
        for k, v in config.items():
            setattr(self, k, v)

        if config['__gpu'] == 0:
            torch.set_num_threads(config['__cpu'])

        # W&B.
        # Do wandb.init() after the os.chdir() above makes sure that the Git
        # diff file (diff.patch) is w.r.t. the directory where this file is in,
        # rather than w.r.t. Ray's package dir.
        wandb_project = config['__run']
        wandb.init(name=os.path.basename(
            self.logdir if self.logdir[-1] != '/' else self.logdir[:-1]),
            sync_tensorboard=True,
            config=config,
            project=wandb_project)

        self.epoch = 0

        if isinstance(self.join_tables, int):
            # Hack to support training single-model tables.
            sorted_table_names = sorted(
                list(datasets.JoinOrderBenchmark.GetJobLightJoinKeys().keys()))
            self.join_tables = [sorted_table_names[self.join_tables]]

        # Try to make all the runs the same, except for input orderings.
        torch.manual_seed(0)
        np.random.seed(0)

        # Common attributes.
        self.loader = None
        self.join_spec = None
        join_iter_dataset = None
        table_primary_index = None

        # New datasets should be loaded here.
        assert self.dataset in ['imdb']
        if self.dataset == 'imdb':
            print('Training on Join({})'.format(self.join_tables))
            loaded_tables = []
            for t in self.join_tables:
                print('Loading', t)
                table = datasets.LoadImdb(t, use_cols=self.use_cols)
                table.data.info()
                loaded_tables.append(table)
            if len(self.join_tables) > 1:
                join_spec, join_iter_dataset, loader, table = self.MakeSamplerDatasetLoader(
                    loaded_tables)

                self.join_spec = join_spec
                self.train_data = join_iter_dataset
                self.loader = loader

                table_primary_index = [t.name for t in loaded_tables
                                       ].index('title')

                table.cardinality = datasets.JoinOrderBenchmark.GetFullOuterCardinalityOrFail(
                    self.join_tables)
                self.train_data.cardinality = table.cardinality

                print('rows in full join', table.cardinality,
                      'cols in full join', len(table.columns), 'cols:', table)
            else:
                # Train on a single table.
                table = loaded_tables[0]

        if self.dataset != 'imdb' or len(self.join_tables) == 1:
            table.data.info()
            self.train_data = self.MakeTableDataset(table)

        self.table = table
        # Provide true cardinalities in a file or implement an oracle CardEst.
        self.oracle = None
        self.table_bits = 0

        # A fixed ordering?
        self.fixed_ordering = self.MakeOrdering(table)

        model = self.MakeModel(self.table,
                               self.train_data,
                               table_primary_index=table_primary_index)

        # NOTE: ReportModel()'s returned value is the true model size in
        # megabytes containing all all *trainable* parameters.  As impl
        # convenience, the saved ckpts on disk have slightly bigger footprint
        # due to saving non-trainable constants (the masks in each layer) as
        # well.  They can be deterministically reconstructed based on RNG seeds
        # and so should not be counted as model size.
        self.mb = train_utils.ReportModel(model)
        if not isinstance(model, transformer.Transformer):
            print('applying train_utils.weight_init()')
            model.apply(train_utils.weight_init)
        self.model = model

        if self.use_data_parallel:
            self.model = DataParallelPassthrough(self.model)

        wandb.watch(model, log='all')

        if self.use_transformer:
            opt = torch.optim.Adam(
                list(model.parameters()),
                2e-4,
                # betas=(0.9, 0.98),  # B in Lingvo; in Trfmr paper.
                betas=(0.9, 0.997),  # A in Lingvo.
                eps=1e-9,
            )
        else:
            if self.optimizer == 'adam':
                opt = torch.optim.Adam(list(model.parameters()), 2e-4)
            else:
                print('Using Adagrad')
                opt = torch.optim.Adagrad(list(model.parameters()), 2e-4)
        print('Optimizer:', opt)
        self.opt = opt

        total_steps = self.epochs * self.max_steps
        if self.lr_scheduler == 'CosineAnnealingLR':
            # Starts decaying to 0 immediately.
            self.lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                opt, total_steps)
        elif self.lr_scheduler == 'OneCycleLR':
            # Warms up to max_lr, then decays to ~0.
            self.lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(
                opt, max_lr=2e-3, total_steps=total_steps)
        elif self.lr_scheduler is not None and self.lr_scheduler.startswith(
                'OneCycleLR-'):
            warmup_percentage = float(self.lr_scheduler.split('-')[-1])
            # Warms up to max_lr, then decays to ~0.
            self.lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(
                opt,
                max_lr=2e-3,
                total_steps=total_steps,
                pct_start=warmup_percentage)
        elif self.lr_scheduler is not None and self.lr_scheduler.startswith(
                'wd_'):
            # Warmups and decays.
            splits = self.lr_scheduler.split('_')
            assert len(splits) == 3, splits
            lr, warmup_fraction = float(splits[1]), float(splits[2])
            self.custom_lr_lambda = train_utils.get_cosine_learning_rate_fn(
                total_steps,
                learning_rate=lr,
                min_learning_rate_mult=1e-5,
                constant_fraction=0.,
                warmup_fraction=warmup_fraction)
        else:
            assert self.lr_scheduler is None, self.lr_scheduler

        self.tbx_logger = tune_logger.TBXLogger(self.config, self.logdir)

        if self.checkpoint_to_load:
            self.LoadCheckpoint()

        self.loaded_queries = None
        self.oracle_cards = None
        if self.dataset == 'imdb' and len(self.join_tables) > 1:
            queries_job_format = utils.JobToQuery(self.queries_csv)
            self.loaded_queries, self.oracle_cards = utils.UnpackQueries(
                self.table, queries_job_format)  # 瑙f瀽杩囩▼锛岄渶瑕佹浛鎹?        timepre1 = time.time()
        print('Pretime:\n', "{:.2f}".format(timepre1 - gettimest1()))
        if config['__gpu'] == 0:
            print('CUDA not available, using # cpu cores for intra-op:',
                  torch.get_num_threads(), '; inter-op:',
                  torch.get_num_interop_threads())
Пример #7
0
def main(argv):
    del argv  # Unused.

    # conn = pg.connect(FLAGS.db)
    # conn.set_session(autocommit=True)
    # cursor = conn.cursor()
    cursor = None

    tables = datasets.LoadImdb(use_cols=None)

    # Load all templates in original JOB-light.
    queries = utils.JobToQuery(FLAGS.tds_light_csv, use_alias_keys=False)
    tables_to_join_keys = {}
    for query in queries:
        key = MakeTablesKey(query[0])
        if key not in tables_to_join_keys:
            join_dict = query[1]
            # Disambiguate: title->id changed to title->title.id.
            for table_name in join_dict.keys():
                # TODO: only support a single join key
                join_key = next(iter(join_dict[table_name]))
                join_dict[table_name] = common.JoinTableAndColumnNames(
                    table_name, join_key, sep='.')
            tables_to_join_keys[key] = join_dict

    num_templates = len(tables_to_join_keys)
    num_queries_per_template = FLAGS.num_queries // num_templates
    logging.info('%d join templates', num_templates)

    rng = np.random.RandomState(1234)
    queries = []  # [(cols, ops, vals)]

    # Disambiguate to not prune away stuff during join sampling.
    for table_name, table in tables.items():
        for col in table.columns:
            col.name = common.JoinTableAndColumnNames(table.name,
                                                      col.name,
                                                      sep='.')
        table.data.columns = [col.name for col in table.columns]

    if FLAGS.print_sel:
        # Print true selectivities.
        df = pd.read_csv(FLAGS.output_csv, sep='#', header=None)
        assert len(df) == FLAGS.num_queries, (len(df), FLAGS.num_queries)

        inner = []
        true_inner_card_cache = {}

        for row in df.iterrows():
            vs = row[1]
            table_names, join_clauses, true_card = vs[0], vs[1], vs[3]
            table_names = table_names.split(',')
            print('Template: {}\tTrue card: {}'.format(table_names, true_card))

            # JOB-light: contains 'full_name alias'.
            # JOB-light-ranges: just 'full_name'.
            if ' ' in table_names[0]:
                table_names = [n.split(' ')[0] for n in table_names]

            tables_in_templates = [tables[n] for n in table_names]
            key = MakeTablesKey(table_names)
            join_keys_list = tables_to_join_keys[key]

            if key not in true_inner_card_cache:
                join_spec = join_utils.get_join_spec({
                    "join_tables":
                    table_names,
                    "join_keys":
                    dict(
                        zip(table_names,
                            [[k.split(".")[1]] for k in join_keys_list])),
                    "join_root":
                    "item",
                    "join_how":
                    "inner",
                })
                ds = FactorizedSamplerIterDataset(
                    tables_in_templates,
                    join_spec,
                    sample_batch_size=num_queries,
                    disambiguate_column_names=False,
                    add_full_join_indicators=False,
                    add_full_join_fanouts=False)
                true_inner_card_cache[key] = ds.sampler.join_card
            inner.append(true_inner_card_cache[key])

        pd.DataFrame({
            'true_cards': df[3],
            'true_inner': inner,
            'inner_sel': df[3] * 1.0 / inner,
            'outer_sel': df[3] * 1.0 / TDS_LIGHT_OUTER_CARDINALITY
        }).to_csv(FLAGS.output_csv + '.sel', index=False)
        print('Done:', FLAGS.output_csv + '.sel')

    else:
        # Generate queries.
        last_run_queries = file_len(FLAGS.output_csv) if os.path.exists(
            FLAGS.output_csv) else 0
        next_template_idx = last_run_queries // num_queries_per_template
        print('next_template_idx', next_template_idx)
        print(tables_to_join_keys.items())

        spark = StartSpark()
        for i, (tables_to_join,
                join_keys) in enumerate(tables_to_join_keys.items()):

            if i < next_template_idx:
                print('Skipping template:', tables_to_join)
                continue
            print('Template:', tables_to_join)

            if i == num_templates - 1:
                num_queries_per_template += FLAGS.num_queries % num_templates

            # Generate num_queries_per_template.
            table_names = tables_to_join.split('-')

            tables_in_templates = [tables[n] for n in table_names]

            queries.extend(
                MakeQueries(spark, cursor, num_queries_per_template,
                            tables_in_templates, table_names, join_keys, rng))
Пример #8
0
def main():
    table2alias = {'title': 't', 'cast_info': 'ci', 'movie_companies': 'mc', 'movie_info': 'mi',
                   'movie_info_idx': 'mi_idx', 'movie_keyword': 'mk'}
    join_tables = ['title', 'cast_info', 'movie_companies', 'movie_info', 'movie_info_idx', 'movie_keyword']
    join_keys = {'title': ['id'], 'cast_info': ['movie_id'], 'movie_companies': ['movie_id'],
                 'movie_info': ['movie_id'], 'movie_info_idx': ['movie_id'], 'movie_keyword': ['movie_id']}
    join_clauses = {'cast_info': 'title.id=cast_info.movie_id', 'movie_companies': 'title.id=movie_companies.movie_id',
                    'movie_info': 'title.id=movie_info.movie_id', 'movie_info_idx': 'title.id=movie_info_idx.movie_id',
                    'movie_keyword': 'title.id=movie_keyword.movie_id'}
    all_cols = {
        'title': [
            'kind_id', 'production_year', 'episode_nr', 'imdb_index', 'phonetic_code', 'season_nr', 'series_years'
        ],
        'cast_info': [
            'nr_order', 'role_id'
        ],
        'movie_companies': [
            'company_type_id'
        ],
        'movie_info_idx': ['info_type_id'],
        'movie_info': ['info_type_id'],
        'movie_keyword': ['keyword_id']
    }

    tables = ['cast_info', 'movie_companies', 'movie_info', 'movie_info_idx', 'movie_keyword']
    for num in range(1, 6):
        for p in combinations(tables, num):
            config = JOB_MY
            config['join_clauses'] = []
            p = [x for x in p]
            for t in p:
                config['join_clauses'].append(join_clauses[t])
            p = p + ['title']
            key = '_'.join(sorted([table2alias[x] for x in p]))
            config['join_tables'] = p
            config['join_keys'] = {}
            for t in p:
                config['join_keys'][t] = join_keys[t]
            col_num = 0
            for t in p:
                col_num += len(all_cols[t])
            join_spec = join_utils.get_join_spec(config)
            prepare_utils.prepare(join_spec)
            loaded_tables = []
            for t in join_spec.join_tables:
                print('Loading', t)
                table = datasets.LoadImdb(t, use_cols=config["use_cols"])
                table.data.info()
                loaded_tables.append(table)

            t_start = time.time()
            join_iter_dataset = FactorizedSamplerIterDataset(
                loaded_tables,
                join_spec,
                sample_batch_size=1000 * 100,
                disambiguate_column_names=True)

            table = common.ConcatTables(loaded_tables,
                                        join_spec.join_keys,
                                        sample_from_join_dataset=join_iter_dataset)

            join_iter_dataset = common.FactorizedSampleFromJoinIterDataset(
                join_iter_dataset,
                base_table=table,
                factorize_blacklist=[],
                word_size_bits=10,
                factorize_fanouts=True)
            t_end = time.time()
            log.info(f"> Initialization took {t_end - t_start} seconds.")
            print(join_iter_dataset.join_iter_dataset.combined_columns)
            samples = []
            for i in tqdm(range(1000000)):
                samples.append(next(join_iter_dataset.join_iter_dataset))
            df = pd.DataFrame(data=pd.concat(samples, axis=1)).T.iloc[:, :col_num]
            df.to_csv('../train-test-data/join_samples/{}.csv'.format(key), index=False)
            # join_iter_dataset.join_iter_dataset._sample_batch()
            print('-' * 60)
            print("Done {}".format(key))