def main(): config = experiments.JOB_FULL join_spec = join_utils.get_join_spec(config) prepare_utils.prepare(join_spec) loaded_tables = [] for t in join_spec.join_tables: print('Loading', t) table = datasets.LoadImdb(t, use_cols=config["use_cols"]) table.data.info() loaded_tables.append(table) t_start = time.time() join_iter_dataset = FactorizedSamplerIterDataset( loaded_tables, join_spec, sample_batch_size=1000 * 100, disambiguate_column_names=True) table = common.ConcatTables(loaded_tables, join_spec.join_keys, sample_from_join_dataset=join_iter_dataset) join_iter_dataset = common.FactorizedSampleFromJoinIterDataset( join_iter_dataset, base_table=table, factorize_blacklist=[], word_size_bits=10, factorize_fanouts=True) t_end = time.time() log.info(f"> Initialization took {t_end - t_start} seconds.") join_iter_dataset.join_iter_dataset._sample_batch() print('-' * 60) print("Done")
def test_sampler(join_spec, batch_size=1024 * 16): tables = [ datasets.LoadImdb(t, use_cols="multi") for t in join_spec.join_tables ] sampler = FairSampler(join_spec, tables, batch_size) print("-" * 60) print("initialization done") print("-" * 60) sample = time_this(sampler.run)() print(sample) print(sample.columns)
def testSimple(self): t = datasets.LoadImdb('title') self.assertEqual('production_year', t.columns[-1].name) production_year = t.columns[-1] self.assertTrue(pd.isnull(production_year.all_distinct_values[0])) min_val = production_year.all_distinct_values[1] # 'RuntimeWarning: invalid value encountered in less' expected. s = (production_year.all_distinct_values < min_val).sum() self.assertEqual(0, s, 'np.nan should not be considered as < value') s = (production_year.all_distinct_values == np.nan).sum() self.assertEqual(0, s, 'np.nan not == np.nan either')
def generate_title_movie_companies(p): table2alias = {'title': 't', 'movie_companies': 'mc', 'company_name': 'cn'} join_tables = ['title', 'movie_companies', 'company_name'] join_keys = {'title': ['id'], 'movie_companies': ['movie_id', 'company_id'], 'company_name': ['id']} join_clauses = {'title': 'title.id=movie_companies.movie_id', 'company_name': 'company_name.id=movie_companies.company_id'} # all_cols = { # 'title': [ # 'title','kind_id','production_year','id2', 'id' # ], # 'movie_companies': [ # 'company_type_id', 'company_id', 'movie_id' # ], # 'company_name': ['name', 'country_code', 'id'], # } config = JOB_jintao p = p + ['movie_companies'] key = '_'.join(sorted([table2alias[x] for x in p])) join_spec = join_utils.get_join_spec(config) prepare_utils.prepare(join_spec) loaded_tables = [] for t in join_spec.join_tables: print('Loading', t) table = datasets.LoadImdb(t, use_cols=config["use_cols"]) table.data.info() loaded_tables.append(table) t_start = time.time() join_iter_dataset = FactorizedSamplerIterDataset( loaded_tables, join_spec, sample_batch_size=51000 * 100, disambiguate_column_names=True) table = common.ConcatTables(loaded_tables, join_spec.join_keys, sample_from_join_dataset=join_iter_dataset) join_iter_dataset = common.FactorizedSampleFromJoinIterDataset( join_iter_dataset, base_table=table, factorize_blacklist=[], word_size_bits=10, factorize_fanouts=True) t_end = time.time() log.info(f"> Initialization took {t_end - t_start} seconds.") print(join_iter_dataset.join_iter_dataset.combined_columns) samples = [] for i in tqdm(range(5000000)): samples.append(next(join_iter_dataset.join_iter_dataset)) df = pd.DataFrame(data=pd.concat(samples, axis=1)).T df.to_csv('/home/jintao/{}.csv'.format(key), index=False)
def load_or_create_join_count_table(join_clauses, join_how, filename=None): if filename is None: filename = ",".join(join_clauses + [join_how]) max_filename_length = 240 if len(filename) > max_filename_length: h = hashlib.sha1(filename.encode()).hexdigest()[:8] filename = filename[:max_filename_length] + "_" + h filename += ".df" save_path = os.path.join(CACHE_DIR, filename) try: with open(save_path, "rb") as f: log.info("Loading join count table from {}".format(save_path)) df = pickle.load(f) assert isinstance(df, pd.DataFrame), type(df) return df except: log.info("Creating the join count table.") table_info, parsed_join_clauses = get_table_info(join_clauses) log.info("Loading tables {}".format(", ".join(t for t in table_info))) data_tables = { table: datasets.LoadImdb(table, use_cols=None).data for table, cols in table_info.items() } log.info("Making count tables") count_tables = make_count_tables(table_info, data_tables) log.info("Joining count tables") df = join_count_tables(table_info, parsed_join_clauses, count_tables, join_how) log.info("Calculated full join size = {}".format(df["cnt"].sum())) log.info("Saved join count table to {}".format(save_path)) with open(save_path, "wb") as f: pickle.dump(df, f, protocol=4) return df
def _setup(self, config): self.config = config print('NeuroCard config:') pprint.pprint(config) os.chdir(config['cwd']) for k, v in config.items(): setattr(self, k, v) if config['__gpu'] == 0: torch.set_num_threads(config['__cpu']) # W&B. # Do wandb.init() after the os.chdir() above makes sure that the Git # diff file (diff.patch) is w.r.t. the directory where this file is in, # rather than w.r.t. Ray's package dir. wandb_project = config['__run'] wandb.init(name=os.path.basename( self.logdir if self.logdir[-1] != '/' else self.logdir[:-1]), sync_tensorboard=True, config=config, project=wandb_project) self.epoch = 0 if isinstance(self.join_tables, int): # Hack to support training single-model tables. sorted_table_names = sorted( list(datasets.JoinOrderBenchmark.GetJobLightJoinKeys().keys())) self.join_tables = [sorted_table_names[self.join_tables]] # Try to make all the runs the same, except for input orderings. torch.manual_seed(0) np.random.seed(0) # Common attributes. self.loader = None self.join_spec = None join_iter_dataset = None table_primary_index = None # New datasets should be loaded here. assert self.dataset in ['imdb'] if self.dataset == 'imdb': print('Training on Join({})'.format(self.join_tables)) loaded_tables = [] for t in self.join_tables: print('Loading', t) table = datasets.LoadImdb(t, use_cols=self.use_cols) table.data.info() loaded_tables.append(table) if len(self.join_tables) > 1: join_spec, join_iter_dataset, loader, table = self.MakeSamplerDatasetLoader( loaded_tables) self.join_spec = join_spec self.train_data = join_iter_dataset self.loader = loader table_primary_index = [t.name for t in loaded_tables ].index('title') table.cardinality = datasets.JoinOrderBenchmark.GetFullOuterCardinalityOrFail( self.join_tables) self.train_data.cardinality = table.cardinality print('rows in full join', table.cardinality, 'cols in full join', len(table.columns), 'cols:', table) else: # Train on a single table. table = loaded_tables[0] if self.dataset != 'imdb' or len(self.join_tables) == 1: table.data.info() self.train_data = self.MakeTableDataset(table) self.table = table # Provide true cardinalities in a file or implement an oracle CardEst. self.oracle = None self.table_bits = 0 # A fixed ordering? self.fixed_ordering = self.MakeOrdering(table) model = self.MakeModel(self.table, self.train_data, table_primary_index=table_primary_index) # NOTE: ReportModel()'s returned value is the true model size in # megabytes containing all all *trainable* parameters. As impl # convenience, the saved ckpts on disk have slightly bigger footprint # due to saving non-trainable constants (the masks in each layer) as # well. They can be deterministically reconstructed based on RNG seeds # and so should not be counted as model size. self.mb = train_utils.ReportModel(model) if not isinstance(model, transformer.Transformer): print('applying train_utils.weight_init()') model.apply(train_utils.weight_init) self.model = model if self.use_data_parallel: self.model = DataParallelPassthrough(self.model) wandb.watch(model, log='all') if self.use_transformer: opt = torch.optim.Adam( list(model.parameters()), 2e-4, # betas=(0.9, 0.98), # B in Lingvo; in Trfmr paper. betas=(0.9, 0.997), # A in Lingvo. eps=1e-9, ) else: if self.optimizer == 'adam': opt = torch.optim.Adam(list(model.parameters()), 2e-4) else: print('Using Adagrad') opt = torch.optim.Adagrad(list(model.parameters()), 2e-4) print('Optimizer:', opt) self.opt = opt total_steps = self.epochs * self.max_steps if self.lr_scheduler == 'CosineAnnealingLR': # Starts decaying to 0 immediately. self.lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( opt, total_steps) elif self.lr_scheduler == 'OneCycleLR': # Warms up to max_lr, then decays to ~0. self.lr_scheduler = torch.optim.lr_scheduler.OneCycleLR( opt, max_lr=2e-3, total_steps=total_steps) elif self.lr_scheduler is not None and self.lr_scheduler.startswith( 'OneCycleLR-'): warmup_percentage = float(self.lr_scheduler.split('-')[-1]) # Warms up to max_lr, then decays to ~0. self.lr_scheduler = torch.optim.lr_scheduler.OneCycleLR( opt, max_lr=2e-3, total_steps=total_steps, pct_start=warmup_percentage) elif self.lr_scheduler is not None and self.lr_scheduler.startswith( 'wd_'): # Warmups and decays. splits = self.lr_scheduler.split('_') assert len(splits) == 3, splits lr, warmup_fraction = float(splits[1]), float(splits[2]) self.custom_lr_lambda = train_utils.get_cosine_learning_rate_fn( total_steps, learning_rate=lr, min_learning_rate_mult=1e-5, constant_fraction=0., warmup_fraction=warmup_fraction) else: assert self.lr_scheduler is None, self.lr_scheduler self.tbx_logger = tune_logger.TBXLogger(self.config, self.logdir) if self.checkpoint_to_load: self.LoadCheckpoint() self.loaded_queries = None self.oracle_cards = None if self.dataset == 'imdb' and len(self.join_tables) > 1: queries_job_format = utils.JobToQuery(self.queries_csv) self.loaded_queries, self.oracle_cards = utils.UnpackQueries( self.table, queries_job_format) # 瑙f瀽杩囩▼锛岄渶瑕佹浛鎹? timepre1 = time.time() print('Pretime:\n', "{:.2f}".format(timepre1 - gettimest1())) if config['__gpu'] == 0: print('CUDA not available, using # cpu cores for intra-op:', torch.get_num_threads(), '; inter-op:', torch.get_num_interop_threads())
def main(argv): del argv # Unused. # conn = pg.connect(FLAGS.db) # conn.set_session(autocommit=True) # cursor = conn.cursor() cursor = None tables = datasets.LoadImdb(use_cols=None) # Load all templates in original JOB-light. queries = utils.JobToQuery(FLAGS.tds_light_csv, use_alias_keys=False) tables_to_join_keys = {} for query in queries: key = MakeTablesKey(query[0]) if key not in tables_to_join_keys: join_dict = query[1] # Disambiguate: title->id changed to title->title.id. for table_name in join_dict.keys(): # TODO: only support a single join key join_key = next(iter(join_dict[table_name])) join_dict[table_name] = common.JoinTableAndColumnNames( table_name, join_key, sep='.') tables_to_join_keys[key] = join_dict num_templates = len(tables_to_join_keys) num_queries_per_template = FLAGS.num_queries // num_templates logging.info('%d join templates', num_templates) rng = np.random.RandomState(1234) queries = [] # [(cols, ops, vals)] # Disambiguate to not prune away stuff during join sampling. for table_name, table in tables.items(): for col in table.columns: col.name = common.JoinTableAndColumnNames(table.name, col.name, sep='.') table.data.columns = [col.name for col in table.columns] if FLAGS.print_sel: # Print true selectivities. df = pd.read_csv(FLAGS.output_csv, sep='#', header=None) assert len(df) == FLAGS.num_queries, (len(df), FLAGS.num_queries) inner = [] true_inner_card_cache = {} for row in df.iterrows(): vs = row[1] table_names, join_clauses, true_card = vs[0], vs[1], vs[3] table_names = table_names.split(',') print('Template: {}\tTrue card: {}'.format(table_names, true_card)) # JOB-light: contains 'full_name alias'. # JOB-light-ranges: just 'full_name'. if ' ' in table_names[0]: table_names = [n.split(' ')[0] for n in table_names] tables_in_templates = [tables[n] for n in table_names] key = MakeTablesKey(table_names) join_keys_list = tables_to_join_keys[key] if key not in true_inner_card_cache: join_spec = join_utils.get_join_spec({ "join_tables": table_names, "join_keys": dict( zip(table_names, [[k.split(".")[1]] for k in join_keys_list])), "join_root": "item", "join_how": "inner", }) ds = FactorizedSamplerIterDataset( tables_in_templates, join_spec, sample_batch_size=num_queries, disambiguate_column_names=False, add_full_join_indicators=False, add_full_join_fanouts=False) true_inner_card_cache[key] = ds.sampler.join_card inner.append(true_inner_card_cache[key]) pd.DataFrame({ 'true_cards': df[3], 'true_inner': inner, 'inner_sel': df[3] * 1.0 / inner, 'outer_sel': df[3] * 1.0 / TDS_LIGHT_OUTER_CARDINALITY }).to_csv(FLAGS.output_csv + '.sel', index=False) print('Done:', FLAGS.output_csv + '.sel') else: # Generate queries. last_run_queries = file_len(FLAGS.output_csv) if os.path.exists( FLAGS.output_csv) else 0 next_template_idx = last_run_queries // num_queries_per_template print('next_template_idx', next_template_idx) print(tables_to_join_keys.items()) spark = StartSpark() for i, (tables_to_join, join_keys) in enumerate(tables_to_join_keys.items()): if i < next_template_idx: print('Skipping template:', tables_to_join) continue print('Template:', tables_to_join) if i == num_templates - 1: num_queries_per_template += FLAGS.num_queries % num_templates # Generate num_queries_per_template. table_names = tables_to_join.split('-') tables_in_templates = [tables[n] for n in table_names] queries.extend( MakeQueries(spark, cursor, num_queries_per_template, tables_in_templates, table_names, join_keys, rng))
def main(): table2alias = {'title': 't', 'cast_info': 'ci', 'movie_companies': 'mc', 'movie_info': 'mi', 'movie_info_idx': 'mi_idx', 'movie_keyword': 'mk'} join_tables = ['title', 'cast_info', 'movie_companies', 'movie_info', 'movie_info_idx', 'movie_keyword'] join_keys = {'title': ['id'], 'cast_info': ['movie_id'], 'movie_companies': ['movie_id'], 'movie_info': ['movie_id'], 'movie_info_idx': ['movie_id'], 'movie_keyword': ['movie_id']} join_clauses = {'cast_info': 'title.id=cast_info.movie_id', 'movie_companies': 'title.id=movie_companies.movie_id', 'movie_info': 'title.id=movie_info.movie_id', 'movie_info_idx': 'title.id=movie_info_idx.movie_id', 'movie_keyword': 'title.id=movie_keyword.movie_id'} all_cols = { 'title': [ 'kind_id', 'production_year', 'episode_nr', 'imdb_index', 'phonetic_code', 'season_nr', 'series_years' ], 'cast_info': [ 'nr_order', 'role_id' ], 'movie_companies': [ 'company_type_id' ], 'movie_info_idx': ['info_type_id'], 'movie_info': ['info_type_id'], 'movie_keyword': ['keyword_id'] } tables = ['cast_info', 'movie_companies', 'movie_info', 'movie_info_idx', 'movie_keyword'] for num in range(1, 6): for p in combinations(tables, num): config = JOB_MY config['join_clauses'] = [] p = [x for x in p] for t in p: config['join_clauses'].append(join_clauses[t]) p = p + ['title'] key = '_'.join(sorted([table2alias[x] for x in p])) config['join_tables'] = p config['join_keys'] = {} for t in p: config['join_keys'][t] = join_keys[t] col_num = 0 for t in p: col_num += len(all_cols[t]) join_spec = join_utils.get_join_spec(config) prepare_utils.prepare(join_spec) loaded_tables = [] for t in join_spec.join_tables: print('Loading', t) table = datasets.LoadImdb(t, use_cols=config["use_cols"]) table.data.info() loaded_tables.append(table) t_start = time.time() join_iter_dataset = FactorizedSamplerIterDataset( loaded_tables, join_spec, sample_batch_size=1000 * 100, disambiguate_column_names=True) table = common.ConcatTables(loaded_tables, join_spec.join_keys, sample_from_join_dataset=join_iter_dataset) join_iter_dataset = common.FactorizedSampleFromJoinIterDataset( join_iter_dataset, base_table=table, factorize_blacklist=[], word_size_bits=10, factorize_fanouts=True) t_end = time.time() log.info(f"> Initialization took {t_end - t_start} seconds.") print(join_iter_dataset.join_iter_dataset.combined_columns) samples = [] for i in tqdm(range(1000000)): samples.append(next(join_iter_dataset.join_iter_dataset)) df = pd.DataFrame(data=pd.concat(samples, axis=1)).T.iloc[:, :col_num] df.to_csv('../train-test-data/join_samples/{}.csv'.format(key), index=False) # join_iter_dataset.join_iter_dataset._sample_batch() print('-' * 60) print("Done {}".format(key))