Exemplo n.º 1
0
def main():
    config = experiments.JOB_FULL
    join_spec = join_utils.get_join_spec(config)
    prepare_utils.prepare(join_spec)
    loaded_tables = []
    for t in join_spec.join_tables:
        print('Loading', t)
        table = datasets.LoadImdb(t, use_cols=config["use_cols"])
        table.data.info()
        loaded_tables.append(table)

    t_start = time.time()
    join_iter_dataset = FactorizedSamplerIterDataset(
        loaded_tables,
        join_spec,
        sample_batch_size=1000 * 100,
        disambiguate_column_names=True)

    table = common.ConcatTables(loaded_tables,
                                join_spec.join_keys,
                                sample_from_join_dataset=join_iter_dataset)

    join_iter_dataset = common.FactorizedSampleFromJoinIterDataset(
        join_iter_dataset,
        base_table=table,
        factorize_blacklist=[],
        word_size_bits=10,
        factorize_fanouts=True)
    t_end = time.time()
    log.info(f"> Initialization took {t_end - t_start} seconds.")

    join_iter_dataset.join_iter_dataset._sample_batch()
    print('-' * 60)
    print("Done")
Exemplo n.º 2
0
def generate_title_movie_companies(p):
    table2alias = {'title': 't', 'movie_companies': 'mc', 'company_name': 'cn'}
    join_tables = ['title', 'movie_companies', 'company_name']
    join_keys = {'title': ['id'], 'movie_companies': ['movie_id', 'company_id'], 'company_name': ['id']}
    join_clauses = {'title': 'title.id=movie_companies.movie_id',
                    'company_name': 'company_name.id=movie_companies.company_id'}
    # all_cols = {
    #         'title': [
    #             'title','kind_id','production_year','id2', 'id'
    #         ],
    #         'movie_companies': [
    #             'company_type_id', 'company_id', 'movie_id'
    #         ],
    #         'company_name': ['name', 'country_code', 'id'],
    #     }

    config = JOB_jintao
    p = p + ['movie_companies']
    key = '_'.join(sorted([table2alias[x] for x in p]))
    join_spec = join_utils.get_join_spec(config)
    prepare_utils.prepare(join_spec)
    loaded_tables = []
    for t in join_spec.join_tables:
        print('Loading', t)
        table = datasets.LoadImdb(t, use_cols=config["use_cols"])
        table.data.info()
        loaded_tables.append(table)
    t_start = time.time()
    join_iter_dataset = FactorizedSamplerIterDataset(
        loaded_tables,
        join_spec,
        sample_batch_size=51000 * 100,
        disambiguate_column_names=True)
    table = common.ConcatTables(loaded_tables,
                                join_spec.join_keys,
                                sample_from_join_dataset=join_iter_dataset)

    join_iter_dataset = common.FactorizedSampleFromJoinIterDataset(
        join_iter_dataset,
        base_table=table,
        factorize_blacklist=[],
        word_size_bits=10,
        factorize_fanouts=True)
    t_end = time.time()
    log.info(f"> Initialization took {t_end - t_start} seconds.")
    print(join_iter_dataset.join_iter_dataset.combined_columns)
    samples = []
    for i in tqdm(range(5000000)):
        samples.append(next(join_iter_dataset.join_iter_dataset))
    df = pd.DataFrame(data=pd.concat(samples, axis=1)).T
    df.to_csv('/home/jintao/{}.csv'.format(key), index=False)
Exemplo n.º 3
0
    def __init__(self,
                 loaded_tables,
                 join_spec,
                 sample_batch_size,
                 rng=None,
                 disambiguate_column_names=True,
                 add_full_join_indicators=True,
                 add_full_join_fanouts=True):
        prepare_utils.prepare(join_spec)
        self.join_spec = join_spec
        self.sample_batch_size = sample_batch_size
        self.rng = rng
        self.disambiguate_column_names = disambiguate_column_names
        self.add_full_join_indicators = add_full_join_indicators
        self.add_full_join_fanouts = add_full_join_fanouts
        self.dt_actors = [
            DataTableActor(table.name, join_spec.join_keys[table.name],
                           table.data, join_spec.join_name)
            for table in loaded_tables
        ]
        jcts = {
            table: load_jct(table, join_spec.join_name)
            for table in join_spec.join_tables
        }
        self.jct_actors = {
            table: JoinCountTableActor(table, jct, join_spec)
            for table, jct in jcts.items()
        }
        self.sampling_tables_ordering = _make_sampling_table_ordering(
            loaded_tables, join_spec.join_root)
        self.all_columns = None
        self.rename_dict = None
        self.jct_count_columns = get_jct_count_columns(self.join_spec)
        self.fanout_columns = get_fanout_columns(
            self.join_spec) if add_full_join_fanouts else []

        root = join_spec.join_root
        self.join_card = self.jct_actors[root].jct["{}.weight".format(
            root)].sum()
Exemplo n.º 4
0
def main():
    table2alias = {'title': 't', 'cast_info': 'ci', 'movie_companies': 'mc', 'movie_info': 'mi',
                   'movie_info_idx': 'mi_idx', 'movie_keyword': 'mk'}
    join_tables = ['title', 'cast_info', 'movie_companies', 'movie_info', 'movie_info_idx', 'movie_keyword']
    join_keys = {'title': ['id'], 'cast_info': ['movie_id'], 'movie_companies': ['movie_id'],
                 'movie_info': ['movie_id'], 'movie_info_idx': ['movie_id'], 'movie_keyword': ['movie_id']}
    join_clauses = {'cast_info': 'title.id=cast_info.movie_id', 'movie_companies': 'title.id=movie_companies.movie_id',
                    'movie_info': 'title.id=movie_info.movie_id', 'movie_info_idx': 'title.id=movie_info_idx.movie_id',
                    'movie_keyword': 'title.id=movie_keyword.movie_id'}
    all_cols = {
        'title': [
            'kind_id', 'production_year', 'episode_nr', 'imdb_index', 'phonetic_code', 'season_nr', 'series_years'
        ],
        'cast_info': [
            'nr_order', 'role_id'
        ],
        'movie_companies': [
            'company_type_id'
        ],
        'movie_info_idx': ['info_type_id'],
        'movie_info': ['info_type_id'],
        'movie_keyword': ['keyword_id']
    }

    tables = ['cast_info', 'movie_companies', 'movie_info', 'movie_info_idx', 'movie_keyword']
    for num in range(1, 6):
        for p in combinations(tables, num):
            config = JOB_MY
            config['join_clauses'] = []
            p = [x for x in p]
            for t in p:
                config['join_clauses'].append(join_clauses[t])
            p = p + ['title']
            key = '_'.join(sorted([table2alias[x] for x in p]))
            config['join_tables'] = p
            config['join_keys'] = {}
            for t in p:
                config['join_keys'][t] = join_keys[t]
            col_num = 0
            for t in p:
                col_num += len(all_cols[t])
            join_spec = join_utils.get_join_spec(config)
            prepare_utils.prepare(join_spec)
            loaded_tables = []
            for t in join_spec.join_tables:
                print('Loading', t)
                table = datasets.LoadImdb(t, use_cols=config["use_cols"])
                table.data.info()
                loaded_tables.append(table)

            t_start = time.time()
            join_iter_dataset = FactorizedSamplerIterDataset(
                loaded_tables,
                join_spec,
                sample_batch_size=1000 * 100,
                disambiguate_column_names=True)

            table = common.ConcatTables(loaded_tables,
                                        join_spec.join_keys,
                                        sample_from_join_dataset=join_iter_dataset)

            join_iter_dataset = common.FactorizedSampleFromJoinIterDataset(
                join_iter_dataset,
                base_table=table,
                factorize_blacklist=[],
                word_size_bits=10,
                factorize_fanouts=True)
            t_end = time.time()
            log.info(f"> Initialization took {t_end - t_start} seconds.")
            print(join_iter_dataset.join_iter_dataset.combined_columns)
            samples = []
            for i in tqdm(range(1000000)):
                samples.append(next(join_iter_dataset.join_iter_dataset))
            df = pd.DataFrame(data=pd.concat(samples, axis=1)).T.iloc[:, :col_num]
            df.to_csv('../train-test-data/join_samples/{}.csv'.format(key), index=False)
            # join_iter_dataset.join_iter_dataset._sample_batch()
            print('-' * 60)
            print("Done {}".format(key))