Exemplo n.º 1
0
def gen2(csv_path):
    """
    Just like the full IMDB schema but without tables that are not used in the job-light benchmark.
    """

    schema = SchemaGraph()

    # tables
    # id, title, imdb_index, kind_id, production_year, imdb_id,phonetic_code, episode_of_id, season_nr, episode_nr,series_years, md5sum
    # title
    schema.add_table(
        Table('auth_user',
              attributes=['id_copy', 'id', 'is_active'],
              irrelevant_attributes=['is_active'],
              csv_file_location=csv_path.format('auth_user'),
              table_size=91889))  # 无用列增加了title

    # movie_info
    schema.add_table(
        Table('student_courseenrollment',
              attributes=['id_copy', 'id', 'user_id', 'is_active'],
              irrelevant_attributes=['id_copy'],
              csv_file_location=csv_path.format('student_courseenrollment'),
              table_size=85637))  # 无用列增加了info

    # relationships

    schema.add_relationship('student_courseenrollment', 'user_id', 'auth_user',
                            'id')

    return schema
Exemplo n.º 2
0
def cols6(csv_path):
    """
    Just like the full IMDB schema but without tables that are not used in the job-light benchmark.
    """

    schema = SchemaGraph()

    # tables
    # id, title, imdb_index, kind_id, production_year, imdb_id,phonetic_code, episode_of_id, season_nr, episode_nr,series_years, md5sum
    # title
    schema.add_table(
        Table('title',
              attributes=[
                  'id', 'title', 'imdb_index', 'kind_id', 'production_year',
                  'imdb_id', 'phonetic_code', 'episode_of_id', 'season_nr',
                  'episode_nr', 'series_years', 'md5sum'
              ],
              irrelevant_attributes=[
                  'title', 'imdb_index', 'imdb_id', 'episode_of_id',
                  'season_nr', 'episode_nr', 'md5sum'
              ],
              no_compression=['kind_id'],
              csv_file_location=csv_path.format('title'),
              table_size=3486660))  # 无用列增加了title

    # movie_info
    schema.add_table(
        Table('movie_info',
              attributes=['id', 'movie_id', 'info_type_id', 'info', 'note'],
              csv_file_location=csv_path.format('movie_info'),
              irrelevant_attributes=['info', 'note'],
              no_compression=['info_type_id'],
              table_size=24988000))  # 无用列增加了info

    # cast_info
    schema.add_table(
        Table('cast_info',
              attributes=[
                  'id', 'person_id', 'movie_id', 'person_role_id', 'note',
                  'nr_order', 'role_id'
              ],
              csv_file_location=csv_path.format('cast_info'),
              irrelevant_attributes=[
                  'person_id', 'person_role_id', 'note', 'nr_order'
              ],
              no_compression=['role_id'],
              table_size=63475800))

    # relationships

    schema.add_relationship('movie_info', 'movie_id', 'title', 'id')
    schema.add_relationship('cast_info', 'movie_id', 'title', 'id')

    return schema
Exemplo n.º 3
0
def gen10(csv_path):
    """
    Just like the full IMDB schema but without tables that are not used in the job-light benchmark.
    """

    schema = SchemaGraph()

    # tables
    # id, title, imdb_index, kind_id, production_year, imdb_id,phonetic_code, episode_of_id, season_nr, episode_nr,series_years, md5sum
    # title
    schema.add_table(
        Table('auth_user',
              attributes=['id_copy', 'id', 'is_active'],
              csv_file_location=csv_path.format('auth_user'),
              table_size=91889))  # 无用列增加了title

    # movie_info
    schema.add_table(
        Table('student_courseenrollment',
              attributes=['id_copy', 'id', 'user_id', 'is_active'],
              csv_file_location=csv_path.format('student_courseenrollment'),
              table_size=85637))  # 无用列增加了info

    # cast_info
    schema.add_table(
        Table('organization_account_userorgprofile',
              attributes=[
                  'id', 'org_id', 'user_id', 'origin', 'role',
                  '_first_level_id'
              ],
              csv_file_location=csv_path.format(
                  'organization_account_userorgprofile'),
              table_size=99832))

    schema.add_table(
        Table('auth_userprofile',
              attributes=['id', 'user_id', 'level_of_education', 'host'],
              csv_file_location=csv_path.format('auth_userprofile'),
              table_size=99349))
    # relationships

    schema.add_relationship('student_courseenrollment', 'user_id', 'auth_user',
                            'id')
    schema.add_relationship('organization_account_userorgprofile', 'user_id',
                            'auth_user', 'id')
    schema.add_relationship('auth_userprofile', 'user_id', 'auth_user', 'id')

    return schema
Exemplo n.º 4
0
def gen_imdb_schema(csv_path):
    """
    Specifies full imdb schema. Also tables not in the job-light benchmark.
    """
    schema = SchemaGraph()

    # tables

    # title
    schema.add_table(
        Table('title',
              attributes=[
                  'id', 'title', 'imdb_index', 'kind_id', 'production_year',
                  'imdb_id', 'phonetic_code', 'episode_of_id', 'season_nr',
                  'episode_nr', 'series_years', 'md5sum'
              ],
              irrelevant_attributes=['episode_of_id'],
              csv_file_location=csv_path.format('title'),
              table_size=3486660))

    # movie_info_idx
    schema.add_table(
        Table('movie_info_idx',
              attributes=['id', 'movie_id', 'info_type_id', 'info', 'note'],
              csv_file_location=csv_path.format('movie_info_idx'),
              table_size=3147110))

    # movie_info
    schema.add_table(
        Table('movie_info',
              attributes=['id', 'movie_id', 'info_type_id', 'info', 'note'],
              csv_file_location=csv_path.format('movie_info'),
              table_size=24988000))

    # info_type
    schema.add_table(
        Table('info_type',
              attributes=['id', 'info'],
              csv_file_location=csv_path.format('info_type'),
              table_size=113))

    # cast_info
    schema.add_table(
        Table('cast_info',
              attributes=[
                  'id', 'person_id', 'movie_id', 'person_role_id', 'note',
                  'nr_order', 'role_id'
              ],
              csv_file_location=csv_path.format('cast_info'),
              table_size=63475800))

    # char_name
    schema.add_table(
        Table('char_name',
              attributes=[
                  'id', 'name', 'imdb_index', 'imdb_id', 'name_pcode_nf',
                  'surname_pcode', 'md5sum'
              ],
              csv_file_location=csv_path.format('char_name'),
              table_size=4314870))

    # role_type
    schema.add_table(
        Table('role_type',
              attributes=['id', 'role'],
              csv_file_location=csv_path.format('role_type'),
              table_size=0))

    # complete_cast
    schema.add_table(
        Table('complete_cast',
              attributes=['id', 'movie_id', 'subject_id', 'status_id'],
              csv_file_location=csv_path.format('complete_cast'),
              table_size=135086))

    # comp_cast_type
    schema.add_table(
        Table('comp_cast_type',
              attributes=['id', 'kind'],
              csv_file_location=csv_path.format('comp_cast_type'),
              table_size=0))

    # name
    schema.add_table(
        Table('name',
              attributes=[
                  'id', 'name', 'imdb_index', 'imdb_id', 'gender',
                  'name_pcode_cf', 'name_pcode_nf', 'surname_pcode', 'md5sum'
              ],
              csv_file_location=csv_path.format('name'),
              table_size=6379740))

    # aka_name
    schema.add_table(
        Table('aka_name',
              attributes=[
                  'id', 'person_id', 'name', 'imdb_index', 'name_pcode_cf',
                  'name_pcode_nf', 'surname_pcode', 'md5sum'
              ],
              csv_file_location=csv_path.format('aka_name'),
              table_size=1312270))

    # movie_link, is empty
    # schema.add_table(Table('movie_link', attributes=['id', 'movie_id', 'linked_movie_id', 'link_type_id'],
    #                        csv_file_location=csv_path.format('movie_link')))

    # link_type, no relationships
    # schema.add_table(Table('link_type', attributes=['id', 'link'],
    #                        csv_file_location=csv_path.format('link_type')))

    # movie_keyword
    schema.add_table(
        Table('movie_keyword',
              attributes=['id', 'movie_id', 'keyword_id'],
              csv_file_location=csv_path.format('movie_keyword'),
              table_size=7522600))

    # keyword
    schema.add_table(
        Table('keyword',
              attributes=['id', 'keyword', 'phonetic_code'],
              csv_file_location=csv_path.format('keyword'),
              table_size=236627))

    # person_info
    schema.add_table(
        Table('person_info',
              attributes=['id', 'person_id', 'info_type_id', 'info', 'note'],
              csv_file_location=csv_path.format('person_info'),
              table_size=4130210))

    # movie_companies
    schema.add_table(
        Table('movie_companies',
              attributes=[
                  'id', 'movie_id', 'company_id', 'company_type_id', 'note'
              ],
              csv_file_location=csv_path.format('movie_companies'),
              table_size=4958300))

    # company_name
    schema.add_table(
        Table('company_name',
              attributes=[
                  'id', 'name', 'country_code', 'imdb_id', 'name_pcode_nf',
                  'name_pcode_sf', 'md5sum'
              ],
              csv_file_location=csv_path.format('company_name'),
              table_size=362131))

    # company_type
    schema.add_table(
        Table('company_type',
              attributes=['id', 'kind'],
              csv_file_location=csv_path.format('company_type'),
              table_size=0))

    # aka_title
    schema.add_table(
        Table('aka_title',
              attributes=[
                  'id', 'movie_id', 'title', 'imdb_index', 'kind_id',
                  'production_year', 'phonetic_code', 'episode_of_id',
                  'season_nr', 'episode_nr', 'note', 'md5sum'
              ],
              irrelevant_attributes=['episode_of_id'],
              csv_file_location=csv_path.format('aka_title'),
              table_size=528268))

    # kind_type
    schema.add_table(
        Table('kind_type',
              attributes=['id', 'kind'],
              csv_file_location=csv_path.format('kind_type'),
              table_size=0))

    # relationships

    # title
    # omit self-join for now
    # schema.add_relationship('title', 'episode_of_id', 'title', 'id')
    schema.add_relationship('title', 'kind_id', 'kind_type', 'id')

    # movie_info_idx
    schema.add_relationship('movie_info_idx', 'info_type_id', 'info_type',
                            'id')
    schema.add_relationship('movie_info_idx', 'movie_id', 'title', 'id')

    # movie_info
    schema.add_relationship('movie_info', 'info_type_id', 'info_type', 'id')
    schema.add_relationship('movie_info', 'movie_id', 'title', 'id')

    # info_type, no relationships

    # cast_info
    schema.add_relationship('cast_info', 'movie_id', 'title', 'id')
    schema.add_relationship('cast_info', 'person_id', 'name', 'id')
    schema.add_relationship('cast_info', 'person_role_id', 'char_name', 'id')
    schema.add_relationship('cast_info', 'role_id', 'role_type', 'id')

    # char_name, no relationships

    # role_type, no relationships

    # complete_cast
    schema.add_relationship('complete_cast', 'movie_id', 'title', 'id')
    schema.add_relationship('complete_cast', 'status_id', 'comp_cast_type',
                            'id')
    schema.add_relationship('complete_cast', 'subject_id', 'comp_cast_type',
                            'id')

    # comp_cast_type, no relationships

    # name, no relationships

    # aka_name
    schema.add_relationship('aka_name', 'person_id', 'name', 'id')

    # movie_link, is empty
    # schema.add_relationship('movie_link', 'link_type_id', 'link_type', 'id')
    # schema.add_relationship('movie_link', 'linked_movie_id', 'title', 'id')
    # schema.add_relationship('movie_link', 'movie_id', 'title', 'id')

    # link_type, no relationships

    # movie_keyword
    schema.add_relationship('movie_keyword', 'keyword_id', 'keyword', 'id')
    schema.add_relationship('movie_keyword', 'movie_id', 'title', 'id')

    # keyword, no relationships

    # person_info
    schema.add_relationship('person_info', 'info_type_id', 'info_type', 'id')
    schema.add_relationship('person_info', 'person_id', 'name', 'id')

    # movie_companies
    schema.add_relationship('movie_companies', 'company_id', 'company_name',
                            'id')
    schema.add_relationship('movie_companies', 'company_type_id',
                            'company_type', 'id')
    schema.add_relationship('movie_companies', 'movie_id', 'title', 'id')

    # company_name, no relationships

    # company_type, no relationships

    # aka_title
    schema.add_relationship('aka_title', 'movie_id', 'title', 'id')
    schema.add_relationship('aka_title', 'kind_id', 'kind_type', 'id')

    # kind_type, no relationships

    return schema
Exemplo n.º 5
0
def gen_job_light_imdb_schema(csv_path):
    """
    Just like the full IMDB schema but without tables that are not used in the job-light benchmark.
    """

    schema = SchemaGraph()

    # tables

    # title
    schema.add_table(
        Table('title',
              attributes=[
                  'id', 'title', 'imdb_index', 'kind_id', 'production_year',
                  'imdb_id', 'phonetic_code', 'episode_of_id', 'season_nr',
                  'episode_nr', 'series_years', 'md5sum'
              ],
              irrelevant_attributes=[
                  'episode_of_id', 'title', 'imdb_index', 'phonetic_code',
                  'season_nr', 'imdb_id', 'episode_nr', 'series_years',
                  'md5sum'
              ],
              no_compression=['kind_id'],
              csv_file_location=csv_path.format('title'),
              table_size=3486660))

    # movie_info_idx
    schema.add_table(
        Table('movie_info_idx',
              attributes=['id', 'movie_id', 'info_type_id', 'info', 'note'],
              csv_file_location=csv_path.format('movie_info_idx'),
              irrelevant_attributes=['info', 'note'],
              no_compression=['info_type_id'],
              table_size=3147110))

    # movie_info
    schema.add_table(
        Table('movie_info',
              attributes=['id', 'movie_id', 'info_type_id', 'info', 'note'],
              csv_file_location=csv_path.format('movie_info'),
              irrelevant_attributes=['info', 'note'],
              no_compression=['info_type_id'],
              table_size=24988000))

    # cast_info
    schema.add_table(
        Table('cast_info',
              attributes=[
                  'id', 'person_id', 'movie_id', 'person_role_id', 'note',
                  'nr_order', 'role_id'
              ],
              csv_file_location=csv_path.format('cast_info'),
              irrelevant_attributes=[
                  'nr_order', 'note', 'person_id', 'person_role_id'
              ],
              no_compression=['role_id'],
              table_size=63475800))

    # movie_keyword
    schema.add_table(
        Table('movie_keyword',
              attributes=['id', 'movie_id', 'keyword_id'],
              csv_file_location=csv_path.format('movie_keyword'),
              no_compression=['keyword_id'],
              table_size=7522600))

    # movie_companies
    schema.add_table(
        Table('movie_companies',
              attributes=[
                  'id', 'movie_id', 'company_id', 'company_type_id', 'note'
              ],
              csv_file_location=csv_path.format('movie_companies'),
              irrelevant_attributes=['note'],
              no_compression=['company_id', 'company_type_id'],
              table_size=4958300))

    # relationships
    schema.add_relationship('movie_info_idx', 'movie_id', 'title', 'id')
    schema.add_relationship('movie_info', 'movie_id', 'title', 'id')
    schema.add_relationship('cast_info', 'movie_id', 'title', 'id')
    schema.add_relationship('movie_keyword', 'movie_id', 'title', 'id')
    schema.add_relationship('movie_companies', 'movie_id', 'title', 'id')

    return schema
Exemplo n.º 6
0
def gen_500gb_ssb_schema(csv_path):
    """
    SSB schema for SF=500.
    """

    schema = SchemaGraph()

    # tables
    # lineorder
    schema.add_table(Table('lineorder',
                           attributes=['lo_orderkey', 'lo_linenumber', 'lo_custkey', 'lo_partkey', 'lo_suppkey',
                                       'lo_orderdate', 'lo_orderpriority', 'lo_shippriority', 'lo_quantity',
                                       'lo_extendedprice', 'lo_ordertotalprice', 'lo_discount', 'lo_revenue',
                                       'lo_supplycost', 'lo_tax', 'lo_commitdate', 'lo_shipmode'],
                           csv_file_location=csv_path.format('lineorder_sampled'),
                           irrelevant_attributes=['lo_commitdate'],
                           table_size=3000028242, primary_key=['lo_orderkey', 'lo_linenumber'], sample_rate=0.003333))

    # dwdate
    # dwdate.d_dayofweek -> dwdate.d_daynuminweek
    # dwdate.d_dayofweek -> dwdate.d_lastdayinweekfl
    # dwdate.d_month -> dwdate.d_monthnuminyear
    # dwdate.d_monthnuminyear -> dwdate.d_sellingseason
    # dwdate.d_daynuminyear -> dwdate.d_weeknuminyear
    schema.add_table(
        Table('dwdate',
              attributes=['d_datekey', 'd_date', 'd_dayofweek', 'd_month', 'd_year', 'd_yearmonthnum', 'd_yearmonth',
                          'd_daynuminweek', 'd_daynuminmonth', 'd_daynuminyear', 'd_monthnuminyear', 'd_weeknuminyear',
                          'd_sellingseason', 'd_lastdayinweekfl', 'd_lastdayinmonthfl', 'd_holidayfl', 'd_weekdayfl'],
              csv_file_location=csv_path.format('date'),
              table_size=2556, primary_key=["d_datekey"],
              fd_list=[('d_dayofweek', 'd_daynuminweek'), ('d_dayofweek', 'd_lastdayinweekfl'),
                       ('d_month', 'd_monthnuminyear'), ('d_monthnuminyear', 'd_sellingseason'),
                       ('d_daynuminyear', 'd_weeknuminyear')]))

    # customer
    # customer.c_city -> customer.c_nation
    # customer.c_nation -> customer.c_region
    schema.add_table(
        Table('customer',
              attributes=['c_custkey', 'c_name', 'c_address', 'c_city', 'c_nation', 'c_region', 'c_phone',
                          'c_mktsegment'],
              csv_file_location=csv_path.format('customer'),
              table_size=15000000, primary_key=["c_custkey"],
              fd_list=[('c_city', 'c_nation'), ('c_nation', 'c_region')]))

    # part
    # part.p_brand1 -> part.p_category
    # part.p_category -> part.p_mfgr
    schema.add_table(
        Table('part',
              attributes=['p_partkey', 'p_name', 'p_mfgr', 'p_category', 'p_brand1', 'p_color', 'p_type', 'p_size',
                          'p_container'],
              csv_file_location=csv_path.format('part'),
              table_size=1800000, primary_key=["p_partkey"],
              fd_list=[('p_category', 'p_mfgr'), ('p_brand1', 'p_category')]))

    # supplier
    # supplier.s_city -> supplier.s_nation
    # supplier.s_nation -> supplier.s_region
    schema.add_table(
        Table('supplier', attributes=['s_suppkey', 's_name', 's_address', 's_city', 's_nation', 's_region', 's_phone'],
              csv_file_location=csv_path.format('supplier'),
              table_size=1000000, primary_key=["s_suppkey"],
              fd_list=[('s_city', 's_nation'), ('s_nation', 's_region')]))

    # relationships
    schema.add_relationship('lineorder', 'lo_custkey', 'customer', 'c_custkey')
    schema.add_relationship('lineorder', 'lo_partkey', 'part', 'p_partkey')
    schema.add_relationship('lineorder', 'lo_suppkey', 'supplier', 's_suppkey')
    schema.add_relationship('lineorder', 'lo_orderdate', 'dwdate', 'd_datekey')
    # schema.add_relationship('lineorder', 'lo_commitdate', 'dwdate', 'd_datekey')

    return schema