Exemplo n.º 1
0
# Sources
author_src = SQLSource(connection=author_conn, query="SELECT * FROM author")
book_src = SQLSource(connection=book_conn, query="SELECT * FROM book")
country_src = CSVSource(f=country_handle, delimiter=",")


# Tables
author_dim = Dimension(name="authordim", key="aid", attributes=["name", "city", "cid"])

book_dim = SlowlyChangingDimension(
    name="bookdim", key="bid", attributes=["title", "year", "version"], lookupatts=["title"], versionatt="version"
)

country_dim = Dimension(name="countrydim", key="cid", attributes=["country"], lookupatts=["country"])

fact_table = FactTable(name="facttable", keyrefs=["aid", "bid"])

snowflake = SnowflakedDimension([(author_dim, country_dim)])

# We map cities to countries and populate the countrydim
cid_map = {}
for row in country_src:
    cid = country_dim.ensure(row)
    cid_map[row["city"]] = cid

# We populate the authordim and the fact table
for row in author_src:
    if row["city"] in ["Hadsten", "Skanderborg", "Kobenhavn"]:
        row["cid"] = cid_map[row["city"]]
    else:
        row["cid"] = None
Exemplo n.º 2
0
 def cls_to_pygram_fact(cls, schema_name):
     fct = FactTable(
         name=schema_name + '.' + cls.cls_get_name(),
         keyrefs=cls.cls_get_key_names(),
         measures=cls.cls_get_measure_names())
     return fct
Exemplo n.º 3
0
company_info = [('The Hobbit', 1), ('The Bobbit', 5)]

# ... and inserting the necessary data.
c.executemany("INSERT INTO FACTTABLE (BOOK,ISSUE) VALUES (?,?)", company_info)

conn.commit()

ConnectionWrapper(conn)

dim = Dimension('COMPANY', 'ID', ['NAME', 'AGE', 'ADDRESS', 'SALARY'], ['NAME'])
dim_rep = DimRepresentation(dim, conn)

dim2 = Dimension('LASTNAME', 'NAME', ['LAST'])
dim_rep2 = DimRepresentation(dim2, conn)

ft = FactTable('FACTTABLE', ['Book'], ['Issue'])
ft_rep = FTRepresentation(ft, conn)

dw = DWRepresentation([dim_rep, ft_rep, dim_rep2], conn)

expected_list1 = [
    {'NAME': 'Anders', 'AGE': 43, 'SALARY': 21000.0, 'ADDRESS': 'Denmark',
     'ID': 1},
    {'NAME': 'Charles', 'AGE': 50, 'SALARY': 25000.0, 'ADDRESS': 'Texas',
     'ID': 2},
    {'NAME': 'Wolf', 'AGE': 28, 'SALARY': 19000.0, 'ADDRESS': 'Sweden',
     'ID': 3},
    {'NAME': 'Hannibal', 'AGE': 45, 'SALARY': 65000.0, 'ADDRESS': 'America',
     'ID': 4},
    {'NAME': 'Buggy', 'AGE': 67, 'SALARY': 2000.0, 'ADDRESS': 'America',
     'ID': 5}
Exemplo n.º 4
0
def setup(dbname, number):

    open(os.path.expanduser(dbname), 'w')

    output_conn = sqlite3.connect(dbname)
    output_cur = output_conn.cursor()

    output_cur.execute("CREATE TABLE dim1 "
                       "(key1 INTEGER PRIMARY KEY, attr1 INTEGER, "
                       "attr2 INTEGER)")

    output_cur.execute("CREATE TABLE dim2 "
                       "(key2 INTEGER PRIMARY KEY, attr3 INTEGER, "
                       "attr4 INTEGER)")

    output_cur.execute("CREATE TABLE ft1 "
                       "(measure INTEGER, key1 INTEGER, key2 INTEGER, "
                       "PRIMARY KEY(key1, key2))")

    output_conn.commit()
    output_wrapper = pygrametl.ConnectionWrapper(connection=output_conn)

    dim1 = Dimension(
        name='dim1',
        key='key1',
        attributes=['attr1', 'attr2']
    )

    dim2 = Dimension(
        name='dim2',
        key='key2',
        attributes=['attr3', 'attr4'],
    )

    ft1 = FactTable(
        name='ft1',
        keyrefs=['key1', 'key2'],
        measures=['measure']
    )

    facts = []
    dim1_data = []
    dim2_data = []

    total_elapsed = 0.
    print('Generating ft1 data')
    start = time.monotonic()
    counter = 0
    for i in range(1, number + 1):
        for j in range(1, number + 1):
            counter += 10
            facts.append({'key1': i, 'key2': j, 'measure': counter})
    end = time.monotonic()
    elapsed = end - start
    print('Generated: {}{}'.format(round(elapsed, 3), 's'))
    total_elapsed += elapsed

    print('Generating dim1 data')
    start = time.monotonic()
    for i in range(1, number + 1):
        dim1_data.append({'attr1': i, 'attr2': number + 1 - i})
    end = time.monotonic()
    elapsed = end - start
    print('Generated: {}{}'.format(round(elapsed, 3), 's'))
    total_elapsed += elapsed

    print('Generating dim2 data')
    start = time.monotonic()
    for i in range(1, number + 1):
        dim2_data.append({'attr3': i, 'attr4': number + 1 - i})
    end = time.monotonic()
    elapsed = end - start
    print('Generated: {}{}'.format(round(elapsed, 3), 's'))
    total_elapsed += elapsed

    print('Inserting data into ft1')
    start = time.monotonic()
    for row in facts:
        ft1.insert(row)
    end = time.monotonic()
    elapsed = end - start
    print('Inserted: {}{}'.format(round(elapsed, 3), 's'))
    total_elapsed += elapsed

    print('Inserting data into dim1')
    start = time.monotonic()
    for row in dim1_data:
        dim1.insert(row)
    end = time.monotonic()
    elapsed = end - start
    print('Inserted: {}{}'.format(round(elapsed, 3), 's'))
    total_elapsed += elapsed

    print('Inserting data into dim2')
    start = time.monotonic()
    for row in dim2_data:
        dim2.insert(row)
    end = time.monotonic()
    elapsed = end - start
    print('Inserted: {}{}'.format(round(elapsed, 3), 's'))
    total_elapsed += elapsed

    print('DW populated')
    print('Total time: {}{}\n'.format(round(total_elapsed, 3), 's'))
    output_conn.commit()
Exemplo n.º 5
0
    key='bookid',
    attributes=['book', 'genre'])

time_dimension = Dimension(
    name='time',
    key='timeid',
    attributes=['day', 'month', 'year'])

location_dimension = Dimension(
    name='location',
    key='locationid',
    attributes=['city', 'region'],
    lookupatts=['city'])

fact_table = FactTable(
    name='facttable',
    keyrefs=['bookid', 'locationid', 'timeid'],
    measures=['sale'])

def split_timestamp(row):
    timestamp = row['timestamp']
    timestamp_split = timestamp.split('/')

    row['year'] = timestamp_split[0]
    row['month'] = timestamp_split[1]
    row['day'] = timestamp_split[2]

[location_dimension.insert(row) for row in region_source]
region_file_handle.close()
for row in sales_source:
    split_timestamp(row)
    row['bookid'] = book_dimension.ensure(row)
Exemplo n.º 6
0
    key='bookid',
    attributes=['book', 'genre'])

time_dimension = Dimension(
    name='timeDim',
    key='timeid',
    attributes=['day', 'month', 'year'])

location_dimension = Dimension(
    name='locationDim',
    key='locationid',
    attributes=['city', 'region'],
    lookupatts=['city'])

fact_table = FactTable(
    name='factTable',
    keyrefs=['bookid', 'locationid', 'timeid'],
    measures=['sale'])


# NOTE: Most of the following code is taken directly form pygrametl.org and has little to do with
# making this example portable to SQLite and this particular implementation. It is however down here
# that a lot of the cool stuff happens, i.e. the ETL stuff.
# Python function needed to split the timestamp into its three parts
def split_timestamp(row):
    """Splits a timestamp containing a date into its three parts
    """

    # Splitting of the timestamp into parts
    timestamp = row['timestamp']
    timestamp_split = timestamp.split('/')
Exemplo n.º 7
0
""" A sample pygrametl program
"""

__author__ = 'Mathias Claus Jensen'

import pygrametl
from pygrametl.datasources import SQLSource
from pygrametl.tables import Dimension, FactTable
import sqlite3

input_conn = sqlite3.connect('input.db')
output_conn = sqlite3.connect('output.db')

input_src = SQLSource(input_conn, query='SELECT * FROM table')
output_wrapper = pygrametl.ConnectionWrapper(connection=output_conn)

dim1 = Dimension('dim1', 'key1', ['attr1', 'attr2'])

dim2 = Dimension(name='dim2', key='key2', attributes=['attr3', 'attr4'])

ft1 = FactTable(name='ft1', keyrefs=[
    'key1',
])

input_conn.close()
output_conn.close()
Exemplo n.º 8
0
    def __create_tables(self):

        # Systems
        self.system_dimension = CachedDimension(name='system',
                                                key='system_id',
                                                attributes=['system_name'],
                                                lookupatts=['system_name'])

        # Stations

        self.start_station_dimension = CachedDimension(
            name='start_station',
            key='start_station_id',
            attributes=[
                'system_id', 'start_station_short_name', 'start_station_name',
                'start_station_latitude', 'start_station_longitude',
                'start_station_capacity'
            ],
            lookupatts=['system_id', 'start_station_short_name'],
            rowexpander=start_station_missing_data_expander)

        self.end_station_dimension = CachedDimension(
            name='end_station',
            key='end_station_id',
            attributes=[
                'system_id', 'end_station_short_name', 'end_station_name',
                'end_station_latitude', 'end_station_longitude',
                'end_station_capacity'
            ],
            lookupatts=['system_id', 'end_station_short_name'],
            rowexpander=end_station_missing_data_expander)

        # Trip dates and times

        self.date_dimension = CachedDimension(
            name='bdate',
            key='date_id',
            attributes=['year', 'month', 'day', 'day_of_week', 'date_string'],
            lookupatts=['date_string'],
            rowexpander=date_row_expander)

        self.time_dimension = CachedDimension(
            name='btime',
            key='time_id',
            attributes=['hour', 'minute', 'time_string', 'time_of_day'],
            lookupatts=['time_string'],
            rowexpander=time_row_expander)

        # Trips

        self.trip_fact_table = FactTable(
            name='trips',
            measures=['duration_s'],
            keyrefs=[
                'system_id', 'start_station_id', 'end_station_id', 'date_id',
                'time_id', 'customer_birthyear_id', 'customer_gender_id',
                'customer_type_id', 'bike_id', 'trip_category_id'
            ])

        # weather fact table and date dimension

        self.weather_fact_table = FactTable(name='weather',
                                            measures=[
                                                'precipitation_in', 'snow_in',
                                                'temp_avg_f', 'temp_min_f',
                                                'temp_max_f', 'wind_mph'
                                            ],
                                            keyrefs=['system_id', 'date_id'])

        self.trip_category = CachedDimension(name='trip_category',
                                             key='trip_category_id',
                                             attributes=['trip_category'])

        self.bike_dimension = CachedDimension(
            name='bikes',
            key='bike_id',
            attributes=['system_id', 'bike_name'],
            lookupatts=['system_id', 'bike_name'],
            defaultidvalue=-1)

        self.customer_gender_dimension = CachedDimension(
            name='customer_gender',
            key='customer_gender_id',
            attributes=['customer_gender'],
            lookupatts=['customer_gender'])

        self.customer_birthyear_dimension = CachedDimension(
            name='customer_birthyear',
            key='customer_birthyear_id',
            attributes=['customer_birthyear'],
            lookupatts=['customer_birthyear'])

        self.customer_type_dimension = CachedDimension(
            name='customer_type',
            key='customer_type_id',
            attributes=['customer_type'],
            lookupatts=['customer_type'])

        # Station status
        self.station_status_fact_table = FactTable(
            name='station_status',
            keyrefs=['system_id', 'start_station_id', 'date_id', 'time_id'],
            measures=['bikes_available', 'docks_available'])

        # Non-cached version of stations for use only with updating Indego stations.

        self.start_station_noncached_dimension = Dimension(
            name='start_station',
            key='start_station_id',
            attributes=[
                'system_id', 'start_station_short_name', 'start_station_name',
                'start_station_latitude', 'start_station_longitude',
                'start_station_capacity'
            ],
            lookupatts=['system_id', 'start_station_short_name'],
            rowexpander=start_station_missing_data_expander,
            defaultidvalue=-1)

        self.end_station_noncached_dimension = Dimension(
            name='end_station',
            key='end_station_id',
            attributes=[
                'system_id', 'end_station_short_name', 'end_station_name',
                'end_station_latitude', 'end_station_longitude',
                'end_station_capacity'
            ],
            lookupatts=['system_id', 'end_station_short_name'],
            rowexpander=end_station_missing_data_expander,
            defaultidvalue=-1)