示例#1
0
    'attr2': 74,
    'attr3': 74,
    'attr4': 74
}, {
    'attr1': 75,
    'attr2': 75,
    'attr3': 75,
    'attr4': 75
}, {
    'attr1': 76,
    'attr2': 76,
    'attr3': 76,
    'attr4': 76
}]

wrapper = pygrametl.ConnectionWrapper(connection=conn)

dim1 = Dimension(name='dim1',
                 key='key1',
                 attributes=['attr1', 'key2', 'key3'],
                 lookupatts=['attr1'])

dim2 = Dimension(name='dim2',
                 key='key2',
                 attributes=['attr2', 'key4'],
                 lookupatts=['attr2'])

dim3 = Dimension(name='dim3', key='key3', attributes=['attr3'])

dim4 = Dimension(name='dim4', key='key4', attributes=['attr4'])
示例#2
0
import sys
dir = sys.path[0]
# cesta k lib
sys.path.append(dir + '\\lib') 

import pygrametl, MySQLdb
from pygrametl.datasources import *
from pygrametl.tables import *

mysql_conn_target = MySQLdb.connect(host='localhost', user='******', passwd='', db='dwh')
mysql_conn_source = MySQLdb.connect(host='localhost', user='******', passwd='', db='karty')

mysql_conn_source.set_character_set('utf8')
mysql_conn_target.set_character_set('utf8')

conn_source = pygrametl.ConnectionWrapper(mysql_conn_source)
conn_target = pygrametl.ConnectionWrapper(mysql_conn_target)

query_target = 'TRUNCATE TABLE facilitytype'
conn_target.execute( query_target )
conn_target.commit()

# zdroj dat pre dimenziu
query_source = 'SELECT id, name, description FROM facilitytype'
facilitytype_source = SQLSource(connection=conn_source, query=query_source, names=(), initsql=None, cursorarg=None)

# dimenzia
facility_dim = CachedDimension(
    targetconnection = conn_target, 
    name='facilitytype', 
    key='id',
示例#3
0
DB_NAME='csi4142'
DB_USER='******'
DB_HOST='localhost'
DB_PASS=''

# Global variables used for in-memory stores.
POP_DATA = {}
LIFE_EXPECTANCY_DATA = {}
GNI_DATA = {}
NUTRITION_DATA = {}


# Connection to the target data warehouse:
pgconn = psycopg2.connect(dbname=DB_NAME, user=DB_USER, host=DB_HOST,
        password=DB_PASS)
connection = pygrametl.ConnectionWrapper(pgconn)
connection.setasdefault()
connection.execute('set search_path to csi4142project')


# Methods
def pgcopybulkloader(name, atts, fieldsep, rowsep, nullval, filehandle):
    # Here we use driver-specific code to get fast bulk loading.
    # You can change this method if you use another driver or you can
    # use the FactTable or BatchFactTable classes (which don't require
    # use of driver-specifc code) instead of the BulkFactTable class.
    global connection
    curs = connection.cursor()
    curs.copy_from(file=filehandle, table=name, sep=fieldsep,
                   null=str(nullval), columns=atts)
示例#4
0
    'attr1': None
}, {
    'attr1': 1
}, {
    'attr1': 25
}, {
    'attr1': None
}, {
    'attr1': 5
}, {
    'attr1': None
}, {
    'attr1': None
}]

wrapper = pygrametl.ConnectionWrapper(connection=null_conn)

dim1 = Dimension(
    name='dim1',
    key='key1',
    attributes=['attr1'],
)

for row in data:
    dim1.insert(row)

dim_rep = DimRepresentation(dim1, null_conn)
notnull_tester = ColumnNotNullPredicate('dim1')
null_rep = DWRepresentation([dim_rep], null_conn)

print(notnull_tester.run(null_rep))
示例#5
0
from pygrametl.tables import Dimension, FactTable

# Creation of a database connection to the sales database with a simple
# connection string, specifying the necessary host, username and passowrd
sales_string = "host='localhost' dbname='source' user='******' password='******' port=54320"
sales_pgconn = psycopg2.connect(sales_string)

# A connection is also created for the data warehouse. The connection is
# then given to a ConnectionWrapper for it to implicitly shared between
# all the pygrametl abstractions that needs it with being passed around
dw_string = "host='localhost' dbname='etl' user='******' password='******' port=54320"
dw_pgconn = psycopg2.connect(dw_string)

# Although the ConnectionWrapper is shared automatically between pygrametl
# abstractions, we still save in in a variable to allow for it to be closed
dw_conn_wrapper = pygrametl.ConnectionWrapper(connection=dw_pgconn)

# As the location dimension stores the name of a location in the attribute
# "city" instead of in the attribute "store" as in the input data from the
# sales relation, a sequence of names matching the number of attributes in
# the relation is created, allowing the SQLSource to do the mapping for us
name_mapping= 'book', 'genre', 'city', 'timestamp', 'sale'

# Extraction of rows from a database using a PEP 249 connection and SQL
sales_source = SQLSource(connection=sales_pgconn, \
                         query="SELECT * FROM sales", names=name_mapping)

# Extraction of rows from a CSV file does not require SQL, just an open file
# handle to the file, as pygrametl uses Pythons DictReader for CSV files,
# and the header of the CSV file contains information about each column.
region_file_handle = open('c:\\work\\python\\region.csv', 'r', 16384)
# setup connection to database
config = configparser.ConfigParser()
config.read('/opt/projects/wob_zz/config.ini')

login = {
    'user': config.get('local_mssql', 'user'),
    'password': config.get('local_mssql', 'password'),
    'server': config.get('local_mssql', 'server'),
    'port': config.get('local_mssql', 'port'),
    'database': config.get('wob_zz', 'database')
    }

cnx = sql.connect(**login)
cur = cnx.cursor()
connection = etl.ConnectionWrapper(cnx)
connection.setasdefault()


# define dimension object for ETL
# Note that:
# - pygrametl object table names are DIM_xxx, FCT_yyy
# - MS SQL schema.table names are DIM.xxx, FCT.yyy
DIM_AFSLUITREDEN = CachedDimension(
name='DIM.AFSLUITREDEN',
key='afs_id',
attributes=['afs_afsluitreden_code'],
size=0,
prefill=True
)
示例#7
0
""" A sample pygrametl program
"""

__author__ = 'Mathias Claus Jensen'

import pygrametl
from pygrametl.datasources import SQLSource
from pygrametl.tables import Dimension, FactTable
import sqlite3

input_conn = sqlite3.connect('input.db')
output_conn = sqlite3.connect('output.db')

input_src = SQLSource(input_conn, query='SELECT * dim1')
output_wrapper = pygrametl.ConnectionWrapper(connection=output_conn)

dim1 = Dimension(name='dim1', key='key1', attributes=['attr1', 'attr2'])

dim1 = Dimension(name='dim2', key='key2', attributes=['attr3', 'attr4'])

ft1 = FactTable(name='ft1', keyrefs=[
    'key1',
])

input_conn.close()
output_conn.close()
示例#8
0
import pyodbc
import pygrametl
import ConnectionStrings as CS
import SourceSQLQueries as SSQ
from datetime import datetime

from pygrametl.datasources import TypedCSVSource, SQLSource
from pygrametl.tables import Dimension, TypeOneSlowlyChangingDimension, FactTable, AccumulatingSnapshotFactTable

# Open a connection to the OLTP AntBil
AntBilReplication_conn = pyodbc.connect(CS.AntBilReplication_string)

# Open a connection to the DW AntBil and create a ConnectionWrapper
AntBilDW_conn = pyodbc.connect(CS.AntBilDW_string)
AntBilDW_conn_wrapper = pygrametl.ConnectionWrapper(AntBilDW_conn)
AntBilDW_conn_wrapper.setasdefault()

# Create the data source of each dimension table
attribute_mapping = {'DateKey' : int, 'DayOfWeek' : int, 'DayOfMonth' : int, 'DayOfYear' : int, 'WeekOfYear' : int, \
                    'MonthOfYear' : int, 'CalendarQuarter' : int, 'CalendarYear' : int, 'FiscalMonthOfYear' : int, \
                    'FiscalQuarter' : int, 'FiscalYear' : int}
DimDate_source = TypedCSVSource(f=open('DimDate_2017-2037.csv', 'r', 16384),
                                casts=attribute_mapping,
                                delimiter=',')
DimGroup_source = SQLSource(connection=AntBilReplication_conn,
                            query=SSQ.DimGroup_query)
DimGroupCategory_source = SQLSource(connection=AntBilReplication_conn,
                                    query=SSQ.DimGroupCategory_query)
DimRole_source = SQLSource(connection=AntBilReplication_conn,
                           query=SSQ.DimRole_query)
DimCandidate_source = SQLSource(connection=AntBilReplication_conn,
# -*- coding: utf-8 -*-

import datetime
import sys
import time

sys.path.append('/home/Documents') #donde está ubicado pygrametl

import pygrametl
from pygrametl.datasources import CSVSource, MergeJoiningSource
from pygrametl.tables import CachedDimension, SnowflakedDimension,BulkFactTable

# Connection to target DW:
import MySQLdb
myconn = MySQLdb.connect(user='******', passwd='hola',db='Estadisticas')
connection=pygrametl.ConnectionWrapper(myconn)
connection.setasdefault()


def loader(name,atts,fieldsep,rowsep,nullval,filehandle):
	curs=MySQLConnection.cursor()
	curs.copy_from(file=filehandle,table=name,sep=fieldsep, null=str(nullval),columns=atts)

#base de datos
sgbstdn = CachedDimension(
	name='SGBSTDN',
	key = 'matricula',
	attributes = ['nombre','paterno', 'materno', 'degc_code','class_code'],
	lookupatts = ['matricula']
)
示例#10
0
def load_dimensions(output_conn):
    dw_conn_wrapper = pygrametl.ConnectionWrapper(connection=output_conn)
    ret = dict()
    ret['dim_datetime'] = CachedDimension(name='dim_datetime',
                                          key='datetime_id',
                                          attributes=[
                                              'epoch', 'minute', 'minute_20',
                                              'minute_30', 'hour',
                                              'day_of_week', 'day_of_month',
                                              'week', 'month', 'year', 'period'
                                          ],
                                          lookupatts=['epoch'],
                                          size=0,
                                          prefill=True,
                                          targetconnection=dw_conn_wrapper)
    ret['dim_location'] = TypeOneSlowlyChangingDimension(
        name='dim_location',
        key='location_id',
        attributes=[
            'lookup_location',
            'initial_id',
            'company_code',
            'street',
            'ward',
            'district',
            'city',
            'area',
            'country',
            'level1flag',
            'level2flag',
            'level3flag',
            'level4flag',
            'level5flag',
            'level6flag',
        ],
        lookupatts=['lookup_location'],
        cachesize=0,
        prefill=True,
        targetconnection=dw_conn_wrapper)
    ret['dim_employee'] = TypeOneSlowlyChangingDimension(
        name='dim_employee',
        key='employee_id',
        attributes=[
            'lookup_employee', 'initial_id', 'company_code', 'login', 'name',
            'active', 'mobile', 'email'
        ],
        lookupatts=['lookup_employee'],
        cachesize=0,
        prefill=True,
        targetconnection=dw_conn_wrapper)
    ret['dim_partner'] = TypeOneSlowlyChangingDimension(
        name='dim_partner',
        key='partner_id',
        attributes=[
            'lookup_partner', 'initial_id', 'company_code', 'name', 'ref',
            'is_company', 'active', 'customer', 'supplier', 'employee',
            'state', 'seq', 'seq_order', 'street_id', 'classify', 'total_sh'
        ],
        lookupatts=['lookup_partner'],
        cachesize=0,
        prefill=True,
        targetconnection=dw_conn_wrapper)

    ret['dim_company'] = TypeOneSlowlyChangingDimension(
        name='dim_company',
        key='company_id',
        attributes=['company_code', 'company_name'],
        lookupatts=['company_code'],
        cachesize=0,
        prefill=True,
        targetconnection=dw_conn_wrapper)

    return ret
示例#11
0
def run_fact_etl(fact_name,
                 class_name,
                 pygram_fact_factory,
                 source_sql,
                 source_conn,
                 output_conn,
                 create_sql,
                 dimensions={}):
    # print current time
    print('current time is {}'.format(datetime.datetime.now()))

    # create connection to dw
    dw_conn_wrapper = pygrametl.ConnectionWrapper(connection=output_conn)
    # TODO: add try statement to raise error

    # create fact_table_object
    pygram_fact_class = pygram_fact_factory["class"]
    pygram_fact_object = pygram_fact_class(
        name=pygram_fact_factory["name"],
        measures=pygram_fact_factory["measures"],
        keyrefs=pygram_fact_factory["keyrefs"],
        targetconnection=dw_conn_wrapper)

    # create fact table by create_sql
    cursor = output_conn.cursor()
    logger.info('create {} if not exist'.format(fact_name))
    print('create {} if not exist'.format(fact_name))
    cursor.execute(create_sql)
    output_conn.commit()

    # create index for each item of primary key group
    logger.info('create index of {} if not exist'.format(fact_name))
    print('create index of {} if not exist'.format(fact_name))
    for keyref in pygram_fact_factory['keyrefs']:
        cursor.execute('''CREATE INDEX IF NOT EXISTS {}_{}_idx
                  ON {}({})'''.format(fact_name, keyref, fact_name, keyref))
    output_conn.commit()

    # Create data_source
    logger.info('start query {}'.format(fact_name))
    print('start query {}'.format(fact_name))
    data_source = SQLSource(connection=source_conn, query=source_sql)

    # handle fact
    final_source = transform_handle(class_name, fact_name, data_source)

    # ensure into fact table
    list_data_source = list(final_source)
    length_source = len(list_data_source)
    if length_source == 0:
        logger.info('no record in query period')
        print('no record in query period')
    else:
        count = 1
        for row in list_data_source:
            row = add_foreign_keys(row, pygram_fact_factory["keyrefs"],
                                   dimensions)
            # logger debug pkey and value of row
            dict_keyref = {}
            for keyref in pygram_fact_factory['keyrefs']:
                dict_keyref[keyref] = row[keyref]
            for measure in pygram_fact_factory['measures']:
                dict_keyref[measure] = row[measure]
            logger.debug('row {}:{}'.format(count, dict_keyref))
            # The row can then be inserted into the fact table
            pygram_fact_object.ensure(row)
            progress(count, length_source, status='{}'.format(fact_name))
            count += 1
    print('done')
    output_conn.commit()
示例#12
0
def run_dimension_etl(dimension_name, class_name, pygram_dimension_factory,
                      source_sql, source_conn, output_conn, create_sql):
    """
    This function can be used in any kind of workflow (for example in a celery
    task) or in a simple main program.
    """
    # TODO: add null user to employee dimension
    # print current time
    print('current time is {}'.format(datetime.datetime.now()))
    # connection wrapper
    dw_conn_wrapper = pygrametl.ConnectionWrapper(connection=output_conn)

    # create dimension table by create_sql
    cursor = output_conn.cursor()
    logger.info('create {} if not exist'.format(dimension_name))
    print('create {} if not exist'.format(dimension_name))
    cursor.execute(create_sql)
    output_conn.commit()

    # create index for dimension
    logger.info('create index of {} if not exist'.format(dimension_name))
    print('create index of {} if not exist'.format(dimension_name))
    for lookupatt in pygram_dimension_factory['lookupatts']:
        cursor.execute('''CREATE INDEX IF NOT EXISTS {}_{}_idx
                      ON {}({})'''.format(dimension_name, lookupatt,
                                          dimension_name, lookupatt))
    output_conn.commit()

    # Create dimension
    pygram_dim_class = pygram_dimension_factory["class"]
    pygram_dim_object = pygram_dim_class(
        name=pygram_dimension_factory["name"],
        key=pygram_dimension_factory["key"],
        attributes=pygram_dimension_factory["attributes"],
        lookupatts=pygram_dimension_factory["lookupatts"],
        targetconnection=dw_conn_wrapper,
        cachesize=0,
        prefill=True)

    # TODO: handle datetime dimension here

    # Create data_source
    logger.info('start query {}'.format(dimension_name))
    print('start query {}'.format(dimension_name))
    if dimension_name in [
            'dim_datetime', 'dim_company', 'dim_call_center', 'dim_dong_ho_o',
            'dim_dong_ho_tong', 'dim_hoa_don_tai_chinh'
    ]:
        final_source = source_sql

    else:
        data_source = SQLSource(connection=source_conn, query=source_sql)
        final_source = transform_handle(class_name, dimension_name,
                                        data_source)

    # Ensure row into dimension
    list_data_source = list(final_source)
    length_source = len(list_data_source)
    count = 1
    for row in list_data_source:
        pygram_dim_object.scdensure(row)
        progress(count, length_source, status='{}'.format(dimension_name))
        count += 1
    print('done')

    output_conn.commit()
示例#13
0
    def run(self):
        self.file_name = True
        self.configure()
        if not self.file_name: return
        self.configureAudit()

        if self.verbose:
            print('Running job:', self.__class__.__name__)
            # print('Configuration:', self.conf)
            print('Arguments:', self.args)

        try:

            self.beforeJobAudit()

            counts = {
                'extract': 0,
                'insert': 0,
                'update': 0,
                'error': 0,
            }

            # Wrap the connection to use by pygrametl
            self.target_connection_wrap = pygrametl.ConnectionWrapper(
                connection=self.target_connection)
            source = self.getSource()
            target = self.getTarget()

            for row in source:
                # Catch any incompatible data warnings
                counts['extract'] += 1
                try:
                    prepared = self.prepareRow(row)

                    if self.verbose:
                        print('Inserting row:', prepared)

                    self.insertRow(target, prepared)
                    is_insert = True  # where to get this info
                    if is_insert:
                        counts['insert'] += 1
                    else:
                        counts['update'] += 1

                    if self.debug:
                        # Commit in debug mode so we can see inserted rows
                        self.target_connection_wrap.commit()
                        input(
                            'Row inserted successfully. Press Enter to continue...'
                        )
                except Exception as e:
                    counts['error'] += 1  # Should we log smth here
                    print(e)
                    self.logWarning(row, counts['extract'])
                    if self.verbose or self.debug:
                        print('Row could not be inserted due to an error.',
                              row)

                    if self.debug:
                        input('Press Enter to continue...')

            self.afterJobAudit(counts)
            if self.verbose:
                print("Commit the target database")
            self.target_connection.commit()
            self.target_connection.close()

        except Exception as e:
            self.logError(e)

        self.close()