예제 #1
0
def func1():
    pgconn = MySQLdb.Connect(host="192.168.1.42",port=3307, user="******",passwd="0.618", db="test",charset="utf8")
    connection = pygrametl.ConnectionWrapper(pgconn)
    connection.setasdefault()

    student = CSVSource(file("./resource/student.txt",'r',100000),delimiter=',')

    studentdim=CachedDimension(name="Student",
                              key="id",
                              attributes=("studentid",'name',"birthday"),
                              lookupatts=("studentid",))
    
    score = CSVSource(file("./resource/studentscore.txt",'r',100000),delimiter=',')

    scoredim=CachedDimension(name="StudentScore",
                              key="id",
                              attributes=("studentid",'coursename',"score"),
                              lookupatts=("studentid",))
    
    mjdata = MergeJoiningSource(student,'no',score,'no')
    
    for row in mjdata:
        row['birthday'] =  datetime.strptime(row['birthday'],'%Y-%m-%d').date()#pygrametl.getdate(connection,row['birthday']) # Convert to an date
        studentdim.ensure(row, {'studentid':'no'})
        scoredim.ensure(row, {'studentid':'no','coursename':'course'})
        
    connection.commit()
예제 #2
0
def func1():
    pgconn = MySQLdb.Connect(host="192.168.1.42",port=3307, user="******",passwd="0.618", db="Data_Fundamental_Master_Genius",charset="utf8")
    connection = pygrametl.ConnectionWrapper(pgconn)
    connection.setasdefault()

    rehabdata = CSVSource(file(ur"d:\temp\ReHis000001.csv",'r',10000),delimiter=',')

    rehabdim=CachedDimension(name="SFM_StockAdjPrices",
                              key="RecordID",
                              attributes=("MarketCode",'StockListID',"TradeDate","PreClosePrice","OpenPrice","ClosePrice","HighPrice","LowPrice","PriceAdjType"),
                              lookupatts=("MarketCode",'StockListID',"TradeDate","PriceAdjType"))
    
    title_mapping={"StockListID":"stockcode",
                   "TradeDate":"tradedate",
                   "PreClosePrice":"preClosePrice",
                   "OpenPrice":"OpenClosePrice",
                   "ClosePrice":"ClosePrice"}
    
    for row in rehabdata:
        row['MarketCode'] = 102
        row['PriceAdjType'] = 1
        rehabdim.ensure(row, title_mapping)
        
    connection.commit()
예제 #3
0
파일: dm.py 프로젝트: yanpan2017/PYELT
    def cls_to_pygram_dim(cls, schema_name, lookup_fields=[]):
        # cls.cls_init_cols()
        # if not lookup_fields:

        lookup_fields = cls.cls_get_lookup_fields()
        if lookup_fields:
            dim = CachedDimension(name=schema_name + '.' + cls.cls_get_name(),
                                  key='id',
                                  attributes=cls.cls_get_column_names_no_id(),
                                  lookupatts=lookup_fields,
                                  cachefullrows=True)
        else:
            dim = Dimension(name=schema_name + '.' + cls.cls_get_name(),
                            key='id',
                            attributes=cls.cls_get_column_names_no_id())
        return dim
예제 #4
0
    # Convert the date from a string to a python `Date` object.
    date = datetime.strptime(date, '%Y-%m-%d').date()
    row['location_year'] = date.year

    # The year for which to retrieve the GDP is hard-coded to simplify the ETL
    # process, and because the data only covers 2012.
    row['gdp'] = pygrametl.getvalue(row, '2012', namemapping)

    return row

# Data dimensions

locationdim = CachedDimension(
    name='Location',
    key='location_skey',
    attributes=['location_type', 'location_key', 'city', 'country', 'gdp',
        'population', 'life_expectancy', 'anav_income', 'location_year'],
    lookupatts=['location_key'],
    rowexpander=locationhandling)

productdim = CachedDimension(
    name='Product',
    key='product_skey',
    attributes=['product_key', 'product_name', 'category', 'energy',
        'carbohydrates', 'fat', 'protein', 'product_year'],
    lookupatts=['product_key'],
    rowexpander=producthandling)

datedim = CachedDimension(
    name='Date',
    key='date_key',
예제 #5
0
    }

cnx = sql.connect(**login)
cur = cnx.cursor()
connection = etl.ConnectionWrapper(cnx)
connection.setasdefault()


# define dimension object for ETL
# Note that:
# - pygrametl object table names are DIM_xxx, FCT_yyy
# - MS SQL schema.table names are DIM.xxx, FCT.yyy
DIM_AFSLUITREDEN = CachedDimension(
name='DIM.AFSLUITREDEN',
key='afs_id',
attributes=['afs_afsluitreden_code'],
size=0,
prefill=True
)

DIM_BEHANDELING = CachedDimension(
    name='DIM.BEHANDELING',
    key='beh_id',
    attributes=['beh_dbc_specialisme_code', 'beh_dbc_behandeling_code'],
    size=0,
    prefill=True
)

DIM_DAG = CachedDimension(
    name='DIM.DAG',
    key='dag_id',
예제 #6
0
    #               null=str(nullval), columns=atts)


def datehandling(row):
    readdatetime = datetime.datetime.strptime(row['EndTime'],
                                              '%Y-%m-%d %H:%M:%S')
    row['readdate'] = readdatetime.strftime('%Y-%m-%d')
    row['readtime'] = readdatetime.hour


def accounttypehandling(row):
    row['type'] = row['subtype'].split('-')[0]


usage_type_dim = CachedDimension(name='smas_water_Type',
                                 key='typeid',
                                 lookupatts=['subtype'],
                                 attributes=['type', 'subtype'])

customer_dim = CachedDimension(
    name='smas_water_customer',
    key='custid',
    lookupatts=['accountno'],
    attributes=['accountno', 'street', 'city', 'province', 'postcode'])

meter_dim = CachedDimension(name='smas_water_meter',
                            key='meterid',
                            lookupatts=['meterno'],
                            attributes=['meterno', 'latitude', 'longitude'])

hourlyreading_fact = BulkFactTable(
    name='smas_water_hourlyreading',
예제 #7
0
                                                "activate_date",
                                                "deactivate_date", "version",
                                                "valid_from", "valid_to"
                                            ],
                                            lookupatts=["name"],
                                            versionatt="version",
                                            fromatt="valid_from",
                                            toatt="valid_to",
                                            srcdateatt="lastmoddate",
                                            cachesize=-1)

time_dimension = CachedDimension(
    name='dim.time',
    key='time_id',
    attributes=[
        't_year', 't_month', 't_day', 't_hour', 'day_of_week',
        'is_fall_semester', 'is_holiday', 't_timestamp'
    ],
    lookupatts=["t_year", "t_month", "t_day", "t_hour"]
    #rowexpander=time_rowexpander
)

store_dimension = SlowlyChangingDimension(
    name="dim.store",
    key="store_id",
    attributes=["name", "description", "version", "valid_from", "valid_to"],
    lookupatts=["name"],
    versionatt="version",
    fromatt="valid_from",
    toatt="valid_to",
    srcdateatt="lastmoddate",
    cachesize=-1)
예제 #8
0
    def run(self):
        with self.output().open() as f:

"""

# The actual database connection is handled using a PEP 249 connection
pgconn = psycopg2.connect("""dbname='DwColegio' user='******'""")

# This ConnectionWrapper will be set as default and is then implicitly used.
# A reference to the wrapper is saved to allow for easy access of it later
conn = pygrametl.ConnectionWrapper(connection=pgconn)

dim_fecha = CachedDimension(name='DimFecha',
                            key='id',
                            attributes=['semestre', 'ano'],
                            lookupatts=['semestre', 'ano'],
                            prefill=True)

dim_colegio = CachedDimension(name='DimColegio',
                              key='id',
                              attributes=['nombre', 'financiamiento'],
                              lookupatts=['semestre', 'ano'],
                              prefill=True)


#create fecha table
def duprange(f, l):
    for i in range(f, l):
        yield i
        yield i
예제 #9
0
    'port': config.get('local_mssql', 'port'),
    'database': config.get('wob_ggz', 'database')
}

cnx = sql.connect(**login)
cur = cnx.cursor()
connection = etl.ConnectionWrapper(cnx)
connection.setasdefault()

# define dimension object for ETL
# Note that:
# - pygrametl object table names are DIM_xxx, FCT_yyy
# - MS SQL schema.table names are DIM.xxx, FCT.yyy
DIM_AFSLUITREDEN = CachedDimension(name='DIM.AFSLUITREDEN',
                                   key='afs_id',
                                   attributes=['afs_afsluitreden_code'],
                                   size=0,
                                   prefill=True)

DIM_CIRCUIT = CachedDimension(name='DIM.CIRCUIT',
                              key='cct_id',
                              attributes=['cct_circuit_code'],
                              size=0,
                              prefill=True)

DIM_DAG = CachedDimension(name='DIM.DAG',
                          key='dag_id',
                          attributes=['dag_datum'],
                          size=0,
                          prefill=True)
예제 #10
0
    # Take the 'www.domain.org' part from 'http://www.domain.org/page.html'
    # We also the host name ('www') in the domain in this example.
    domaininfo = row['url'].split('/')[-2]
    row['domain'] = domaininfo
    # Take the top level which is the last part of the domain
    row['topleveldomain'] = domaininfo.split('.')[-1]


def extractserverinfo(row):
    # Find the server name from a string like "ServerName/Version"
    row['server'] = row['serverversion'].split('/')[0]


# Dimension and fact table objects
topleveldim = CachedDimension(name='topleveldomain',
                              key='topleveldomainid',
                              attributes=['topleveldomain'])

domaindim = CachedDimension(name='domain',
                            key='domainid',
                            attributes=['domain', 'topleveldomainid'],
                            lookupatts=['domain'])

serverdim = CachedDimension(name='server',
                            key='serverid',
                            attributes=['server'])

serverversiondim = CachedDimension(name='serverversion',
                                   key='serverversionid',
                                   attributes=['serverversion', 'serverid'])
# Connection to target DW:
import MySQLdb
myconn = MySQLdb.connect(user='******', passwd='hola',db='Estadisticas')
connection=pygrametl.ConnectionWrapper(myconn)
connection.setasdefault()


def loader(name,atts,fieldsep,rowsep,nullval,filehandle):
	curs=MySQLConnection.cursor()
	curs.copy_from(file=filehandle,table=name,sep=fieldsep, null=str(nullval),columns=atts)

#base de datos
sgbstdn = CachedDimension(
	name='SGBSTDN',
	key = 'matricula',
	attributes = ['nombre','paterno', 'materno', 'degc_code','class_code'],
	lookupatts = ['matricula']
)

scbcrse = CachedDimension(
	name='SCBCRSE',	
	key = 'cvemat',
	attributes = ['nommat','clase','lab','unidades'],
	lookupatts = ['cvemat']
)


ssbsect_algo = CachedDimension(
	name = 'SSBSECT',
	key = 'crn',
	attributes = ['cvemat', 'grupo', 'levl_code', 'coll_code', 'dept_code']
예제 #12
0
def load_dimensions(output_conn):
    dw_conn_wrapper = pygrametl.ConnectionWrapper(connection=output_conn)
    ret = dict()
    ret['dim_datetime'] = CachedDimension(name='dim_datetime',
                                          key='datetime_id',
                                          attributes=[
                                              'epoch', 'minute', 'minute_20',
                                              'minute_30', 'hour',
                                              'day_of_week', 'day_of_month',
                                              'week', 'month', 'year', 'period'
                                          ],
                                          lookupatts=['epoch'],
                                          size=0,
                                          prefill=True,
                                          targetconnection=dw_conn_wrapper)
    ret['dim_location'] = TypeOneSlowlyChangingDimension(
        name='dim_location',
        key='location_id',
        attributes=[
            'lookup_location',
            'initial_id',
            'company_code',
            'street',
            'ward',
            'district',
            'city',
            'area',
            'country',
            'level1flag',
            'level2flag',
            'level3flag',
            'level4flag',
            'level5flag',
            'level6flag',
        ],
        lookupatts=['lookup_location'],
        cachesize=0,
        prefill=True,
        targetconnection=dw_conn_wrapper)
    ret['dim_employee'] = TypeOneSlowlyChangingDimension(
        name='dim_employee',
        key='employee_id',
        attributes=[
            'lookup_employee', 'initial_id', 'company_code', 'login', 'name',
            'active', 'mobile', 'email'
        ],
        lookupatts=['lookup_employee'],
        cachesize=0,
        prefill=True,
        targetconnection=dw_conn_wrapper)
    ret['dim_partner'] = TypeOneSlowlyChangingDimension(
        name='dim_partner',
        key='partner_id',
        attributes=[
            'lookup_partner', 'initial_id', 'company_code', 'name', 'ref',
            'is_company', 'active', 'customer', 'supplier', 'employee',
            'state', 'seq', 'seq_order', 'street_id', 'classify', 'total_sh'
        ],
        lookupatts=['lookup_partner'],
        cachesize=0,
        prefill=True,
        targetconnection=dw_conn_wrapper)

    ret['dim_company'] = TypeOneSlowlyChangingDimension(
        name='dim_company',
        key='company_id',
        attributes=['company_code', 'company_name'],
        lookupatts=['company_code'],
        cachesize=0,
        prefill=True,
        targetconnection=dw_conn_wrapper)

    return ret
예제 #13
0
    def __create_tables(self):

        # Systems
        self.system_dimension = CachedDimension(name='system',
                                                key='system_id',
                                                attributes=['system_name'],
                                                lookupatts=['system_name'])

        # Stations

        self.start_station_dimension = CachedDimension(
            name='start_station',
            key='start_station_id',
            attributes=[
                'system_id', 'start_station_short_name', 'start_station_name',
                'start_station_latitude', 'start_station_longitude',
                'start_station_capacity'
            ],
            lookupatts=['system_id', 'start_station_short_name'],
            rowexpander=start_station_missing_data_expander)

        self.end_station_dimension = CachedDimension(
            name='end_station',
            key='end_station_id',
            attributes=[
                'system_id', 'end_station_short_name', 'end_station_name',
                'end_station_latitude', 'end_station_longitude',
                'end_station_capacity'
            ],
            lookupatts=['system_id', 'end_station_short_name'],
            rowexpander=end_station_missing_data_expander)

        # Trip dates and times

        self.date_dimension = CachedDimension(
            name='bdate',
            key='date_id',
            attributes=['year', 'month', 'day', 'day_of_week', 'date_string'],
            lookupatts=['date_string'],
            rowexpander=date_row_expander)

        self.time_dimension = CachedDimension(
            name='btime',
            key='time_id',
            attributes=['hour', 'minute', 'time_string', 'time_of_day'],
            lookupatts=['time_string'],
            rowexpander=time_row_expander)

        # Trips

        self.trip_fact_table = FactTable(
            name='trips',
            measures=['duration_s'],
            keyrefs=[
                'system_id', 'start_station_id', 'end_station_id', 'date_id',
                'time_id', 'customer_birthyear_id', 'customer_gender_id',
                'customer_type_id', 'bike_id', 'trip_category_id'
            ])

        # weather fact table and date dimension

        self.weather_fact_table = FactTable(name='weather',
                                            measures=[
                                                'precipitation_in', 'snow_in',
                                                'temp_avg_f', 'temp_min_f',
                                                'temp_max_f', 'wind_mph'
                                            ],
                                            keyrefs=['system_id', 'date_id'])

        self.trip_category = CachedDimension(name='trip_category',
                                             key='trip_category_id',
                                             attributes=['trip_category'])

        self.bike_dimension = CachedDimension(
            name='bikes',
            key='bike_id',
            attributes=['system_id', 'bike_name'],
            lookupatts=['system_id', 'bike_name'],
            defaultidvalue=-1)

        self.customer_gender_dimension = CachedDimension(
            name='customer_gender',
            key='customer_gender_id',
            attributes=['customer_gender'],
            lookupatts=['customer_gender'])

        self.customer_birthyear_dimension = CachedDimension(
            name='customer_birthyear',
            key='customer_birthyear_id',
            attributes=['customer_birthyear'],
            lookupatts=['customer_birthyear'])

        self.customer_type_dimension = CachedDimension(
            name='customer_type',
            key='customer_type_id',
            attributes=['customer_type'],
            lookupatts=['customer_type'])

        # Station status
        self.station_status_fact_table = FactTable(
            name='station_status',
            keyrefs=['system_id', 'start_station_id', 'date_id', 'time_id'],
            measures=['bikes_available', 'docks_available'])

        # Non-cached version of stations for use only with updating Indego stations.

        self.start_station_noncached_dimension = Dimension(
            name='start_station',
            key='start_station_id',
            attributes=[
                'system_id', 'start_station_short_name', 'start_station_name',
                'start_station_latitude', 'start_station_longitude',
                'start_station_capacity'
            ],
            lookupatts=['system_id', 'start_station_short_name'],
            rowexpander=start_station_missing_data_expander,
            defaultidvalue=-1)

        self.end_station_noncached_dimension = Dimension(
            name='end_station',
            key='end_station_id',
            attributes=[
                'system_id', 'end_station_short_name', 'end_station_name',
                'end_station_latitude', 'end_station_longitude',
                'end_station_capacity'
            ],
            lookupatts=['system_id', 'end_station_short_name'],
            rowexpander=end_station_missing_data_expander,
            defaultidvalue=-1)