key='zgv_id', attributes=['zgv_dbc_specialisme_code', 'zgv_dbc_zorgvraag_code'], size=0, prefill=True ) FCT_SUBTRAJECT = BulkFactTable( name='FCT.SUBTRAJECT', keyrefs=['beh_id', 'dag_id_begindatum_zorgtraject', 'dag_id_einddatum_zorgtraject', 'dag_id_begindatum_subtraject', 'dag_id_einddatum_subtraject', 'dag_id_declaratiedatum', 'dia_id', 'stn_id', 'zgt_id', 'zgv_id', 'zpr_id', 'zvs_id_behandelend', 'zvs_id_verwijzend'], measures=['geslacht', 'heeft_oranje_zorgactiviteit', 'heeft_zorgactiviteit_met_machtiging', 'is_hoofdtraject', 'is_aanspraak_zvw', 'is_aanspraak_zvw_toegepast', 'is_zorgactiviteitvertaling_toegepast', 'fct_omzet_ziekenhuis', 'fct_omzet_honorarium_totaal'], nullsubst='', fieldsep='\t', rowsep='\r\n', usefilename=True, bulkloader=mssql_bulkloader ) def load_str_dot(file, config): """Method for loading one subtraject file of WOB ZZ DOT Main ETL method for WOB ZZ subtrajecten. Requires active pygrametl connection as global.
key='product_skey', attributes=['product_key', 'product_name', 'category', 'energy', 'carbohydrates', 'fat', 'protein', 'product_year'], lookupatts=['product_key'], rowexpander=producthandling) datedim = CachedDimension( name='Date', key='date_key', attributes=['date', 'day_of_week', 'week_in_year', 'month', 'year', 'weekend'], lookupatts=['date']) facttbl = BulkFactTable( name='ProductPrice', keyrefs=['date_key', 'pp_key', 'product_key', 'location_key'], measures=['price'], # TODO: determine if any other measures should be listed bulkloader=pgcopybulkloader, bulksize=5000000, nullsubst='N/A') # TODO: replace this with a better NULL substitution value # Data sources # The buffer size is set to 16384 B, because this performs best, according to # the pygrametl developers. data_set = CSVSource(open(DATA_FILE, 'r', 16384), delimiter=',') gdp_data_set = CSVSource(open(GDP_FILE, 'r', 16384), delimiter=',') pop_data_set = CSVSource(open(POPULATION_FILE, 'r', 16384), delimiter=',')
attributes=['type', 'subtype']) customer_dim = CachedDimension( name='smas_water_customer', key='custid', lookupatts=['accountno'], attributes=['accountno', 'street', 'city', 'province', 'postcode']) meter_dim = CachedDimension(name='smas_water_meter', key='meterid', lookupatts=['meterno'], attributes=['meterno', 'latitude', 'longitude']) hourlyreading_fact = BulkFactTable( name='smas_water_hourlyreading', keyrefs=['typeid', 'custid', 'meterid'], measures=['readtime', 'reading', 'temperature'], bulkloader=pgcopybulkloader, bulksize=500000) dailyreading_fact = BulkFactTable( name='smas_water_dailyreading', keyrefs=['typeid', 'custid', 'meterid'], measures=['readtime', 'reading', 'temperature'], bulkloader=pgcopybulkloader, bulksize=500000) # Data sources - change the path if you have your files somewhere else #reading = SQLSource(connection, # 'SELECT meterid, datavalue, endtime FROM water_meterreading_tmp', # names=['meterno', 'reading', 'readtimestamp']) #
attributes=[ "gender", "is_active", "start_year", "version", "valid_from", "valid_to" ], lookupatts=["member_id"], versionatt="version", fromatt="valid_from", toatt="valid_to", srcdateatt="lastmoddate", cachesize=-1) # TODO: Use BulkFactTable or BatchFactTable fact_table = BulkFactTable( name="fct.sale", keyrefs=["fk_product_id", "fk_time_id", "fk_store_id", "fk_member_id"], measures=["price"], bulkloader=pgcopybulkloader) ### Dimension Filling ### def fill_product_dimension(): #id;name;price;active;deactivate_date;quantity;alcohol_content_ml;start_date for srcrow in product_source: try: dimrow = { 'product_id': srcrow['id'], 'name': re.sub('<[^<]+?>', '', srcrow['name']).strip() # Remove HTML tags
DIM_ZORGTYPE = CachedDimension(name='DIM.ZORGTYPE', key='zgt_id', attributes=['zgt_zorgtype_code'], size=0, prefill=True) FCT_SUBTRAJECT = BulkFactTable( name='FCT.SUBTRAJECT', # NB: order matters for MS SQL bulk insert! keyrefs=[ 'stn_id', 'dag_id_begindatum_zorgtraject', 'dag_id_einddatum_zorgtraject', 'dag_id_begindatum_subtraject', 'dag_id_einddatum_subtraject', 'dag_id_diagnose', 'zgt_id', 'dia_id_primair', 'prg_id', 'psc_id', 'cct_id', 'afs_id', 'lnd_id' ], measures=[ 'geslacht', 'diagnose_trekken_van', 'fct_verkoopprijs', 'fct_tarief', 'fct_verrekenbedrag' ], nullsubst='', fieldsep='\t', rowsep='\r\n', usefilename=True, bulkloader=mssql_bulkloader) def load_subtraject(file, config): """Method for loading one subtraject file of WOB ZZ DOT Main ETL method for WOB GGZ subtrajecten.
key='testid', attributes=['testname', 'testauthor'], lookupatts=['testname'], prefill=True, defaultidvalue=-1) datedim = CachedDimension( name='date', key='dateid', attributes=['date', 'day', 'month', 'year', 'week', 'weekyear'], lookupatts=['date'], rowexpander=datehandling) facttbl = BulkFactTable(name='testresults', keyrefs=['pageid', 'testid', 'dateid'], measures=['errors'], bulkloader=pgcopybulkloader, bulksize=5000000) # Data sources - change the path if you have your files somewhere else downloadlog = CSVSource(file('../generator/DownloadLog.csv', 'r', 16384), delimiter='\t') testresults = CSVSource(file('../generator/TestResults.csv', 'r', 16384), delimiter='\t') inputdata = MergeJoiningSource(downloadlog, 'localfile', testresults, 'localfile') def main():
ssbsect_algo = CachedDimension( name = 'SSBSECT', key = 'crn', attributes = ['cvemat', 'grupo', 'levl_code', 'coll_code', 'dept_code'] ) #merge tablas que utiliza fk ssbsect = SnowflakedDimension( [(ssbsect_algo,(scbcrse))] ) facttabl= BulkFactTable( name='aaa', keyrefs=['matricula','cvemat','crn'], measures=['errors'], bulkloader=loader ) #de donde se baja la info // cambiar path al de bd resSGBSTDN = CSVSource(file('./vistaSGBSTDN.csv','r',16384), delimiter='\t') resSCBCRSE = CSVSource(file('./vistaSCBCRSE.csv','r',16384), delimiter='\t') resSSBSECT = CSVSource(file('./vistaSSBSECT.csv','r',16384), delimiter='\t') inputdata=MergeJoiningSource(resSCBCRSE, 'cvemat',resSSBSECT, 'cvemat') def main(): for row in inputdata: print row row['cvemat']=scbcrse.ensure(row,'nommat':'hola')