def move_column(self, column, index): """ Move a column `Args:` column: str The column name to move index: The new index for the column `Returns:` `Parsons Table` and also updates existing object. """ self.table = petl.movefield(self.table, column, index) return self
import pyodbc import petl as etl import pymysql as mysql ########## Json extraction and maping tableJ = etl.fromjson('cust_data.json', header=['id','gender','first_name','last_name', 'email','ville']) tableJ = etl.movefield(tableJ, 'gender', 4) ########## CSV extraction and conversion tableCSV = etl.fromcsv('week_cust.csv') tableCSV = etl.convert(tableCSV, 'id', int) ########### Sqlserver connection and extraction connectionSqlServer=pyodbc.connect("Driver={SQL Server Native Client 11.0};" "Server=81_64_msdn;" "Database=BD4client;" "Trusted_Connection=yes;" "convert_unicode =True;") cursor = connectionSqlServer.cursor() cursor.execute('SELECT id, first_name, last_name, email, gender, ville FROM client_DATA') tableSqlServer = cursor.fetchall() tableSqlServer =[('id','first_name','last_name', 'email','gender','ville')]+tableSqlServer cursor.close() connectionSqlServer.close() ######### Staging area transforming and concatenation StagingArea = etl.cat(tableCSV, tableJ,tableSqlServer) StagingArea = etl.convert(StagingArea, 'gender', {'Male': 'M', 'Female': 'F', 'male': 'M', 'female': 'F', None: 'N'}) StagingArea = etl.rename(StagingArea, 'ville', 'city') ######## mysql connection = mysql.connect(host="localhost", user="******", passwd="", db="customerdatabase") curseur = connection.cursor() curseur.execute('SET SQL_MODE=ANSI_QUOTES') #### load data, assuming table " CustomerData" already exists in the database
def dimension_values(): connection = psycopg2.connect(dbname='voyager', user='******', password='******', host='172.16.0.45') engine = create_engine('postgresql://*****:*****@172.16.0.45:5432/voyager') com = 'select id as id_component, name as component from dim_com' table_com = etl.fromdb(connection, com) loc = 'select id as id_location, name as name from dim_loc' table_loc = etl.fromdb(connection, loc) tim = 'select id as id_time, time as timestamp from dim_time' table_time = etl.fromdb(connection, tim) print(table_com) print(table_loc) print(table_time) for ran in range(0, 65424, 1000): sql = "select * from KNMI_station_data kk " \ "RIGHT JOIN weatherstations w ON " \ " CAST (kk.weather_station_id AS INTEGER) = CAST (w.station_number AS INTEGER) " \ "WHERE w.station_number NOT LIKE \'NL%%\' AND date > 20190901 LIMIT 1000 OFFSET %s" % ran print(sql) table = etl.fromdb(connection, sql) print('knmi') print(table) table.log_progress() table = etl.convert(table, 'date', str) table = etl.convert(table, 'hour', str) table = etl.convert(table, 'temperature', int) table = etl.convert(table, 'temperature_dew', int) table = etl.convert(table, 'temperature_min', int) table = etl.convert(table, 'wind_speed_avg', int) table = etl.convert(table, 'wind_speed', int) table = etl.convert(table, 'wind_speed_max', int) table = etl.convert(table, 'temperature', lambda v: v / 10) table = etl.convert(table, 'temperature_dew', lambda v: v / 10) table = etl.convert(table, 'temperature_min', lambda v: v / 10) table = etl.convert(table, 'wind_speed_avg', lambda v: v / 10) table = etl.convert(table, 'wind_speed', lambda v: v / 10) table = etl.convert(table, 'wind_speed_max', lambda v: v / 10) df = pd.DataFrame(table) df.columns = df.iloc[0] df = df.drop(0) df['timestamp'] = df['date'] + df['hour'] df['weather_station_id'] = df['weather_station_id'].astype(str) df['timestamp'] = df['timestamp'].apply(custom_to_datetime) df['timestamp'] = df['timestamp'].astype(str) df = df.drop(columns=['date', 'hour'], axis=1) final_knmi_table = etl.fromdataframe(df) final_knmi_table = etl.melt(final_knmi_table, key=[ 'weather_station_id', 'timestamp', 'id', 'latitude', 'longitude', 'name', 'station_number', 'data_source_id', 'altitude' ]) final_knmi_table = etl.rename(final_knmi_table, 'variable', 'component') print(final_knmi_table) final_knmi_table2 = etl.join(final_knmi_table, table_com, key='component') final_knmi_table2 = etl.join(final_knmi_table2, table_loc, key='name') final_knmi_table2 = etl.join(final_knmi_table2, table_time, key='timestamp') print('dos') print(final_knmi_table2) df = pd.DataFrame(final_knmi_table2) df.columns = df.iloc[0] df = df.drop(0) fact_source = df[[ 'id_component', 'id_location', 'id_time', 'value', 'data_source_id', 'weather_station_id' ]] print(fact_source) fact_source.to_sql('fact_source', engine, if_exists='append', index=False, method='multi') for rn in range(0, 1148, 1000): print('lmn') final_lmn_table = etl.fromdb( connection, "select ld.id, ld.station_number, ld.value, ld.timestamp, ls.name as component, " "ws.id as lid, ws.latitude, ws.longitude, ws.data_source_id, ws.altitude, ws.name as name" " from luchtmeetnet_data ld " "right join luchtmeetnet_sensors ls on ld.formula = ls.formula " " join weatherstations ws on ld.station_number = ws.station_number " "where ws.station_number like \'NL%%\' AND timestamp > '2019-09-01' " "LIMIT 1000 OFFSET %s" % rn) final_lmn_table = etl.rename(final_lmn_table, {'station_number': 'weather_station_id'}) final_lmn_table = etl.movefield(final_lmn_table, 'timestamp', 1) # print(final_lmn_table) # print(final_lmn_table) # print(table_com) final_lmn_table2 = etl.join(final_lmn_table, table_com, key='component') # print(final_lmn_table2) final_lmn_table2 = etl.join(final_lmn_table2, table_loc, key='name') # print(final_lmn_table2) df = pd.DataFrame(final_lmn_table2) df.columns = df.iloc[0] df = df.drop(0) df['timestamp'] = df['timestamp'].str[:-6] # print(df) final_lmn_table2 = etl.fromdataframe(df) final_lmn_table2 = etl.join(final_lmn_table2, table_time, key='timestamp') # print(final_lmn_table2) print(final_lmn_table2) final_lmn_df = pd.DataFrame(final_lmn_table2) final_lmn_df.columns = final_lmn_df.iloc[0] final_lmn_df = final_lmn_df.drop(0) fact_source = final_lmn_df[[ 'id_component', 'id_location', 'id_time', 'value', 'data_source_id', 'weather_station_id' ]] print(fact_source) fact_source.to_sql('fact_source', engine, if_exists='append', index=False, method='multi')
table_c['t'] = str(datetime.datetime.fromtimestamp(table_c['t'])) price = table_c if table_h == 'stock/insider-transactions': for transaction in table_c['data']: insiders.append(transaction) if table_h == 'news-sentiment': sentiment = table_c if table_h == 'stock/recommendation': for recommend in table_c: recommendation.append(recommend) if table_h == 'stock/metric': metrics = table_c['metric'] table_price = etl.fromdicts([price]) table_price = etl.addfield(table_price, 'symbol', stock) table_recommendation = etl.movefield(etl.fromdicts(recommendation), 'period', 6) table_insiders = etl.movefield(etl.fromdicts(insiders), 'transactionDate', 6) table_insiders = etl.addfield(table_insiders, 'symbol', stock) list_of_metrics = [ '10DayAverageTradingVolume', '13WeekPriceReturnDaily', '26WeekPriceReturnDaily', '3MonthAverageTradingVolume', '52WeekHigh', '52WeekHighDate', '52WeekLow', '52WeekLowDate', '52WeekPriceReturnDaily', '5DayPriceReturnDaily' ] important_metrics = {} for metric in list_of_metrics: important_metrics[metric] = metrics[metric] important_metrics['_date'] = str(datetime.datetime.now())[:19] table_metrics = etl.fromdicts([important_metrics]) table_financial_metrics = etl.rename(