예제 #1
0
파일: etl.py 프로젝트: kcym-3c/parsons
    def move_column(self, column, index):
        """
        Move a column

        `Args:`
            column: str
                The column name to move
            index:
                The new index for the column
        `Returns:`
            `Parsons Table` and also updates existing object.
        """

        self.table = petl.movefield(self.table, column, index)

        return self
import pyodbc
import petl as etl
import pymysql as mysql

########## Json extraction and maping
tableJ = etl.fromjson('cust_data.json', header=['id','gender','first_name','last_name', 'email','ville'])
tableJ = etl.movefield(tableJ, 'gender', 4)

########## CSV extraction and conversion
tableCSV = etl.fromcsv('week_cust.csv')
tableCSV = etl.convert(tableCSV, 'id', int)

########### Sqlserver connection and extraction
connectionSqlServer=pyodbc.connect("Driver={SQL Server Native Client 11.0};" "Server=81_64_msdn;" "Database=BD4client;" "Trusted_Connection=yes;" "convert_unicode =True;")
cursor = connectionSqlServer.cursor()
cursor.execute('SELECT id, first_name, last_name, email, gender, ville FROM client_DATA')
tableSqlServer = cursor.fetchall()
tableSqlServer =[('id','first_name','last_name', 'email','gender','ville')]+tableSqlServer
cursor.close()
connectionSqlServer.close()

######### Staging area transforming and concatenation
StagingArea = etl.cat(tableCSV, tableJ,tableSqlServer)
StagingArea = etl.convert(StagingArea, 'gender', {'Male': 'M', 'Female': 'F', 'male': 'M', 'female': 'F', None: 'N'})
StagingArea = etl.rename(StagingArea, 'ville', 'city')

######## mysql
connection = mysql.connect(host="localhost", user="******", passwd="", db="customerdatabase")
curseur = connection.cursor()
curseur.execute('SET SQL_MODE=ANSI_QUOTES')
#### load data, assuming table " CustomerData" already exists in the database
예제 #3
0
def dimension_values():
    connection = psycopg2.connect(dbname='voyager',
                                  user='******',
                                  password='******',
                                  host='172.16.0.45')
    engine = create_engine('postgresql://*****:*****@172.16.0.45:5432/voyager')

    com = 'select id as id_component, name as component from dim_com'
    table_com = etl.fromdb(connection, com)

    loc = 'select id as id_location, name as name from dim_loc'
    table_loc = etl.fromdb(connection, loc)

    tim = 'select id as id_time, time as timestamp from dim_time'
    table_time = etl.fromdb(connection, tim)

    print(table_com)
    print(table_loc)
    print(table_time)

    for ran in range(0, 65424, 1000):
        sql = "select * from KNMI_station_data kk " \
              "RIGHT JOIN weatherstations w ON " \
              " CAST (kk.weather_station_id AS INTEGER)  = CAST (w.station_number AS INTEGER) " \
              "WHERE w.station_number NOT LIKE \'NL%%\' AND date > 20190901 LIMIT 1000 OFFSET %s" % ran
        print(sql)
        table = etl.fromdb(connection, sql)

        print('knmi')
        print(table)
        table.log_progress()
        table = etl.convert(table, 'date', str)
        table = etl.convert(table, 'hour', str)

        table = etl.convert(table, 'temperature', int)
        table = etl.convert(table, 'temperature_dew', int)
        table = etl.convert(table, 'temperature_min', int)
        table = etl.convert(table, 'wind_speed_avg', int)
        table = etl.convert(table, 'wind_speed', int)
        table = etl.convert(table, 'wind_speed_max', int)

        table = etl.convert(table, 'temperature', lambda v: v / 10)
        table = etl.convert(table, 'temperature_dew', lambda v: v / 10)
        table = etl.convert(table, 'temperature_min', lambda v: v / 10)
        table = etl.convert(table, 'wind_speed_avg', lambda v: v / 10)
        table = etl.convert(table, 'wind_speed', lambda v: v / 10)
        table = etl.convert(table, 'wind_speed_max', lambda v: v / 10)

        df = pd.DataFrame(table)
        df.columns = df.iloc[0]
        df = df.drop(0)
        df['timestamp'] = df['date'] + df['hour']

        df['weather_station_id'] = df['weather_station_id'].astype(str)
        df['timestamp'] = df['timestamp'].apply(custom_to_datetime)
        df['timestamp'] = df['timestamp'].astype(str)

        df = df.drop(columns=['date', 'hour'], axis=1)

        final_knmi_table = etl.fromdataframe(df)

        final_knmi_table = etl.melt(final_knmi_table,
                                    key=[
                                        'weather_station_id', 'timestamp',
                                        'id', 'latitude', 'longitude', 'name',
                                        'station_number', 'data_source_id',
                                        'altitude'
                                    ])
        final_knmi_table = etl.rename(final_knmi_table, 'variable',
                                      'component')
        print(final_knmi_table)

        final_knmi_table2 = etl.join(final_knmi_table,
                                     table_com,
                                     key='component')
        final_knmi_table2 = etl.join(final_knmi_table2, table_loc, key='name')
        final_knmi_table2 = etl.join(final_knmi_table2,
                                     table_time,
                                     key='timestamp')
        print('dos')

        print(final_knmi_table2)
        df = pd.DataFrame(final_knmi_table2)
        df.columns = df.iloc[0]
        df = df.drop(0)
        fact_source = df[[
            'id_component', 'id_location', 'id_time', 'value',
            'data_source_id', 'weather_station_id'
        ]]

        print(fact_source)
        fact_source.to_sql('fact_source',
                           engine,
                           if_exists='append',
                           index=False,
                           method='multi')

    for rn in range(0, 1148, 1000):
        print('lmn')
        final_lmn_table = etl.fromdb(
            connection,
            "select ld.id, ld.station_number, ld.value, ld.timestamp, ls.name as component, "
            "ws.id as lid, ws.latitude, ws.longitude, ws.data_source_id, ws.altitude, ws.name as name"
            " from luchtmeetnet_data ld "
            "right join luchtmeetnet_sensors ls on ld.formula = ls.formula "
            " join weatherstations ws on ld.station_number = ws.station_number "
            "where ws.station_number like \'NL%%\' AND timestamp > '2019-09-01' "
            "LIMIT 1000 OFFSET %s" % rn)
        final_lmn_table = etl.rename(final_lmn_table,
                                     {'station_number': 'weather_station_id'})
        final_lmn_table = etl.movefield(final_lmn_table, 'timestamp', 1)
        # print(final_lmn_table)
        # print(final_lmn_table)

        # print(table_com)
        final_lmn_table2 = etl.join(final_lmn_table,
                                    table_com,
                                    key='component')
        # print(final_lmn_table2)

        final_lmn_table2 = etl.join(final_lmn_table2, table_loc, key='name')
        # print(final_lmn_table2)
        df = pd.DataFrame(final_lmn_table2)
        df.columns = df.iloc[0]
        df = df.drop(0)
        df['timestamp'] = df['timestamp'].str[:-6]
        # print(df)

        final_lmn_table2 = etl.fromdataframe(df)

        final_lmn_table2 = etl.join(final_lmn_table2,
                                    table_time,
                                    key='timestamp')
        # print(final_lmn_table2)

        print(final_lmn_table2)
        final_lmn_df = pd.DataFrame(final_lmn_table2)
        final_lmn_df.columns = final_lmn_df.iloc[0]
        final_lmn_df = final_lmn_df.drop(0)
        fact_source = final_lmn_df[[
            'id_component', 'id_location', 'id_time', 'value',
            'data_source_id', 'weather_station_id'
        ]]
        print(fact_source)

        fact_source.to_sql('fact_source',
                           engine,
                           if_exists='append',
                           index=False,
                           method='multi')
예제 #4
0
        table_c['t'] = str(datetime.datetime.fromtimestamp(table_c['t']))
        price = table_c
    if table_h == 'stock/insider-transactions':
        for transaction in table_c['data']:
            insiders.append(transaction)
    if table_h == 'news-sentiment':
        sentiment = table_c
    if table_h == 'stock/recommendation':
        for recommend in table_c:
            recommendation.append(recommend)
    if table_h == 'stock/metric':
        metrics = table_c['metric']

table_price = etl.fromdicts([price])
table_price = etl.addfield(table_price, 'symbol', stock)
table_recommendation = etl.movefield(etl.fromdicts(recommendation), 'period',
                                     6)
table_insiders = etl.movefield(etl.fromdicts(insiders), 'transactionDate', 6)
table_insiders = etl.addfield(table_insiders, 'symbol', stock)

list_of_metrics = [
    '10DayAverageTradingVolume', '13WeekPriceReturnDaily',
    '26WeekPriceReturnDaily', '3MonthAverageTradingVolume', '52WeekHigh',
    '52WeekHighDate', '52WeekLow', '52WeekLowDate', '52WeekPriceReturnDaily',
    '5DayPriceReturnDaily'
]
important_metrics = {}
for metric in list_of_metrics:
    important_metrics[metric] = metrics[metric]
important_metrics['_date'] = str(datetime.datetime.now())[:19]
table_metrics = etl.fromdicts([important_metrics])
table_financial_metrics = etl.rename(