Exemplo n.º 1
0
    def test_41_load_incremental_should_return_0_if_no_new_records(self):
        test_covid_nyt_data_latest = os.path.join(sys.path[0], 'testdata/nyt_data_latest.csv')

        df_nyt_data, df_jh_data = extract_covid_data(test_covid_nyt_data_latest, self._test_covid_jh_data)
        df_transformed_latest = transform(df_nyt_data, df_jh_data)

        df_size = load_incremental(df_transformed_latest, self.conn)
        self.assertEqual(df_size, 0)
Exemplo n.º 2
0
    def setUpClass(self):
        self._test_covid_nyt_data = os.path.join(sys.path[0], 'testdata/nyt_data.csv')
        self._test_covid_jh_data = os.path.join(sys.path[0], 'testdata/jh_data.csv')

        df_nyt_data, df_jh_data = extract_covid_data(self._test_covid_nyt_data, self._test_covid_jh_data)
        self._df_transformed = transform(df_nyt_data, df_jh_data)

        self.postgresql = testing.postgresql.Postgresql()
        self.conn = psycopg2.connect(**self.postgresql.dsn())
        cursor = self.conn.cursor()
        cursor.execute("CREATE SCHEMA covid")
        cursor.close()
        self.conn.commit()
Exemplo n.º 3
0
def load_data(event, context):
    try:
        data = yaml.load(open(CONFIG_FILE), Loader=yaml.BaseLoader)['data']
        url_covid_nyt_data = data['url_covid_nyt_data']
        url_covid_jh_data = data['url_covid_jh_data']
        dwh_conn = db_utils.get_dwh_conn('dwh')

        df_nyt_data, df_jh_data = extract_covid_data(url_covid_nyt_data,
                                                     url_covid_jh_data)
        df_transformed = transform(df_nyt_data, df_jh_data)
        load_to_dwh(df_transformed, dwh_conn)
    except (Exception, psycopg2.Error) as error:
        notify_etl_status(False, str(error))
Exemplo n.º 4
0
    def test_40_load_incremental(self):
        test_covid_nyt_data_latest = os.path.join(sys.path[0], 'testdata/nyt_data_latest.csv')

        df_nyt_data, df_jh_data = extract_covid_data(test_covid_nyt_data_latest, self._test_covid_jh_data)
        df_transformed_latest = transform(df_nyt_data, df_jh_data)

        load_to_dwh(df_transformed_latest, self.conn)
        df_result = db_utils.get_dwh_result_as_df(self.conn,
                                                  "select * from covid.daily_stats ds  order by ds.rep_date desc", "")

        exp_shape = (257, 4)
        exp_recent_record = list([datetime.date(2020, 10, 4), 7444705, 209603, 2911699])
        exp_columns = list(['rep_date', 'cases', 'deaths', 'recovered'])
        self.assertTupleEqual(df_result.shape, exp_shape)
        self.assertListEqual(list(df_result.columns), exp_columns)
        self.assertListEqual(list(df_result.iloc[0]), exp_recent_record)
Exemplo n.º 5
0
import logging

from config import configure_logging
from etl.extract import import_data
from etl.transform import transform
from etl.load import load

# from etl.load import

if __name__ == "__main__":

    configure_logging()
    # create the dashboard heading
    logging.info("Launching ETL")

    dataset = None
    # Extract and import data from remote repo
    dataset = import_data(remote=True)
    # Transform data
    dataset = transform(dataset)
    # Save data to disk
    result = load(dataset)

    logging.info(f"ETL finished: {result}")
Exemplo n.º 6
0
import findspark

findspark.init()
from pyspark.sql import SparkSession
from etl.ingest import ingest_logs
from etl.transform import transform
from analytics.log_analytics import analysis

if __name__ == '__main__':
    spark = SparkSession.builder.appName('Whitehouse Logs').config(
        'spark.master', 'local').getOrCreate()

    print(spark.version)

    ingest_logs(spark)

    transform(spark)

    analysis(spark)
Exemplo n.º 7
0
    # Lets get to work...
    downloaded_count = None
    data = None
    output = None
    
    if run_cmds.find('e') > -1:
        # Extract files from ftp location and download them
        downloaded_count = extract.extract(source = source, 
                                           extract_storage = extract_storage)
        print 'Downloaded %s files ' % downloaded_count

    if run_cmds.find('t') > -1:
        # Transform downloaded files
        data = transform.transform(extract_storage = extract_storage,
                                 db = db,
                                 csv_schema = csv_schema)
        print 'Processed and stored %s files in %s' % (len(data), db)

    if run_cmds.find('l') > -1:
        # Load to processed content to Google Fusion
        username = raw_input('Google user: '******'Google password: '******'Done'
Exemplo n.º 8
0
def go ():
  return transform(export())
Exemplo n.º 9
0
 def test_transform(self):
     df_transformed = transform(self._df_nyt_data, self._df_jh_data)
     exp_shape = (253, 4)
     self.assertTupleEqual(df_transformed.shape, exp_shape)
     exp_columns = list(['date', 'cases', 'deaths', 'recovered'])
     self.assertListEqual(list(df_transformed.columns), exp_columns)