def transform_data(): import pyspark import pandas as pd from pyspark.sql import SQLContext from pyspark import SparkContext dest = SqliteHook(sqlite_conn_id='sqlite2') dest_conn = dest.get_conn() BASE_DIR = os.path.dirname(os.path.abspath("__file__")) sc = pyspark.SparkContext() sqlContext = SQLContext(sc) df = sc.textFile(os.path.join(BASE_DIR, "InputFile.txt")) df.collect() sparkDF = df.map(lambda x: str(x).translate({ord(c): None for c in '][""'}) ).map(lambda w: w.split(',')).toDF() pdDF = sparkDF.toPandas() sqlUpdate = 'INSERT OR REPLACE INTO Currency(USD, JPY, CAD, GBP, NZD, INR, Date_of_rate) VALUES (?, ?, ?, ?, ?, ?, ?)' data = pdDF.values dest_conn.executemany(sqlUpdate, data) dest_conn.commit() sc.stop()
def process_weather(station, field_to_process): db_hook = SqliteHook(conn_name_attr='sqlite_default') weather_select = (f'select record_date, {field_to_process} ' f'from weather where station_id={station} ' f'order by record_date;') data = db_hook.get_pandas_df(weather_select) average = data.rolling( 3, center=True).mean().rename(columns={field_to_process: 'average'}) data = data.merge(average, left_index=True, right_index=True) del average weather_update = """ update weather set average = ? where station_id=? and record_date=?; """ # iteration over data is used like a hack to avoid using either DataFrame.itertuples(), iteritems() or iterrows(), # which may be a bottleneck in case if number of rows is more then several thousands data.apply(lambda row: db_hook.run(weather_update, parameters=(row['average'], station, row['record_date'])), axis=1)
def save_data_into_db(): sqlite_hook = SqliteHook(sqlite_conn_id='playgroundDB') with open('data.json') as f: data = json.load(f) insert = """ INSERT INTO Covid19Report ( confirmed, recovered, hospitalized, deaths, new_confirmed, new_recovered, new_hospitalized, new_deaths, update_date, source, dev_by, sever_by) VALUES (?,?,?,?,?,?,?,?,?,?,?,?); """ sqlite_hook.run( insert, parameters=(data['Confirmed'], data['Recovered'], data['Hospitalized'], data['Deaths'], data['NewConfirmed'], data['NewRecovered'], data['NewHospitalized'], data['NewDeaths'], datetime.strptime(data['UpdateDate'], '%d/%m/%Y %H:%M'), data['Source'], data['DevBy'], data['SeverBy']))
def get_weather(url_params): db_hook = SqliteHook(conn_name_attr='sqlite_default ') api_hook = HttpHook(http_conn_id='http_default', method='GET') url = add_or_replace_parameters(f'v1/history/daily', url_params) resp = api_hook.run(url) data = resp.json()['data'] # usually I don't really care about this, but in case of big data, I guess it may be very useful del resp weather_insert = """ insert or ignore into weather (station_id, record_date, temperature, temperature_min, temperature_max, winddirection, windspeed, sunshine, pressure) values (?, ?, ?, ?, ?, ?, ?, ?, ?); """ for day in data: db_hook.run(weather_insert, parameters=(url_params['station'], day['date'], day['temperature'], day['temperature_min'], day['temperature_max'], day['winddirection'], day['windspeed'], day['sunshine'], day['pressure']))
def getdf(): import os.path h = SqliteHook(conn_id) df = h.get_pandas_df( "SELECT A.DocumentNo, A.FullName, A.Device, A.Country, B.OrderId, B.DocumentNo, B.OrderDate, B.CatalogId,C.CatalogId, C.ProductId, C.CUID, D.ProductId, D.ProductName, D.CUID FROM CUSTOMERS as A inner join ORDERS as B on A.DocumentNo = B.DocumentNo inner join CATALOG as C on C.CatalogId = B.CatalogId inner join PRODUCTS AS D ON C.CUID = D.CUID" ) print(df) df.to_csv(os.path.join(BASE_DIR, "SourceData.txt"), index=False)
def createdb(): try: dest = SqliteHook(sqlite_conn_id='sqlite2') dest_conn = dest.get_conn() dest_conn.execute( '''CREATE TABLE if not exists Currency(USD text, JPY text, CAD text, GBP text, NZD text, INR text, Date_of_rate date)''' ) except: print("SQLite Connection Failed")
def test_env_var_priority(self): c = SqliteHook.get_connection(conn_id='airflow_db') self.assertNotEqual('ec2.compute.com', c.host) with mock.patch.dict('os.environ', { 'AIRFLOW_CONN_AIRFLOW_DB': 'postgres://*****:*****@ec2.compute.com:5432/the_database', }): c = SqliteHook.get_connection(conn_id='airflow_db') self.assertEqual('ec2.compute.com', c.host) self.assertEqual('the_database', c.schema) self.assertEqual('username', c.login) self.assertEqual('password', c.password) self.assertEqual(5432, c.port)
def get_num_active_dagruns(dag_id, conn_id='sqlite_default'): # if you've opted for a different backend for airflow, you will need to # refactor the two lines below. For a Postgres example, please refer to # https://github.com/Nextdoor/airflow_examples/blob/master/dags/util.py#L8 airflow_db = SqliteHook(sqlite_conn_id=conn_id) conn = airflow_db.get_conn() cursor = conn.cursor() sql = """ select count(*) from dag_run where dag_id = '{dag_id}' and state in ('running', 'queued', 'up_for_retry') """.format(dag_id=dag_id) cursor.execute(sql) num_active_dagruns = cursor.fetchone()[0] return num_active_dagruns
def test_using_env_var(self): c = SqliteHook.get_connection(conn_id='test_uri') self.assertEqual('ec2.compute.com', c.host) self.assertEqual('the_database', c.schema) self.assertEqual('username', c.login) self.assertEqual('password', c.password) self.assertEqual(5432, c.port)
def test_using_unix_socket_env_var(self): c = SqliteHook.get_connection(conn_id='test_uri_no_creds') self.assertEqual('ec2.compute.com', c.host) self.assertEqual('the_database', c.schema) self.assertIsNone(c.login) self.assertIsNone(c.password) self.assertIsNone(c.port)
def load_csv_to_sqlite(csv_path, sqlite_table, **context): """ Loads CSV data (i.e. research_papers and mct_talks) in the default Sqlite DB. """ df = pd.read_csv(csv_path) with SqliteHook().get_conn() as conn: df.to_sql(sqlite_table, con=conn, if_exists='replace')
def test_get_connections_env_var(self): conns = SqliteHook.get_connections(conn_id='test_uri') assert len(conns) == 1 assert conns[0].host == 'ec2.compute.com' assert conns[0].schema == 'the_database' assert conns[0].login == 'username' assert conns[0].password == 'password' assert conns[0].port == 5432
def create_covid19_report_table(): sqlite_hook = SqliteHook(sqlite_conn_id='playgroundDB') create = """ CREATE TABLE "Covid19Report" ( "confirmed" INTEGER, "recovered" INTEGER, "hospitalized" INTEGER, "deaths" INTEGER, "new_confirmed" INTEGER, "new_recovered" INTEGER, "new_hospitalized" INTEGER, "new_deaths" INTEGER, "update_date" TEXT, "source" TEXT, "dev_by" TEXT, "sever_by" TEXT); """ sqlite_hook.run(create)
def predict(classifier, **context): """ Makes predictions for a model created by the given classifier and returns its stores the results in the mct_talks table. """ # Load model model = _load_model(classifier.__name__) # Load data db = SqliteHook() df = db.get_pandas_df('select * from mct_talks') # Make predictions df['Conference'] = model.predict(df['Title'].tolist()) # Save predictions with db.get_conn() as conn: df.to_sql('mct_talks', con=conn, if_exists='replace')
def split_data(**context): """ Splits the sample data (i.e. research_papers) into training and test sets and stores them in the Sqlite DB. """ # Load full dataset db = SqliteHook() df = db.get_pandas_df('select * from research_papers') # Create train/test split train, _ = train_test_split(df.index, test_size=0.33, stratify=df['Conference'], random_state=42) # Save training and test data in separate tables with db.get_conn() as conn: df.iloc[train].to_sql('training_data', con=conn, if_exists='replace') df.drop(train).to_sql('test_data', con=conn, if_exists='replace')
def createdb(): dest = SqliteHook(sqlite_conn_id='sqlite2') dest_conn = dest.get_conn() dest_conn.execute( '''CREATE TABLE if not exists ORDERS(OrderId text, DocumentNo text, OrderDate text, CatalogId text)''' ) dest_conn.execute( '''CREATE TABLE if not exists CUSTOMERS(DocumentNo text, FullName text, Device text, Counttry real)''' ) dest_conn.execute( '''CREATE TABLE if not exists CATALOG(CatalogId text, ProductId text, CUID text)''' ) dest_conn.execute( '''CREATE TABLE if not exists PRODUCTS(ProductId text, ProductName text, CUID text)''' ) dest_conn.execute( '''CREATE TABLE if not exists LEADS(Id text, CustomerId text)''') dest_conn.execute( '''CREATE TABLE if not exists LOGS(ClientIp text, UserName text, Time text)''' )
def etl_data(**kwargs): # Extract Transform Load csv in DB task_instance = kwargs['ti'] filename = task_instance.xcom_pull(task_ids='get_file') conn_host = SqliteHook(sqlite_conn_id='sqlite_db', supports_autocommit = True).get_conn() loader = ETLoader(filename, downloads_path_dir, conn_host, 'revenues', revenue_table) loader.orchestrate_etl() return filename
def get_hook(self): try: if self.conn_type == 'mysql': from airflow.hooks.mysql_hook import MySqlHook return MySqlHook(mysql_conn_id=self.conn_id) elif self.conn_type == 'google_cloud_platform': from airflow.contrib.hooks.bigquery_hook import BigQueryHook return BigQueryHook(bigquery_conn_id=self.conn_id) elif self.conn_type == 'postgres': from airflow.hooks.postgres_hook import PostgresHook return PostgresHook(postgres_conn_id=self.conn_id) elif self.conn_type == 'hive_cli': from airflow.hooks.hive_hooks import HiveCliHook return HiveCliHook(hive_cli_conn_id=self.conn_id) elif self.conn_type == 'presto': from airflow.hooks.presto_hook import PrestoHook return PrestoHook(presto_conn_id=self.conn_id) elif self.conn_type == 'hiveserver2': from airflow.hooks.hive_hooks import HiveServer2Hook return HiveServer2Hook(hiveserver2_conn_id=self.conn_id) elif self.conn_type == 'sqlite': from airflow.hooks.sqlite_hook import SqliteHook return SqliteHook(sqlite_conn_id=self.conn_id) elif self.conn_type == 'jdbc': from airflow.hooks.jdbc_hook import JdbcHook return JdbcHook(jdbc_conn_id=self.conn_id) elif self.conn_type == 'mssql': from airflow.hooks.mssql_hook import MsSqlHook return MsSqlHook(mssql_conn_id=self.conn_id) elif self.conn_type == 'oracle': from airflow.hooks.oracle_hook import OracleHook return OracleHook(oracle_conn_id=self.conn_id) elif self.conn_type == 'vertica': from airflow.contrib.hooks.vertica_hook import VerticaHook return VerticaHook(vertica_conn_id=self.conn_id) elif self.conn_type == 'cloudant': from airflow.contrib.hooks.cloudant_hook import CloudantHook return CloudantHook(cloudant_conn_id=self.conn_id) elif self.conn_type == 'jira': from airflow.contrib.hooks.jira_hook import JiraHook return JiraHook(jira_conn_id=self.conn_id) elif self.conn_type == 'redis': from airflow.contrib.hooks.redis_hook import RedisHook return RedisHook(redis_conn_id=self.conn_id) elif self.conn_type == 'wasb': from airflow.contrib.hooks.wasb_hook import WasbHook return WasbHook(wasb_conn_id=self.conn_id) elif self.conn_type == 'docker': from airflow.hooks.docker_hook import DockerHook return DockerHook(docker_conn_id=self.conn_id) except: pass
def store_recording(**context): """Stores a random integer from 1 to 10 with timestamp into data table""" conn = SqliteHook.get_connection( context['params']['conn_id']).get_hook().get_conn() time = context['execution_date'] values = (time, random.randint(1, 10)) logging.info('Inserting: {0:%Y-%m-%d %H:%M:%S}, {1}'.format(*values)) with conn: cur = conn.cursor() cur.execute("INSERT INTO data VALUES (?, ?)", values)
def fill_devices_table(**kwargs): ti = kwargs['ti'] devices = ti.xcom_pull(task_ids='get_devices_dates') conn_host = SqliteHook(sqlite_conn_id='sqlite_devices').get_conn() for device in devices: sql_insert = f"""INSERT OR REPLACE INTO {DEVICES_TABLE} (device, last_reading, latest_postprocessing) VALUES ({device}, '{devices[device]["last_reading"]}', '{devices[device]["latest_postprocessing"]}' ) ;""" conn_host.execute(sql_insert) conn_host.commit() conn_host.close()
def train_model(classifier, **context): """ Trains a model using the given classifier and stores it at MODEL_PATH. """ # Load data df = SqliteHook().get_pandas_df('select * from training_data') # Create pipeline pipeline = Pipeline([('vectorizer', CountVectorizer(stop_words='english')), ('classifier', classifier())]) # Train model pipeline.fit(df['Title'].tolist(), df['Conference'].tolist()) # Save model _save_model(pipeline, classifier.__name__)
def save_prediction(**kwargs): # Tasks can pass parameters to downstream tasks through the XCom space. # In this example the current task `save_prediction` takes the output of # the previous task `run_prediction` "pulling" it from the XCom space ti = kwargs['ti'] prediction_dict = ti.xcom_pull(task_ids='run_prediction') # INSERT OR REPLACE guarantees to have unique row for each date_to_predict: # it inserts a new prediction if it doesn't exists or replace the existing # one due to the index idx_date_to_predict on date_to_predict sql_insert = f"""INSERT OR REPLACE INTO {PREDICTION_TABLE} (date_to_predict, run_date, yhat, yhat_upper, yhat_lower) VALUES ('{prediction_dict["date_to_predict"]}', '{prediction_dict["run_date"]}', {prediction_dict["yhat"]}, {prediction_dict["yhat_upper"]}, {prediction_dict["yhat_lower"]} ) ;""" # Hooks are interface to external platforms (e.g. Amazon S3) # and DBs (e.g. SQLite DB, PostgreSQL) conn_host = SqliteHook(sqlite_conn_id='sqlite_ml').get_conn() conn_host.execute(sql_insert) conn_host.commit() conn_host.close()
def calc_hourly_totals(**context): """Calculates sum of values from data table for the previous hour and puts it into totals table""" conn = SqliteHook.get_connection( context['params']['conn_id']).get_hook().get_conn() hour = context['execution_date'] prev_hour = hour - timedelta(hours=1) with conn: cur = conn.cursor() cur.execute( "SELECT count(*) FROM data " "WHERE strftime('%Y-%m-%d %H', time) = strftime('%Y-%m-%d %H', ?)", (prev_hour, )) data_count = cur.fetchone()[0] logging.info('Found {} values for {}'.format(data_count, hour)) if data_count != 4: logging.warn('Need all 4 values to calculate hourly value.') raise AirflowException( "Need all 4 values to calculate hourly value.") cur.execute( "SELECT count(*) FROM hourly " "WHERE hour = strftime('%Y-%m-%d %H:00:00', ?)", (hour, )) hour_count = cur.fetchone()[0] if hour_count: logging.warn( 'There is an hourly value for {} already.'.format(hour)) return cur.execute( "INSERT INTO hourly SELECT strftime('%Y-%m-%d %H:00:00', ?), sum(value) FROM data " "WHERE strftime('%Y-%m-%d %H', time) = strftime('%Y-%m-%d %H', ?)", (hour, prev_hour)) logging.info('Inserted hourly value for {}.'.format(hour))
def test_model(classifier, **context): """ Tests a model created by the given classifier and returns its accuracy score as an XCom. """ # Simulate a task failure # raise RuntimeError('BOOM!') # Load model model = _load_model(classifier.__name__) # Load data df = SqliteHook().get_pandas_df('select * from test_data') # Make predictions X, y = df['Title'].tolist(), df['Conference'].tolist() y_pred = model.predict(X) # Log classification results print(classification_report(y, y_pred)) # Push Accuracy as XCom return classifier.__name__, f1_score(y, y_pred, average='weighted')
def save_model_accuracy(**kwargs): # Tasks can pass parameters to downstream tasks through the XCom space. # XCom (cross-communication) allows communication between task instances. # In this example the current task `save_model_accuracy` takes the output # of the previous task `measure_accuracy` "pulling" it from the XCom space ti = kwargs['ti'] accuracy = ti.xcom_pull(task_ids='measure_accuracy') sql_insert = f"""INSERT INTO {TRAINING_TABLE} (mape_test, rmse_test, days_in_test) VALUES({accuracy['mape_test']}, {accuracy['rmse_test']}, {accuracy['days_in_test']}) ; """ # Hooks are interface to external platforms (e.g. Amazon S3) # and DBs (e.g. SQLite DB, PostgreSQL) conn_host = SqliteHook(sqlite_conn_id='sqlite_ml').get_conn() conn_host.execute(sql_insert) conn_host.commit()
def make_hook() -> Optional[BaseHook]: """ Makes a SqliteHook to test with. """ return SqliteHook()
def create_tables(**context): hook = SqliteHook.get_connection(context['params']['conn_id']).get_hook() run_sql(hook, "CREATE TABLE IF NOT EXISTS data (time TIMESTAMP, value INT);") run_sql(hook, "CREATE TABLE IF NOT EXISTS hourly (hour TIMESTAMP, total INT);")
def execute(self, context): _log.info('Executing: ' + self.sql) hook = SqliteHook(sqlite_conn_id=self.sqlite_conn_id) hook.run(self.sql, parameters=self.parameters)
def get_hook(self): if self.conn_type == 'mysql': from airflow.hooks.mysql_hook import MySqlHook return MySqlHook(mysql_conn_id=self.conn_id) elif self.conn_type == 'google_cloud_platform': from airflow.gcp.hooks.bigquery import BigQueryHook return BigQueryHook(bigquery_conn_id=self.conn_id) elif self.conn_type == 'postgres': from airflow.hooks.postgres_hook import PostgresHook return PostgresHook(postgres_conn_id=self.conn_id) elif self.conn_type == 'pig_cli': from airflow.hooks.pig_hook import PigCliHook return PigCliHook(pig_cli_conn_id=self.conn_id) elif self.conn_type == 'hive_cli': from airflow.hooks.hive_hooks import HiveCliHook return HiveCliHook(hive_cli_conn_id=self.conn_id) elif self.conn_type == 'presto': from airflow.hooks.presto_hook import PrestoHook return PrestoHook(presto_conn_id=self.conn_id) elif self.conn_type == 'hiveserver2': from airflow.hooks.hive_hooks import HiveServer2Hook return HiveServer2Hook(hiveserver2_conn_id=self.conn_id) elif self.conn_type == 'sqlite': from airflow.hooks.sqlite_hook import SqliteHook return SqliteHook(sqlite_conn_id=self.conn_id) elif self.conn_type == 'jdbc': from airflow.hooks.jdbc_hook import JdbcHook return JdbcHook(jdbc_conn_id=self.conn_id) elif self.conn_type == 'mssql': from airflow.hooks.mssql_hook import MsSqlHook return MsSqlHook(mssql_conn_id=self.conn_id) elif self.conn_type == 'oracle': from airflow.hooks.oracle_hook import OracleHook return OracleHook(oracle_conn_id=self.conn_id) elif self.conn_type == 'vertica': from airflow.contrib.hooks.vertica_hook import VerticaHook return VerticaHook(vertica_conn_id=self.conn_id) elif self.conn_type == 'cloudant': from airflow.contrib.hooks.cloudant_hook import CloudantHook return CloudantHook(cloudant_conn_id=self.conn_id) elif self.conn_type == 'jira': from airflow.contrib.hooks.jira_hook import JiraHook return JiraHook(jira_conn_id=self.conn_id) elif self.conn_type == 'redis': from airflow.contrib.hooks.redis_hook import RedisHook return RedisHook(redis_conn_id=self.conn_id) elif self.conn_type == 'wasb': from airflow.contrib.hooks.wasb_hook import WasbHook return WasbHook(wasb_conn_id=self.conn_id) elif self.conn_type == 'docker': from airflow.hooks.docker_hook import DockerHook return DockerHook(docker_conn_id=self.conn_id) elif self.conn_type == 'azure_data_lake': from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id) elif self.conn_type == 'azure_cosmos': from airflow.contrib.hooks.azure_cosmos_hook import AzureCosmosDBHook return AzureCosmosDBHook(azure_cosmos_conn_id=self.conn_id) elif self.conn_type == 'cassandra': from airflow.contrib.hooks.cassandra_hook import CassandraHook return CassandraHook(cassandra_conn_id=self.conn_id) elif self.conn_type == 'mongo': from airflow.contrib.hooks.mongo_hook import MongoHook return MongoHook(conn_id=self.conn_id) elif self.conn_type == 'gcpcloudsql': from airflow.gcp.hooks.cloud_sql import CloudSqlDatabaseHook return CloudSqlDatabaseHook(gcp_cloudsql_conn_id=self.conn_id) elif self.conn_type == 'grpc': from airflow.contrib.hooks.grpc_hook import GrpcHook return GrpcHook(grpc_conn_id=self.conn_id) raise AirflowException("Unknown hook type {}".format(self.conn_type))
def execute(self, context): self.log.info('Executing: %s', self.sql) hook = SqliteHook(sqlite_conn_id=self.sqlite_conn_id) return hook.get_records(self.sql, parameters=self.parameters)
def execute(self, context): logging.info('Executing: ' + self.sql) hook = SqliteHook(sqlite_conn_id=self.sqlite_conn_id) hook.run(self.sql, parameters=self.parameters)