def extract( batch_id, method="GET", http_conn_id="default_api", mongo_conn_id="default_mongo" ): http = HttpHook(method, http_conn_id=http_conn_id) mongo_conn = MongoHook(mongo_conn_id) ids_to_update_coll = mongo_conn.get_collection("ids_to_update", "courts") results_to_transform_coll = mongo_conn.get_collection( "results_to_transform", "courts" ) # Note/TODO: because we add endpoints back that we couldn't handle, we may # get stuck in an infinite loop. Another solution is exiting whenever an # exception occurs, but this isn't ideal either while ids_to_update_coll.find_one({"batch_id": str(batch_id)}) != None: # find a job to work on result = ids_to_update_coll.find_one_and_delete({"batch_id": str(batch_id)}) api_id = result["api_id"] try: # transform to get a valid link # TODO: this needs to be generalized to any website endpoint = f"opinions/{api_id}" # pull data in response = http.run(endpoint) result_data = response.json() if response.status_code == 200: # store our result into mongo results_to_transform_coll.insert_one( {"batch_id": str(batch_id), "data": result_data} ) else: # TODO: throw a more specific exception raise AirflowException( f"Received {response.status_code} code from {endpoint}." ) except json.JSONDecodeError as j_error: print("Failed to decode response with {j_error}:\n{response.body}") mongo_conn.insert_one( "ids_to_update", {"api_id": str(api_id), "batch_id": str(batch_id)}, mongo_db="courts", ) except Exception as error: # something went wrong. Log it and return this endpoint to mongoDB so we can try again print(f"An exception occured while processing batch {batch_id}:\n{error}") mongo_conn.insert_one( "ids_to_update", {"api_id": str(api_id), "batch_id": str(batch_id)}, mongo_db="courts", )
def execute(self, context): mongoHook = MongoHook(conn_id=self.mongo_conn_id) self.mongo_db = mongoHook.connection.schema log.info('postgres_conn_id: %s', self.postgres_conn_id) log.info('mongo_conn_id: %s', self.mongo_conn_id) log.info('postgres_sql: %s', self.postgres_sql) # log.info('prev_exec_date: %s', self.prev_exec_date) log.info('mongo_db: %s', self.mongo_db) log.info('mongo_collection: %s', self.mongo_collection) well_data = self.get_data() most_recent_date = Variable.get("most_recent_date") print(most_recent_date) filter_query = None for index, well in well_data.iterrows(): if well is not None and well['is_newly_added']: print('newly added') filter_query = {"Name": {"$eq": well['well_name']}} else: print('old well') filter_query = { "$and": [{ "Name": { "$eq": well['well_name'] } }, { "Date": { "$gt": most_recent_date } }] } # filter_query = { "Date" : { "$gt" : most_recent_date } } log.info('mongo filter query: %s', filter_query) mongo_well_list = self.transform( mongoHook.get_collection( self.mongo_collection).find(filter_query)) print(len(mongo_well_list)) if len(mongo_well_list) > 0: for doc in mongo_well_list: doc["water_cut_calc"] = utils.calc_watercut( doc['OIL_bopd'], doc['WATER_bwpd']) doc["gor_calc"] = utils.calc_gor(doc['OIL_bopd'], doc['GAS_mscfd']) self.update_records(mongoHook, filter_query, mongo_well_list)
def test_transform_load_operator( self, mocker, postgresql, ports_collection, test_dag ): """Test if transform_load_operator upserts data into master db.""" # Create mocks mocker.patch.object( PostgresHook, "get_conn", return_value=postgresql ) mocker.patch.object( MongoHook, "get_collection", return_value=ports_collection ) # Check if the source table has an item in it mongo_hook = MongoHook() collection = mongo_hook.get_collection() assert collection.count_documents({}) > 0 # Check if the sink table is initially empty cursor = postgresql.cursor() cursor.execute("SELECT COUNT(*) FROM ports;") initial_result = cursor.fetchone()[0] assert initial_result == 0 # Setup task mongo_staging_config = MongoConfig('mongo_default', 'ports') postgres_master_config = PostgresConfig('postgres_default') task = TransformAndLoadOperator( mongo_config=mongo_staging_config, postgres_config=postgres_master_config, task_id='test', processor=PortsItemProcessor(), query=SqlQueries.ports_table_insert, query_params={"updated_at": datetime.datetime.utcnow()}, dag=test_dag ) # Execute task and check if it inserted the data successfully task.execute(context={}, testing=True) cursor.execute("SELECT COUNT(*) FROM ports;") after_result = cursor.fetchone()[0] assert after_result > 0
def test_transform_load_operator_database_error( self, mocker, postgresql, ports_collection, test_dag ): """Test if transform_load_operator handles DB errors.""" # Create mocks mocker.patch.object( PostgresHook, "get_conn", return_value=postgresql ) mocker.patch.object( MongoHook, "get_collection", return_value=ports_collection ) # Check if the source table has an item in it mongo_hook = MongoHook() collection = mongo_hook.get_collection() assert collection.count_documents({}) > 0 # Setup task, intentionally give an unknown table mongo_staging_config = MongoConfig('mongo_default', 'ports') postgres_master_config = PostgresConfig('postgres_default') task = TransformAndLoadOperator( mongo_config=mongo_staging_config, postgres_config=postgres_master_config, task_id='test', processor=PortsItemProcessor(), query=SqlQueries.ports_table_insert.replace( 'ports', 'ports_wrong' ), query_params={"updated_at": datetime.datetime.utcnow()}, dag=test_dag ) # Execute the task and check if it will raise an UndefinedTable error with raises((UndefinedTable, Exception, OperationalError)): # Set testing to false to implicitly close the database task.execute(context={}, testing=False) task.execute(context={}, testing=True)
def execute(self, context, testing=False): """ Read all data from mongo db, process it and write to postgresql db. Uses UPSERT SQL query to write data. """ self.log.info('LoadToMasterdbOperator Starting...') self.log.info("Initializing Mongo Staging DB Connection...") mongo_hook = MongoHook(conn_id=self._mongo_conn_id) ports_collection = mongo_hook.get_collection(self._mongo_collection) self.log.info("Initializing Postgres Master DB Connection...") psql_hook = PostgresHook(postgres_conn_id=self._postgres_conn_id) psql_conn = psql_hook.get_conn() psql_cursor = psql_conn.cursor() self.log.info("Loading Staging data to Master Database...") try: for idx, document in enumerate(ports_collection.find({})): document = self._processor.process_item(document) staging_id = document.get('_id').__str__() document['staging_id'] = staging_id document.pop('_id') psql_cursor.execute(self._sql_query, document) psql_conn.commit() except (OperationalError, UndefinedTable, OperationFailure): self.log.error("Writting to database FAILED.") self.log.error(traceback.format_exc()) raise Exception("LoadToMasterdbOperator FAILED.") except Exception: self.log.error(traceback.format_exc()) raise Exception("LoadToMasterdbOperator FAILED.") finally: if not testing: self.log.info('Closing database connections...') psql_conn.close() mongo_hook.close_conn() self.log.info(f'UPSERTED {idx+1} records into Postgres Database.') self.log.info('LoadToMasterdbOperator SUCCESS!')
def execute(self, context): mongoHook = MongoHook(conn_id=self.mongo_conn_id) log.info('odbc_conn_id: %s', self.odbc_conn_id) log.info('postgres_conn_id: %s', self.postgres_conn_id) log.info('mongo_conn_id: %s', self.mongo_conn_id) log.info('mongo_db: %s', mongoHook.connection.schema) log.info('mongo_collection: %s', self.mongo_collection) log.info('odbc_sql: %s', self.odbc_sql) log.info('postgres_sql: %s', self.postgres_sql) log.info('postgres_insert_sql: %s', self.postgres_insert_sql) mongo_well_list = mongoHook.get_collection( self.mongo_collection).distinct("Name") log.info('mongo well list: %s', mongo_well_list) odbc_well_list = self.get_data() log.info('odbc well list: %s', odbc_well_list) final_well_list = [] if not mongo_well_list and len(mongo_well_list) == 0: final_well_list = self.prepare_well_list(odbc_well_list, True) else: mongo_filtered_well_list = self.prepare_well_list( mongo_well_list, False) new_well_list = list(set(odbc_well_list) - set(mongo_well_list)) log.info('new well list: %s', new_well_list) new_well_list = self.prepare_well_list(new_well_list, True) postgres_well_list = self.get_well_data() if postgres_well_list.empty == True: for item in new_well_list: final_well_list.append(item) else: final_well_list = new_well_list log.info('final well list for insert: %s', final_well_list) if final_well_list and len(final_well_list) > 0: self.insert_data(final_well_list)
def test_transform_load_operator_exception_error( self, mocker, postgresql, ports_collection, test_dag ): """Test if transform_load_operator handles Exception thrown.""" # Create mocks mocker.patch.object( PostgresHook, "get_conn", return_value=postgresql ) mocker.patch.object( MongoHook, "get_collection", return_value=ports_collection ) # Check if the source table has an item in it mongo_hook = MongoHook() collection = mongo_hook.get_collection() assert collection.count_documents({}) > 0 # Setup task mongo_staging_config = MongoConfig('mongo_default', 'ports') postgres_master_config = PostgresConfig('postgres_default') task = TransformAndLoadOperator( mongo_config=mongo_staging_config, postgres_config=postgres_master_config, task_id='test', processor=PortsItemProcessor(), query='Wrong SQL query', dag=test_dag ) # Execute task and check if it will raise an Exception error with raises(Exception): task.execute(context={}, testing=True)
def test_save_to_json_operator( self, mocker, postgresql, ports_collection, test_dag, tmp_path: Path ): """Test if save_to_json_operator saves the file on a specified path""" # Create mocks mocker.patch.object( PostgresHook, "get_conn", return_value=postgresql ) mocker.patch.object( MongoHook, "get_collection", return_value=ports_collection ) # Check if the source table has an item in it mongo_hook = MongoHook() collection = mongo_hook.get_collection() assert collection.count_documents({}) > 0 # Setup some data, transfer staging data to master mongo_staging_config = MongoConfig('mongo_default', 'ports') postgres_master_config = PostgresConfig('postgres_default') transform_load = TransformAndLoadOperator( mongo_config=mongo_staging_config, postgres_config=postgres_master_config, task_id='test', processor=PortsItemProcessor(), query=SqlQueries.ports_table_insert, query_params={"updated_at": datetime.datetime.utcnow()}, dag=test_dag ) # Execute task and check if it inserted the data successfully transform_load.execute(context={}, testing=True) pg_hook = PostgresHook() cursor = pg_hook.get_conn().cursor() cursor.execute("SELECT COUNT(*) FROM ports;") after_result = cursor.fetchone()[0] assert after_result > 0 # Alter tmp_path to forcesively create a path tmp_path = tmp_path / 'unknown-path' # Execute save_to_json to save the data into json file on tmp_path save_to_json = LoadToJsonOperator( task_id='export_to_json', postgres_config=postgres_master_config, query=SqlQueries.select_all_query_to_json, path=tmp_path, tables=['ports'], dag=test_dag ) save_to_json.execute( {'execution_date': datetime.datetime(2021, 1, 1)} ) output_path = tmp_path / 'ports_20210101T000000.json' expected_data = { 'ports': [{ 'id': 1, 'countryName': 'Philippines', 'portName': 'Aleran/Ozamis', 'unlocode': 'PH ALE', 'coordinates': '4234N 00135E' }] } # Read result with open(output_path, "r") as f: result = json.load(f) # Assert assert 'ports' in result assert result == expected_data
def test_save_to_json_operator_database_error( self, mocker, postgresql, ports_collection, test_dag, tmp_path: Path ): """Test if save_to_json_operator can handle errors related to db.""" # Create mocks mocker.patch.object( PostgresHook, "get_conn", return_value=postgresql ) mocker.patch.object( MongoHook, "get_collection", return_value=ports_collection ) # Check if the source table has an item in it mongo_hook = MongoHook() collection = mongo_hook.get_collection() assert collection.count_documents({}) > 0 # Setup some data, transfer staging data to master mongo_staging_config = MongoConfig('mongo_default', 'ports') postgres_master_config = PostgresConfig('postgres_default') transform_load = TransformAndLoadOperator( mongo_config=mongo_staging_config, postgres_config=postgres_master_config, task_id='test', processor=PortsItemProcessor(), query=SqlQueries.ports_table_insert, query_params={"updated_at": datetime.datetime.utcnow()}, dag=test_dag ) # Execute task and check if it inserted the data successfully transform_load.execute(context={}, testing=True) pg_hook = PostgresHook() cursor = pg_hook.get_conn().cursor() cursor.execute("SELECT COUNT(*) FROM ports;") after_result = cursor.fetchone()[0] assert after_result > 0 # Execute save_to_json to save the data into json file on tmp_path save_to_json = LoadToJsonOperator( task_id='test2', postgres_config=postgres_master_config, query=SqlQueries.select_all_query_to_json, path=tmp_path, tables=['foo'], dag=test_dag ) with raises((UndefinedTable, OperationalError, Exception)): # Set testing = False to implicitly close the database connection save_to_json.execute( {'execution_date': datetime.datetime(2021, 1, 1)}, testing=False ) save_to_json.execute( {'execution_date': datetime.datetime(2021, 1, 1)}, testing=True )