def execute(self, context, testing=False): """ Read all data from mongo db, process it and write to postgresql db. Uses UPSERT SQL query to write data. """ self.log.info('LoadToMasterdbOperator Starting...') self.log.info("Initializing Mongo Staging DB Connection...") mongo_hook = MongoHook(conn_id=self._mongo_conn_id) ports_collection = mongo_hook.get_collection(self._mongo_collection) self.log.info("Initializing Postgres Master DB Connection...") psql_hook = PostgresHook(postgres_conn_id=self._postgres_conn_id) psql_conn = psql_hook.get_conn() psql_cursor = psql_conn.cursor() self.log.info("Loading Staging data to Master Database...") try: for idx, document in enumerate(ports_collection.find({})): document = self._processor.process_item(document) staging_id = document.get('_id').__str__() document['staging_id'] = staging_id document.pop('_id') psql_cursor.execute(self._sql_query, document) psql_conn.commit() except (OperationalError, UndefinedTable, OperationFailure): self.log.error("Writting to database FAILED.") self.log.error(traceback.format_exc()) raise Exception("LoadToMasterdbOperator FAILED.") except Exception: self.log.error(traceback.format_exc()) raise Exception("LoadToMasterdbOperator FAILED.") finally: if not testing: self.log.info('Closing database connections...') psql_conn.close() mongo_hook.close_conn() self.log.info(f'UPSERTED {idx+1} records into Postgres Database.') self.log.info('LoadToMasterdbOperator SUCCESS!')
class BaseAPIOperator(BaseOperator): """ Base Operator for API Requests to a main endpoint that generates subendpoints for futher requests. :param endpoint: The API endpoint to query. :type endpoint: str :param parser: Function that parses the endpoint response, into a list of sub-endpoints. Should return a list of strings. :type parser: function that takes a requests.Response object and returns a list of sub-endpoints :param response_count: Function that returns number of items in API response :type response_count: Callable[[requests.Response], int] :param number_of_batches: Number of batches used in the DAG Run. :type number_of_batches: int :param http_conn_id: Airflow Connection variable name for the base API URL. :type http_conn_id: str :param mongo_conn_id: Airflow Connection variable name for the MongoDB. :type mongo_conn_id: str :param response_valid: Function that checks if status code is valid. Defaults to 200 status only. :type response_valid: Callable[[requests.Response], bool] :param query_builder: Function that returns a Dictionary of query parameters. :type query_builder: Callable[[None], Dict[str, str]] :param header: Headers to be added to API request. :type header: dict of string key-value pairs :param options: Optional keyword arguments for the Requests library get function. :type options: dict of string key-value pairs :param log_response: Flag to allow for logging Request response. Defaults to False. :type log_response: bool """ @apply_defaults def __init__( self, endpoint: str, parser: Callable[ [requests.Response], list], # Function that parses a response to gather specific endpoints response_count: Callable[ [requests.Response], int], # Determines the number of items from query number_of_batches: int, http_conn_id: str, mongo_conn_id: str, batch_name: str, response_valid: Callable[[requests.Response], bool] = None, query_builder: Callable[[None], str] = None, header: Optional[Dict[str, str]] = None, options: Optional[Dict[str, Any]] = None, log_response: bool = False, **kwargs, ) -> None: # delegate to BaseOperator, we don't need to do anything else super().__init__(**kwargs) self.number_of_batches = number_of_batches # API endpoint information, we should only be making GET requests from here # Header is most likely unneccessary self.endpoint = endpoint self.method = "GET" self.query_builder = query_builder or self._default_query_builder self.header = header or {} self.http_conn_id = http_conn_id self.mongo_conn_id = mongo_conn_id self.batch_name = batch_name # Functions for operating on response data self.parser = parser self.response_count = response_count self.response_valid = response_valid or self._default_response_valid # Options is for Requests library functions self.options = options or {} self.log_response = log_response # # these get instantiated on execute # these get instantiated on execute self.http = None self.mongo_conn = None # Override the execute method, we want any derived classes to override # _execute() def execute(self, context: Dict[str, Any]) -> Any: self.http = HttpHook(self.method, http_conn_id=self.http_conn_id) self.mongo_conn = MongoHook(self.mongo_conn_id) # generate query parameters self.query = self.query_builder() self.log.info(f"Connecting to: {self.http_conn_id}") return_val = self._execute(context) self._shutdown() return return_val def _execute(self, context: Dict[str, Any]) -> Any: raise NotImplementedError( "_execute() needs to be defined for subclasses.") def _call_once(self, use_query: bool = False) -> Union[requests.Response, None]: """ Execute a single API call. :param query: If use_query is true, we use the internal query string provided in our request. :type query: bool (defaults to False) """ response = self.http.run( self.endpoint, self.query if use_query else {}, self.header, self.options, ) if self.log_response: self.log.info(response.url) if not self.response_valid(response): return None return self._to_json(response) def _to_json(self, response: requests.Response): try: return response.json() except JSONDecodeError: self.log.error( f"Failed to convert response to JSON: {response.url}") return None def _api_id_to_document(self, _id: str, name: str, batch_id: int): return {"api_id": str(_id), "batch_id": f"{name}{batch_id}"} def _default_query_builder(self) -> dict: return {} def _default_response_valid(self, response: requests.Response) -> bool: """Default response_valid() function. Returns True only on 200.""" return response.status_code == 200 def _shutdown(self) -> None: """Explicitly close MongoDB connection""" if self.mongo_conn: self.mongo_conn.close_conn()