def _fetch_data(self, config, provider): url = config['url'] api_key = config['api_key'] last_update = provider.get( 'last_updated', utcfromtimestamp(0)).strftime('%Y-%m-%dT%H:%M:%S') # Results are pagified so we'll read this many at a time offset_jump = 10 params = {'start': last_update, 'limit': offset_jump} headers = {'apikey': api_key} items = [] offset = 0 while True: params['offset'] = offset try: response = requests.get(url, params=params, headers=headers, timeout=30) except requests.exceptions.ConnectionError as err: raise IngestApiError.apiConnectionError(exception=err) if response.ok: # The total number of results are given to us in json, get them # via a regex to read the field so we don't have to convert the # whole thing to json pointlessly item_ident = re.search('\"total\": *[0-9]*', response.text).group() results_str = re.search('[0-9]+', item_ident).group() if results_str is None: raise IngestApiError.apiGeneralError( Exception(response.text), provider) num_results = int(results_str) if num_results > 0: items.append(response.text) if offset >= num_results: return items offset += offset_jump else: if re.match('Error: No API Key provided', response.text): raise IngestApiError.apiAuthError(Exception(response.text), provider) elif response.status_code == 404: raise IngestApiError.apiNotFoundError( Exception(response.reason), provider) else: raise IngestApiError.apiGeneralError( Exception(response.reason), provider) return items
def _get_tree(self, endpoint, payload=None): """Get xml response for given API endpoint and payload. :param: endpoint :type endpoint: str :param: payload :type payload: str """ if payload is None: payload = {} payload['token'] = self._get_auth_token(self.provider, update=True) url = self._get_absolute_url(endpoint) if not self.session: self.session = requests.Session() retries = 0 while True: try: response = self.session.get(url, params=payload, timeout=(30, 15)) except requests.exceptions.Timeout as ex: if retries < 3: logger.warn( 'Reuters API timeout retrying, retries {}'.format( retries)) retries += 1 continue raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError( _('Not found {payload}').format(payload=payload)) break try: return etree.fromstring( response.content) # workaround for http mock lib except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider)
def _fetch_data(self, config, provider): url = config['url'] api_key = config['api_key'] last_update = provider.get('last_updated', utcfromtimestamp(0)).strftime('%Y-%m-%dT%H:%M:%S') # Results are pagified so we'll read this many at a time offset_jump = 10 params = {'start': last_update, 'limit': offset_jump} headers = {'apikey': api_key} items = [] offset = 0 while True: params['offset'] = offset try: response = requests.get(url, params=params, headers=headers, timeout=30) except requests.exceptions.ConnectionError as err: raise IngestApiError.apiConnectionError(exception=err) if response.ok: # The total number of results are given to us in json, get them # via a regex to read the field so we don't have to convert the # whole thing to json pointlessly item_ident = re.search('\"total\": *[0-9]*', response.text).group() results_str = re.search('[0-9]+', item_ident).group() if results_str is None: raise IngestApiError.apiGeneralError( Exception(response.text), provider) num_results = int(results_str) if num_results > 0: items.append(response.text) if offset >= num_results: return items offset += offset_jump else: if re.match('Error: No API Key provided', response.text): raise IngestApiError.apiAuthError( Exception(response.text), provider) elif response.status_code == 404: raise IngestApiError.apiNotFoundError( Exception(response.reason), provider) else: raise IngestApiError.apiGeneralError( Exception(response.reason), provider) return items
def _get_tree(self, endpoint, payload=None): """Get xml response for given API endpoint and payload. :param: endpoint :type endpoint: str :param: payload :type payload: str """ if payload is None: payload = {} payload['token'] = self._get_auth_token(self.provider, update=True) url = self._get_absolute_url(endpoint) if not self.session: self.session = requests.Session() retries = 0 while True: try: response = self.session.get(url, params=payload, timeout=(30, 15)) except requests.exceptions.Timeout as ex: if retries < 3: logger.warn('Reuters API timeout retrying, retries {}'.format(retries)) retries += 1 continue raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found %s' % payload) break try: return etree.fromstring(response.content) # workaround for http mock lib except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider)
def _update(self, provider, update): # Each update run will retrieve the data for a single "market" market_index = provider.get('private', {}).get('market_index', 0) markets = json.loads( provider.get('config', {}).get('market_definitions', []).replace('\'', '"')) market = markets[market_index] logger.info('Retrieving fuel data for the {} market'.format( market.get('market'))) try: self.session_token = self._get_token(provider).get('id') prices = self._get_prices(provider, market) self._save(prices, market) except Exception as ex: raise IngestApiError.apiGeneralError(ex, self.provider) finally: # Save the next market to process market_index = (market_index + 1) % len(markets) get_resource_service('ingest_providers').system_update( provider.get('_id'), {'private': { 'market_index': market_index }}, provider) return None
def _generate_auth_token(self, provider): """ Generates Authentication Token as per the configuration in Ingest Provider. :param provider: dict - Ingest provider details to which the current directory has been configured :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource` :return: token details if successfully authenticated :rtype: str :raises: IngestApiError.apiGeneralError() if auth_url is missing in the Ingest Provider configuration """ session = requests.Session() session.mount('https://', SSLAdapter()) auth_url = provider.get('config', {}).get('auth_url', None) if not auth_url: raise IngestApiError.apiGeneralError(provider=provider, exception=KeyError( ''' Ingest Provider {} is missing Authentication URL. Please check the configuration. '''.format(provider['name'])) ) payload = { 'username': provider.get('config', {}).get('username', ''), 'password': provider.get('config', {}).get('password', ''), } response = session.get(auth_url, params=payload, verify=False, timeout=30) if response.status_code < 200 or response.status_code >= 300: raise IngestApiError.apiAuthError(provider=provider) tree = etree.fromstring(response.content) # workaround for http mock lib return tree.text
def _fetch_data(self, config, provider): """Fetch the latest feed data. :param dict config: RSS resource configuration :param provider: data provider instance, needed as an argument when raising ingest errors :return: fetched RSS data :rtype: str :raises IngestApiError: if fetching data fails for any reason (e.g. authentication error, resource not found, etc.) """ url = config['url'] if config.get('auth_required', False): auth = (config.get('username'), config.get('password')) else: auth = None response = requests.get(url, auth=auth) if response.ok: return response.content else: if response.status_code in (401, 403): raise IngestApiError.apiAuthError( Exception(response.reason), provider) elif response.status_code == 404: raise IngestApiError.apiNotFoundError( Exception(response.reason), provider) else: raise IngestApiError.apiGeneralError( Exception(response.reason), provider)
def _generate_auth_token(self, provider): """ Generates Authentication Token as per the configuration in Ingest Provider. :param provider: dict - Ingest provider details to which the current directory has been configured :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource` :return: token details if successfully authenticated :rtype: str :raises: IngestApiError.apiGeneralError() if auth_url is missing in the Ingest Provider configuration """ session = requests.Session() session.mount('https://', SSLAdapter()) auth_url = provider.get('config', {}).get('auth_url', None) if not auth_url: raise IngestApiError.apiGeneralError(provider=provider, exception=KeyError(''' Ingest Provider {} is missing Authentication URL. Please check the configuration. '''.format( provider['name']))) payload = { 'username': provider.get('config', {}).get('username', ''), 'password': provider.get('config', {}).get('password', ''), } response = session.get(auth_url, params=payload, verify=False, timeout=30) tree = etree.fromstring( response.content) # workaround for http mock lib return tree.text
def _test(self, provider): config = provider.get('config', {}) url = config['url'] api_key = config['api_key'] # limit the data to a single article and filter out all article fields # to save bandwidth params = {'limit': 1, 'fields': 'id'} headers = {'apikey': api_key} try: response = requests.get(url, params=params, headers=headers, timeout=30) except requests.exceptions.ConnectionError as err: raise IngestApiError.apiConnectionError(exception=err) if not response.ok: if response.status_code == 404: raise IngestApiError.apiNotFoundError( Exception(response.reason), provider) else: raise IngestApiError.apiGeneralError( Exception(response.reason), provider)
def _fetch_data(self, config, provider): """Fetch the latest feed data. :param dict config: RSS resource configuration :param provider: data provider instance, needed as an argument when raising ingest errors :return: fetched RSS data :rtype: str :raises IngestApiError: if fetching data fails for any reason (e.g. authentication error, resource not found, etc.) """ url = config["url"] if config.get("auth_required", False): auth = (config.get("username"), config.get("password")) else: auth = None response = requests.get(url, auth=auth) if response.ok: return response.content else: if response.status_code in (401, 403): raise IngestApiError.apiAuthError(Exception(response.reason), provider) elif response.status_code == 404: raise IngestApiError.apiNotFoundError(Exception(response.reason), provider) else: raise IngestApiError.apiGeneralError(Exception(response.reason), provider)
class WufooFeedingService(FeedingService): """ Feeding Service class which can read article(s) using Wufoo API """ NAME = "wufoo" ERRORS = [ IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiGeneralError().get_error_description(), ] label = "Wufoo feed API" fields = [ { "id": "wufoo_username", "type": "text", "label": "Login", "placeholder": "Wufoo login", "required": True }, { "id": "wufoo_api_key", "type": "password", "label": "API key", "placeholder": "Wufoo API Key", "required": True, }, ] def __init__(self): super().__init__() self.fields_cache = {} def _update(self, provider, update): user = provider["config"]["wufoo_username"] wufoo_data = { "url": WUFOO_URL.format(subdomain=user), "user": user, "api_key": provider["config"]["wufoo_api_key"], "form_query_entries_tpl": WUFOO_QUERY_FORM + WUFOO_QUERY_ENTRIES, "update": update, } try: parser = self.get_feed_parser(provider, None) except requests.exceptions.Timeout as ex: raise IngestApiError.apiTimeoutError(ex, provider) except requests.exceptions.TooManyRedirects as ex: raise IngestApiError.apiRedirectError(ex, provider) except requests.exceptions.RequestException as ex: raise IngestApiError.apiRequestError(ex, provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) items = parser.parse(wufoo_data, provider) return [items]
def _get_tree(self, endpoint, payload=None): """ Get xml response for given API endpoint and payload. :param: endpoint :type endpoint: str :param: payload :type payload: str """ if payload is None: payload = {} payload['token'] = self._get_auth_token(self.provider, update=True) url = self._get_absolute_url(endpoint) try: response = requests.get(url, params=payload, timeout=15) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found %s' % payload) try: return etree.fromstring( response.content) # workaround for http mock lib except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider)
class WufooFeedingService(FeedingService): """ Feeding Service class which can read article(s) using Wufoo API """ NAME = 'wufoo' ERRORS = [ IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiGeneralError().get_error_description() ] label = 'Wufoo feed API' fields = [{ 'id': 'wufoo_username', 'type': 'text', 'label': 'Login', 'placeholder': 'Wufoo login', 'required': True }, { 'id': 'wufoo_api_key', 'type': 'password', 'label': 'API key', 'placeholder': 'Wufoo API Key', 'required': True }] parser_restricted_values = ['wufoo'] def __init__(self): self.fields_cache = {} def _update(self, provider, update): user = provider['config']['wufoo_username'] wufoo_data = { "url": WUFOO_URL.format(subdomain=user), "user": user, "api_key": provider['config']['wufoo_api_key'], "form_query_entries_tpl": WUFOO_QUERY_FORM + WUFOO_QUERY_ENTRIES, "update": update } try: parser = self.get_feed_parser(provider, None) except requests.exceptions.Timeout as ex: raise IngestApiError.apiTimeoutError(ex, provider) except requests.exceptions.TooManyRedirects as ex: raise IngestApiError.apiRedirectError(ex, provider) except requests.exceptions.RequestException as ex: raise IngestApiError.apiRequestError(ex, provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) items = parser.parse(wufoo_data, provider) return [items]
def _update(self, provider, update): updated = utcnow() last_updated = provider.get('last_updated') ttl_minutes = app.config['INGEST_EXPIRY_MINUTES'] if not last_updated or last_updated < updated - datetime.timedelta( minutes=ttl_minutes): last_updated = updated - datetime.timedelta(minutes=ttl_minutes) self.provider = provider provider_config = provider.get('config') if not provider_config: provider_config = {} provider['config'] = provider_config self.URL = provider_config.get('url') payload = {} parser = self.get_feed_parser(provider) try: response = requests.get(self.URL, params=payload, timeout=15) # TODO: check if file has been updated since provider last_updated # although some ptovider do not include 'Last-Modified' in headers # so unsure how to do this logger.info('Http Headers: %s', response.headers) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found %s' % payload) logger.info('Ingesting: %s', str(response.content)) if isinstance(parser, NTBEventXMLFeedParser): xml = ET.fromstring(response.content) items = parser.parse(xml, provider) elif isinstance(parser, IcsTwoFeedParser): cal = Calendar.from_ical(response.content) items = parser.parse(cal, provider) else: items = parser.parse(response.content) if isinstance(items, list): yield items else: yield [items]
def _get_tree(self, endpoint, payload=None): """ Get xml response for given API endpoint and payload. :param: endpoint :type endpoint: str :param: payload :type payload: str """ if payload is None: payload = {} payload['token'] = self._get_auth_token(self.provider, update=True) url = self._get_absolute_url(endpoint) try: response = requests.get(url, params=payload, timeout=15) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found %s' % payload) try: return etree.fromstring(response.content) # workaround for http mock lib except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider)
class EventHTTPFeedingService(HTTPFeedingServiceBase): """ Feeding Service class which can read events using HTTP """ NAME = 'event_http' label = 'Event HTTP feed' service = 'events' fields = [ { 'id': 'url', 'type': 'text', 'label': 'Feed URL', 'placeholder': 'Feed URL', 'required': True } ] ERRORS = [IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description()] HTTP_AUTH = False def _update(self, provider, update): """ Fetch events from external API. :param provider: Ingest Provider Details. :type provider: dict :param update: Any update that is required on provider. :type update: dict :return: a list of events which can be saved. """ response = self.get_url(self.config['url']) parser = self.get_feed_parser(provider) logger.info('Ingesting events with {} parser'.format(parser.__class__.__name__)) logger.info('Ingesting content: {} ...'.format(str(response.content)[:4000])) if hasattr(parser, 'parse_http'): items = parser.parse_http(response.content, provider) else: items = parser.parse(response.content) if isinstance(items, list): yield items else: yield [items]
def test_raise_apiGeneralError(self): with assert_raises(IngestApiError) as error_context: ex = Exception("Testing general API error") raise IngestApiError.apiGeneralError(ex, self.provider) exception = error_context.exception self.assertEqual(exception.code, 4000) self.assertEqual(exception.message, "Unknown API ingest error") self.assertEqual(exception.provider_name, "TestProvider") self.assertIsNotNone(exception.system_exception) self.assertEqual(exception.system_exception.args[0], "Testing general API error") self.assertEqual(len(self.mock_logger_handler.messages['error']), 1) self.assertEqual( self.mock_logger_handler.messages['error'][0], "IngestApiError Error 4000 - Unknown API ingest error: " "Testing general API error on channel TestProvider")
def test_raise_apiGeneralError(self): with assert_raises(IngestApiError) as error_context: ex = Exception("Testing general API error") raise IngestApiError.apiGeneralError(ex, self.provider) exception = error_context.exception self.assertEqual(exception.code, 4000) self.assertEqual(exception.message, "Unknown API ingest error") self.assertEqual(exception.provider_name, "TestProvider") self.assertIsNotNone(exception.system_exception) self.assertEqual(exception.system_exception.args[0], "Testing general API error") self.assertEqual(len(self.mock_logger_handler.messages["error"]), 1) self.assertEqual( self.mock_logger_handler.messages["error"][0], "IngestApiError Error 4000 - Unknown API ingest error: " "Testing general API error on channel TestProvider", )
def _generate_auth_token(self, provider): """ Generates Authentication Token as per the configuration in Ingest Provider. :param provider: dict - Ingest provider details to which the current directory has been configured :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource` :return: token details if successfully authenticated :rtype: str :raises: IngestApiError.apiGeneralError() if auth_url is missing in the Ingest Provider configuration """ session = requests.Session() session.mount("https://", SSLAdapter()) auth_url = provider.get("config", {}).get("auth_url", None) if not auth_url: raise IngestApiError.apiGeneralError( provider=provider, exception=KeyError( """ Ingest Provider {} is missing Authentication URL. Please check the configuration. """.format( provider["name"] ) ), ) payload = { "username": provider.get("config", {}).get("username", ""), "password": provider.get("config", {}).get("password", ""), } response = session.get(auth_url, params=payload, verify=False, timeout=30) if response.status_code < 200 or response.status_code >= 300: try: response.raise_for_status() except Exception: err = IngestApiError.apiAuthError(provider=provider) self.close_provider(provider, err, force=True) raise err tree = etree.fromstring(response.content) # workaround for http mock lib return tree.text
def _fetch_data(self, config, provider): """Fetch the latest feed data. :param dict config: RSS resource configuration :param provider: data provider instance, needed as an argument when raising ingest errors :return: fetched RSS data :rtype: str :raises IngestApiError: if fetching data fails for any reason (e.g. authentication error, resource not found, etc.) """ url = config['url'] if config.get('auth_required', False): auth = (config.get('username'), config.get('password')) self.auth_info = { 'username': config.get('username', ''), 'password': config.get('password', '') } else: auth = None try: response = requests.get(url, auth=auth, timeout=30) except requests.exceptions.ConnectionError as err: raise IngestApiError.apiConnectionError(exception=err, provider=provider) except requests.exceptions.RequestException as err: raise IngestApiError.apiURLError(exception=err, provider=provider) if response.ok: return response.content else: if response.status_code in (401, 403): raise IngestApiError.apiAuthError( Exception(response.reason), provider) elif response.status_code == 404: raise IngestApiError.apiNotFoundError( Exception(response.reason), provider) else: raise IngestApiError.apiGeneralError( Exception(response.reason), provider)
def _request(self, url): try: response = requests.get(url, params={}, timeout=120) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found') return response.content
def get_url(self, url=None, **kwargs): """Do an HTTP Get on URL and validate response. :param string url: url to use (None to use self.HTTP_URL) :param **kwargs: extra parameter for requests :return dict: response content data """ response = super().get_url(url=url, **kwargs) content = response.json() if content['hasError']: msg = "Error in GET: '{}'. ErrorCode: '{}'. Description: '{}'".format( url, content['data']['errorCode'], content['data']['description'] ) logger.error(msg) raise IngestApiError.apiGeneralError(Exception(msg), self.provider) return content['data']
def _fetch_data(self): url = self.config['url'] api_key = self.config['api_key'] last_update = self.provider.get( 'last_updated', utcfromtimestamp(0)).strftime('%Y-%m-%dT%H:%M:%S') # Results are pagified so we'll read this many at a time offset_jump = 10 params = {'start': last_update, 'limit': offset_jump} headers = {'apikey': api_key} items = [] offset = 0 while True: params['offset'] = offset response = self.get_url(url, params=params, headers=headers) # The total number of results are given to us in json, get them # via a regex to read the field so we don't have to convert the # whole thing to json pointlessly item_ident = re.search('\"total\": *[0-9]*', response.text).group() results_str = re.search('[0-9]+', item_ident).group() if results_str is None: raise IngestApiError.apiGeneralError(Exception(response.text), self.provider) num_results = int(results_str) if num_results > 0: items.append(response.text) if offset >= num_results: return items offset += offset_jump return items
def _update(self, provider, update): user = provider['config']['wufoo_username'] wufoo_data = { "url": WUFOO_URL.format(subdomain=user), "user": user, "api_key": provider['config']['wufoo_api_key'], "form_query_entries_tpl": WUFOO_QUERY_FORM + WUFOO_QUERY_ENTRIES, "update": update} try: parser = self.get_feed_parser(provider, None) except requests.exceptions.Timeout as ex: raise IngestApiError.apiTimeoutError(ex, provider) except requests.exceptions.TooManyRedirects as ex: raise IngestApiError.apiRedirectError(ex, provider) except requests.exceptions.RequestException as ex: raise IngestApiError.apiRequestError(ex, provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) items = parser.parse(wufoo_data, provider) return [items]
def _update(self, provider, update): # Each update run will retrieve the data for a single "market" market_index = provider.get('private', {}).get('market_index', 0) markets = json.loads(provider.get('config', {}).get('market_definitions', []).replace('\'', '"')) market = markets[market_index] logger.info('Retrieving fuel data for the {} market'.format(market.get('market'))) try: self.session_token = self._get_token(provider).get('id') prices = self._get_prices(provider, market) self._save(prices, market) except Exception as ex: raise IngestApiError.apiGeneralError(ex, self.provider) finally: # Save the next market to process market_index = (market_index + 1) % len(markets) get_resource_service('ingest_providers').system_update(provider.get('_id'), {'private': {'market_index': market_index}}, provider) return None
def _update(self, provider, update): def convert_date(epoch): dt = local_to_utc(config.DEFAULT_TIMEZONE, datetime.fromtimestamp(int(str(epoch)[:10]))) return dt username = provider.get('config', {}).get('username') password = provider.get('config', {}).get('password') url = provider.get('config', {}).get('api_url') try: response = requests.get(url, auth=(username, password)) response.raise_for_status() except Exception as ex: raise IngestApiError.apiGeneralError(ex, self.provider) data = json.loads(response.content.decode('UTF-8')) service = get_resource_service('traffic_incidents') incidents = [] for feature in data.get('features', []): props = feature.get('properties', {}) incident = { 'guid': int(props.get('id')), 'start_date': convert_date(props.get('startDate')), 'end_date': convert_date(props.get('endDate')), 'incident_type': props.get('type'), 'incident_description': props.get('description'), 'city': props.get('city'), 'state': props.get('state'), 'from_street_name': props.get('fromStreetName'), 'from_cross_street_name': props.get('fromCrossStreetName'), 'to_street_name': props.get('toStreetName'), 'to_cross_street_name': props.get('toCrossStreetName'), 'geometry': feature.get('geometry') } incident.get('geometry').pop('crs') incidents.append(incident) service.delete(lookup={}) service.post(incidents)
class AAPSportsHTTPFeedingService(HTTPFeedingService): label = 'AAP Sports Results Feed' NAME = 'aap_sports_http' ERRORS = [ IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description() ] """ Defines the collection service to be used with this ingest feeding service. """ service = 'events' fields = [ { 'id': 'login_url', 'type': 'text', 'label': 'Login Url', 'placeholder': 'Login Url', 'required': True, 'errors': { 4006: 'Server not found.', 4000: 'Unexpected server response' } }, { 'id': 'fixtures_url', 'type': 'text', 'label': 'Fixtures Url', 'placeholder': 'Fixtures Url', 'required': True }, { 'id': 'username', 'type': 'text', 'label': 'Username', 'placeholder': 'Username', 'required': True }, { 'id': 'password', 'type': 'password', 'label': 'Password', 'placeholder': 'Password', 'required': True, 'errors': { 4007: 'Authentication error.' } }, { 'id': 'sports', 'type': 'text', 'label': 'Sports', 'placeholder': 'Comma separate list of sports ids', 'required': True, 'default': '1,2,3,4,10' }, ] def _update(self, provider, update): self.provider = provider parser = self.get_feed_parser(provider) # get the current year, it is used to filter fixtures for this year and next year = int(utcnow().year) % 100 config = provider.get('config', {}) content = self._request( config.get('login_url').format(config.get('username'), config.get('password'))) # get the configured sports configured_sports = config.get('sports').split(',') xml = ET.fromstring(content) if xml.attrib['Status_Code'] == 'OK': session = xml.attrib['Status_Session'] content = self._request( config.get('fixtures_url').format(session, '', '', '')) xml = ET.fromstring(content) for s in xml.findall('.//Sports/Sport'): sport_id = s.attrib['SportID'] if sport_id not in configured_sports: continue sport_name = s.attrib['SportName'] content = self._request( config.get('fixtures_url').format(session, sport_id, '', '')) sport_xml = ET.fromstring(content) for c in sport_xml.findall('.//Competition'): comp_id = c.attrib.get('Comp_ID') comp_name = c.attrib.get('Comp_Name') content = self._request( config.get('fixtures_url').format( session, sport_id, comp_id, '')) comp_xml = ET.fromstring(content) for season in comp_xml.findall('.//Season'): season_id = season.attrib.get('SeasonID') if str(year) in season_id or str(year + 1) in season_id: content = self._request( config.get('fixtures_url').format( session, sport_id, comp_id, season_id)) fixture_xml = ET.fromstring(content) logger.info('Parsing {}/{} {}/{}'.format( sport_id, sport_name, comp_id, comp_name)) items = parser.parse( { 'fixture_xml': fixture_xml, 'sport_id': sport_id, 'sport_name': sport_name, 'comp_name': comp_name, 'comp_id': comp_id }, provider) if len(items) > 0: yield items def _request(self, url): try: response = requests.get(url, params={}, timeout=120) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found') return response.content
from superdesk.errors import IngestApiError, ParserError from superdesk.io import register_provider from superdesk.io.ingest_service import IngestService from superdesk.utils import merge_dicts from urllib.parse import quote as urlquote, urlsplit, urlunsplit PROVIDER = "rss" utcfromtimestamp = datetime.utcfromtimestamp errors = [ IngestApiError.apiAuthError().get_error_description(), IngestApiError.apiNotFoundError().get_error_description(), IngestApiError.apiGeneralError().get_error_description(), ParserError.parseMessageError().get_error_description(), ] class RssIngestService(IngestService): """Ingest service for providing feeds received in RSS 2.0 format. (NOTE: it should also work with other syndicated feeds formats, too, since the underlying parser supports them, but for our needs RSS 2.0 is assumed) """ ItemField = namedtuple("ItemField", ["name", "name_in_data", "type"]) item_fields = [ ItemField("guid", "guid", str),
class HTTPFeedingServiceBase(FeedingService): """ Base class for feeding services using HTTP. This class contains helpers to make the creation of HTTP based feeding services easier. There are a couple of class attributes you can use: ======================= =========== Attribute Explanation ======================= =========== HTTP_URL Main URL of your service, will be used by default in get_url HTTP_TIMEOUT Timeout of requests in seconds HTTP_DEFAULT_PARAMETERS Parameters used in every ``get`` requests. Will be updated with params set in arguments HTTP_AUTH Indicate if HTTP authentication is needed for your service. If None, the authentication will be determined by the existence of user and password. Will be overriden by auth_required config if it exists. ======================= =========== In addition, you have some pre-filled fields: =============== =========== Field Explanation =============== =========== AUTH_FIELDS username and password fields AUTH_REQ_FIELDS username and password fields + auth_required field to indicate if they are needed =============== =========== When ingest is updated, the provider is automatically saved to ``self.provider``. ``config`` property allows to access easily the user configuration. ``auth_info`` property returns a dictionary with ``username`` and ``password`` ``get_url`` method do a HTTP Get request. url can be ommited in which case HTTP_URL will be used. Authentication parameters are set automatically, and errors are catched appropriately. Extra arguments are used directly in *requests* call. """ ERRORS = [ IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiGeneralError().get_error_description(), SuperdeskIngestError.notConfiguredError().get_error_description() ] # override this parameter with the main URL to use HTTP_URL = None # timeout in seconds HTTP_TIMEOUT = 30 # if some parameters are used in every request, put them here HTTP_DEFAULT_PARAMETERS = None # Set to True if authentication is mandatory, False if there is no authentication # and None to add authentication if user and password are defined. # If auth_required is defined in config fields, it will override this value. HTTP_AUTH = True # use this when auth is always required AUTH_FIELDS = [{ 'id': 'username', 'type': 'text', 'label': 'Username', 'placeholder': 'Username', 'required': True }, { 'id': 'password', 'type': 'password', 'label': 'Password', 'placeholder': 'Password', 'required': True }] # use this when auth depends of a "auth_required" flag (set by user) AUTH_REQ_FIELDS = [{ 'id': 'auth_required', 'type': 'boolean', 'label': 'Requires Authentication', 'placeholder': 'Requires Authentication', 'required': False }, { 'id': 'username', 'type': 'text', 'label': 'Username', 'placeholder': 'Username', 'required_expression': '{auth_required}', 'show_expression': '{auth_required}' }, { 'id': 'password', 'type': 'password', 'label': 'Password', 'placeholder': 'Password', 'required_expression': '{auth_required}', 'show_expression': '{auth_required}' }] def __init__(self): self.token = None @property def auth_info(self): """Helper method to retrieve a dict with username and password when set""" username = self.config.get('username', '') password = self.config.get('password', '') if not username or not password: return None return {'username': username, 'password': password} @property def config(self): return self.provider.setdefault('config', {}) def validate_config(self): """ Validate provider config according to `cls.fields` :param config: Ingest provider configuration :type config: dict :return: """ # validate required config fields required_keys = [ field['id'] for field in self.fields if field.get('required', False) ] if not set(self.config.keys()).issuperset(required_keys): raise SuperdeskIngestError.notConfiguredError( Exception('{} are required.'.format(', '.join(required_keys)))) url = self.config.get('url').strip() if not url: try: url_field = next({f for f in self.fields if f['id'] == u'url'}) except StopIteration: url_required = False else: url_required = url_field.get('required', False) if url_required: raise SuperdeskIngestError.notConfiguredError( Exception('URL is a required field.')) else: # validate url if not url.startswith('http'): raise SuperdeskIngestError.notConfiguredError( Exception('URL must be a valid HTTP link.')) def get_url(self, url=None, **kwargs): """Do an HTTP Get on URL :param string url: url to use (None to use self.HTTP_URL) :param **kwargs: extra parameter for requests :return requests.Response: response """ if not url: url = self.HTTP_URL config = self.config user = config.get('username') password = config.get('password') if user: user = user.strip() if password: password = password.strip() auth_required = config.get('auth_required', self.HTTP_AUTH) if auth_required is None: # auth_required may not be user in the feeding service # in this case with use authentification only if user # and password are set. auth_required = bool(user and password) if auth_required: if not user: raise SuperdeskIngestError.notConfiguredError( "user is not configured") if not password: raise SuperdeskIngestError.notConfiguredError( "password is not configured") kwargs.setdefault('auth', (user, password)) params = kwargs.pop("params", {}) if params or self.HTTP_DEFAULT_PARAMETERS: # if we have default parameters, we want them to be overriden # by conflicting params given in arguments if self.HTTP_DEFAULT_PARAMETERS: params.update(self.HTTP_DEFAULT_PARAMETERS) kwargs["params"] = params try: response = requests.get(url, timeout=self.HTTP_TIMEOUT, **kwargs) except requests.exceptions.Timeout as exception: raise IngestApiError.apiTimeoutError(exception, self.provider) except requests.exceptions.ConnectionError as exception: raise IngestApiError.apiConnectionError(exception, self.provider) except requests.exceptions.RequestException as exception: raise IngestApiError.apiRequestError(exception, self.provider) except Exception as exception: traceback.print_exc() raise IngestApiError.apiGeneralError(exception, self.provider) if not response.ok: exception = Exception(response.reason) if response.status_code in (401, 403): raise IngestApiError.apiAuthError(exception, self.provider) elif response.status_code == 404: raise IngestApiError.apiNotFoundError(exception, self.provider) else: raise IngestApiError.apiGeneralError(exception, self.provider) return response def update(self, provider, update): self.provider = provider self.validate_config() return super().update(provider, update)
class ReutersHTTPFeedingService(HTTPFeedingService): """ Feeding Service class which can read article(s) using HTTP provided by Reuters. """ NAME = 'reuters_http' ERRORS = [IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description()] DATE_FORMAT = '%Y.%m.%d.%H.%M' def _update(self, provider): updated = utcnow() last_updated = provider.get('last_updated') ttl_minutes = app.config['INGEST_EXPIRY_MINUTES'] if not last_updated or last_updated < updated - datetime.timedelta(minutes=ttl_minutes): last_updated = updated - datetime.timedelta(minutes=ttl_minutes) self.provider = provider provider_config = provider.get('config') if not provider_config: provider_config = {} provider['config'] = provider_config if 'url' not in provider_config: provider_config['url'] = 'http://rmb.reuters.com/rmd/rest/xml' if 'auth_url' not in provider_config: provider_config['auth_url'] = 'https://commerce.reuters.com/rmd/rest/xml/login' self.URL = provider_config.get('url') for channel in self._get_channels(): for guid in self._get_article_ids(channel, last_updated, updated): items = self.fetch_ingest(guid) if items: yield items def _get_channels(self): """Get subscribed channels.""" channels = [] tree = self._get_tree('channels') for channel in tree.findall('channelInformation'): channels.append(channel.find('alias').text) return channels def _get_tree(self, endpoint, payload=None): """ Get xml response for given API endpoint and payload. :param: endpoint :type endpoint: str :param: payload :type payload: str """ if payload is None: payload = {} payload['token'] = self._get_auth_token(self.provider, update=True) url = self._get_absolute_url(endpoint) try: response = requests.get(url, params=payload, timeout=15) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found %s' % payload) try: return etree.fromstring(response.content) # workaround for http mock lib except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) def _get_absolute_url(self, endpoint): """ Get absolute URL for given endpoint. :param: endpoint :type endpoint: str """ return '/'.join([self.URL, endpoint]) def _get_article_ids(self, channel, last_updated, updated): """ Get article ids which should be upserted. """ ids = set() payload = {'channel': channel, 'fieldsRef': 'id', 'dateRange': "%s-%s" % (self._format_date(last_updated), self._format_date(updated))} logger.info('Reuters requesting Date Range |{}| for channel {}'.format(payload['dateRange'], channel)) tree = self._get_tree('items', payload) for result in tree.findall('result'): ids.add(result.find('guid').text) return ids def _format_date(self, date): return date.strftime(self.DATE_FORMAT) def fetch_ingest(self, guid): items = self._parse_items(guid) result_items = [] while items: item = items.pop() self.add_timestamps(item) try: items.extend(self._fetch_items_in_package(item)) result_items.append(item) except LookupError as err: self.log_item_error(err, item, self.provider) return [] return result_items def _parse_items(self, guid): """ Parse item message and return given items. """ payload = {'id': guid} tree = self._get_tree('item', payload) parser = self.get_feed_parser(self.provider, tree) items = parser.parse(tree, self.provider) return items def _fetch_items_in_package(self, item): """ Fetch remote assets for given item. """ items = [] for group in item.get('groups', []): for ref in group.get('refs', []): if 'residRef' in ref: items.extend(self._parse_items(ref.get('residRef'))) return items
class HTTPFeedingService(FeedingService, metaclass=ABCMeta): """ Feeding Service class which can read article(s) using HTTP. """ ERRORS = [IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description()] label = 'HTTP' def __init__(self): super().__init__() self.token = None def _generate_token_and_update_provider(self, provider): """ Generates Authentication Token and updates the given provider with the authentication token. :param provider: dict - Ingest provider details to which the current directory has been configured :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource` :return: Authentication Token :rtype: str """ token = {'auth_token': self._generate_auth_token(provider), 'created': utcnow()} get_resource_service('ingest_providers').system_update(provider[config.ID_FIELD], updates={'tokens': token}, original=provider) provider['tokens'] = token return token['auth_token'] def _generate_auth_token(self, provider): """ Generates Authentication Token as per the configuration in Ingest Provider. :param provider: dict - Ingest provider details to which the current directory has been configured :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource` :return: token details if successfully authenticated :rtype: str :raises: IngestApiError.apiGeneralError() if auth_url is missing in the Ingest Provider configuration """ session = requests.Session() session.mount('https://', SSLAdapter()) auth_url = provider.get('config', {}).get('auth_url', None) if not auth_url: raise IngestApiError.apiGeneralError(provider=provider, exception=KeyError( ''' Ingest Provider {} is missing Authentication URL. Please check the configuration. '''.format(provider['name'])) ) payload = { 'username': provider.get('config', {}).get('username', ''), 'password': provider.get('config', {}).get('password', ''), } response = session.get(auth_url, params=payload, verify=False, timeout=30) if response.status_code < 200 or response.status_code >= 300: try: response.raise_for_status() except Exception: err = IngestApiError.apiAuthError(provider=provider) self.close_provider(provider, err, force=True) raise err tree = etree.fromstring(response.content) # workaround for http mock lib return tree.text def _is_valid_token(self, token): """Check if the given token is still valid. Most of authentication tokens issued by Ingest Providers are valid for 12 hours. :param token: Token information :type token: dict :return: True if valid, False otherwise :rtype: bool """ ttl = timedelta(hours=12) created = arrow.get(token.get('created')).datetime return created + ttl >= utcnow() and token.get('auth_token') def _get_auth_token(self, provider, update=False): """ Gets authentication token for given provider instance and save it in db based on the given update flag. :param provider: dict - Ingest provider details to which the current directory has been configured :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource` :param update: a flag which dictates whether to save the authentication token in Ingest Provider record or not. Saves if the value is True, defaults to False. :type update: bool :return: Authentication Token :rtype: str """ token = provider.get('tokens') if token and self._is_valid_token(token): return token.get('auth_token') return self._generate_token_and_update_provider(provider) if update else ''
class RSSFeedingService(HTTPFeedingServiceBase): """ Feeding service for providing feeds received in RSS 2.0 format. (NOTE: it should also work with other syndicated feeds formats, too, since the underlying parser supports them, but for our needs RSS 2.0 is assumed) """ NAME = "rss" ERRORS = [ IngestApiError.apiAuthError().get_error_description(), IngestApiError.apiNotFoundError().get_error_description(), IngestApiError.apiGeneralError().get_error_description(), ParserError.parseMessageError().get_error_description(), ] label = "RSS/Atom" fields = ([{ "id": "url", "type": "text", "label": "Host", "placeholder": "RSS Feed URL", "required": True, "errors": { 4001: "Connection timed out.", 4006: "URL not found.", 4009: "Can't connect to host.", 1001: "Can't parse the RSS.", }, }] + HTTPFeedingServiceBase.AUTH_REQ_FIELDS + [{ "id": "field_aliases", "type": "mapping", "label": "Content Field Aliases", "add_mapping_label": "Add alias", "remove_mapping_label": "Remove", "empty_label": "No field aliases defined.", "first_field_options": { "label": "Content Field Name", "values": [ "body_text", "guid", "published_parsed", "summary", "title", "updated_parsed" ], }, "second_field_options": { "label": "Field Alias", "placeholder": "Enter field alias" }, }]) HTTP_AUTH = None field_groups = { "auth_data": { "label": "Authentication Info", "fields": ["username", "password"] } } ItemField = namedtuple("ItemField", ["name", "name_in_data", "type"]) item_fields = [ ItemField("guid", "guid", str), ItemField("uri", "guid", str), ItemField("firstcreated", "published_parsed", datetime), ItemField("versioncreated", "updated_parsed", datetime), ItemField("headline", "title", str), ItemField("abstract", "summary", str), ItemField("body_html", "body_text", str), ItemField("byline", "author", str), ] """A list of fields that items created from the ingest data should contain. Each list item is a named tuple with the following three attribues: * name - the name of the field (attribute) in the resulting ingest item * name_in_data - the expected name of the data field in the retrieved ingest data (this can be overriden by providing a field name alias) * type - field's data type """ IMG_MIME_TYPES = ( "image/gif", "image/jpeg", "image/png", "image/tiff", ) """ Supported MIME types for ingesting external images referenced by the RSS entries. """ IMG_FILE_SUFFIXES = (".gif", ".jpeg", ".jpg", ".png", ".tif", ".tiff") """ Supported image filename extensions for ingesting (used for the <media:thumbnail> tags - they lack the "type" attribute). """ def prepare_href(self, url, mimetype=None): """Prepare a link to an external resource (e.g. an image file). It can be directly used by the ingest machinery for fetching it. If provider requires authentication, basic HTTP authentication info is added to the given url, otherwise it is returned unmodified. :param str url: the original URL as extracted from an RSS entry :return: prepared URL :rtype: str """ if self.auth_info: userinfo_part = "{}:{}@".format( urlquote(self.auth_info["username"]), urlquote(self.auth_info["password"])) scheme, netloc, path, query, fragment = urlsplit(url) netloc = userinfo_part + netloc url = urlunsplit((scheme, netloc, path, query, fragment)) return url def _test(self, provider): """Test connection.""" self.provider = provider xml = self._fetch_data() data = feedparser.parse(xml) if data.bozo: raise ParserError.parseMessageError(data.bozo_exception, provider) def _update(self, provider, update): """ Check data provider for data updates and returns new items (if any). :param provider: data provider instance :return: a list containing a list of new content items :rtype: list :raises IngestApiError: if data retrieval error occurs :raises ParserError: if retrieved RSS data cannot be parsed """ xml_data = self._fetch_data() try: data = feedparser.parse(xml_data) except Exception as ex: raise ParserError.parseMessageError(ex, provider, data=xml_data) # If provider last updated time is not available, set it to 1.1.1970 # so that it will be recognized as "not up to date". # Also convert it to a naive datetime object (removing tzinfo is fine, # because it is in UTC anyway) t_provider_updated = provider.get(LAST_ITEM_UPDATE, utcfromtimestamp(0)) t_provider_updated = t_provider_updated.replace(tzinfo=None) new_items = [] field_aliases = self.config.get("field_aliases") for entry in data.entries: try: t_entry_updated = utcfromtimestamp(timegm( entry.updated_parsed)) if t_entry_updated <= t_provider_updated: continue except (AttributeError, TypeError): # missing updated info, so better ingest it pass item = self._create_item(entry, field_aliases, provider.get("source", None)) self.localize_timestamps(item) # If the RSS entry references any images, create picture items from # them and create a package referencing them and the entry itself. # If there are no image references, treat entry as a simple text # item, even if it might reference other media types, e.g. videos. image_urls = self._extract_image_links(entry) if image_urls: image_items = self._create_image_items(image_urls, item) new_items.extend(image_items) new_items.append(item) item = self._create_package(item, image_items) new_items.append(item) return [new_items] def _fetch_data(self): """Fetch the latest feed data. :return: fetched RSS data :rtype: str :raises IngestApiError: if fetching data fails for any reason (e.g. authentication error, resource not found, etc.) """ url = self.config["url"] response = self.get_url(url) return response.content def _extract_image_links(self, rss_entry): """Extract URLs of all images referenced by the given RSS entry. Images can be referenced via `<enclosure>`, `<media:thumbnail>` or `<media:content>` RSS tag and must be listed among the allowed image types. All other links to external media are ignored. Duplicate URLs are omitted from the result. :param rss_entry: parsed RSS item (entry) :type rss_entry: :py:class:`feedparser.FeedParserDict` :return: a list of all unique image URLs found (as strings) """ img_links = set() for link in getattr(rss_entry, "links", []): if link.get("type") in self.IMG_MIME_TYPES: img_links.add(link["href"]) for item in getattr(rss_entry, "media_thumbnail", []): url = item.get("url", "") if url.endswith(self.IMG_FILE_SUFFIXES): img_links.add(url) for item in getattr(rss_entry, "media_content", []): if item.get("type") in self.IMG_MIME_TYPES: img_links.add(item["url"]) return list(img_links) def _create_item(self, data, field_aliases=None, source="source"): """Create a new content item from RSS feed data. :param dict data: parsed data of a single feed entry :param field_aliases: (optional) field name aliases. Used for content fields that are named differently in retrieved data. :type field_aliases: list of {field_name: alias} dictionaries or None :param str source: the source of provider :return: created content item :rtype: dict """ if field_aliases is None: field_aliases = {} else: field_aliases = merge_dicts(field_aliases) aliased_fields = set(field_aliases.values()) item = dict(type=CONTENT_TYPE.TEXT) # Only consider fields that are not used as an alias (i.e. used to # populate another field) - unless those fields have their own # aliases, too. # The idea is that if e.g. the main text field is aliased to use the # parsed data's summary field, that summary should not be used to # populate the field it was originally meant for. fields_to_consider = (f for f in self.item_fields if (f.name_in_data not in aliased_fields) or ( f.name_in_data in aliased_fields and f.name_in_data in field_aliases)) utc_now = datetime.utcnow() for field in fields_to_consider: data_field_name = field_aliases.get(field.name_in_data, field.name_in_data) field_value = data.get(data_field_name) if (field.type is datetime) and field_value: field_value = utcfromtimestamp(timegm(field_value)) field_value = utc_now if field_value > utc_now else field_value item[field.name] = field_value # Some feeds use <content:encoded> tag for storing the main content, # and that tag is parsed differently. If the body_html has not been # found in its default data field and is not aliased, try to # populate it using the aforementioned content field as a fallback. if field.name == "body_html" and not field_value and field.name_in_data not in field_aliases: try: item["body_html"] = data.content[0].value except Exception: pass # content either non-existent or parsed differently if not data.get("guidislink") and data.get("link"): item["uri"] = data["link"] scheme, netloc, path, query, fragment = urlsplit(item["uri"]) if data.get("guid"): item["guid"] = generate_tag(domain=netloc, id=data.get("guid")) else: item["guid"] = generate_tag_from_url(data["link"]) if item.get("uri", None): if not item.get("body_html", None): item["body_html"] = "" item[ "body_html"] = '<p><a href="%s" target="_blank">%s</a></p>' % ( item["uri"], source) + item["body_html"] item["dateline"] = { "source": source, "date": item.get("firstcreated", item.get("versioncreated")) } if not item.get("versioncreated") and item.get("firstcreated"): item["versioncreated"] = item["firstcreated"] return item def _create_image_items(self, image_links, text_item): """Create a list of picture items that represent the external images located on given URLs. Each created item's `firstcreated` and `versioncreated` fields are set to the same value as the values of these fields in `text_item`. :param iterable image_links: list of image URLs :param dict text_item: the "main" text item the images are related to :return: list of created image items (as dicts) """ image_items = [] for image_url in image_links: img_item = { "guid": generate_tag_from_url(image_url), ITEM_TYPE: CONTENT_TYPE.PICTURE, "firstcreated": text_item.get("firstcreated"), "versioncreated": text_item.get("versioncreated"), "renditions": { "baseImage": { "href": image_url } }, } image_items.append(img_item) return image_items def _create_package(self, text_item, image_items): """Create a new content package from given content items. The package's `main` group contains only the references to given items, not the items themselves. In the list of references, the reference to the text item preceeds the references to image items. Package's `firstcreated` and `versioncreated` fields are set to values of these fields in `text_item`, and the `headline` is copied as well. :param dict text_item: item representing the text content :param list image_items: list of items (dicts) representing the images related to the text content :return: the created content package :rtype: dict """ package = { ITEM_TYPE: CONTENT_TYPE.COMPOSITE, "guid": "{}:pkg".format(text_item["guid"]), "firstcreated": text_item["firstcreated"], "versioncreated": text_item["versioncreated"], "headline": text_item.get("headline", ""), "groups": [ { "id": "root", "role": "grpRole:NEP", "refs": [{ "idRef": "main" }], }, { "id": "main", "role": "main", "refs": [], }, ], } item_references = package["groups"][1]["refs"] item_references.append({"residRef": text_item["guid"]}) for image in image_items: item_references.append({"residRef": image["guid"]}) return package
class RSSFeedingService(FeedingService): """ Feeding service for providing feeds received in RSS 2.0 format. (NOTE: it should also work with other syndicated feeds formats, too, since the underlying parser supports them, but for our needs RSS 2.0 is assumed) """ NAME = 'rss' ERRORS = [IngestApiError.apiAuthError().get_error_description(), IngestApiError.apiNotFoundError().get_error_description(), IngestApiError.apiGeneralError().get_error_description(), ParserError.parseMessageError().get_error_description()] ItemField = namedtuple('ItemField', ['name', 'name_in_data', 'type']) item_fields = [ ItemField('guid', 'guid', str), ItemField('uri', 'guid', str), ItemField('firstcreated', 'published_parsed', datetime), ItemField('versioncreated', 'updated_parsed', datetime), ItemField('headline', 'title', str), ItemField('abstract', 'summary', str), ItemField('body_html', 'body_text', str), ItemField('timescalled', 'timescalled', int), ItemField('test', 'test', str), ItemField('testing', 'testing', str), ItemField('mobilecircle', 'mobilecircle', str), ItemField('audiofile', 'audiofile', str), ItemField('timesrecorded', 'timesrecorded', int), ItemField('timespublished', 'timespublished', int), ] """A list of fields that items created from the ingest data should contain. Each list item is a named tuple with the following three attribues: * name - the name of the field (attribute) in the resulting ingest item * name_in_data - the expected name of the data field in the retrieved ingest data (this can be overriden by providing a field name alias) * type - field's data type """ IMG_MIME_TYPES = ( 'image/gif', 'image/jpeg', 'image/png', 'image/tiff', ) """ Supported MIME types for ingesting external images referenced by the RSS entries. """ IMG_FILE_SUFFIXES = ('.gif', '.jpeg', '.jpg', '.png', '.tif', '.tiff') """ Supported image filename extensions for ingesting (used for the <media:thumbnail> tags - they lack the "type" attribute). """ def __init__(self): super().__init__() self.auth_info = None def prepare_href(self, url, mimetype=None): """ Prepare a link to an external resource (e.g. an image file) so that it can be directly used by the ingest machinery for fetching it. If provider requires authentication, basic HTTP authentication info is added to the given url, otherwise it is returned unmodified. :param str url: the original URL as extracted from an RSS entry :return: prepared URL :rtype: str """ if self.auth_info: userinfo_part = '{}:{}@'.format( urlquote(self.auth_info['username']), urlquote(self.auth_info['password']) ) scheme, netloc, path, query, fragment = urlsplit(url) netloc = userinfo_part + netloc url = urlunsplit((scheme, netloc, path, query, fragment)) return url def _update(self, provider): """ Check data provider for data updates and returns new items (if any). :param provider: data provider instance :return: a list containing a list of new content items :rtype: list :raises IngestApiError: if data retrieval error occurs :raises ParserError: if retrieved RSS data cannot be parsed """ config = provider.get('config', {}) if config.get('auth_required'): self.auth_info = { 'username': config.get('username', ''), 'password': config.get('password', '') } try: xml_data = self._fetch_data(config, provider) data = feedparser.parse(xml_data) except IngestApiError: raise except Exception as ex: raise ParserError.parseMessageError(ex, provider) # If provider last updated time is not available, set it to 1.1.1970 # so that it will be recognized as "not up to date". # Also convert it to a naive datetime object (removing tzinfo is fine, # because it is in UTC anyway) t_provider_updated = provider.get('last_updated', utcfromtimestamp(0)) t_provider_updated = t_provider_updated.replace(tzinfo=None) new_items = [] field_aliases = config.get('field_aliases') for entry in data.entries: t_entry_updated = utcfromtimestamp(timegm(entry.updated_parsed)) if t_entry_updated <= t_provider_updated: continue item = self._create_item(entry, field_aliases, provider.get('source', None)) self.add_timestamps(item) # If the RSS entry references any images, create picture items from # them and create a package referencing them and the entry itself. # If there are no image references, treat entry as a simple text # item, even if it might reference other media types, e.g. videos. image_urls = self._extract_image_links(entry) if image_urls: image_items = self._create_image_items(image_urls, item) new_items.extend(image_items) new_items.append(item) item = self._create_package(item, image_items) new_items.append(item) return [new_items] def _fetch_data(self, config, provider): """Fetch the latest feed data. :param dict config: RSS resource configuration :param provider: data provider instance, needed as an argument when raising ingest errors :return: fetched RSS data :rtype: str :raises IngestApiError: if fetching data fails for any reason (e.g. authentication error, resource not found, etc.) """ url = config['url'] if config.get('auth_required', False): auth = (config.get('username'), config.get('password')) else: auth = None response = requests.get(url, auth=auth) if response.ok: return response.content else: if response.status_code in (401, 403): raise IngestApiError.apiAuthError( Exception(response.reason), provider) elif response.status_code == 404: raise IngestApiError.apiNotFoundError( Exception(response.reason), provider) else: raise IngestApiError.apiGeneralError( Exception(response.reason), provider) def _extract_image_links(self, rss_entry): """Extract URLs of all images referenced by the given RSS entry. Images can be referenced via `<enclosure>`, `<media:thumbnail>` or `<media:content>` RSS tag and must be listed among the allowed image types. All other links to external media are ignored. Duplicate URLs are omitted from the result. :param rss_entry: parsed RSS item (entry) :type rss_entry: :py:class:`feedparser.FeedParserDict` :return: a list of all unique image URLs found (as strings) """ img_links = set() for link in getattr(rss_entry, 'links', []): if link.get('type') in self.IMG_MIME_TYPES: img_links.add(link['href']) for item in getattr(rss_entry, 'media_thumbnail', []): url = item.get('url', '') if url.endswith(self.IMG_FILE_SUFFIXES): img_links.add(url) for item in getattr(rss_entry, 'media_content', []): if item.get('type') in self.IMG_MIME_TYPES: img_links.add(item['url']) return list(img_links) def _create_item(self, data, field_aliases=None, source=None): """Create a new content item from RSS feed data. :param dict data: parsed data of a single feed entry :param field_aliases: (optional) field name aliases. Used for content fields that are named differently in retrieved data. :type field_aliases: list of {field_name: alias} dictionaries or None :param str source: the source of provider :return: created content item :rtype: dict """ if field_aliases is None: field_aliases = {} else: field_aliases = merge_dicts(field_aliases) aliased_fields = set(field_aliases.values()) item = dict(type=CONTENT_TYPE.TEXT) # Only consider fields that are not used as an alias (i.e. used to # populate another field) - unless those fields have their own # aliases, too. # The idea is that if e.g. the main text field is aliased to use the # parsed data's summary field, that summary should not be used to # populate the field it was originally meant for. fields_to_consider = ( f for f in self.item_fields if (f.name_in_data not in aliased_fields) or (f.name_in_data in aliased_fields and f.name_in_data in field_aliases) ) for field in fields_to_consider: data_field_name = field_aliases.get( field.name_in_data, field.name_in_data ) field_value = data.get(data_field_name) if (field.type is datetime) and field_value: field_value = utcfromtimestamp(timegm(field_value)) item[field.name] = field_value # Some feeds use <content:encoded> tag for storing the main content, # and that tag is parsed differently. If the body_html has not been # found in its default data field and is not aliased, try to # populate it using the aforementioned content field as a fallback. if ( field.name == 'body_html' and not field_value and field.name_in_data not in field_aliases ): try: item['body_html'] = data.content[0].value except: pass # content either non-existent or parsed differently if item.get('uri', None): if not item.get('body_html', None): item['body_html'] = '' source = source or 'source' item['body_html'] = '<p><a href="%s" target="_blank">%s</a></p>' % (item['uri'], source) + item['body_html'] return item def _create_image_items(self, image_links, text_item): """Create a list of picture items that represent the external images located on given URLs. Each created item's `firstcreated` and `versioncreated` fields are set to the same value as the values of these fields in `text_item`. :param iterable image_links: list of image URLs :param dict text_item: the "main" text item the images are related to :return: list of created image items (as dicts) """ image_items = [] for image_url in image_links: img_item = { 'guid': generate_guid(type=GUID_TAG), ITEM_TYPE: CONTENT_TYPE.PICTURE, 'firstcreated': text_item.get('firstcreated'), 'versioncreated': text_item.get('versioncreated'), 'renditions': { 'baseImage': { 'href': image_url } } } image_items.append(img_item) return image_items def _create_package(self, text_item, image_items): """Create a new content package from given content items. The package's `main` group contains only the references to given items, not the items themselves. In the list of references, the reference to the text item preceeds the references to image items. Package's `firstcreated` and `versioncreated` fields are set to values of these fields in `text_item`, and the `headline` is copied as well. :param dict text_item: item representing the text content :param list image_items: list of items (dicts) representing the images related to the text content :return: the created content package :rtype: dict """ package = { ITEM_TYPE: CONTENT_TYPE.COMPOSITE, 'guid': generate_guid(type=GUID_TAG), 'firstcreated': text_item['firstcreated'], 'versioncreated': text_item['versioncreated'], 'headline': text_item.get('headline', ''), 'groups': [ { 'id': 'root', 'role': 'grpRole:NEP', 'refs': [{'idRef': 'main'}], }, { 'id': 'main', 'role': 'main', 'refs': [], } ] } item_references = package['groups'][1]['refs'] item_references.append({'residRef': text_item['guid']}) for image in image_items: item_references.append({'residRef': image['guid']}) return package
def get_url(self, url=None, **kwargs): """Do an HTTP Get on URL :param string url: url to use (None to use self.HTTP_URL) :param **kwargs: extra parameter for requests :return requests.Response: response """ if not url: url = self.HTTP_URL config = self.config user = config.get('username') password = config.get('password') if user: user = user.strip() if password: password = password.strip() auth_required = config.get('auth_required', self.HTTP_AUTH) if auth_required is None: # auth_required may not be user in the feeding service # in this case with use authentification only if user # and password are set. auth_required = bool(user and password) if auth_required: if not user: raise SuperdeskIngestError.notConfiguredError( "user is not configured") if not password: raise SuperdeskIngestError.notConfiguredError( "password is not configured") kwargs.setdefault('auth', (user, password)) params = kwargs.pop("params", {}) if params or self.HTTP_DEFAULT_PARAMETERS: # if we have default parameters, we want them to be overriden # by conflicting params given in arguments if self.HTTP_DEFAULT_PARAMETERS: params.update(self.HTTP_DEFAULT_PARAMETERS) kwargs["params"] = params try: response = requests.get(url, timeout=self.HTTP_TIMEOUT, **kwargs) except requests.exceptions.Timeout as exception: raise IngestApiError.apiTimeoutError(exception, self.provider) except requests.exceptions.ConnectionError as exception: raise IngestApiError.apiConnectionError(exception, self.provider) except requests.exceptions.RequestException as exception: raise IngestApiError.apiRequestError(exception, self.provider) except Exception as exception: traceback.print_exc() raise IngestApiError.apiGeneralError(exception, self.provider) if not response.ok: exception = Exception(response.reason) if response.status_code in (401, 403): raise IngestApiError.apiAuthError(exception, self.provider) elif response.status_code == 404: raise IngestApiError.apiNotFoundError(exception, self.provider) else: raise IngestApiError.apiGeneralError(exception, self.provider) return response
class EventHTTPFeedingService(HTTPFeedingService): """ Feeding Service class which can read events using HTTP """ NAME = 'event_http' ERRORS = [ IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description() ] label = 'Event HTTP feed' """ Defines the collection service to be used with this ingest feeding service. """ service = 'events' fields = [{ 'id': 'url', 'type': 'text', 'label': 'Feed URL', 'placeholder': 'Feed URL', 'required': True }] def _update(self, provider, update): updated = utcnow() last_updated = provider.get('last_updated') ttl_minutes = app.config['INGEST_EXPIRY_MINUTES'] if not last_updated or last_updated < updated - datetime.timedelta( minutes=ttl_minutes): last_updated = updated - datetime.timedelta(minutes=ttl_minutes) self.provider = provider provider_config = provider.get('config') if not provider_config: provider_config = {} provider['config'] = provider_config self.URL = provider_config.get('url') payload = {} parser = self.get_feed_parser(provider) try: response = requests.get(self.URL, params=payload, timeout=15) # TODO: check if file has been updated since provider last_updated # although some ptovider do not include 'Last-Modified' in headers # so unsure how to do this logger.info('Http Headers: %s', response.headers) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found %s' % payload) logger.info('Ingesting: %s', str(response.content)) if isinstance(parser, NTBEventXMLFeedParser): xml = ET.fromstring(response.content) items = parser.parse(xml, provider) elif isinstance(parser, IcsTwoFeedParser): cal = Calendar.from_ical(response.content) items = parser.parse(cal, provider) else: items = parser.parse(response.content) if isinstance(items, list): yield items else: yield [items]
class ReutersHTTPFeedingService(HTTPFeedingService): """ Feeding Service class which can read article(s) using HTTP provided by Reuters. """ NAME = 'reuters_http' ERRORS = [ IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description() ] DATE_FORMAT = '%Y.%m.%d.%H.%M' def _update(self, provider): updated = utcnow() last_updated = provider.get('last_updated') ttl_minutes = app.config['INGEST_EXPIRY_MINUTES'] if not last_updated or last_updated < updated - datetime.timedelta( minutes=ttl_minutes): last_updated = updated - datetime.timedelta(minutes=ttl_minutes) self.provider = provider provider_config = provider.get('config') if not provider_config: provider_config = {} provider['config'] = provider_config if 'url' not in provider_config: provider_config['url'] = 'http://rmb.reuters.com/rmd/rest/xml' if 'auth_url' not in provider_config: provider_config[ 'auth_url'] = 'https://commerce.reuters.com/rmd/rest/xml/login' self.URL = provider_config.get('url') for channel in self._get_channels(): ids = self._get_article_ids(channel, last_updated, updated) for id in ids: try: items = self.fetch_ingest(id) if items: yield items # if there was an exception processing the one of the bunch log it and continue except Exception as ex: logger.warn( 'Reuters item {} has not been retrieved'.format(id)) logger.exception(ex) def _get_channels(self): """Get subscribed channels.""" channels = [] tree = self._get_tree('channels') for channel in tree.findall('channelInformation'): channels.append(channel.find('alias').text) return channels def _get_tree(self, endpoint, payload=None): """ Get xml response for given API endpoint and payload. :param: endpoint :type endpoint: str :param: payload :type payload: str """ if payload is None: payload = {} payload['token'] = self._get_auth_token(self.provider, update=True) url = self._get_absolute_url(endpoint) try: response = requests.get(url, params=payload, timeout=15) except requests.exceptions.Timeout as ex: # Maybe set up for a retry, or continue in a retry loop raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError('Not found %s' % payload) try: return etree.fromstring( response.content) # workaround for http mock lib except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) def _get_absolute_url(self, endpoint): """ Get absolute URL for given endpoint. :param: endpoint :type endpoint: str """ return '/'.join([self.URL, endpoint]) def _get_article_ids(self, channel, last_updated, updated): """ Get article ids which should be upserted also save the poll token that is returned. """ ids = set() payload = {'channel': channel, 'fieldsRef': 'id'} # check if the channel has a pollToken if not fall back to dateRange last_poll_token = self._get_poll_token(channel) if last_poll_token is not None: logger.info( "Reuters requesting channel {} with poll token {}".format( channel, last_poll_token)) payload['pollToken'] = last_poll_token else: payload['dateRange'] = "%s-%s" % (self._format_date(last_updated), self._format_date(updated)) logger.info( "Reuters requesting channel {} with dateRange {}".format( channel, payload['dateRange'])) tree = self._get_tree('items', payload) status_code = tree.find('status').get( 'code') if tree.tag == 'results' else tree.get('code') # check the returned status if status_code != '10': logger.warn( "Reuters channel request returned status code {}".format( status_code)) # status code 30 indicates failure if status_code == '30': # invalid token logger.warn("Reuters error on channel {} code {} {}".format( channel, tree.find('error').get('code'), tree.find('error').text)) if tree.find('error').get('code') == '2100': self._save_poll_token(channel, None) logger.warn( "Reuters channel invalid token reseting {}".format( status_code)) return ids # extract the returned poll token if there is one poll_token = tree.find('pollToken') if poll_token is not None: # a new token indicated new content if poll_token.text != last_poll_token: logger.info("Reuters channel {} new token {}".format( channel, poll_token.text)) self._save_poll_token(channel, poll_token.text) else: # the token has not changed, so nothing new logger.info("Reuters channel {} nothing new".format(channel)) return ids else: logger.info( "Reuters channel {} retrieved no token".format(channel)) return ids for result in tree.findall('result'): id = result.find('id').text ids.add(id) logger.info("Reuters id : {}".format(id)) return ids def _save_poll_token(self, channel, poll_token): """ Saves the poll token for the passed channel in the config section of the :param channel: :param poll_token: :return: """ # get the provider in case it has been updated by another channel ingest_provider_service = superdesk.get_resource_service( 'ingest_providers') provider = ingest_provider_service.find_one( req=None, _id=self.provider[superdesk.config.ID_FIELD]) provider_token = provider.get('tokens') if 'poll_tokens' not in provider_token: provider_token['poll_tokens'] = {channel: poll_token} else: provider_token['poll_tokens'][channel] = poll_token upd_provider = {'tokens': provider_token} ingest_provider_service.system_update( self.provider[superdesk.config.ID_FIELD], upd_provider, self.provider) def _get_poll_token(self, channel): """ Get the poll token from provider config if it is available. :param channel: :return: token """ if 'tokens' in self.provider and 'poll_tokens' in self.provider[ 'tokens']: return self.provider.get('tokens').get('poll_tokens').get( channel, None) def _format_date(self, date): return date.strftime(self.DATE_FORMAT) def fetch_ingest(self, id): items = self._parse_items(id) result_items = [] while items: item = items.pop() self.add_timestamps(item) try: items.extend(self._fetch_items_in_package(item)) result_items.append(item) except LookupError as err: self.log_item_error(err, item, self.provider) return [] return result_items def _parse_items(self, id): """ Parse item message and return given items. """ payload = {'id': id} tree = self._get_tree('item', payload) parser = self.get_feed_parser(self.provider, tree) items = parser.parse(tree, self.provider) return items def _fetch_items_in_package(self, item): """ Fetch remote assets for given item. """ items = [] for group in item.get('groups', []): for ref in group.get('refs', []): if 'residRef' in ref: items.extend(self._parse_items(ref.get('residRef'))) return items def prepare_href(self, href, mimetype=None): (scheme, netloc, path, params, query, fragment) = urlparse(href) new_href = urlunparse((scheme, netloc, path, '', '', '')) return '%s?auth_token=%s' % ( new_href, self._get_auth_token(self.provider, update=True))
class BBCLDRSFeedingService(FeedingService): """ Feeding Service class for reading BBC's Local Democracy Reporting Service """ # Following the api spec at https://docs.ldrs.org.uk/ NAME = 'bbc_ldrs' ERRORS = [ IngestApiError.apiAuthError().get_error_description(), IngestApiError.apiNotFoundError().get_error_description(), IngestApiError.apiGeneralError().get_error_description(), ParserError.parseMessageError().get_error_description() ] label = 'BBC Local Democracy Reporter Service' fields = [{ 'id': 'url', 'type': 'text', 'label': 'LDRS URL', 'placeholder': 'LDRS URL', 'required': True, 'default': 'https://api.ldrs.org.uk/v1/item' }, { 'id': 'api_key', 'type': 'text', 'label': 'API Key', 'placeholder': 'API Key', 'required': True, 'default': '' }] def __init__(self): super().__init__() def _test(self, provider): config = provider.get('config', {}) url = config['url'] api_key = config['api_key'] # limit the data to a single article and filter out all article fields # to save bandwidth params = {'limit': 1, 'fields': 'id'} headers = {'apikey': api_key} try: response = requests.get(url, params=params, headers=headers, timeout=30) except requests.exceptions.ConnectionError as err: raise IngestApiError.apiConnectionError(exception=err) if not response.ok: if response.status_code == 404: raise IngestApiError.apiNotFoundError( Exception(response.reason), provider) else: raise IngestApiError.apiGeneralError( Exception(response.reason), provider) def _update(self, provider, update): config = provider.get('config', {}) json_items = self._fetch_data(config, provider) parsed_items = [] for item in json_items: try: parser = self.get_feed_parser(provider, item) parsed_items.append(parser.parse(item)) except Exception as ex: raise ParserError.parseMessageError(ex, provider, data=item) return parsed_items def _fetch_data(self, config, provider): url = config['url'] api_key = config['api_key'] last_update = provider.get( 'last_updated', utcfromtimestamp(0)).strftime('%Y-%m-%dT%H:%M:%S') # Results are pagified so we'll read this many at a time offset_jump = 10 params = {'start': last_update, 'limit': offset_jump} headers = {'apikey': api_key} items = [] offset = 0 while True: params['offset'] = offset try: response = requests.get(url, params=params, headers=headers, timeout=30) except requests.exceptions.ConnectionError as err: raise IngestApiError.apiConnectionError(exception=err) if response.ok: # The total number of results are given to us in json, get them # via a regex to read the field so we don't have to convert the # whole thing to json pointlessly item_ident = re.search('\"total\": *[0-9]*', response.text).group() results_str = re.search('[0-9]+', item_ident).group() if results_str is None: raise IngestApiError.apiGeneralError( Exception(response.text), provider) num_results = int(results_str) if num_results > 0: items.append(response.text) if offset >= num_results: return items offset += offset_jump else: if re.match('Error: No API Key provided', response.text): raise IngestApiError.apiAuthError(Exception(response.text), provider) elif response.status_code == 404: raise IngestApiError.apiNotFoundError( Exception(response.reason), provider) else: raise IngestApiError.apiGeneralError( Exception(response.reason), provider) return items
class ReutersHTTPFeedingService(HTTPFeedingService): """ Feeding Service class which can read article(s) using HTTP provided by Reuters. """ NAME = "reuters_http" ERRORS = [ IngestApiError.apiTimeoutError().get_error_description(), IngestApiError.apiRedirectError().get_error_description(), IngestApiError.apiRequestError().get_error_description(), IngestApiError.apiUnicodeError().get_error_description(), IngestApiError.apiParseError().get_error_description(), IngestApiError.apiGeneralError().get_error_description(), ] DATE_FORMAT = "%Y.%m.%d.%H.%M" label = "Reuters feed API" fields = [ { "id": "url", "type": "text", "label": "Feed URL", "placeholder": "Feed URL", "required": True, "default": "http://rmb.reuters.com/rmd/rest/xml", }, { "id": "auth_url", "type": "text", "label": "URL for Authentication", "placeholder": "authentication url", "required": True, "default": "https://commerce.reuters.com/rmd/rest/xml/login", }, {"id": "username", "type": "text", "label": "Username", "placeholder": "Username", "required": True}, {"id": "password", "type": "password", "label": "Password", "placeholder": "Password", "required": True}, ] session = None def _update(self, provider, update): updated = utcnow() last_updated = provider.get("last_updated") ttl_minutes = app.config["INGEST_EXPIRY_MINUTES"] if not last_updated or last_updated < updated - datetime.timedelta(minutes=ttl_minutes): last_updated = updated - datetime.timedelta(minutes=ttl_minutes) self.provider = provider provider_config = provider.get("config") if not provider_config: provider_config = {} provider["config"] = provider_config provider_config.setdefault("url", "http://rmb.reuters.com/rmd/rest/xml") provider_config.setdefault("auth_url", "https://commerce.reuters.com/rmd/rest/xml/login") self.URL = provider_config.get("url") for channel in self._get_channels(): ids = self._get_article_ids(channel, last_updated, updated) for id in ids: try: items = self.fetch_ingest(id) if items: yield items # if there was an exception processing the one of the bunch log it and continue except Exception as ex: logger.warn("Reuters item {} has not been retrieved".format(id)) logger.exception(ex) def _get_channels(self): """Get subscribed channels.""" channels = [] tree = self._get_tree("channels") for channel in tree.findall("channelInformation"): channels.append(channel.find("alias").text) return channels def _get_tree(self, endpoint, payload=None): """Get xml response for given API endpoint and payload. :param: endpoint :type endpoint: str :param: payload :type payload: str """ if payload is None: payload = {} payload["token"] = self._get_auth_token(self.provider, update=True) url = self._get_absolute_url(endpoint) if not self.session: self.session = requests.Session() retries = 0 while True: try: response = self.session.get(url, params=payload, timeout=(30, 15)) except requests.exceptions.Timeout as ex: if retries < 3: logger.warn("Reuters API timeout retrying, retries {}".format(retries)) retries += 1 continue raise IngestApiError.apiTimeoutError(ex, self.provider) except requests.exceptions.TooManyRedirects as ex: # Tell the user their URL was bad and try a different one raise IngestApiError.apiRedirectError(ex, self.provider) except requests.exceptions.RequestException as ex: # catastrophic error. bail. raise IngestApiError.apiRequestError(ex, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) if response.status_code == 404: raise LookupError(_("Not found {payload}").format(payload=payload)) break try: return etree.fromstring(response.content) # workaround for http mock lib except UnicodeEncodeError as error: traceback.print_exc() raise IngestApiError.apiUnicodeError(error, self.provider) except ParseError as error: traceback.print_exc() raise IngestApiError.apiParseError(error, self.provider) except Exception as error: traceback.print_exc() raise IngestApiError.apiGeneralError(error, self.provider) def _get_absolute_url(self, endpoint): """ Get absolute URL for given endpoint. :param: endpoint :type endpoint: str """ return "/".join([self.URL, endpoint]) def _get_article_ids(self, channel, last_updated, updated): """ Get article ids which should be upserted also save the poll token that is returned. """ ids = set() payload = {"channel": channel, "fieldsRef": "id"} # check if the channel has a pollToken if not fall back to dateRange last_poll_token = self._get_poll_token(channel) if last_poll_token is not None: logger.info("Reuters requesting channel {} with poll token {}".format(channel, last_poll_token)) payload["pollToken"] = last_poll_token else: payload["dateRange"] = "%s-%s" % (self._format_date(last_updated), self._format_date(updated)) logger.info("Reuters requesting channel {} with dateRange {}".format(channel, payload["dateRange"])) tree = self._get_tree("items", payload) status_code = tree.find("status").get("code") if tree.tag == "results" else tree.get("code") # check the returned status if status_code != "10": logger.warn("Reuters channel request returned status code {}".format(status_code)) # status code 30 indicates failure if status_code == "30": # invalid token logger.warn( "Reuters error on channel {} code {} {}".format( channel, tree.find("error").get("code"), tree.find("error").text ) ) if tree.find("error").get("code") == "2100": self._save_poll_token(channel, None) logger.warn("Reuters channel invalid token reseting {}".format(status_code)) return ids # extract the returned poll token if there is one poll_token = tree.find("pollToken") if poll_token is not None: # a new token indicated new content if poll_token.text != last_poll_token: logger.info("Reuters channel {} new token {}".format(channel, poll_token.text)) self._save_poll_token(channel, poll_token.text) else: # the token has not changed, so nothing new logger.info("Reuters channel {} nothing new".format(channel)) return ids else: logger.info("Reuters channel {} retrieved no token".format(channel)) return ids for result in tree.findall("result"): id = result.find("id").text ids.add(id) logger.info("Reuters id : {}".format(id)) return ids def _save_poll_token(self, channel, poll_token): """Saves the poll token for the passed channel in the config section of the :param channel: :param poll_token: :return: """ # get the provider in case it has been updated by another channel ingest_provider_service = superdesk.get_resource_service("ingest_providers") provider = ingest_provider_service.find_one(req=None, _id=self.provider[superdesk.config.ID_FIELD]) provider_token = provider.get("tokens") if "poll_tokens" not in provider_token: provider_token["poll_tokens"] = {channel: poll_token} else: provider_token["poll_tokens"][channel] = poll_token upd_provider = {"tokens": provider_token} ingest_provider_service.system_update(self.provider[superdesk.config.ID_FIELD], upd_provider, self.provider) def _get_poll_token(self, channel): """Get the poll token from provider config if it is available. :param channel: :return: token """ if "tokens" in self.provider and "poll_tokens" in self.provider["tokens"]: return self.provider.get("tokens").get("poll_tokens").get(channel, None) def _format_date(self, date): return date.strftime(self.DATE_FORMAT) def fetch_ingest(self, id): items = self._parse_items(id) result_items = [] while items: item = items.pop() self.localize_timestamps(item) try: items.extend(self._fetch_items_in_package(item)) result_items.append(item) except LookupError as err: self.log_item_error(err, item, self.provider) return [] return result_items def _parse_items(self, id): """ Parse item message and return given items. """ payload = {"id": id} tree = self._get_tree("item", payload) parser = self.get_feed_parser(self.provider, tree) items = parser.parse(tree, self.provider) return items def _fetch_items_in_package(self, item): """ Fetch remote assets for given item. """ items = [] for group in item.get("groups", []): for ref in group.get("refs", []): if "residRef" in ref: items.extend(self._parse_items(ref.get("residRef"))) return items def prepare_href(self, href, mimetype=None): (scheme, netloc, path, params, query, fragment) = urlparse(href) new_href = urlunparse((scheme, netloc, path, "", "", "")) return "%s?auth_token=%s" % (new_href, self._get_auth_token(self.provider, update=True))
from collections import namedtuple from datetime import datetime from superdesk.errors import IngestApiError, ParserError from superdesk.io import register_provider from superdesk.io.ingest_service import IngestService from superdesk.utils import merge_dicts PROVIDER = 'rss' utcfromtimestamp = datetime.utcfromtimestamp errors = [IngestApiError.apiAuthError().get_error_description(), IngestApiError.apiNotFoundError().get_error_description(), IngestApiError.apiGeneralError().get_error_description(), ParserError.parseMessageError().get_error_description()] class RssIngestService(IngestService): """Ingest service for providing feeds received in RSS 2.0 format. (NOTE: it should also work with other syndicated feeds formats, too, since the underlying parser supports them, but for our needs RSS 2.0 is assumed) """ ItemField = namedtuple('ItemField', ['name', 'name_in_data', 'type']) item_fields = [ ItemField('guid', 'guid', str), ItemField('uri', 'guid', str),