def delete(self, endpoint_name, lookup): """Delete method to delete by using mongo query syntax. :param endpoint_name: Name of the endpoint :param lookup: User mongo query syntax. example 1. ``{'_id':123}``, 2. ``{'item_id': {'$in': [123, 234]}}`` :returns: Returns the mongo remove command response. {'n': 12, 'ok': 1} """ backend = self._backend(endpoint_name) search_backend = self._lookup_backend(endpoint_name) docs = self.get_from_mongo(endpoint_name, lookup=lookup, req=ParsedRequest()) ids = [doc[config.ID_FIELD] for doc in docs] removed_ids = ids logger.info("total documents to be removed {}".format(len(ids))) if search_backend and ids: removed_ids = [] # first remove it from search backend, so it won't show up. when this is done - remove it from mongo for _id in ids: try: self.remove_from_search(endpoint_name, _id) removed_ids.append(_id) except NotFoundError: logger.warning('item missing from elastic _id=%s' % (_id, )) removed_ids.append(_id) except: logger.exception('item can not be removed from elastic _id=%s' % (_id, )) backend.remove(endpoint_name, {config.ID_FIELD: {'$in': removed_ids}}) logger.info("Removed {} documents from {}.".format(len(ids), endpoint_name)) if not ids: logger.warn("No documents for {} resource were deleted using lookup {}".format(endpoint_name, lookup))
def remove_media_files(doc): """Removes the media files of the given doc. If media files are not references by any other story then delete the media files :param dict doc: document for which the media are being deleted :return boolean: True if files are deleted else false. """ logger.info('Removing Media Files...') references = None if doc.get('renditions'): references = [doc.get('renditions')] if not references: references = [assoc.get('renditions') for assoc in (doc.get(ASSOCIATIONS) or {}).values() if assoc and assoc.get('renditions')] for renditions in references: for rendition in renditions.values(): media = rendition.get('media') if isinstance(rendition.get('media'), str) else str(rendition.get('media')) try: references = get_resource_service('media_references').get(req=None, lookup={ 'media_id': media, 'published': True }) if references.count() == 0: logger.info('Deleting media:{}'.format(rendition.get('media'))) app.media.delete(media) except Exception: logger.exception('Failed to remove Media Id: {} from item: {}'.format(media, doc.get(config.ID_FIELD))) for attachment in doc.get('attachments', []): lookup = {'_id': attachment['attachment']} get_resource_service('attachments').delete_action(lookup)
def post_process_item(self, item, provider): try: item['body_html'] = '<p>{}</p>'.format( re.sub('<p> ', '<p>', item.get('body_html', '').replace('\n\n', '\n').replace('\n', '</p><p>'))) if self.ITEM_PLACE in item: if item[self.ITEM_PLACE]: item['headline'] = '{}: {}'.format(item[self.ITEM_PLACE], item.get(self.ITEM_HEADLINE, '')) locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators') place = [x for x in locator_map.get('items', []) if x['qcode'] == item.get(self.ITEM_PLACE, '').upper()] if place is not None: item[self.ITEM_PLACE] = place else: item.pop(self.ITEM_PLACE) genre_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='genre') item['genre'] = [x for x in genre_map.get('items', []) if x['qcode'] == 'Broadcast Script' and x['is_active']] # Remove the attribution item['body_html'] = item.get('body_html', '').replace('<p>AAP RTV</p>', '') item['sign_off'] = 'RTV' except Exception as ex: logger.exception(ex) return item
def _update(self, provider, update): updated = utcnow() last_updated = provider.get('last_updated') ttl_minutes = app.config['INGEST_EXPIRY_MINUTES'] if not last_updated or last_updated < updated - datetime.timedelta(minutes=ttl_minutes): last_updated = updated - datetime.timedelta(minutes=ttl_minutes) self.provider = provider provider_config = provider.get('config') if not provider_config: provider_config = {} provider['config'] = provider_config provider_config.setdefault('url', 'http://rmb.reuters.com/rmd/rest/xml') provider_config.setdefault('auth_url', 'https://commerce.reuters.com/rmd/rest/xml/login') self.URL = provider_config.get('url') for channel in self._get_channels(): ids = self._get_article_ids(channel, last_updated, updated) for id in ids: try: items = self.fetch_ingest(id) if items: yield items # if there was an exception processing the one of the bunch log it and continue except Exception as ex: logger.warn('Reuters item {} has not been retrieved'.format(id)) logger.exception(ex)
def post_process_item(self, item, provider): try: # is it a horse or dog racing item if item.get(self.ITEM_SLUGLINE, '').find('Grey') != -1 or item.get(self.ITEM_SLUGLINE, '').find( 'Trot') != -1 or item.get(self.ITEM_SLUGLINE, '').find('Gallop') != -1: # Don't look for the date in the TAB Dividends if item.get(self.ITEM_HEADLINE, '').find('TAB DIVS') == -1: try: raceday = datetime.strptime(item.get(self.ITEM_HEADLINE, ''), '%d/%m/%Y') item[self.ITEM_TAKE_KEY] = 'Fields ' + raceday.strftime('%A') except: item[self.ITEM_TAKE_KEY] = 'Fields' # it's the dogs if item.get(self.ITEM_SLUGLINE, '').find('Grey') != -1: item[self.ITEM_HEADLINE] = item.get(self.ITEM_SLUGLINE) + 'hound ' + item.get( self.ITEM_TAKE_KEY, '') item[self.ITEM_SUBJECT] = [{'qcode': '15082000', 'name': subject_codes['15082000']}] if item.get(self.ITEM_SLUGLINE, '').find('Trot') != -1: item[self.ITEM_HEADLINE] = item.get(self.ITEM_SLUGLINE) + ' ' + item.get(self.ITEM_TAKE_KEY, '') item[self.ITEM_SUBJECT] = [{'qcode': '15030003', 'name': subject_codes['15030003']}] else: # Dividends if item.get(self.ITEM_HEADLINE, '').find('TAB DIVS') != -1: item[self.ITEM_TAKE_KEY] = re.sub(' Monday$| Tuesday$| Wednesday$| Thursday$| Friday$', '', item[self.ITEM_HEADLINE]) item[self.ITEM_HEADLINE] = '{} {}'.format(item[self.ITEM_SLUGLINE], item[self.ITEM_HEADLINE]) if item.get(self.ITEM_SLUGLINE, '').find('Greyhound') != -1: item[self.ITEM_SLUGLINE] = item.get(self.ITEM_SLUGLINE, '').replace('Greyhound', 'Greys') item[self.ITEM_SUBJECT] = [{'qcode': '15082000', 'name': subject_codes['15082000']}] if item.get(self.ITEM_SLUGLINE, '').find('Trot') != -1: item[self.ITEM_SUBJECT] = [{'qcode': '15030003', 'name': subject_codes['15030003']}] if item.get(self.ITEM_SLUGLINE, '').find('Gallop') != -1: item[self.ITEM_SUBJECT] = [{'qcode': '15030001', 'name': subject_codes['15030001']}] item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'r'}] self._set_results_genre(item) elif item.get(self.ITEM_SLUGLINE, '').find(' Betting') != -1: try: raceday = datetime.strptime(item.get(self.ITEM_HEADLINE, ''), '%d/%m/%Y') item[self.ITEM_TAKE_KEY] = raceday.strftime('%A') except: pass item[self.ITEM_SLUGLINE] = item.get(self.ITEM_SLUGLINE, '').replace(' Betting', ' Market') item[self.ITEM_HEADLINE] = '{} {}'.format(item[self.ITEM_SLUGLINE], item[self.ITEM_TAKE_KEY]) item[self.ITEM_SUBJECT] = [{'qcode': '15030001', 'name': subject_codes['15030001']}] item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'r'}] elif item.get(self.ITEM_SLUGLINE, '').find('AFL') != -1: item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 't'}] item[self.ITEM_SUBJECT] = [{'qcode': '15084000', 'name': subject_codes['15084000']}] self._set_results_genre(item) else: item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'f'}] item[self.ITEM_SUBJECT] = [{'qcode': '04000000', 'name': subject_codes['04000000']}] return item except Exception as ex: logger.exception(ex)
def _validate(self, doc, **kwargs): use_headline = kwargs and 'headline' in kwargs validators = self._get_validators(doc) for validator in validators: validation_schema = self._get_validator_schema(validator) self._sanitize_fields(doc['validate'], validator) self._set_default_subject_scheme(doc['validate']) self._process_media(doc['validate'], validation_schema) self._process_sms(doc['validate'], validation_schema) v = SchemaValidator() v.allow_unknown = True try: v.validate(doc['validate'], validation_schema) except TypeError as e: logger.exception('Invalid validator schema value "%s" for ' % str(e)) error_list = v.errors response = [] for e in error_list: messages = [] # Ignore dateline if item is corrected because it can't be changed after the item is published if doc.get('act', None) == 'correct' and e == 'dateline': continue elif doc.get('act', None) == 'kill' and doc['validate'].get('profile', None) and \ e in ('headline', 'abstract', 'body_html'): continue elif e == 'extra': for field in error_list[e]: display_name = self._get_vocabulary_display_name(field) if 'required' in error_list[e][field]: messages.append(REQUIRED_ERROR.format(display_name)) else: messages.append('{} {}'.format(display_name, error_list[e][field])) elif error_list[e] == 'required field' or type(error_list[e]) is dict or \ type(error_list[e]) is list: messages.append(REQUIRED_ERROR.format(e.upper())) elif 'min length is 1' == error_list[e] or 'null value not allowed' in error_list[e]: messages.append(REQUIRED_ERROR.format(e.upper())) elif 'min length is' in error_list[e]: messages.append('{} is too short'.format(e.upper())) elif 'max length is' in error_list[e]: messages.append('{} is too long'.format(e.upper())) else: messages.append('{} {}'.format(e.upper(), error_list[e])) for message in messages: if use_headline: headline = '{}: {}'.format(doc['validate'].get('headline', doc['validate'].get('_id')), message) response.append(headline) else: response.append(message) return response else: logger.warn('validator was not found for {}'.format(doc['act'])) return []
def can_parse(self, file_path): try: with open(file_path, 'r', encoding='windows-1252') as f: lines = [line for line in f] m = re.match(self.START_OF_MESSAGE, lines[0]) if m.group(0) == self.START_OF_MESSAGE: return True return False except Exception as ex: logger.exception(ex) return False
def can_parse(self, file_path): try: with open(file_path, 'r', encoding='latin-1') as f: lines = f.readlines() for line in lines: if self.START_OF_MESSAGE in line: return True return False except Exception as ex: logger.exception(ex) return False
def remove_expired(self, provider): lock_name = 'ingest:gc' if not lock(lock_name, expire=300): return try: remove_expired_data(provider) push_notification('ingest:cleaned') except Exception as err: logger.exception(err) raise ProviderError.expiredContentError(err, provider) finally: unlock(lock_name)
def validate_and_run_elastic_query(self, elastic_query, index): """ Validates the elastic_query against ElasticSearch. :param elastic_query: JSON format inline with ElasticSearch syntax :param index: Name of the ElasticSearch index :raise SuperdeskError: If failed to validate the elastic_query against ElasticSearch """ parsed_request = self.init_request(elastic_query) try: return get_resource_service(index).get(req=parsed_request, lookup={}) except Exception as e: logger.exception(e) raise SuperdeskApiError.badRequestError('Fail to validate the filter against %s.' % index)
def post_process_item(self, item, provider): try: # Pagemasters sourced content is Greyhound or Trot related, maybe AFL otherwise financial # It is from the Racing system item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'h'}] item[self.ITEM_SUBJECT] = [{'qcode': '15030001', 'name': subject_codes['15030001']}] lines = item['body_html'].split('\n') if lines[2] and lines[2].find(':SPORT -') != -1: item[self.ITEM_HEADLINE] = lines[2][9:] if lines[1] and lines[1].find(':POTTED :') != -1: item[self.ITEM_SLUGLINE] = lines[1][9:] elif lines[1] and lines[1].find('RACING : ') != -1: item[self.ITEM_HEADLINE] = lines[1][8:] item[self.ITEM_SLUGLINE] = lines[1][8:] elif lines[1] and lines[1].find(':POTTED :') != -1: item[self.ITEM_HEADLINE] = lines[1][9:] item[self.ITEM_SLUGLINE] = lines[1][9:] elif lines[1] and lines[1].find(':PREMIERSHIP') != -1: self._scan_lines(item, lines) elif lines[1] and lines[1].find(' WEIGHTS ') != -1: self._scan_lines(item, lines) elif lines[0] and lines[0].find('YY ') != -1: item[self.ITEM_HEADLINE] = lines[1] item[self.ITEM_SLUGLINE] = lines[1] if lines[1].find(' Comment ') != -1: item[self.ITEM_SLUGLINE] = lines[1][:(lines[1].find(' Comment ') + 8)] item[self.ITEM_TAKE_KEY] = lines[1][(lines[1].find(' Comment ') + 9):] else: self._scan_lines(item, lines) # Truncate the slugline and headline to the lengths defined on the validators if required lookup = {'act': ITEM_PUBLISH, 'type': CONTENT_TYPE.TEXT} validators = superdesk.get_resource_service('validators').get(req=None, lookup=lookup) if validators.count(): max_slugline_len = validators[0]['schema']['slugline']['maxlength'] max_headline_len = validators[0]['schema']['headline']['maxlength'] if self.ITEM_SLUGLINE in item and len(item[self.ITEM_SLUGLINE]) > max_slugline_len: # the overflow of the slugline is dumped in the take key item[self.ITEM_TAKE_KEY] = item.get(self.ITEM_SLUGLINE)[max_slugline_len:] item[self.ITEM_SLUGLINE] = item[self.ITEM_SLUGLINE][:max_slugline_len] if self.ITEM_HEADLINE in item: item[self.ITEM_HEADLINE] = item[self.ITEM_HEADLINE][:max_headline_len] \ if len(item[self.ITEM_HEADLINE]) > max_headline_len else item[self.ITEM_HEADLINE] return item except Exception as ex: logger.exception(ex)
def _validate(self, doc, **kwargs): use_headline = kwargs and "headline" in kwargs validators = self._get_validators(doc) for validator in validators: self._sanitize_fields(doc["validate"], validator) v = SchemaValidator() v.allow_unknown = True try: v.validate(doc["validate"], self._get_validator_schema(validator)) except TypeError as e: logger.exception('Invalid validator schema value "%s" for ' % str(e)) error_list = v.errors response = [] for e in error_list: # Ignore dateline if item is corrected because it can't be changed after the item is published if doc.get("act", None) == "correct" and e == "dateline": continue elif ( doc.get("act", None) == "kill" and doc["validate"].get("profile", None) and e in ("headline", "abstract", "body_html") ): continue elif error_list[e] == "required field" or type(error_list[e]) is dict or type(error_list[e]) is list: message = "{} is a required field".format(e.upper()) elif "min length is 1" == error_list[e]: message = "{} is a required field".format(e.upper()) elif "min length is" in error_list[e]: message = "{} is too short".format(e.upper()) elif "max length is" in error_list[e]: message = "{} is too long".format(e.upper()) else: message = "{} {}".format(e.upper(), error_list[e]) if use_headline: response.append( "{}: {}".format(doc["validate"].get("headline", doc["validate"].get("_id")), message) ) else: response.append(message) return response else: logger.warn("validator was not found for {}".format(doc["act"])) return []
def _change_request(self, endpoint_name, id, updates, original): backend = self._backend(endpoint_name) search_backend = self._lookup_backend(endpoint_name) try: backend.update(endpoint_name, id, updates, original) except eve.io.base.DataLayer.OriginalChangedError: if not backend.find_one(endpoint_name, req=None, _id=id): # item is in elastic, not in mongo - not good logger.warn("Item is missing in mongo resource=%s id=%s".format(endpoint_name, id)) self.remove_from_search(endpoint_name, id) raise SuperdeskApiError.notFoundError() else: # item is there, but no change was done - ok logger.exception('Item : {} not updated in collection {}. ' 'Updates are : {}'.format(id, endpoint_name, updates)) return updates if search_backend: doc = backend.find_one(endpoint_name, req=None, _id=id) search_backend.update(endpoint_name, id, doc) return updates
def _validate(self, doc, fields=False, **kwargs): item = deepcopy( doc["validate"] ) # make a copy for signal before validation processing use_headline = kwargs and "headline" in kwargs validators = self._get_validators(doc) for validator in validators: validation_schema = self._get_validator_schema(validator) self._sanitize_fields(doc["validate"], validator) self._set_default_subject_scheme(doc["validate"]) self._process_media(doc["validate"], validation_schema) self._process_sms(doc["validate"], validation_schema) self._process_media_metadata(doc["validate"], validation_schema) v = SchemaValidator() v.allow_unknown = True try: v.validate(doc["validate"], validation_schema) except TypeError as ex: logger.exception('Invalid validator schema value "%s" for ' % str(ex)) error_list = v.errors response = [] for e in error_list: messages = [] # Ignore dateline if item is corrected because it can't be changed after the item is published if doc.get("act", None) == "correct" and e == "dateline": continue elif (doc.get("act", None) == "kill" and doc["validate"].get("profile", None) and e in ("headline", "abstract", "body_html")): continue elif e == "extra": for field in error_list[e]: display_name = self._get_vocabulary_display_name(field) if "required" in error_list[e][field]: messages.append( ERROR_MESSAGES[REQUIRED_ERROR].format( display_name)) else: error_field = self.get_error_field_name( display_name) messages.append("{} {}".format( error_field, error_list[e][field])) elif "required field" in error_list[e] or type( error_list[e]) is dict or type(error_list[e]) is list: display_name = self._get_vocabulary_display_name(e) error_field = self.get_error_field_name(display_name) messages.append(ERROR_MESSAGES[REQUIRED_ERROR].format( error_field.upper())) elif "min length is 1" == error_list[ e] or "null value not allowed" in error_list[e]: messages.append(ERROR_MESSAGES[REQUIRED_ERROR].format( e.upper())) elif "min length is" in error_list[e]: error_field = self.get_error_field_name(e) messages.append(ERROR_MESSAGES[TOO_SHORT].format( error_field.upper())) elif "max length is" in error_list[e]: error_field = self.get_error_field_name(e) messages.append(ERROR_MESSAGES[TOO_LONG].format( error_field.upper())) else: error_field = self.get_error_field_name(e) messages.append("{} {}".format( error_field.upper(), ERROR_MESSAGES[error_list[e]] if ERROR_MESSAGES.get( error_list[e]) else error_list[e], )) for message in messages: if use_headline: headline = "{}: {}".format( doc["validate"].get("headline", doc["validate"].get("_id")), message) response.append(headline) else: response.append(message) # let custom code do additional validation item_validate.send(self, item=item, response=response, error_fields=v.errors) if fields: return response, v.errors return response else: logger.warn("validator was not found for {}".format(doc["act"])) if fields: return [], {} return []
def post_process_item(self, item, provider): try: lines_to_remove = 1 # Pagemasters sourced content is Greyhound or Trot related, maybe AFL otherwise financial # It is from the Racing system item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'r'}] item[self.ITEM_SUBJECT] = [{ 'qcode': '15030001', 'name': subject_codes['15030001'] }] lines = item['body_html'].split('\n') # If the content is to be routed/auto published if lines[0].upper().find('YY ') != -1 or lines[0].upper().find( 'HH ') != -1: for dest in self.destinations: if lines[0].upper().find(' ' + dest.upper()) != -1: if (item.get('keywords')): item.get('keywords', []).append(dest) else: item['keywords'] = [dest] if lines[2] and lines[2].find(':SPORT -') != -1: item[self.ITEM_HEADLINE] = lines[2][9:] if lines[1] and lines[1].find(':POTTED :') != -1: item[self.ITEM_SLUGLINE] = lines[1][9:] lines_to_remove = 3 elif lines[1] and lines[1].find('RACING : ') != -1: item[self.ITEM_HEADLINE] = lines[1][8:] item[self.ITEM_SLUGLINE] = lines[1][8:] lines_to_remove = 2 elif lines[1] and lines[1].find(':POTTED :') != -1: item[self.ITEM_HEADLINE] = lines[1][9:] item[self.ITEM_SLUGLINE] = lines[1][9:] lines_to_remove = 2 elif lines[1] and lines[1].find(':Premierships') != -1: item[self.ITEM_HEADLINE] = lines[1][1:] item[self.ITEM_SLUGLINE] = item[self.ITEM_HEADLINE] # the overflow of the slugline is dumped in the take key item[self.ITEM_TAKE_KEY] = item.get(self.ITEM_SLUGLINE)[21:] item[self.ITEM_SLUGLINE] = item[self.ITEM_SLUGLINE][:21] lines_to_remove = 2 elif lines[1] and lines[1].find(' WEIGHTS ') != -1: self._scan_lines(item, lines) elif lines[0] and lines[0].find('YY ') != -1 or lines[0].find( 'HH ') != -1: item[self.ITEM_HEADLINE] = lines[1] item[self.ITEM_SLUGLINE] = lines[1] if lines[1].find(' Comment ') != -1: # need to split the line on the word Comment item[self.ITEM_SLUGLINE] = lines[ 1][:lines[1].find('Comment')] + 'Comment' item[self. ITEM_TAKE_KEY] = lines[1][lines[1].find('Comment') + 8:] item[self.ITEM_HEADLINE] = lines[1][:lines[1].find( 'Comment')] + 'Gallop Comment ' + item[ self.ITEM_TAKE_KEY] lines_to_remove = 2 else: self._scan_lines(item, lines) item['body_html'] = '<pre>' + '\n'.join(lines[lines_to_remove:]) # if the concatenation of the slugline and take key contain the phrase 'Brief Form' change the category to # h if (item.get(self.ITEM_SLUGLINE, '') + item.get( self.ITEM_TAKE_KEY, '')).lower().find('brief form') >= 0: item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'h'}] # Another exception if 'NZ/AUST FIELDS' in item.get('body_html', ''): item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'h'}] # if the item has been marked as convert to HTML then we need to use the racing reformat macro # to convert it. if lines[0] and lines[0].find('HH ') != -1: racing_reformat_macro(item) return item except Exception as ex: logger.exception(ex)
def post_process_item(self, item, provider): try: # is it a horse or dog racing item if item.get(self.ITEM_SLUGLINE, '').find('Grey') != -1 or item.get( self.ITEM_SLUGLINE, '').find('Trot') != -1 or item.get( self.ITEM_SLUGLINE, '').find('Gallop') != -1: # Don't look for the date in the TAB Dividends if item.get(self.ITEM_HEADLINE, '').find('TAB DIVS') == -1: try: raceday = datetime.strptime( item.get(self.ITEM_HEADLINE, ''), '%d/%m/%Y') item[ self. ITEM_TAKE_KEY] = 'Fields ' + raceday.strftime('%A') except: item[self.ITEM_TAKE_KEY] = 'Fields' # it's the dogs if item.get(self.ITEM_SLUGLINE, '').find('Grey') != -1: item[self.ITEM_HEADLINE] = item.get( self.ITEM_SLUGLINE) + 'hound ' + item.get( self.ITEM_TAKE_KEY, '') item[self.ITEM_SUBJECT] = [{ 'qcode': '15082000', 'name': subject_codes['15082000'] }] if item.get(self.ITEM_SLUGLINE, '').find('Trot') != -1: item[self.ITEM_HEADLINE] = item.get( self.ITEM_SLUGLINE) + ' ' + item.get( self.ITEM_TAKE_KEY, '') item[self.ITEM_SUBJECT] = [{ 'qcode': '15030003', 'name': subject_codes['15030003'] }] self._set_results_genre(item, self.racing_qcode) else: # Dividends if item.get(self.ITEM_HEADLINE, '').find('TAB DIVS') != -1: item[self.ITEM_TAKE_KEY] = re.sub( ' Monday$| Tuesday$| Wednesday$| Thursday$| Friday$', '', item[self.ITEM_HEADLINE]) item[self.ITEM_HEADLINE] = '{} {}'.format( item[self.ITEM_SLUGLINE], item[self.ITEM_HEADLINE]) if item.get(self.ITEM_SLUGLINE, '').find('Greyhound') != -1: item[self.ITEM_SLUGLINE] = item.get( self.ITEM_SLUGLINE, '').replace('Greyhound', 'Greys') item[self.ITEM_SUBJECT] = [{ 'qcode': '15082000', 'name': subject_codes['15082000'] }] if item.get(self.ITEM_SLUGLINE, '').find('Trot') != -1: item[self.ITEM_SUBJECT] = [{ 'qcode': '15030003', 'name': subject_codes['15030003'] }] if item.get(self.ITEM_SLUGLINE, '').find('Gallop') != -1: item[self.ITEM_SUBJECT] = [{ 'qcode': '15030001', 'name': subject_codes['15030001'] }] self._set_results_genre(item, self.sport_results_qcode) item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'r'}] elif item.get(self.ITEM_SLUGLINE, '').find(' Betting') != -1: try: raceday = datetime.strptime( item.get(self.ITEM_HEADLINE, ''), '%d/%m/%Y') item[self.ITEM_TAKE_KEY] = raceday.strftime('%A') except: pass item[self.ITEM_SLUGLINE] = item.get(self.ITEM_SLUGLINE, '').replace( ' Betting', ' Market') item[self.ITEM_HEADLINE] = '{} {}'.format( item[self.ITEM_SLUGLINE], item[self.ITEM_TAKE_KEY]) item[self.ITEM_SUBJECT] = [{ 'qcode': '15030001', 'name': subject_codes['15030001'] }] item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'r'}] self._set_results_genre(item, self.racing_qcode) elif item.get(self.ITEM_SLUGLINE, '').find('AFL') != -1: item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 't'}] item[self.ITEM_SUBJECT] = [{ 'qcode': '15084000', 'name': subject_codes['15084000'] }] self._set_results_genre(item, self.sport_results_qcode) else: item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'f'}] item[self.ITEM_SUBJECT] = [{ 'qcode': '04000000', 'name': subject_codes['04000000'] }] self._set_results_genre(item, self.finance_qcode) # truncate the slugline to the length defined in the validation schema lookup = {'act': 'auto_publish', 'type': CONTENT_TYPE.TEXT} validators = get_resource_service('validators').get(req=None, lookup=lookup) if validators.count(): max_slugline_len = validators[0]['schema']['slugline'][ 'maxlength'] if 'slugline' in item: item['slugline'] = item['slugline'][:max_slugline_len] \ if len(item['slugline']) > max_slugline_len else item['slugline'] return item except Exception as ex: logger.exception(ex)
def handle_exception(exc): """Log exception to logger.""" logger.exception(exc)
def post_process_item(self, item, provider): try: # is it a horse or dog racing item if item.get(self.ITEM_SLUGLINE, '').find('Grey') != -1 or item.get(self.ITEM_SLUGLINE, '').find( 'Trot') != -1 or item.get(self.ITEM_SLUGLINE, '').find('Gallop') != -1: # Don't look for the date in the TAB Dividends if item.get(self.ITEM_HEADLINE, '').find('TAB DIVS') == -1: try: raceday = datetime.strptime(item.get(self.ITEM_HEADLINE, ''), '%d/%m/%Y') item[self.ITEM_TAKE_KEY] = 'Fields ' + raceday.strftime('%A') except: item[self.ITEM_TAKE_KEY] = 'Fields' # it's the dogs if item.get(self.ITEM_SLUGLINE, '').find('Grey') != -1: item[self.ITEM_HEADLINE] = item.get(self.ITEM_SLUGLINE) + 'hound ' + item.get( self.ITEM_TAKE_KEY, '') item[self.ITEM_SUBJECT] = [{'qcode': '15082000', 'name': subject_codes['15082000']}] if item.get(self.ITEM_SLUGLINE, '').find('Trot') != -1: item[self.ITEM_HEADLINE] = item.get(self.ITEM_SLUGLINE) + ' ' + item.get(self.ITEM_TAKE_KEY, '') item[self.ITEM_SUBJECT] = [{'qcode': '15030003', 'name': subject_codes['15030003']}] else: # Dividends if item.get(self.ITEM_HEADLINE, '').find('TAB DIVS') != -1: item[self.ITEM_TAKE_KEY] = re.sub(' Monday$| Tuesday$| Wednesday$| Thursday$| Friday$', '', item[self.ITEM_HEADLINE]) item[self.ITEM_HEADLINE] = '{} {}'.format(item[self.ITEM_SLUGLINE], item[self.ITEM_HEADLINE]) if item.get(self.ITEM_SLUGLINE, '').find('Greyhound') != -1: item[self.ITEM_SLUGLINE] = item.get(self.ITEM_SLUGLINE, '').replace('Greyhound', 'Greys') item[self.ITEM_SUBJECT] = [{'qcode': '15082000', 'name': subject_codes['15082000']}] if item.get(self.ITEM_SLUGLINE, '').find('Trot') != -1: item[self.ITEM_SUBJECT] = [{'qcode': '15030003', 'name': subject_codes['15030003']}] if item.get(self.ITEM_SLUGLINE, '').find('Gallop') != -1: item[self.ITEM_SUBJECT] = [{'qcode': '15030001', 'name': subject_codes['15030001']}] item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'r'}] self._set_results_genre(item) elif item.get(self.ITEM_SLUGLINE, '').find(' Betting') != -1: try: raceday = datetime.strptime(item.get(self.ITEM_HEADLINE, ''), '%d/%m/%Y') item[self.ITEM_TAKE_KEY] = raceday.strftime('%A') except: pass item[self.ITEM_SLUGLINE] = item.get(self.ITEM_SLUGLINE, '').replace(' Betting', ' Market') item[self.ITEM_HEADLINE] = '{} {}'.format(item[self.ITEM_SLUGLINE], item[self.ITEM_TAKE_KEY]) item[self.ITEM_SUBJECT] = [{'qcode': '15030001', 'name': subject_codes['15030001']}] item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'r'}] elif item.get(self.ITEM_SLUGLINE, '').find('AFL') != -1: item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 't'}] item[self.ITEM_SUBJECT] = [{'qcode': '15084000', 'name': subject_codes['15084000']}] self._set_results_genre(item) else: item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'f'}] item[self.ITEM_SUBJECT] = [{'qcode': '04000000', 'name': subject_codes['04000000']}] # truncate the slugline to the length defined in the validation schema lookup = {'act': 'auto_publish', 'type': CONTENT_TYPE.TEXT} validators = get_resource_service('validators').get(req=None, lookup=lookup) if validators.count(): max_slugline_len = validators[0]['schema']['slugline']['maxlength'] if 'slugline' in item: item['slugline'] = item['slugline'][:max_slugline_len] \ if len(item['slugline']) > max_slugline_len else item['slugline'] return item except Exception as ex: logger.exception(ex)
def _validate(self, doc, **kwargs): use_headline = kwargs and 'headline' in kwargs validators = self._get_validators(doc) for validator in validators: validation_schema = self._get_validator_schema(validator) self._sanitize_fields(doc['validate'], validator) self._set_default_subject_scheme(doc['validate']) self._process_media(doc['validate'], validation_schema) self._process_sms(doc['validate'], validation_schema) v = SchemaValidator() v.allow_unknown = True try: v.validate(doc['validate'], validation_schema) except TypeError as e: logger.exception('Invalid validator schema value "%s" for ' % str(e)) error_list = v.errors response = [] for e in error_list: messages = [] # Ignore dateline if item is corrected because it can't be changed after the item is published if doc.get('act', None) == 'correct' and e == 'dateline': continue elif doc.get('act', None) == 'kill' and doc['validate'].get('profile', None) and \ e in ('headline', 'abstract', 'body_html'): continue elif e == 'extra': for field in error_list[e]: display_name = self._get_vocabulary_display_name(field) if 'required' in error_list[e][field]: messages.append( REQUIRED_ERROR.format(display_name)) else: messages.append('{} {}'.format( display_name, error_list[e][field])) elif error_list[e] == 'required field' or type(error_list[e]) is dict or \ type(error_list[e]) is list: messages.append(REQUIRED_ERROR.format(e.upper())) elif 'min length is 1' == error_list[ e] or 'null value not allowed' in error_list[e]: messages.append(REQUIRED_ERROR.format(e.upper())) elif 'min length is' in error_list[e]: messages.append('{} is too short'.format(e.upper())) elif 'max length is' in error_list[e]: messages.append('{} is too long'.format(e.upper())) else: messages.append('{} {}'.format(e.upper(), error_list[e])) for message in messages: if use_headline: headline = '{}: {}'.format( doc['validate'].get('headline', doc['validate'].get('_id')), message) response.append(headline) else: response.append(message) return response else: logger.warn('validator was not found for {}'.format(doc['act'])) return []
def process_timelines(self, items, failed_ids): statistics_service = get_resource_service('archive_statistics') items_to_create = [] rewrites = [] for item_id, item in items.items(): try: self.gen_stats_from_timeline(item) except Exception: logger.exception( 'Failed to generate stats for item {}'.format(item_id)) failed_ids.append(item_id) continue if item['updates'].get('rewrite_of') and \ (item['updates'].get('time_to_first_publish') or 0) > 0: rewrites.append(item_id) if not item['item'].get(config.ID_FIELD): item['updates'][config.ID_FIELD] = item_id item['updates']['stats_type'] = 'archive' items_to_create.append(item['updates']) else: try: statistics_service.patch(item_id, item['updates']) except Exception: logger.exception( 'Failed to update stats for item {}. updates={}'. format(item_id, item.get('updates'))) failed_ids.append(item_id) if len(items_to_create) > 0: try: statistics_service.post(items_to_create) except Exception: item_ids = [ item.get(config.ID_FIELD) for item in items_to_create ] logger.exception( 'Failed to create stat entries for items {}'.format( ', '.join(item_ids))) failed_ids.extend(failed_ids) for item_id in rewrites: item = items[item_id] updated_at = item['updates'].get('firstpublished') if not updated_at: logger.warning( 'Failed {}, updated_at not defined'.format(item_id)) continue original_id = item['updates'].get('rewrite_of') if not original_id: logger.warning( 'Failed {}, original_id not defined'.format(item_id)) continue original = statistics_service.find_one(req=None, _id=original_id) if not original: logger.warning('Failed {}, original not found'.format(item_id)) continue published_at = original.get('firstpublished') if not published_at: logger.warning( 'Failed {}, published_at not defined'.format(original_id)) continue statistics_service.patch( original_id, { 'time_to_next_update_publish': (updated_at - published_at).total_seconds() })
def post_process_item(self, item, provider): try: lines_to_remove = 1 # Pagemasters sourced content is Greyhound or Trot related, maybe AFL otherwise financial # It is from the Racing system item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'r'}] item[self.ITEM_SUBJECT] = [{'qcode': '15030001', 'name': subject_codes['15030001']}] lines = item['body_html'].split('\n') # If the content is to be routed/auto published if lines[0].upper().find('YY ') != -1 or lines[0].upper().find('HH ') != -1: destinations = lines[0].split(' ') for dest in destinations[1:]: if (item.get('keywords')): item.get('keywords', []).append(dest) else: item['keywords'] = [dest] if lines[2] and lines[2].find(':SPORT -') != -1: item[self.ITEM_HEADLINE] = lines[2][9:] if lines[1] and lines[1].find(':POTTED :') != -1: item[self.ITEM_SLUGLINE] = lines[1][9:] lines_to_remove = 3 elif lines[1] and lines[1].find('RACING : ') != -1: item[self.ITEM_HEADLINE] = lines[1][8:] item[self.ITEM_SLUGLINE] = lines[1][8:] lines_to_remove = 2 elif lines[1] and lines[1].find(':POTTED :') != -1: item[self.ITEM_HEADLINE] = lines[1][9:] item[self.ITEM_SLUGLINE] = lines[1][9:] lines_to_remove = 2 elif lines[1] and lines[1].find(':Premierships') != -1: item[self.ITEM_HEADLINE] = lines[1][1:] item[self.ITEM_SLUGLINE] = item[self.ITEM_HEADLINE] # the overflow of the slugline is dumped in the take key item[self.ITEM_TAKE_KEY] = item.get(self.ITEM_SLUGLINE)[21:] item[self.ITEM_SLUGLINE] = item[self.ITEM_SLUGLINE][:21] lines_to_remove = 2 elif lines[1] and lines[1].find(' WEIGHTS ') != -1: self._scan_lines(item, lines) elif lines[0] and lines[0].find('YY ') != -1 or lines[0].find('HH ') != -1: item[self.ITEM_HEADLINE] = lines[1] item[self.ITEM_SLUGLINE] = lines[1] if lines[1].find(' Comment ') != -1: # need to split the line on the word Comment item[self.ITEM_SLUGLINE] = lines[1][:lines[1].find('Comment')] + 'Comment' item[self.ITEM_TAKE_KEY] = lines[1][lines[1].find('Comment') + 8:] item[self.ITEM_HEADLINE] = lines[1][:lines[1].find('Comment')] + 'Gallop Comment ' + item[ self.ITEM_TAKE_KEY] lines_to_remove = 2 else: self._scan_lines(item, lines) item['body_html'] = '<pre>' + '\n'.join(lines[lines_to_remove:]) # if the concatenation of the slugline and take key contain the phrase 'Brief Form' change the category to # h if (item.get(self.ITEM_SLUGLINE, '') + item.get(self.ITEM_TAKE_KEY, '')).lower().find('brief form') >= 0: item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'h'}] # Another exception if 'NZ/AUST FIELDS' in item.get('body_html', ''): item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'h'}] # if the item has been marked as convert to HTML then we need to use the racing reformat macro # to convert it. if lines[0] and lines[0].find('HH ') != -1: racing_reformat_macro(item) genre_map = get_resource_service('vocabularies').find_one(req=None, _id='genre') if genre_map: item['genre'] = [x for x in genre_map.get('items', []) if x['qcode'] == 'Racing Data' and x['is_active']] return item except Exception as ex: logger.exception(ex)
def gen_stats_from_timeline(self, item): item.setdefault('updates', {}) updates = item['updates'] updates.setdefault('stats', {}) stats = updates['stats'] if len(stats.get(STAT_TYPE.TIMELINE) or []) < 1: return new_timeline = [] desk_transitions.init(stats) featuremedia_updates.init(stats) try: entries = sorted(stats[STAT_TYPE.TIMELINE], key=lambda k: (k['operation_created'], k['history_id'])) except Exception as e: logger.exception('Failed to sort timeline {}'.format( stats[STAT_TYPE.TIMELINE])) raise e # If the first history item has original_item_id attribute, # then this item is a duplicate of another item updates['_duplicate'] = entries[0].get('original_item_id') # Default the paragraph count to 0 # We'll update this count while processing the timeline updates['par_count'] = 0 for entry in entries: entry.setdefault('update', {}) self.set_metadata_updates(item, entry) self.set_timeline_entry_task_details(entry, updates) if self.skip_timeline_entry(entry, updates): continue # Remove the update attribute before adding to the timeline update = entry.get('update') or {} self._store_update_fields(entry) # Update the paragraph count from this history entry self.update_par_count_from_timeline_entry(entry, updates, update) new_timeline.append(entry) # Use a copy of entry after adding to the timeline # So that any changes from here do not modify the existing timeline entry entry = deepcopy(entry) operation = entry.get('operation') operation_created = entry.get('operation_created') if operation == OPERATION.PUBLISH: updates['_published'] = True if not updates.get('firstpublished'): updates['firstpublished'] = operation_created elif operation in [OPERATION.CREATE, OPERATION.FETCH] and \ not updates.get('firstcreated'): updates['firstcreated'] = operation_created desk_transitions.process(entry, new_timeline, updates, update, stats) featuremedia_updates.process(entry, new_timeline, updates, update, stats) desk_transitions.complete(stats, updates) featuremedia_updates.complete(stats, updates) if updates.get('firstpublished') and updates.get('firstcreated'): updates['time_to_first_publish'] = ( updates['firstpublished'] - updates['firstcreated']).total_seconds() def _remove_tmp_fields(entry): entry.pop('_processed', None) return entry stats[STAT_TYPE.TIMELINE] = [ _remove_tmp_fields(entry) for entry in new_timeline ] for key in list(updates.keys()): if key.startswith('_'): updates.pop(key)
def post_process_item(self, item, provider): try: # is it a horse or dog racing item if item.get(self.ITEM_SLUGLINE, '').find('Grey') != -1 or item.get( self.ITEM_SLUGLINE, '').find('Trot') != -1 or item.get( self.ITEM_SLUGLINE, '').find('Gallop') != -1: # Don't look for the date in the TAB Dividends if item.get(self.ITEM_HEADLINE, '').find('TAB DIVS') == -1: try: raceday = datetime.strptime( item.get(self.ITEM_HEADLINE, ''), '%d/%m/%Y') item[ self. ITEM_TAKE_KEY] = 'Fields ' + raceday.strftime('%A') except: item[self.ITEM_TAKE_KEY] = 'Fields' # it's the dogs if item.get(self.ITEM_SLUGLINE, '').find('Grey') != -1: item[self.ITEM_HEADLINE] = item.get( self.ITEM_SLUGLINE) + 'hound ' + item.get( self.ITEM_TAKE_KEY, '') item[self.ITEM_SUBJECT] = [{ 'qcode': '15082000', 'name': subject_codes['15082000'] }] if item.get(self.ITEM_SLUGLINE, '').find('Trot') != -1: item[self.ITEM_HEADLINE] = item.get( self.ITEM_SLUGLINE) + ' ' + item.get( self.ITEM_TAKE_KEY, '') item[self.ITEM_SUBJECT] = [{ 'qcode': '15030003', 'name': subject_codes['15030003'] }] else: # Dividends if item.get(self.ITEM_HEADLINE, '').find('TAB DIVS') != -1: item[self.ITEM_TAKE_KEY] = re.sub( ' Monday$| Tuesday$| Wednesday$| Thursday$| Friday$', '', item[self.ITEM_HEADLINE]) item[self.ITEM_HEADLINE] = '{} {}'.format( item[self.ITEM_SLUGLINE], item[self.ITEM_HEADLINE]) if item.get(self.ITEM_SLUGLINE, '').find('Greyhound') != -1: item[self.ITEM_SLUGLINE] = item.get( self.ITEM_SLUGLINE, '').replace('Greyhound', 'Greys') item[self.ITEM_SUBJECT] = [{ 'qcode': '15082000', 'name': subject_codes['15082000'] }] if item.get(self.ITEM_SLUGLINE, '').find('Trot') != -1: item[self.ITEM_SUBJECT] = [{ 'qcode': '15030003', 'name': subject_codes['15030003'] }] if item.get(self.ITEM_SLUGLINE, '').find('Gallop') != -1: item[self.ITEM_SUBJECT] = [{ 'qcode': '15030001', 'name': subject_codes['15030001'] }] item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'r'}] self._set_results_genre(item) elif item.get(self.ITEM_SLUGLINE, '').find('AFL') != -1: item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 't'}] item[self.ITEM_SUBJECT] = [{ 'qcode': '15084000', 'name': subject_codes['15084000'] }] self._set_results_genre(item) else: item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'f'}] item[self.ITEM_SUBJECT] = [{ 'qcode': '04000000', 'name': subject_codes['04000000'] }] return item except Exception as ex: logger.exception(ex)