def set_dateline(item, city, source, set_date=False, text=None): """Set the dateline for item""" if not city: return cities = app.locators.find_cities() located = [c for c in cities if c['city'].lower() == city.lower()] item.setdefault('dateline', {}) item['dateline']['located'] = located[0] if len(located) > 0 else { 'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city' } if set_date: item['dateline']['date'] = datetime.fromtimestamp( get_date(item['firstcreated']).timestamp(), tz=timezone(item['dateline']['located']['tz'])) item['dateline']['source'] = source if text: item['dateline']['text'] = text else: item['dateline']['text'] = format_dateline_to_locmmmddsrc( item['dateline']['located'], get_date(item['firstcreated']), source=source)
def run(self, url, start_date=None, end_date=None, page_size=None): try: if start_date: self.default_start_date = get_date(start_date) self.default_end_date = get_date(end_date) if page_size: self.default_page_size = int(page_size) self.export(url) except: logger.exception('Failed to run the command.')
def _fill_dates(self, document, item): tz = 'Europe/Oslo' item['dates'] = {'tz': tz} for tag in ('startDate', 'timeStart'): _time = document.find(tag) if _time is not None: item['dates']['start'] = local_to_utc(tz, get_date(_time.text)) break for tag in ('stopDate', 'timeEnd'): _time = document.find(tag) if _time is not None: item['dates']['end'] = local_to_utc(tz, get_date(_time.text)) break
def ap_derive_dateline(self, item): """ This function looks for a dateline in the article body an uses that. :param item: :return: item populated with a dateline """ try: html = item.get('body_html') if html: soup = BeautifulSoup(html, "html.parser") pars = soup.findAll('p') for par in pars: city, source, the_rest = par.get_text().partition(' (AP) _ ') if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(',')[0] if any(char.isdigit() for char in city): return cities = app.locators.find_cities() located = [c for c in cities if c['city'].lower() == city.lower()] item.setdefault('dateline', {}) item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city'} item['dateline']['source'] = item.get('original_source', 'AP') item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'], get_date(item['firstcreated']), source=item.get('original_source', 'AP')) break return item except: logging.exception('AP dateline extraction exception')
def set_default_source(doc): """Set the source for the item. If desk level source is specified then use that source else default from global settings. :param {dict} doc: doc where source is defined """ # set the source for the article as default source = get_default_source() desk_id = doc.get('task', {}).get('desk') if desk_id: # if desk level source is specified then use that instead of the default source desk = get_resource_service('desks').find_one(req=None, _id=desk_id) source = desk.get('source') or source doc['source'] = source if not doc.get('dateline'): return doc['dateline']['source'] = source if not (doc['dateline'].get('located') and doc['dateline'].get('date')): return if isinstance(doc['dateline'].get('date'), str): doc['dateline']['date'] = get_date(doc['dateline'].get('date')) doc['dateline']['text'] = format_dateline_to_locmmmddsrc( doc['dateline'].get('located'), doc['dateline'].get('date'), source)
def derive_dateline(self, item): """ Attempt to derive a dateline using the place, only if there is exactly one match on the city can we be sure we have the correct country. :param item: :return: """ try: if len(item.get('place', [])) == 1: cities = app.locators.find_cities() city = item.get('place', '')[0].get('name', '') if city: located = [c for c in cities if c['city'].lower() == city.lower()] if len(located) == 1: item.setdefault('dateline', {}) item['dateline']['located'] = located[0] item['dateline']['source'] = item.get('original_source', 'EFE') item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'], get_date(item['firstcreated']), source=item.get('original_source', 'EFE')) except Exception as ex: logging.exception('EFE dateline extraction exception {}'.format(ex)) finally: item.pop('place', None)
def set_default_source(doc): """Set the source for the item. If desk level source is specified then use that source else default from global settings. :param {dict} doc: doc where source is defined """ # set the source for the article as default source = get_default_source() desk_id = doc.get('task', {}).get('desk') if desk_id: # if desk level source is specified then use that instead of the default source desk = get_resource_service('desks').find_one(req=None, _id=desk_id) source = desk.get('source') or source doc['source'] = source if not doc.get('dateline'): return doc['dateline']['source'] = source if not (doc['dateline'].get('located') and doc['dateline'].get('date')): return if isinstance(doc['dateline'].get('date'), str): doc['dateline']['date'] = get_date(doc['dateline'].get('date')) doc['dateline']['text'] = format_dateline_to_locmmmddsrc(doc['dateline'].get('located'), doc['dateline'].get('date'), source)
def update_to_pass_validation(item, **kwargs): """ This is a test macro that does what is required to ensure that a text item will pass publication validation. It is intended to be used to test auto publishing, that is publishing directly from ingest. At the moment virtually all content received from Reuters fails validation. :param item: :param kwargs: :return: """ try: lookup = {'act': ITEM_PUBLISH, 'type': CONTENT_TYPE.TEXT} validators = superdesk.get_resource_service('validators').get(req=None, lookup=lookup) if validators.count(): max_slugline_len = validators[0]['schema']['slugline']['maxlength'] max_headline_len = validators[0]['schema']['headline']['maxlength'] item['slugline'] = item['slugline'][:max_slugline_len] \ if len(item['slugline']) > max_slugline_len else item['slugline'] item['headline'] = item['headline'][:max_headline_len] \ if len(item['headline']) > max_headline_len else item['headline'] if 'dateline' not in item: cities = app.locators.find_cities(country_code='AU', state_code='NSW') located = [c for c in cities if c['city'].lower() == 'sydney'] if located: item['dateline'] = {'date': item['firstcreated'], 'located': located[0]} item['dateline']['source'] = item['source'] item['dateline']['text'] = format_dateline_to_locmmmddsrc(located[0], get_date(item['firstcreated']), source=item['source']) return item except: logging.exception('Test update to pass validation macro exception')
def format_datetime_filter(date_or_string, timezone_string=None, date_format=None): """Convert date or string to another timezone :param str date_or_string: :param str timezone_string: :param str date_format: :return str: returns string representation of the date format """ try: date_time = get_date(date_or_string) timezone_string = timezone_string if timezone_string else config.DEFAULT_TIMEZONE tz = timezone(timezone_string) if tz: date_time = date_time.astimezone(tz) if date_format: return date_time.strftime(date_format) else: return str(date_time) except Exception: logger.warning('Failed to convert datetime. Arguments: Date - {} Timezone - {} format - {}.'.format( date_or_string, timezone_string, date_format )) return ''
def format_datetime_filter(date_or_string, timezone_string=None, date_format=None): """ Convert date or string to another timezone :param str date_or_string: :param str timezone_string: :param str date_format: :return str: returns string representation of the date format """ try: date_time = get_date(date_or_string) timezone_string = timezone_string if timezone_string else config.DEFAULT_TIMEZONE tz = timezone(timezone_string) if tz: date_time = date_time.astimezone(tz) if date_format: return date_time.strftime(date_format) else: return str(date_time) except: logger.warning( 'Failed to convert datetime. Arguments: Date - {} Timezone - {} format - {}.' .format(date_or_string, timezone_string, date_format)) return ''
def noise11_derive_metadata(item, **kwargs): """ By definition anyhting from NOISE11 will be entertainment so set the category, subject and dateline appropriately :param item: :param kwargs: :return: """ try: if "anpa_category" not in item: category_map = superdesk.get_resource_service("vocabularies").find_one(req=None, _id="categories") if category_map: map_entry = next( (code for code in category_map["items"] if code["qcode"] == "e" and code["is_active"]), None ) item["anpa_category"] = [{"qcode": "e", "name": map_entry["name"]}] if "subject" not in item: qcode = "01000000" item["subject"] = [{"qcode": qcode, "name": subject_codes[qcode]}] cities = find_cities(country_code="AU", state_code="NSW") located = [c for c in cities if c["city"].lower() == "sydney"] if located and "dateline" not in item: item["dateline"] = {"date": item["firstcreated"], "located": located[0]} item["dateline"]["source"] = item["source"] item["dateline"]["text"] = format_dateline_to_locmmmddsrc( located[0], get_date(item["firstcreated"]), source=item["source"] ) return item except Exception as ex: logger.exception(ex)
def derive_dateline(self, item): """ Attempt to derive a dateline using the place, only if there is exactly one match on the city can we be sure we have the correct country. :param item: :return: """ try: if len(item.get("place", [])) == 1: cities = app.locators.find_cities() city = item.get("place", "")[0].get("name", "") if city: located = [ c for c in cities if c["city"].lower() == city.lower() ] if len(located) == 1: item.setdefault("dateline", {}) item["dateline"]["located"] = located[0] item["dateline"]["source"] = item.get( "original_source", "EFE") item["dateline"][ "text"] = format_dateline_to_locmmmddsrc( item["dateline"]["located"], get_date(item["firstcreated"]), source=item.get("original_source", "EFE"), ) except Exception as ex: logging.exception( "EFE dateline extraction exception {}".format(ex)) finally: item.pop("place", None)
def derive_dateline(self, item): """ Attempt to derive a dateline using the place, only if there is exactly one match on the city can we be sure we have the correct country. :param item: :return: """ try: if len(item.get('place', [])) == 1: cities = app.locators.find_cities() city = item.get('place', '')[0].get('name', '') located = [ c for c in cities if c['city'].lower() == city.lower() ] if len(located) == 1: item.setdefault('dateline', {}) item['dateline']['located'] = located[0] item['dateline']['source'] = item.get( 'original_source', 'EFE') item['dateline']['text'] = format_dateline_to_locmmmddsrc( item['dateline']['located'], get_date(item['firstcreated']), source=item.get('original_source', 'EFE')) item.pop('place') except: logging.exception('EFE dateline extraction exception')
def test_utcnow(self): self.assertIsInstance(utcnow(), datetime) date1 = get_date(datetime.now(tz=utc)) date2 = utcnow() self.assertEqual(date1.year, date2.year) self.assertEqual(date1.month, date2.month) self.assertEqual(date1.day, date2.day) self.assertEqual(date1.hour, date2.hour) self.assertEqual(date1.minute, date2.minute) self.assertEqual(date1.second, date2.second)
def set_dateline(item, city, source, set_date=False, text=None): """Set the dateline for item""" if not city: return cities = app.locators.find_cities() located = [c for c in cities if c['city'].lower() == city.lower()] item.setdefault('dateline', {}) item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city'} if set_date: item['dateline']['date'] = datetime.fromtimestamp(get_date(item['firstcreated']).timestamp(), tz=timezone(item['dateline']['located']['tz'])) item['dateline']['source'] = source if text: item['dateline']['text'] = text else: item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'], get_date(item['firstcreated']), source=source)
def reuters_derive_dateline(item, **kwargs): """ It seems that most locations injected into the item by the parser are Bangalor This function looks for a dateline in the article body an uses that. :param items: :return: """ try: html = item.get('body_html') if html: soup = BeautifulSoup(html, "html.parser") pars = soup.findAll('p') if len(pars) >= 2: if BYLINE in item and item.get(BYLINE) in pars[0].get_text(): first = pars[1].get_text() else: first = pars[0].get_text() city, source, the_rest = first.partition(' (Reuters) - ') if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(',')[0] if any(char.isdigit() for char in city): return cities = app.locators.find_cities() located = [ c for c in cities if c['city'].lower() == city.lower() ] # if not dateline we create one if 'dateline' not in item: item['dateline'] = {} # there is already a dateline that is not Bangalore don't do anything just return elif 'located' in item['dateline'] and 'BANGALORE' != item[ 'dateline']['located'].get('city').upper(): return item['dateline']['located'] = located[0] if len( located) > 0 else { 'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city' } item['dateline']['source'] = item.get( 'original_source', 'Reuters') item['dateline']['text'] = format_dateline_to_locmmmddsrc( item['dateline']['located'], get_date(item['firstcreated']), source=item.get('original_source', 'Reuters')) return item except: logging.exception('Reuters dateline macro exception')
def reuters_derive_dateline(item, **kwargs): """ It seems that most locations injected into the item by the parser are Bangalor This function looks for a dateline in the article body an uses that. :param items: :return: """ try: html = item.get("body_html") if html: soup = BeautifulSoup(html, "html.parser") pars = soup.findAll("p") if len(pars) >= 2: if BYLINE in item and item.get(BYLINE) in pars[0].get_text(): first = pars[1].get_text() else: first = pars[0].get_text() city, source, the_rest = first.partition(" (Reuters) - ") if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(",")[0] if any(char.isdigit() for char in city): return cities = find_cities() located = [c for c in cities if c["city"].lower() == city.lower()] # if not dateline we create one if "dateline" not in item: item["dateline"] = {} # there is already a dateline that is not Bangalore don't do anything just return elif ( "located" in item["dateline"] and "BANGALORE" != item["dateline"]["located"].get("city").upper() ): return item["dateline"]["located"] = ( located[0] if len(located) > 0 else {"city_code": city, "city": city, "tz": "UTC", "dateline": "city"} ) item["dateline"]["source"] = item.get("original_source", "Reuters") item["dateline"]["text"] = format_dateline_to_locmmmddsrc( item["dateline"]["located"], get_date(item["firstcreated"]), source=item.get("original_source", "Reuters"), ) return item except: logging.exception("Reuters dateline macro exception")
def set_if_not_running(pipe): last_updated = pipe.get(key) if last_updated: last_updated = get_date(str(last_updated)) delta = last_updated + update_schedule if delta < now: logger.warn('Overwriting running key for {}:{}'.format(name, id)) pipe.set(key, date_to_str(now)) return True else: logger.warn('Task {}:{} is already running. last_updated={}'.format(name, id, last_updated)) return False else: pipe.set(key, date_to_str(now)) return True
def set_dateline(updates, original): """Set the dateline for the item. :param {dict} updates: Updates related to the doc :param {dict} original: Original document. """ if not ((updates.get('dateline') or {}).get('located') and (updates.get('dateline') or {}).get('date')): return source = updates.get('source', original.get('source')) or get_default_source() updates['dateline']['source'] = source if isinstance(updates['dateline'].get('date'), str): updates['dateline']['date'] = get_date(updates['dateline'].get('date')) updates['dateline']['text'] = format_dateline_to_locmmmddsrc(updates['dateline'].get('located'), updates['dateline'].get('date'), source)
def set_if_not_running(pipe): last_updated = pipe.get(key) if last_updated: last_updated = get_date(str(last_updated)) delta = last_updated + update_schedule if delta < now: logger.warn('Overwritting running key for provider {0}'.format(provider[superdesk.config.ID_FIELD])) pipe.set(key, date_to_str(now)) return True else: logger.warn('Update ingest already running for provider {0}, last_updated={1}'. format(provider[superdesk.config.ID_FIELD], last_updated)) return False else: pipe.set(key, date_to_str(now)) return True
def reuters_derive_dateline(item, **kwargs): """ It seems that most locations injected into the item by the parser are Bangalor This function looks for a dateline in the article body an uses that. :param items: :return: """ try: html = item.get('body_html') if html: parsed = parse_html(html, content='xml') pars = parsed.xpath('//p') if len(pars) >= 2: if BYLINE in item and item.get(BYLINE) in ''.join(pars[0].itertext()): first = ''.join(pars[1].itertext()) else: first = ''.join(pars[0].itertext()) city, source, the_rest = first.partition(' (Reuters) - ') if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(',')[0] if any(char.isdigit() for char in city): return cities = app.locators.find_cities() located = [c for c in cities if c['city'].lower() == city.lower()] # if not dateline we create one if 'dateline' not in item: item['dateline'] = {} # there is already a dateline that is not Bangalore don't do anything just return elif 'located' in item['dateline'] and 'BANGALORE' != item['dateline']['located'].get( 'city').upper(): return item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city'} item['dateline']['source'] = item.get('original_source', 'Reuters') item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'], get_date(item['firstcreated']), source=item.get('original_source', 'Reuters')) return item except: logging.exception('Reuters dateline macro exception')
def _yonhap_derive_dateline(item, **kwargs): """ It seems that most locations injected into the item by the parser are Bangalor This function looks for a dateline in the article body an uses that. :param items: :return: """ try: html = item.get('body_html') if html: parsed = sd_etree.parse_html(html, content='xml') pars = parsed.xpath('//p') for par in pars: if not par.text: continue city, source, the_rest = par.text.partition(' (Yonhap) -- ') if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(',')[0] if any(char.isdigit() for char in city): return cities = app.locators.find_cities() located = [ c for c in cities if c['city'].lower() == city.lower() ] # if not dateline we create one if 'dateline' not in item: item['dateline'] = {} item['dateline']['located'] = located[0] if len( located) == 1 else { 'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city' } item['dateline']['source'] = item.get('source', 'Yonhap') item['dateline']['text'] = format_dateline_to_locmmmddsrc( item['dateline']['located'], get_date(item['firstcreated']), source='Yonhap') break return item except: logging.exception('Yonhap dateline macro exception')
def set_if_not_running(pipe): last_updated = pipe.get(key) if last_updated: last_updated = get_date(str(last_updated)) delta = last_updated + update_schedule if delta < now: logger.warn('Overwriting running key for {}:{}'.format( name, id)) pipe.set(key, date_to_str(now)) return True else: logger.warn( 'Task {}:{} is already running. last_updated={}'.format( name, id, last_updated)) return False else: pipe.set(key, date_to_str(now)) return True
def _get_date_range(self, input_date, days_to_process=1): """ Calculate the date range to process :param datetime input_date: :param int days_to_process: :return: """ if not input_date: input_date = utcnow() elif isinstance(input_date, str): input_date = get_date(input_date) elif not isinstance(input_date, datetime): raise ValueError("Invalid Input Date.") end_date = input_date start_date = (end_date - timedelta(days=int(days_to_process))).replace(hour=0, minute=0, second=0, microsecond=0) return start_date, end_date
def test_compare_repos(self): with self.app.app_context(): cmd = GenerateActivityCountReport() items = cmd.run(get_date("2017-05-10T23:59:59+0000")) self.assertEqual(len(items), 3) user1_items = [ item for item in items if item['user_id'] == 'user1' ] user2_items = [ item for item in items if item['user_id'] == 'user2' ] self.assertEqual(len(user1_items), 1) self.assertEqual(len(user2_items), 2) self.assertEqual(user1_items[0]['create_count'], 1) self.assertEqual(user1_items[0]['update_count'], 1) self.assertEqual(user2_items[0]['create_count'], 1) self.assertEqual(user2_items[0]['update_count'], 0) self.assertEqual(user2_items[1]['create_count'], 1) self.assertEqual(user2_items[1]['update_count'], 0)
def set_dateline(updates, original): """Set the dateline for the item. :param {dict} updates: Updates related to the doc :param {dict} original: Original document. """ if not ((updates.get("dateline") or {}).get("located") and (updates.get("dateline") or {}).get("date")): return source = updates.get("source", original.get("source")) or get_default_source() updates["dateline"]["source"] = source if isinstance(updates["dateline"].get("date"), str): updates["dateline"]["date"] = get_date(updates["dateline"].get("date")) updates["dateline"]["text"] = format_dateline_to_locmmmddsrc( updates["dateline"].get("located"), updates["dateline"].get("date"), source)
def _get_date_range(self, input_date, days_to_process=1): """Calculate the date range to process :param datetime input_date: :param int days_to_process: :return: """ if not input_date: input_date = utcnow() elif isinstance(input_date, str): input_date = get_date(input_date) elif not isinstance(input_date, datetime): raise ValueError("Invalid Input Date.") end_date = input_date start_date = (end_date - timedelta(days=int(days_to_process))).replace(hour=0, minute=0, second=0, microsecond=0) return start_date, end_date
def _process_report(self, items): """To insert/update the activity report :param list items: """ service = superdesk.get_resource_service('activity_report') new_items = [] for item in items: item['activity_date'] = get_date(item['activity_date']) existing_item = service.find_one(req=None, activity_date=item['activity_date'], user_id=item['user_id']) if existing_item: service.patch(existing_item['_id'], item) else: new_items.append(item) if new_items: service.post(new_items)
def noise11_derive_metadata(item, **kwargs): """ By definition anyhting from NOISE11 will be entertainment so set the category, subject and dateline appropriately :param item: :param kwargs: :return: """ try: if 'anpa_category' not in item: category_map = superdesk.get_resource_service( 'vocabularies').find_one(req=None, _id='categories') if category_map: map_entry = next( (code for code in category_map['items'] if code['qcode'] == 'e' and code['is_active']), None) item['anpa_category'] = [{ 'qcode': 'e', 'name': map_entry['name'] }] if 'subject' not in item: qcode = '01000000' item['subject'] = [{'qcode': qcode, 'name': subject_codes[qcode]}] cities = find_cities(country_code='AU', state_code='NSW') located = [c for c in cities if c['city'].lower() == 'sydney'] if located and 'dateline' not in item: item['dateline'] = { 'date': item['firstcreated'], 'located': located[0] } item['dateline']['source'] = item['source'] item['dateline']['text'] = format_dateline_to_locmmmddsrc( located[0], get_date(item['firstcreated']), source=item['source']) return item except Exception as ex: logger.exception(ex)
def ap_derive_dateline(self, item): """This function looks for a dateline in the article body an uses that. :param item: :return: item populated with a dateline """ try: html = item.get("body_html") if html: parsed = parse_html(html, content="html") for par in parsed.xpath("/div/child::*"): if not par.text: continue city, source, the_rest = par.text.partition(" (AP) _ ") if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(",")[0] if any(char.isdigit() for char in city): return cities = app.locators.find_cities() located = [c for c in cities if c["city"].lower() == city.lower()] item.setdefault("dateline", {}) item["dateline"]["located"] = ( located[0] if len(located) == 1 else {"city_code": city, "city": city, "tz": "UTC", "dateline": "city"} ) item["dateline"]["source"] = item.get("original_source", "AP") item["dateline"]["text"] = format_dateline_to_locmmmddsrc( item["dateline"]["located"], get_date(item["firstcreated"]), source=item.get("original_source", "AP"), ) break return item except Exception: logging.exception("AP dateline extraction exception")
def ap_weather_format(item, **kwargs): if not item.get('slugline', '').startswith('WEA--GlobalWeather-Ce') or not item.get('source', '') == 'AP': raise SuperdeskApiError.badRequestError("Article should be an AP sourced weather table") item['slugline'] = 'WORLD WEATHER' text = get_text(item['body_html'], content='html') lines = text.splitlines() if not lines[0] == 'BC-WEA--Global Weather-Celsius,<': raise SuperdeskApiError.badRequestError("Table should be in Celsius only") # tabular column max lengths are extracted into this list columns = [] # map of the columns to extract and the substitutions to apply to the column columnMap = ({'index': 0}, {'index': 1}, {'index': 2}, {'index': 3, 'substitute': [('COND', 'CONDITIONS'), ('pc', 'partly cloudy'), ('clr', 'clear'), ('cdy', 'cloudy'), ('rn', 'rain'), ('sn', 'snow')]}) # story preamble preamble = 'Temperatures and conditions in world centres:\r\n' output = StringIO() output.write(preamble) # story is always datelined News York city = 'New York City' cities = app.locators.find_cities() located = [c for c in cities if c['city'].lower() == city.lower()] if 'dateline' not in item: item['dateline'] = {} item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city'} item['dateline']['date'] = datetime.fromtimestamp(get_date(item['firstcreated']).timestamp(), tz=timezone(item['dateline']['located']['tz'])) item['dateline']['source'] = 'AP' item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'], get_date(item['firstcreated']), source=item.get('original_source', 'AP')) item['headline'] = 'World Weather for ' + item['dateline']['date'].strftime('%b %-d') item['subject'] = [{"name": "weather", "qcode": "17000000"}] locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators') item['place'] = [x for x in locator_map.get('items', []) if x['qcode'] == 'US'] if lines: # scan all the lines in the file for potential collimated lines and calculate the length # of the column for line in lines: row = re.split('[;\<]+', line) # only consider it if there are more than two rows if len(row) > 2: index = 0 for col in row: # check if the column is mapped map = [me for me in columnMap if me['index'] == index] if len(map): for sub in map[0].get('substitute', ()): col = col.replace(sub[0], sub[1]) # if it's a new column if 0 <= index < len(columns): # check the length if len(col) > columns[index]: columns[index] = len(col) else: columns.append(len(col)) index += 1 for line in lines: row = re.split('[;\<]+', line) if len(row) > 2: index = 0 for col in row: map = [me for me in columnMap if me['index'] == index] if len(map) > 0: for sub in map[0].get('substitute', ()): col = col.replace(sub[0], sub[1]) output.write( '{}'.format(col.lstrip('\t').ljust(columns[map[0].get('index')] + 2)).rstrip('\r\n')) index += 1 output.write('\r\n') item['body_html'] = '<pre>' + output.getvalue() + '</pre>' return item
def test_get_date(self): self.assertIsInstance(get_date('2012-12-12'), datetime) self.assertIsInstance(get_date(datetime.now()), datetime) self.assertIsNone(get_date(None))
def _parse_formatted_email(self, data, provider): """Construct an item from an email that was constructed as a notification from a google form submission. The google form submits to a google sheet, this sheet creates the email as a notification :param data: :param provider: :return: A list of 1 item """ try: item = dict() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['versioncreated'] = utcnow() for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) # Check that the subject line matches what we expect, ignore it if not if self.parse_header( msg['subject']) != 'Formatted Editorial Story': return [] item['guid'] = msg['Message-ID'] date_tuple = email.utils.parsedate_tz(msg['Date']) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone('utc')) item['firstcreated'] = dt for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) # if we don't know the charset just have a go! if part.get_content_charset() is None: json_str = body.decode().replace('\r\n', '').replace( ' ', ' ') else: charset = part.get_content_charset() json_str = body.decode(charset).replace( '\r\n', '').replace(' ', ' ') mail_item = dict( (k, v[0]) for k, v in json.loads(json_str).items()) self._expand_category(item, mail_item) item['original_source'] = mail_item.get( 'Username', '') item['headline'] = mail_item.get('Headline', '') item['abstract'] = mail_item.get('Abstract', '') item['slugline'] = mail_item.get('Slugline', '') item['body_html'] = '<p>' + mail_item.get( 'Body', '').replace('\n', '</p><p>') + '</p>' default_source = app.config.get( 'DEFAULT_SOURCE_VALUE_FOR_MANUAL_ARTICLES') city = mail_item.get('Dateline', '') cities = app.locators.find_cities() located = [ c for c in cities if c['city'].lower() == city.lower() ] item.setdefault('dateline', {}) item['dateline']['located'] = located[0] if len( located) > 0 else { 'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city' } item['dateline']['source'] = default_source item['dateline'][ 'text'] = format_dateline_to_locmmmddsrc( item['dateline']['located'], get_date(item['firstcreated']), source=default_source) if mail_item.get('Priority') != '': if mail_item.get('Priority', '3').isdigit(): item['priority'] = int( mail_item.get('Priority', '3')) else: priority_map = superdesk.get_resource_service( 'vocabularies').find_one( req=None, _id='priority') priorities = [ x for x in priority_map.get('items', []) if x['name'].upper() == mail_item.get( 'Priority', '').upper() ] if priorities is not None and len( priorities) > 0: item['priority'] = int( priorities[0].get('qcode', '3')) else: item['priority'] = 3 if mail_item.get('News Value') != '': item['urgency'] = int( mail_item.get('News Value', '3')) # We expect the username passed corresponds to a superdesk user query = { 'email': re.compile( '^{}$'.format(mail_item.get('Username')), re.IGNORECASE) } user = superdesk.get_resource_service( 'users').find_one(req=None, **query) if not user: logger.error( 'Failed to find user for email {}'.format( mail_item.get('Username'))) raise UserNotRegisteredException() item['original_creator'] = user.get('_id') if BYLINE in user and user.get(BYLINE, ''): item['byline'] = user.get(BYLINE) item[SIGN_OFF] = user.get(SIGN_OFF) # attempt to match the given desk name against the defined desks query = { 'name': re.compile( '^{}$'.format(mail_item.get('Desk', '')), re.IGNORECASE) } desk = superdesk.get_resource_service( 'desks').find_one(req=None, **query) if desk: item['task'] = { 'desk': desk.get('_id'), 'stage': desk.get('incoming_stage') } if 'Place' in mail_item: locator_map = superdesk.get_resource_service( 'vocabularies').find_one(req=None, _id='locators') place = [ x for x in locator_map.get('items', []) if x['qcode'] == mail_item.get( 'Place', '').upper() ] if place is not None: item['place'] = place if mail_item.get('Legal flag', '') == 'LEGAL': item['flags'] = {'marked_for_legal': True} break return [item] except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
def process_victorian_harness_racing(item, **kwargs): number_words_map = {1: 'One', 2: 'Two', 3: 'Three', 4: 'Four', 5: 'Five', 6: 'Six', 7: 'Seven', 8: 'Eight', 9: 'Nine', 10: 'Ten', 11: 'Eleven', 12: 'Twelve', 13: 'Thirteen', 14: 'Fourteen', 15: 'Fifteen', 16: 'Sixteen', 17: 'Seventeen', 18: 'Eighteen', 19: 'Nineteen', 20: 'Twenty', 30: 'Thirty', 40: 'Forty', 50: 'Fifty', 60: 'Sixty', 70: 'Seventy', 80: 'Eighty', 90: 'Ninety', 0: 'Zero'} substitution_map = OrderedDict({"second": "2nd", "third": "3rd", "fourth": "4th", "fifth": "5th", "sixth": "6th", "seventh": "7th", "eighth": "8th", "ninth": "9th", "2nd row": "second row", "2nd up": "second up", "2nd line": "second line", "2nd run": "second run", "2nd pick": "second pick", "January": "Jan", "February": "Feb", "August": "Aug", "September": "Sept", "October": "Oct", "November": "Nov", "December": "Dec", "Harold Park": "HP", "Moonee Valley": "MV"}) def race_number_to_words(race): n = int(race.replace('Race', '').replace(':', '')) try: return titlecase(number_words_map[n]) except KeyError: try: return titlecase(number_words_map[n - n % 10] + number_words_map[n % 10].lower()) except KeyError: return str(n) content = item.get('body_html', '') comment_item = { "anpa_category": [ { "qcode": "r", "name": "Racing (Turf)", "subject": "15030001" } ], "subject": [ { "parent": "15000000", "name": "horse racing, harness racing", "qcode": "15030000" } ], "place": [ { "state": "Victoria", "name": "VIC", "group": "Australia", "country": "Australia", "qcode": "VIC", "world_region": "Oceania" } ], FORMAT: FORMATS.HTML, ITEM_TYPE: CONTENT_TYPE.TEXT } selections_item = deepcopy(comment_item) # copy the genre of the item that we are oprerting on if 'genre' in item: selections_item['genre'] = deepcopy(item['genre']) parsed = parse_html(content, content='html') for tag in parsed.xpath('/html/div/child::*'): if tag.tag == 'p': if tag.text.startswith('VENUE: '): venue = tag.text.replace('VENUE: ', '') elif tag.text.startswith('DATE: '): try: meeting_date = datetime.strptime(tag.text.replace('DATE: ', '').replace(' ', ''), '%d/%m/%y') except Exception: logger.warning('Date format exception for {}'.format(tag.text.replace('DATE: ', ''))) try: meeting_date = datetime.strptime(tag.text.replace('DATE: ', '').replace(' ', ''), '%d/%m/%Y') except Exception: logger.warning('Date format exception 2 for {}'.format(tag.text.replace('DATE: ', ''))) try: meeting_date = get_date(tag.text.replace('DATE: ', '').replace(' ', '')) except Exception: logger.warning('Date format exception 3 for {}'.format(tag.text.replace('DATE: ', ''))) meeting_date = utcnow() comment_item['slugline'] = venue + ' Comment' comment_item['anpa_take_key'] = meeting_date.strftime('%A') comment_item['headline'] = venue + ' Trot Comment ' + meeting_date.strftime('%A') comment_item['firstcreated'] = utcnow() set_dateline(comment_item, 'Melbourne', 'AAP') selections_item['slugline'] = venue + ' Selections' selections_item['anpa_take_key'] = meeting_date.strftime('%A') selections_item['headline'] = venue + ' Trot Selections ' + meeting_date.strftime('%A') selections_item['firstcreated'] = utcnow() set_dateline(selections_item, 'Melbourne', 'AAP') selections_item['body_html'] = '<p>{} Selections for {}\'s {} trots.-</p>'.format( selections_item.get('dateline').get('text'), meeting_date.strftime('%A'), venue) selections_item['firstcreated'] = utcnow() break regex = r"Race ([1-9][0-9]|[1-9]):" for tag in parsed.xpath('/html/div/child::*'): if tag.tag == 'p': m = re.match(regex, tag.text) if m: selections_item['body_html'] += '<p>{} '.format(tag.text) if tag.text.startswith('SELECTIONS: '): sels = titlecase(tag.text.replace('SELECTIONS: ', '')) # In some cases there is no comma between the selections, apparently there should be! sels = sels.replace(') ', '), ') sels = re.sub(r'\s\(.*?\)', '', sels) # get rid of the trailing one sels = re.sub(r'(, $|,$)', ' ', sels) selections_item['body_html'] += '{}</p>'.format(sels) selections_item['body_html'] += '<p>AAP SELECTIONS</p>' comment_item['body_html'] = '' overview = '' regex = r"Race ([1-9][0-9]|[1-9]):" for tag in parsed.xpath('/html/div/child::*'): if tag.tag == 'p': m = re.match(regex, tag.text) if m: comment_item['body_html'] += '<p>Race {}:</p>'.format(race_number_to_words(tag.text)) if tag.text.startswith('EARLY SPEED: '): comment_item['body_html'] += '<p>{}</p>'.format(overview.rstrip()) overview = '' comment_item['body_html'] += '<p>{}</p>'.format(tag.text.rstrip()) if tag.text.startswith('OVERVIEW: '): overview = tag.text elif overview: overview += tag.text for i, j in substitution_map.items(): comment_item['body_html'] = comment_item['body_html'].replace(i, j) comment_item['body_html'] += '<p>AAP COMMENT</p>' service = get_resource_service('archive') selections_item['task'] = item.get('task') selections_item['profile'] = item.get('profile') selections_item[ITEM_STATE] = CONTENT_STATE.PROGRESS service.post([selections_item]) item.update(comment_item) return item
class ActivityReportTestCase(AAPTestCase): history = [{ "operation": "create", "_created": get_date("2017-05-09T01:55:47+0000"), "item_id": "item1", "version": 1, "_updated": get_date("2017-05-09T01:55:47+0000"), "user_id": "user1" }, { "operation": "update", "_created": get_date("2017-05-09T01:55:47+0000"), "item_id": "item1", "version": 2, "_updated": get_date("2017-05-09T01:55:47+0000"), "user_id": "user1" }, { "operation": "create", "_created": get_date("2017-05-09T01:55:47+0000"), "item_id": "item2", "version": 1, "_updated": get_date("2017-05-09T01:55:47+0000"), "user_id": "user2" }, { "operation": "create", "_created": get_date("2017-05-09T01:55:47+0000"), "item_id": "item3", "version": 1, "_updated": get_date("2017-05-09T01:55:47+0000"), "user_id": "user1" }, { "operation": "update", "_created": get_date("2017-05-09T01:55:47.000+0000"), "item_id": "item3", "version": 2, "_updated": get_date("2017-05-09T01:55:47+0000"), "user_id": "user1" }, { "operation": "spike", "_created": get_date("2017-05-09T01:55:47+0000"), "item_id": "item3", "version": 3, "_updated": get_date("2017-05-09T01:55:47+0000"), "user_id": "user2" }, { "operation": "create", "_created": get_date("2017-05-10T01:55:47+0000"), "item_id": "item4", "version": 1, "_updated": get_date("2017-05-10T01:55:47+0000"), "user_id": "user2" }, { "operation": "create", "_created": get_date("2017-05-10T01:55:47+0000"), "item_id": "item5", "version": 1, "_updated": get_date("2017-05-10T01:55:47+0000"), "user_id": "user2", "original_item_id": "item2" }] def setUp(self): super().setUp() self.app.data.insert('archive_history', self.history) def test_compare_repos(self): with self.app.app_context(): cmd = GenerateActivityCountReport() items = cmd.run(get_date("2017-05-10T23:59:59+0000")) self.assertEqual(len(items), 3) user1_items = [ item for item in items if item['user_id'] == 'user1' ] user2_items = [ item for item in items if item['user_id'] == 'user2' ] self.assertEqual(len(user1_items), 1) self.assertEqual(len(user2_items), 2) self.assertEqual(user1_items[0]['create_count'], 1) self.assertEqual(user1_items[0]['update_count'], 1) self.assertEqual(user2_items[0]['create_count'], 1) self.assertEqual(user2_items[0]['update_count'], 0) self.assertEqual(user2_items[1]['create_count'], 1) self.assertEqual(user2_items[1]['update_count'], 0)
def _parse_formatted_email(self, data, provider): """Construct an item from an email that was constructed as a notification from a google form submission. The google form submits to a google sheet, this sheet creates the email as a notification :param data: :param provider: :return: A list of 1 item """ try: item = dict() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['versioncreated'] = utcnow() for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) # Check that the subject line matches what we expect, ignore it if not if self.parse_header(msg['subject']) != 'Formatted Editorial Story': return [] item['guid'] = msg['Message-ID'] date_tuple = email.utils.parsedate_tz(msg['Date']) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone('utc')) item['firstcreated'] = dt for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) # if we don't know the charset just have a go! if part.get_content_charset() is None: json_str = body.decode().replace('\r\n', '').replace(' ', ' ') else: charset = part.get_content_charset() json_str = body.decode(charset).replace('\r\n', '').replace(' ', ' ') mail_item = dict((k, v[0]) for k, v in json.loads(json_str).items()) self._expand_category(item, mail_item) item['original_source'] = mail_item.get('Username', '') item['headline'] = mail_item.get('Headline', '') item['abstract'] = mail_item.get('Abstract', '') item['slugline'] = mail_item.get('Slugline', '') item['body_html'] = '<p>' + mail_item.get('Body', '').replace('\n', '</p><p>') + '</p>' default_source = app.config.get('DEFAULT_SOURCE_VALUE_FOR_MANUAL_ARTICLES') city = mail_item.get('Dateline', '') cities = app.locators.find_cities() located = [c for c in cities if c['city'].lower() == city.lower()] item.setdefault('dateline', {}) item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city'} item['dateline']['source'] = default_source item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'], get_date(item['firstcreated']), source=default_source) if mail_item.get('Priority') != '': if mail_item.get('Priority', '3').isdigit(): item['priority'] = int(mail_item.get('Priority', '3')) else: priority_map = superdesk.get_resource_service('vocabularies').find_one( req=None, _id='priority') priorities = [x for x in priority_map.get('items', []) if x['name'].upper() == mail_item.get('Priority', '').upper()] if priorities is not None and len(priorities) > 0: item['priority'] = int(priorities[0].get('qcode', '3')) else: item['priority'] = 3 if mail_item.get('News Value') != '': item['urgency'] = int(mail_item.get('News Value', '3')) # We expect the username passed corresponds to a superdesk user query = {'email': re.compile('^{}$'.format(mail_item.get('Username')), re.IGNORECASE)} user = superdesk.get_resource_service('users').find_one(req=None, **query) if not user: logger.error('Failed to find user for email {}'.format(mail_item.get('Username'))) raise UserNotRegisteredException() item['original_creator'] = user.get('_id') if BYLINE in user and user.get(BYLINE, ''): item['byline'] = user.get(BYLINE) item[SIGN_OFF] = user.get(SIGN_OFF) # attempt to match the given desk name against the defined desks query = {'name': re.compile('^{}$'.format(mail_item.get('Desk', '')), re.IGNORECASE)} desk = superdesk.get_resource_service('desks').find_one( req=None, **query) if desk: item['task'] = {'desk': desk.get('_id'), 'stage': desk.get('incoming_stage')} if 'Place' in mail_item: locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators') place = [x for x in locator_map.get('items', []) if x['qcode'] == mail_item.get('Place', '').upper()] if place is not None: item['place'] = place if mail_item.get('Legal flag', '') == 'LEGAL': item['flags'] = {'marked_for_legal': True} break return [item] except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
def get_dateline_date(self, ap_item): if ap_item.get("firstcreated"): dateline_date = get_date(ap_item["firstcreated"]).replace(tzinfo=pytz.UTC) else: dateline_date = utcnow() return dateline_date
def parse(self, s_json, provider=None): in_item = s_json.get('data', {}).get('item') nitf_item = s_json.get('nitf', {}) item = { 'guid': in_item.get('altids', {}).get('itemid') + ':' + str(in_item.get('version')) } item['source'] = provider.get('source') if provider else 'AP' for copy_property in self.direct_copy_properties: if in_item.get(copy_property) is not None: item[copy_property] = in_item[copy_property] if in_item.get('version'): item['version'] = in_item['version'] if in_item.get('versioncreated'): item['versioncreated'] = self.datetime( in_item.get('versioncreated')) if in_item.get('firstcreated'): item['firstcreated'] = self.datetime(in_item.get('firstcreated')) if len(in_item.get('infosource', [])): item['original_source'] = ','.join( [n.get('name') for n in in_item.get('infosource', [])]) if in_item.get('datelinelocation'): cities = app.locators.find_cities() # Try to find a single matching city either by city and country or city country and state located = [ c for c in cities if c['city'] == in_item.get('datelinelocation').get('city') and c['country'] == in_item.get('datelinelocation').get( 'countryname') ] if len(located) > 1: located = [ c for c in cities if c['city'] == in_item.get('datelinelocation').get('city') and c['country'] == in_item.get('datelinelocation').get( 'countryname') and c['state'] == in_item.get( 'datelinelocation').get('countryareaname') ] if len(located) == 1: item['dateline'] = dict() item['dateline']['located'] = located[0] item['dateline']['source'] = provider.get('source') item['dateline']['text'] = format_dateline_to_locmmmddsrc( item['dateline']['located'], get_date(item['firstcreated']), provider.get('source')) if len(in_item.get('bylines', [])): item['byline'] = ','.join([ n.get('name') if n.get('name') else n.get('by', '') + (' ({})'.format(n.get('title')) if n.get('title') else '') for n in in_item.get('bylines', []) ]) if item.get('byline').startswith('By '): item['byline'] = item['byline'][3:] if len(in_item.get('usageterms', [])): item['usageterms'] = ', '.join( [n for n in in_item.get('usageterms', [])]) if in_item.get('type') == 'picture': if in_item.get('renditions'): self._parse_renditions(in_item['renditions'], item, provider) if in_item.get('description_caption'): item['description_text'] = in_item.get('description_caption') item['archive_description'] = in_item.get( 'description_caption') if in_item.get('description_creditline'): item['credit'] = in_item.get('description_creditline') if in_item.get('photographer', {}).get('name'): item['byline'] = in_item.get('photographer', {}).get('name') if in_item.get('type') == 'text': # Peel off the take key if possible if ',' in item['slugline']: item['anpa_take_key'] = item['slugline'].split(',')[1] item['slugline'] = item['slugline'].split(',')[0] if item['slugline'].startswith('BC-'): item['slugline'] = item['slugline'][3:] if item.get('ednote', '').startswith('Eds:'): item['ednote'] = item['ednote'][5:] if in_item.get('headline_extended'): item['abstract'] = in_item.get('headline_extended') self.categorisation_mapping(in_item, item) # Map the urgency to urgency and priority if in_item.get('urgency'): item[ITEM_URGENCY] = int( in_item['urgency']) if in_item['urgency'] <= 5 else 5 item[ITEM_PRIORITY] = self.priority_map.get( in_item['urgency'], 5) if nitf_item.get('body_html'): # item['body_html'] = sd_etree.clean_html_str(nitf_item.get('body_html')) item['body_html'] = nitf_item.get('body_html').replace( '<block id="Main">', '').replace('</block>', '') if s_json.get('associations'): self._parse_associations(s_json['associations'], item, provider) return item
def parse(self, s_json, provider=None): in_item = s_json.get("data", {}).get("item") nitf_item = s_json.get("nitf", {}) item = { "guid": in_item.get("altids", {}).get("itemid") + ":" + str(in_item.get("version")) } item["source"] = provider.get("source") if provider else "AP" for copy_property in self.direct_copy_properties: if in_item.get(copy_property) is not None: item[copy_property] = in_item[copy_property] if in_item.get("version"): item["version"] = in_item["version"] if in_item.get("versioncreated"): item["versioncreated"] = self.datetime( in_item.get("versioncreated")) if in_item.get("firstcreated"): item["firstcreated"] = self.datetime(in_item.get("firstcreated")) if len(in_item.get("infosource", [])): item["original_source"] = ",".join( [n.get("name") for n in in_item.get("infosource", [])]) if in_item.get("datelinelocation"): cities = app.locators.find_cities() # Try to find a single matching city either by city and country or city country and state located = [ c for c in cities if c["city"] == in_item.get("datelinelocation").get("city") and c["country"] == in_item.get("datelinelocation").get( "countryname") ] if len(located) > 1: located = [ c for c in cities if c["city"] == in_item.get("datelinelocation").get("city") and c["country"] == in_item.get("datelinelocation").get( "countryname") and c["state"] == in_item.get( "datelinelocation").get("countryareaname") ] if len(located) == 1: item["dateline"] = dict() item["dateline"]["located"] = located[0] item["dateline"]["source"] = provider.get("source") item["dateline"]["text"] = format_dateline_to_locmmmddsrc( item["dateline"]["located"], get_date(item["firstcreated"]), provider.get("source")) if len(in_item.get("bylines", [])): item["byline"] = ",".join([ n.get("name") if n.get("name") else n.get("by", "") + (" ({})".format(n.get("title")) if n.get("title") else "") for n in in_item.get("bylines", []) ]) if item.get("byline").startswith("By "): item["byline"] = item["byline"][3:] if len(in_item.get("usageterms", [])): item["usageterms"] = ", ".join( [n for n in in_item.get("usageterms", [])]) if in_item.get("type") == "picture": if in_item.get("renditions"): self._parse_renditions(in_item["renditions"], item, provider) if in_item.get("description_caption"): item["description_text"] = in_item.get("description_caption") item["archive_description"] = in_item.get( "description_caption") if in_item.get("description_creditline"): item["credit"] = in_item.get("description_creditline") if in_item.get("photographer", {}).get("name"): item["byline"] = in_item.get("photographer", {}).get("name") if in_item.get("type") == "text": # Peel off the take key if possible if "," in item["slugline"]: item["anpa_take_key"] = item["slugline"].split(",")[1] item["slugline"] = item["slugline"].split(",")[0] if item["slugline"].startswith("BC-"): item["slugline"] = item["slugline"][3:] if item.get("ednote", "").startswith("Eds:"): item["ednote"] = item["ednote"][5:] if in_item.get("headline_extended"): item["abstract"] = in_item.get("headline_extended") self.categorisation_mapping(in_item, item) # Map the urgency to urgency and priority if in_item.get("urgency"): item[ITEM_URGENCY] = int( in_item["urgency"]) if in_item["urgency"] <= 5 else 5 item[ITEM_PRIORITY] = self.priority_map.get( in_item["urgency"], 5) if nitf_item.get("body_html"): # item['body_html'] = sd_etree.clean_html_str(nitf_item.get('body_html')) item["body_html"] = nitf_item.get("body_html").replace( '<block id="Main">', "").replace("</block>", "") if s_json.get("associations"): self._parse_associations(s_json["associations"], item, provider) return item
def _parse_doc(self, doc): new_doc = {} new_doc['_id'] = doc['refPtr'] new_doc['guid'] = doc['refPtr'] try: new_doc['description_text'] = doc['caption'] except KeyError: pass try: new_doc['headline'] = doc['headline'] except KeyError: pass try: new_doc['original_source'] = new_doc['source'] = doc['credit'] except KeyError: pass new_doc['versioncreated'] = new_doc['firstcreated'] = self._datetime( local_to_utc(SCANPIX_TZ, get_date(doc['archivedTime']))) new_doc['pubstatus'] = 'usable' # This must match the action new_doc['_type'] = 'externalsource' # entry that the client can use to identify the fetch endpoint new_doc['fetch_endpoint'] = 'scanpix' # mimetype is not directly found in Scanpix API # so we use original filename to guess it mimetype = mimetypes.guess_type("_{}".format( splitext(doc.get('originalFileName', ''))[1]))[0] if mimetype is None: # nothing found with filename, we try out luck with fileFormat try: format_ = doc['fileFormat'].split()[0] except (KeyError, IndexError): mimetype = None else: mimetype = mimetypes.guess_type('_.{}'.format(format_))[0] if mimetype is not None: new_doc['mimetype'] = mimetype main_group = doc['mainGroup'] if main_group == 'video': new_doc[ITEM_TYPE] = CONTENT_TYPE.VIDEO elif main_group == 'graphic': new_doc[ITEM_TYPE] = CONTENT_TYPE.GRAPHIC new_doc['mimetype'] = 'image/jpeg' else: new_doc[ITEM_TYPE] = CONTENT_TYPE.PICTURE try: doc_previews = doc['previews'] except KeyError: logger.warning('no preview found for item {}'.format( new_doc['_id'])) else: # we look for best available scanpix preview available_previews = [p['type'] for p in doc_previews] renditions = new_doc['renditions'] = {} for rend, previews in REND2PREV.items(): for prev in previews: if prev in available_previews: idx = available_previews.index(prev) renditions[rend] = {"href": doc_previews[idx]['url']} break new_doc['byline'] = doc['byline'] doc.clear() doc.update(new_doc)
def _get_local_time(self, time, tz=None): if time is None: time = utcnow() if not tz: tz = self.TIMEZONE return utc_to_local(tz, get_date(time))