def parse(self, file_path, provider=None): try: item = { 'guid': '{}-{}'.format(file_path, uuid.uuid4()), 'pubstatus': 'usable', 'versioncreated': utcnow(), ITEM_TYPE: CONTENT_TYPE.TEXT, FORMAT: FORMATS.PRESERVED, } with open(file_path, 'r', encoding='windows-1252') as f: data = f.read().replace('\r', '') header, dateline_data, body_data = data.split('\n\n', 2) self._process_header(item, header) start_of_body = 'MEDIA RELEASE ' source, data = data.split(start_of_body, 1) data = start_of_body + data item['anpa_category'] = [{'qcode': 'j'}] item['original_source'] = 'AsiaNet' item['word_count'] = get_text_word_count(data) item['body_html'] = '<pre>' + to_ascii( html.escape(data)) + '</pre>' return item except Exception as e: raise AAPParserError.AsiaNetParserError(file_path, e)
def broadcast_auto_publish(item, **kwargs): """Broadcast auto publish macro. :param item: :param kwargs: :return: """ if item.get(ITEM_TYPE) != CONTENT_TYPE.TEXT or item.get( FORMAT) != FORMATS.HTML: return formatter = AAPBulletinBuilderFormatter() body_text = formatter.get_text_content(formatter.append_body_footer(item)) word_count = get_text_word_count(body_text) max_word_count = config.MIN_BROADCAST_TEXT_WORD_COUNT item['genre'] = [{'name': 'Broadcast Script', 'qcode': 'Broadcast Script'}] if item[ITEM_STATE] not in {CONTENT_STATE.KILLED, CONTENT_STATE.RECALLED} and \ not (item.get('flags') or {}).get('marked_for_legal'): if word_count > max_word_count and \ not (item.get('flags') or {}).get('marked_for_legal'): lines = body_text.splitlines() new_body_html = [] for line in lines: para = line.strip() if not para: continue new_body_html.append('<p>{}</p>'.format(para)) word_count = get_text_word_count(''.join(new_body_html)) if word_count > max_word_count: if len(new_body_html): item['body_html'] = ''.join(new_body_html) item['word_count'] = word_count break elif item[ITEM_STATE] in {CONTENT_STATE.KILLED, CONTENT_STATE.RECALLED}: lines = body_text.splitlines() lines = [ '<p>{}</p>'.format(line.strip()) for line in lines if line.strip() ] # remove the first line/paragraph of kill message lines = lines[1:] item['body_html'] = ''.join(lines) fields_to_remove = ['embargo', 'dateline', 'slugline', 'genre'] for field in fields_to_remove: item.pop(field, None) internal_destination_auto_publish(item, **kwargs)
def broadcast_auto_publish(item, **kwargs): """Broadcast auto publish macro. :param item: :param kwargs: :return: """ if item.get(ITEM_TYPE) != CONTENT_TYPE.TEXT or item.get(FORMAT) != FORMATS.HTML: return formatter = AAPBulletinBuilderFormatter() body_text = formatter.get_text_content(formatter.append_body_footer(item)) word_count = get_text_word_count(body_text) max_word_count = config.MIN_BROADCAST_TEXT_WORD_COUNT item['genre'] = [{'name': 'Broadcast Script', 'qcode': 'Broadcast Script'}] if item[ITEM_STATE] not in {CONTENT_STATE.KILLED, CONTENT_STATE.RECALLED} and \ not (item.get('flags') or {}).get('marked_for_legal'): if word_count > max_word_count and \ not (item.get('flags') or {}).get('marked_for_legal'): lines = body_text.splitlines() new_body_html = [] for line in lines: para = line.strip() if not para: continue new_body_html.append('<p>{}</p>'.format(para)) word_count = get_text_word_count(''.join(new_body_html)) if word_count > max_word_count: if len(new_body_html): item['body_html'] = ''.join(new_body_html) item['word_count'] = word_count break elif item[ITEM_STATE] in {CONTENT_STATE.KILLED, CONTENT_STATE.RECALLED}: lines = body_text.splitlines() lines = ['<p>{}</p>'.format(line.strip()) for line in lines if line.strip()] # remove the first line/paragraph of kill message lines = lines[1:] item['body_html'] = ''.join(lines) fields_to_remove = ['embargo', 'dateline', 'slugline', 'genre'] for field in fields_to_remove: item.pop(field, None) internal_destination_auto_publish(item, **kwargs)
def broadcast_auto_publish(item, **kwargs): """Broadcast auto publish macro. :param item: :param kwargs: :return: """ if item.get(ITEM_TYPE) != CONTENT_TYPE.TEXT or item.get( FORMAT) != FORMATS.HTML: return max_word_count = config.MIN_BROADCAST_TEXT_WORD_COUNT item['genre'] = [{'name': 'Broadcast Script', 'qcode': 'Broadcast Script'}] if item[ITEM_STATE] not in {CONTENT_STATE.KILLED, CONTENT_STATE.RECALLED} and \ not (item.get('flags') or {}).get('marked_for_legal'): formatter = AAPBulletinBuilderFormatter() body_text = formatter.get_text_content( formatter.append_body_footer(item)) word_count = get_text_word_count(body_text) if word_count > max_word_count and \ not (item.get('flags') or {}).get('marked_for_legal'): lines = body_text.splitlines() new_body_html = [] for line in lines: para = line.strip() if not para: continue new_body_html.append('<p>{}</p>'.format(para)) word_count = get_text_word_count(''.join(new_body_html)) if word_count > max_word_count: if len(new_body_html): item['body_html'] = ''.join(new_body_html) item['word_count'] = word_count break internal_destination_auto_publish(item, **kwargs)
def _transform_to_ninjs(self, article, subscriber, recursive=True): ninjs = { 'guid': article.get(GUID_FIELD, article.get('uri')), 'version': str(article.get(config.VERSION, 1)), 'type': self._get_type(article) } if article.get('byline'): ninjs['byline'] = article['byline'] located = article.get('dateline', {}).get('located', {}) if located: ninjs['located'] = located.get('city', '') for copy_property in self.direct_copy_properties: if article.get(copy_property) is not None: ninjs[copy_property] = article[copy_property] if 'body_text' not in article and 'alt_text' in article: ninjs['body_text'] = article['alt_text'] if 'title' in article: ninjs['headline'] = article['title'] if article.get('body_html'): ninjs['body_html'] = self.append_body_footer(article) if article.get('description'): ninjs['description_html'] = self.append_body_footer(article) if article.get('place'): ninjs['place'] = self._format_place(article) if article.get('profile'): ninjs['profile'] = self._format_profile(article['profile']) if recursive: if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE: ninjs[ASSOCIATIONS] = self._get_associations( article, subscriber) if article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS].update( self._format_related(article, subscriber)) elif article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS] = self._format_related(article, subscriber) elif article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS] = self._format_related(article, subscriber) if article.get(EMBARGO): ninjs['embargoed'] = get_utc_schedule(article, EMBARGO).isoformat() if article.get('priority'): ninjs['priority'] = article['priority'] else: ninjs['priority'] = 5 if article.get('subject'): ninjs['subject'] = self._get_subject(article) if article.get('anpa_category'): ninjs['service'] = self._get_service(article) if article.get('renditions'): ninjs['renditions'] = self._get_renditions(article) elif 'url' in article: ninjs['renditions'] = self._generate_renditions(article) # SDPA-317 if 'abstract' in article: abstract = article.get('abstract', '') ninjs['description_html'] = abstract ninjs['description_text'] = text_utils.get_text(abstract) elif article.get('description_text'): ninjs['description_text'] = article.get('description_text') if article.get('company_codes'): ninjs['organisation'] = [{ 'name': c.get('name', ''), 'rel': 'Securities Identifier', 'symbols': [{ 'ticker': c.get('qcode', ''), 'exchange': c.get('security_exchange', '') }] } for c in article['company_codes']] elif 'company' in article: ninjs['organisation'] = [{'name': article['company']}] if article.get('rewrite_of'): ninjs['evolvedfrom'] = article['rewrite_of'] if not ninjs.get('copyrightholder') and not ninjs.get( 'copyrightnotice') and not ninjs.get('usageterms'): ninjs.update( superdesk.get_resource_service('vocabularies').get_rightsinfo( article)) if 'genre' in article: ninjs['genre'] = self._get_genre(article) if article.get('flags', {}).get('marked_for_legal'): ninjs['signal'] = self._format_signal_cwarn() if article.get('attachments'): ninjs['attachments'] = self._format_attachments(article) if ninjs['type'] == CONTENT_TYPE.TEXT and ('body_html' in ninjs or 'body_text' in ninjs): if 'body_html' in ninjs: body_html = ninjs['body_html'] word_count = text_utils.get_word_count(body_html) char_count = text_utils.get_char_count(body_html) readtime = text_utils.get_reading_time(body_html, word_count, article.get('language')) else: body_text = ninjs['body_text'] word_count = text_utils.get_text_word_count(body_text) char_count = len(body_text) readtime = text_utils.get_reading_time(body_text, word_count, article.get('language')) ninjs['charcount'] = char_count ninjs['wordcount'] = word_count ninjs['readtime'] = readtime if article.get('authors'): ninjs['authors'] = self._format_authors(article) return ninjs
def _transform_to_ninjs(self, article, subscriber, recursive=True): ninjs = { "guid": article.get(GUID_FIELD, article.get("uri")), "version": str(article.get(config.VERSION, 1)), "type": self._get_type(article), } if article.get("byline"): ninjs["byline"] = article["byline"] located = article.get("dateline", {}).get("located", {}) if located: ninjs["located"] = located.get("city", "") for copy_property in self.direct_copy_properties: if article.get(copy_property) is not None: ninjs[copy_property] = article[copy_property] if "body_text" not in article and "alt_text" in article: ninjs["body_text"] = article["alt_text"] if "title" in article: ninjs["headline"] = article["title"] if article.get("body_html"): ninjs["body_html"] = self.append_body_footer(article) if article.get("description"): ninjs["description_html"] = self.append_body_footer(article) if article.get("place"): ninjs["place"] = self._format_place(article) if article.get("profile"): ninjs["profile"] = self._format_profile(article["profile"]) extra_items = None if recursive: if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE: ninjs[ASSOCIATIONS] = self._get_associations( article, subscriber) if article.get(ASSOCIATIONS): associations, extra_items = self._format_related( article, subscriber) ninjs[ASSOCIATIONS].update(associations) elif article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS], extra_items = self._format_related( article, subscriber) elif article.get(ASSOCIATIONS) and recursive: ninjs[ASSOCIATIONS], extra_items = self._format_related( article, subscriber) if extra_items: ninjs.setdefault(EXTRA_ITEMS, {}).update(extra_items) if article.get("embargoed"): ninjs["embargoed"] = article["embargoed"].isoformat() if article.get( EMBARGO): # embargo set in superdesk overrides ingested one ninjs["embargoed"] = get_utc_schedule(article, EMBARGO).isoformat() if article.get("priority"): ninjs["priority"] = article["priority"] else: ninjs["priority"] = 5 if article.get("subject"): ninjs["subject"] = self._get_subject(article) if article.get("anpa_category"): ninjs["service"] = self._get_service(article) if article.get("renditions"): ninjs["renditions"] = self._get_renditions(article) elif "url" in article: ninjs["renditions"] = self._generate_renditions(article) if "order" in article: ninjs["order"] = article["order"] # SDPA-317 if "abstract" in article: abstract = article.get("abstract", "") ninjs["description_html"] = abstract ninjs["description_text"] = text_utils.get_text(abstract) elif article.get("description_text"): ninjs["description_text"] = article.get("description_text") if article.get("company_codes"): ninjs["organisation"] = [{ "name": c.get("name", ""), "rel": "Securities Identifier", "symbols": [{ "ticker": c.get("qcode", ""), "exchange": c.get("security_exchange", "") }], } for c in article["company_codes"]] elif "company" in article: ninjs["organisation"] = [{"name": article["company"]}] if article.get("rewrite_of"): ninjs["evolvedfrom"] = article["rewrite_of"] if not ninjs.get("copyrightholder") and not ninjs.get( "copyrightnotice") and not ninjs.get("usageterms"): ninjs.update( superdesk.get_resource_service("vocabularies").get_rightsinfo( article)) if article.get("genre"): ninjs["genre"] = self._get_genre(article) if article.get("flags", {}).get("marked_for_legal"): ninjs["signal"] = self._format_signal_cwarn() if article.get("signal"): ninjs.setdefault("signal", []).extend( [self._format_signal(signal) for signal in article["signal"]]) if article.get("attachments"): ninjs["attachments"] = self._format_attachments(article) if ninjs["type"] == CONTENT_TYPE.TEXT and ("body_html" in ninjs or "body_text" in ninjs): if "body_html" in ninjs: body_html = ninjs["body_html"] word_count = text_utils.get_word_count(body_html) char_count = text_utils.get_char_count(body_html) readtime = text_utils.get_reading_time(body_html, word_count, article.get("language")) else: body_text = ninjs["body_text"] word_count = text_utils.get_text_word_count(body_text) char_count = len(body_text) readtime = text_utils.get_reading_time(body_text, word_count, article.get("language")) ninjs["charcount"] = char_count ninjs["wordcount"] = word_count ninjs["readtime"] = readtime if article.get("authors"): ninjs["authors"] = self._format_authors(article) if (article.get("schedule_settings") or {}).get("utc_publish_schedule"): ninjs["publish_schedule"] = article["schedule_settings"][ "utc_publish_schedule"] # set description for custom embed field if article.get("extra"): ninjs["extra"] = article["extra"] for key, value in ninjs["extra"].items(): if type(value) == dict and "embed" in value: value.setdefault("description", "") return ninjs
def _transform_to_ninjs(self, article, subscriber, recursive=True): ninjs = { 'guid': article.get(GUID_FIELD, article.get('uri')), 'version': str(article.get(config.VERSION, 1)), 'type': self._get_type(article) } if article.get('byline'): ninjs['byline'] = article['byline'] located = article.get('dateline', {}).get('located', {}) if located: ninjs['located'] = located.get('city', '') for copy_property in self.direct_copy_properties: if article.get(copy_property) is not None: ninjs[copy_property] = article[copy_property] if 'body_text' not in article and 'alt_text' in article: ninjs['body_text'] = article['alt_text'] if 'title' in article: ninjs['headline'] = article['title'] if article.get('body_html'): ninjs['body_html'] = self.append_body_footer(article) if article.get('description'): ninjs['description_html'] = self.append_body_footer(article) if article.get('place'): ninjs['place'] = self._format_place(article) if article.get('profile'): ninjs['profile'] = self._format_profile(article['profile']) extra_items = None if recursive: if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE: ninjs[ASSOCIATIONS] = self._get_associations(article, subscriber) if article.get(ASSOCIATIONS): associations, extra_items = self._format_related(article, subscriber) ninjs[ASSOCIATIONS].update(associations) elif article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS], extra_items = self._format_related(article, subscriber) elif article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS], extra_items = self._format_related(article, subscriber) if extra_items: ninjs.setdefault(EXTRA_ITEMS, {}).update(extra_items) if article.get(EMBARGO): ninjs['embargoed'] = get_utc_schedule(article, EMBARGO).isoformat() if article.get('priority'): ninjs['priority'] = article['priority'] else: ninjs['priority'] = 5 if article.get('subject'): ninjs['subject'] = self._get_subject(article) if article.get('anpa_category'): ninjs['service'] = self._get_service(article) if article.get('renditions'): ninjs['renditions'] = self._get_renditions(article) elif 'url' in article: ninjs['renditions'] = self._generate_renditions(article) # SDPA-317 if 'abstract' in article: abstract = article.get('abstract', '') ninjs['description_html'] = abstract ninjs['description_text'] = text_utils.get_text(abstract) elif article.get('description_text'): ninjs['description_text'] = article.get('description_text') if article.get('company_codes'): ninjs['organisation'] = [{'name': c.get('name', ''), 'rel': 'Securities Identifier', 'symbols': [{'ticker': c.get('qcode', ''), 'exchange': c.get('security_exchange', '')}]} for c in article['company_codes']] elif 'company' in article: ninjs['organisation'] = [{'name': article['company']}] if article.get('rewrite_of'): ninjs['evolvedfrom'] = article['rewrite_of'] if not ninjs.get('copyrightholder') and not ninjs.get('copyrightnotice') and not ninjs.get('usageterms'): ninjs.update(superdesk.get_resource_service('vocabularies').get_rightsinfo(article)) if 'genre' in article: ninjs['genre'] = self._get_genre(article) if article.get('flags', {}).get('marked_for_legal'): ninjs['signal'] = self._format_signal_cwarn() if article.get('attachments'): ninjs['attachments'] = self._format_attachments(article) if ninjs['type'] == CONTENT_TYPE.TEXT and ('body_html' in ninjs or 'body_text' in ninjs): if 'body_html' in ninjs: body_html = ninjs['body_html'] word_count = text_utils.get_word_count(body_html) char_count = text_utils.get_char_count(body_html) readtime = text_utils.get_reading_time(body_html, word_count, article.get('language')) else: body_text = ninjs['body_text'] word_count = text_utils.get_text_word_count(body_text) char_count = len(body_text) readtime = text_utils.get_reading_time(body_text, word_count, article.get('language')) ninjs['charcount'] = char_count ninjs['wordcount'] = word_count ninjs['readtime'] = readtime if article.get('authors'): ninjs['authors'] = self._format_authors(article) return ninjs
def _process_bunch(self, x): # x.findall('dc_rest_docs/dc_rest_doc')[0].get('href') items = [] for doc in x.findall('dc_rest_docs/dc_rest_doc'): try: # print(doc.get('href')) id = doc.find('dcdossier').get('id') if self._direction: if int(id) > self._id: self._id = int(id) else: if int(id) < self._id: self._id = int(id) item = {} item['guid'] = doc.find('dcdossier').get('guid') item[ITEM_TYPE] = CONTENT_TYPE.TEXT format = self._get_head_value(doc, 'Format') if format == 't': item[FORMAT] = FORMATS.PRESERVED else: item[FORMAT] = FORMATS.HTML # item[FORMAT] = FORMATS.HTML # if the item has been modified in the archive then it is due to a kill # there is an argument that this item should not be imported at all if doc.find('dcdossier').get('created') != doc.find( 'dcdossier').get('modified'): # item[ITEM_STATE] = CONTENT_STATE.KILLED continue else: item[ITEM_STATE] = CONTENT_STATE.PUBLISHED value = datetime.strptime( self._get_head_value(doc, 'PublicationDate'), '%Y%m%d%H%M%S') local_tz = pytz.timezone('Australia/Sydney') try: aus_dt = local_tz.localize(value, is_dst=None) except NonExistentTimeError as ex: aus_dt = local_tz.localize(value, is_dst=True) except AmbiguousTimeError: aus_dt = local_tz.localize(value, is_dst=False) item['firstcreated'] = aus_dt.astimezone(pytz.utc) item['versioncreated'] = item['firstcreated'] generate_unique_id_and_name(item) item['ingest_id'] = id last_line = None el = doc.find('dcdossier/document/body/BodyText') if el is not None: story = el.text lines = story.split('\n') if len(lines) > 0: last_line = lines[-1] if item.get(FORMAT) == FORMATS.HTML: story = story.replace('\n ', '<p></p>') story = story.replace('\n', '<br>') item['body_html'] = '<p>' + story + '</p>' else: item['body_html'] = '<pre>' + story + '</pre>' try: item['word_count'] = get_text_word_count( item['body_html']) except: pass else: # Items with no body are ignored continue item['source'] = self._get_head_value(doc, 'Agency') # if the source document contains no agency then by definition it is unknown if item['source'] is None: item['source'] = 'UNKNOWN' else: # check if the source of the document was Newscentre dc_unique = doc.find('dcdossier').get('unique') if dc_unique.startswith('NC.') and last_line is not None: # The AFR summary articles all have agency values 25 chars long if len(item['source']) == 25: item['source'] = 'AAP' # is it a numeric Agency elif self._get_head_value(doc, 'Agency').isdigit(): sign_off = last_line.split(' ') if len(sign_off) > 0: item['source'] = sign_off[0].upper() else: item['source'] = sign_off.upper() # clean up what we have extracted if item['source'].startswith('AAP'): item['source'] = 'AAP' else: # make sure it is one of the known values if item['source'] not in { 'AAP', 'AP', 'REUT', 'Asia Pulse', 'DPA', 'AFP', 'RAW', 'NZA', 'NZPA', 'KRT', 'PA', 'PAA', 'SNI', 'REUTERS' }: print('Source : {}'.format(item['source'])) item['source'] = 'UNKNOWN' # self._addkeywords('AsiaPulseCodes', doc, item) byline = self._get_head_value(doc, 'Byline') if byline: item['byline'] = byline # item['service'] = self._get_head_value(doc,'Service') category = self._get_head_value(doc, 'Category') if not category: publication_name = self._get_head_value( doc, 'PublicationName') if publication_name in pubnames: category = pubnames[publication_name] if category: anpacategory = {} anpacategory['qcode'] = category for anpa_category in self._anpa_categories['items']: if anpacategory['qcode'].lower( ) == anpa_category['qcode'].lower(): anpacategory = { 'qcode': anpacategory['qcode'], 'name': anpa_category['name'] } break item['anpa_category'] = [anpacategory] self._addkeywords('CompanyCodes', doc, item) item['keyword'] = self._get_head_value(doc, 'Keyword') item['ingest_provider_sequence'] = self._get_head_value( doc, 'Sequence') orginal_source = self._get_head_value(doc, 'Author') if orginal_source: item['original_source'] = orginal_source item['headline'] = self._get_head_value(doc, 'Headline') code = self._get_head_value(doc, 'SubjectRefNum') if code and len(code) == 7: code = '0' + code if code and code in subject_codes: item['subject'] = [] item['subject'].append({ 'qcode': code, 'name': subject_codes[code] }) try: process_iptc_codes(item, None) except: pass slug = self._get_head_value(doc, 'SLUG') if slug: item['slugline'] = slug else: item['slugline'] = self._get_head_value(doc, 'Keyword') take_key = self._get_head_value(doc, 'Takekey') if take_key: item['anpa_take_key'] = take_key self._addkeywords('Topic', doc, item) # self._addkeywords('Selectors', doc, item) item['pubstatus'] = 'usable' # this is required for the archived service additional lookup item['item_id'] = item['guid'] item[config.VERSION] = 1 item['flags'] = {'marked_archived_only': True} # item['_id'] = ObjectId(id.rjust(24,'0')) item['_id'] = ObjectId() items.append(item) if self._limit: self._limit -= 1 # print(item) except Exception as ex: print('Exception parsing DC documnent {}'.format(id)) pass try: res = superdesk.get_resource_service('archived') s = time.time() res.post(items) print('Post to Batch to Superdesk took {:.2f}'.format(time.time() - s)) except Exception as ex: if ex.code == 409: print('Key clash exceptionn detected') # create a list of the guids we tried to post guids = [g['guid'] for g in items] # create a query for all those id's query = { 'size': self.BATCH_SIZE, 'query': { 'filtered': { 'filter': { "terms": { "guid": [guids] } } } } } req = ParsedRequest() repos = 'archived' req.args = {'source': json.dumps(query), 'repo': repos} search_res = superdesk.get_resource_service('search') existing = search_res.get(req=req, lookup=None) existing_guids = [e['guid'] for e in existing] not_existing = [g for g in guids if g not in existing_guids] for missing_guid in not_existing: i = [m for m in items if m['guid'] == missing_guid] original = res.find_one(req=None, guid=i[0]['guid']) if not original: try: s = time.time() res.post(i) print( 'Post single item to Superdesk in {:.2f} seconds' .format(time.time() - s)) except Exception as ex: print('Exception posting single item') else: print('Exception posting batch')