def parse(self, file_path, provider=None): try: item = { 'guid': '{}-{}'.format(file_path, uuid.uuid4()), 'pubstatus': 'usable', 'versioncreated': utcnow(), ITEM_TYPE: CONTENT_TYPE.TEXT, FORMAT: FORMATS.HTML, } with open(file_path, 'r', encoding='windows-1252') as f: data = f.read().replace('\r', '') header, dateline_data, body_data = data.split('\n\n', 2) self._process_header(item, header) start_of_body = 'MEDIA RELEASE ' source, data = data.split(start_of_body, 1) data = start_of_body + data item['anpa_category'] = [{'qcode': 'j'}] item['original_source'] = 'AsiaNet' body_html = to_ascii(html.escape(data)).replace('\n\n', '</p><p>').replace( '\n', ' ') item['body_html'] = '<p>' + body_html + '</p>' item['word_count'] = get_word_count(item['body_html']) return item except Exception as e: raise AAPParserError.AsiaNetParserError(file_path, e)
def parse(self, file_path, provider=None): try: item = { 'guid': '{}-{}'.format(file_path, uuid.uuid4()), 'pubstatus': 'usable', 'versioncreated': utcnow(), ITEM_TYPE: CONTENT_TYPE.TEXT, FORMAT: FORMATS.PRESERVED, } with open(file_path, 'r', encoding='windows-1252') as f: data = f.read().replace('\r', '') header, dateline_data, data = data.split('\n\n', 2) self._process_header(item, header) self._process_dateline(item, dateline_data) item['original_source'] = 'AsiaNet' item['word_count'] = get_text_word_count(data) item['body_html'] = '<pre>' + html.escape(data) + '</pre>' return item except Exception as e: raise AAPParserError.AsiaNetParserError(file_path, e)
:param dict item: The item where the data will be stored :param str header: The header of the file """ source = 'anpa_take_key' for line in header.split('\n'): if line.lower().startswith('media release'): break if source not in item: item[source] = line else: item[source] += line # Clean up the header entries item['anpa_take_key'] = item['anpa_take_key'][8:].replace('\n', '').strip() item['headline'] = 'Media Release: ' + item.get('anpa_take_key', '') item['slugline'] = 'AAP Medianet' self._truncate_headers(item) try: register_feed_parser(AsiaNetFeedParser.NAME, AsiaNetFeedParser()) except AlreadyExistsError: pass register_feeding_service_error( 'file', AAPParserError.AsiaNetParserError().get_error_description())