示例#1
0
    def parse(self, file_path, provider=None):
        try:
            item = {
                'guid': '{}-{}'.format(file_path, uuid.uuid4()),
                'pubstatus': 'usable',
                'versioncreated': utcnow(),
                ITEM_TYPE: CONTENT_TYPE.TEXT,
                FORMAT: FORMATS.HTML,
            }

            with open(file_path, 'r', encoding='windows-1252') as f:
                data = f.read().replace('\r', '')

            header, dateline_data, body_data = data.split('\n\n', 2)

            self._process_header(item, header)

            start_of_body = 'MEDIA RELEASE '
            source, data = data.split(start_of_body, 1)
            data = start_of_body + data

            item['anpa_category'] = [{'qcode': 'j'}]
            item['original_source'] = 'AsiaNet'
            body_html = to_ascii(html.escape(data)).replace('\n\n',
                                                            '</p><p>').replace(
                                                                '\n', ' ')
            item['body_html'] = '<p>' + body_html + '</p>'
            item['word_count'] = get_word_count(item['body_html'])

            return item
        except Exception as e:
            raise AAPParserError.AsiaNetParserError(file_path, e)
示例#2
0
    def parse(self, file_path, provider=None):
        try:
            item = {
                'guid': '{}-{}'.format(file_path, uuid.uuid4()),
                'pubstatus': 'usable',
                'versioncreated': utcnow(),
                ITEM_TYPE: CONTENT_TYPE.TEXT,
                FORMAT: FORMATS.PRESERVED,
            }

            with open(file_path, 'r', encoding='windows-1252') as f:
                data = f.read().replace('\r', '')

            header, dateline_data, data = data.split('\n\n', 2)

            self._process_header(item, header)
            self._process_dateline(item, dateline_data)

            item['original_source'] = 'AsiaNet'
            item['word_count'] = get_text_word_count(data)
            item['body_html'] = '<pre>' + html.escape(data) + '</pre>'

            return item
        except Exception as e:
            raise AAPParserError.AsiaNetParserError(file_path, e)
示例#3
0
        :param dict item: The item where the data will be stored
        :param str header: The header of the file
        """
        source = 'anpa_take_key'
        for line in header.split('\n'):
            if line.lower().startswith('media release'):
                break

            if source not in item:
                item[source] = line
            else:
                item[source] += line

        # Clean up the header entries
        item['anpa_take_key'] = item['anpa_take_key'][8:].replace('\n',
                                                                  '').strip()
        item['headline'] = 'Media Release: ' + item.get('anpa_take_key', '')
        item['slugline'] = 'AAP Medianet'
        self._truncate_headers(item)


try:
    register_feed_parser(AsiaNetFeedParser.NAME, AsiaNetFeedParser())
except AlreadyExistsError:
    pass

register_feeding_service_error(
    'file',
    AAPParserError.AsiaNetParserError().get_error_description())