示例#1
0
    def parse(self, filename, provider=None):
        try:
            item = {}
            self.set_item_defaults(item, filename)
            with open(filename, 'r', encoding='windows-1252') as f:
                # read the whole file into a single string
                lines = f.read()
                # Construct pattern for the regular expression
                pattern = '(.*)\n'
                for f in self.field_list:
                    pattern = pattern + f[0] + '(.*)\n'
                m = re.match(pattern, ''.join(lines), re.MULTILINE | re.DOTALL)
                if m:
                    for f in self.field_list:
                        if f[1] is not None:
                            item[f[1]] = m.group(f[2])

            # fix the formatting
            item[self.ITEM_VERSION_CREATED] = self.datetime(
                item[self.ITEM_VERSION_CREATED])
            item[self.ITEM_BODY_HTML] = '<p>' + html.escape(item[self.ITEM_BODY_HTML].strip()).replace('\n', '</p><p>')\
                                        + '</p>'
            item.setdefault('word_count', get_word_count(item['body_html']))

            return item
        except Exception as ex:
            raise AAPParserError.NewsBitesParserError(exception=ex,
                                                      provider=provider)
示例#2
0
            'qcode': '04000000',
            'name': subject_codes['04000000']
        }]
        item[FORMAT] = FORMATS.HTML

    def datetime(self, string):
        """
        Convert the date string parsed from the source file to a datetime, assumes that the
        time is local to Sydney Australia
        :param string:
        :return:
        """
        # 06 June 2016 14:00:00
        try:
            local_dt = datetime.datetime.strptime(string, '%d %B %Y %H:%M:%S')
        except ValueError:
            local_dt = datetime.datetime.strptime(string, '%d %b %Y %H:%M:%S')

        local_tz = pytz.timezone('Australia/Sydney')
        aus_dt = local_tz.localize(local_dt, is_dst=None)
        return aus_dt.astimezone(pytz.utc)


try:
    register_feed_parser(NewsBitesFeedParser.NAME, NewsBitesFeedParser())
except AlreadyExistsError:
    pass
register_feeding_service_error(
    'file',
    AAPParserError.NewsBitesParserError().get_error_description())
示例#3
0
    def set_item_defaults(self, item, filename):
        item['guid'] = filename + ':' + str(uuid.uuid4())
        item['urgency'] = 5
        item['pubstatus'] = 'usable'
        item['versioncreated'] = utcnow()
        item[ITEM_TYPE] = CONTENT_TYPE.TEXT
        item['anpa_category'] = [{'qcode': 'f'}]
        item['subject'] = [{'qcode': '04000000', 'name': subject_codes['04000000']}]
        item[FORMAT] = FORMATS.HTML

    def datetime(self, string):
        """
        Convert the date string parsed from the source file to a datetime, assumes that the
        time is local to Sydney Australia
        :param string:
        :return:
        """
        # 06 June 2016 14:00:00
        local_dt = datetime.datetime.strptime(string, '%d %B %Y %H:%M:%S')
        local_tz = pytz.timezone('Australia/Sydney')
        aus_dt = local_tz.localize(local_dt, is_dst=None)
        return aus_dt.astimezone(pytz.utc)


try:
    register_feed_parser(NewsBitesFeedParser.NAME, NewsBitesFeedParser())
except AlreadyExistsError as ex:
    pass
register_feeding_service_error('file', AAPParserError.NewsBitesParserError().get_error_description())