def parse(self, filename, provider=None): try: item = {} self.set_item_defaults(item, filename) with open(filename, 'r', encoding='windows-1252') as f: # read the whole file into a single string lines = f.read() # Construct pattern for the regular expression pattern = '(.*)\n' for f in self.field_list: pattern = pattern + f[0] + '(.*)\n' m = re.match(pattern, ''.join(lines), re.MULTILINE | re.DOTALL) if m: for f in self.field_list: if f[1] is not None: item[f[1]] = m.group(f[2]) # fix the formatting item[self.ITEM_VERSION_CREATED] = self.datetime( item[self.ITEM_VERSION_CREATED]) item[self.ITEM_BODY_HTML] = '<p>' + html.escape(item[self.ITEM_BODY_HTML].strip()).replace('\n', '</p><p>')\ + '</p>' item.setdefault('word_count', get_word_count(item['body_html'])) return item except Exception as ex: raise AAPParserError.NewsBitesParserError(exception=ex, provider=provider)
'qcode': '04000000', 'name': subject_codes['04000000'] }] item[FORMAT] = FORMATS.HTML def datetime(self, string): """ Convert the date string parsed from the source file to a datetime, assumes that the time is local to Sydney Australia :param string: :return: """ # 06 June 2016 14:00:00 try: local_dt = datetime.datetime.strptime(string, '%d %B %Y %H:%M:%S') except ValueError: local_dt = datetime.datetime.strptime(string, '%d %b %Y %H:%M:%S') local_tz = pytz.timezone('Australia/Sydney') aus_dt = local_tz.localize(local_dt, is_dst=None) return aus_dt.astimezone(pytz.utc) try: register_feed_parser(NewsBitesFeedParser.NAME, NewsBitesFeedParser()) except AlreadyExistsError: pass register_feeding_service_error( 'file', AAPParserError.NewsBitesParserError().get_error_description())
def set_item_defaults(self, item, filename): item['guid'] = filename + ':' + str(uuid.uuid4()) item['urgency'] = 5 item['pubstatus'] = 'usable' item['versioncreated'] = utcnow() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['anpa_category'] = [{'qcode': 'f'}] item['subject'] = [{'qcode': '04000000', 'name': subject_codes['04000000']}] item[FORMAT] = FORMATS.HTML def datetime(self, string): """ Convert the date string parsed from the source file to a datetime, assumes that the time is local to Sydney Australia :param string: :return: """ # 06 June 2016 14:00:00 local_dt = datetime.datetime.strptime(string, '%d %B %Y %H:%M:%S') local_tz = pytz.timezone('Australia/Sydney') aus_dt = local_tz.localize(local_dt, is_dst=None) return aus_dt.astimezone(pytz.utc) try: register_feed_parser(NewsBitesFeedParser.NAME, NewsBitesFeedParser()) except AlreadyExistsError as ex: pass register_feeding_service_error('file', AAPParserError.NewsBitesParserError().get_error_description())