def parse(self, file_path, provider=None): try: item = {ITEM_TYPE: CONTENT_TYPE.TEXT} with open(file_path, 'rb') as f: lines = [line for line in f] # parse first header line m = re.match(b'\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)', lines[0], flags=re.I) if m: item['provider_sequence'] = m.group(2).decode() # parse second header line m = re.match( b'([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-)([a-z-]+)(.*) ' b'([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})', lines[1], flags=re.I) if m: item['priority'] = self.map_priority(m.group(1).decode()) item['anpa_category'] = [{'qcode': m.group(2).decode()}] item['word_count'] = int(m.group(10).decode()) if m.group(4) == b'\x12': item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED # parse created date at the end of file m = re.search(b'\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT', lines[-4], flags=re.I) if m: item['firstcreated'] = datetime.strptime(m.group(3).decode(), '%m-%d-%y %H%M').replace(tzinfo=utc) # parse anpa content body = b''.join(lines[2:]) m = re.match(b'\x02(.*)\x03', body, flags=re.M + re.S) if m: text = m.group(1).decode().split('\n') # text body_lines = [l.strip() for l in text if l.startswith('\t')] item['body_text'] = '\n'.join(body_lines) # content metadata header_lines = [l.strip('^<= ') for l in text if l.startswith('^')] if len(header_lines) > 3: item['headline'] = header_lines[1] item['byline'] = header_lines[-2] # slugline if len(header_lines) > 1: m = re.match('[A-Z]{2}-[A-Z]{2}--([a-z-0-9]+)', header_lines[0], flags=re.I) if m: item['slugline'] = m.group(1) # ednote for line in header_lines: m = re.search("EDITOR'S NOTE _(.*)", line) if m: item['ednote'] = m.group(1).strip() return item except Exception as ex: raise ParserError.anpaParseFileError(file_path, ex)
def parse(self, file_path, provider=None): try: item = { ITEM_TYPE: CONTENT_TYPE.TEXT, GUID_FIELD: generate_guid(type=GUID_TAG), FORMAT: FORMATS.HTML } with open(file_path, "rb") as f: lines = [line for line in f] # parse first header line m = re.match(b"\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)", lines[0], flags=re.I) if m: item["provider_sequence"] = m.group(2).decode() # parse second header line m = re.match( b"([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-|ap-)([a-z-.0-9]+)(.*) " b"([0-9]{1,2})-([0-9]{1,2}) ([0-9]{0,4})", lines[1], flags=re.I, ) if m: item["priority"] = self.map_priority(m.group(1).decode()) item["anpa_category"] = [{"qcode": m.group(2).decode()}] item["slugline"] = m.group(6).decode("latin-1", "replace") item["anpa_take_key"] = m.group(7).decode( "latin-1", "replace").strip() if len(m.group(10).decode()): item["word_count"] = int(m.group(10).decode()) if m.group(4) == b"\x12": item[FORMAT] = FORMATS.PRESERVED # parse created date at the end of file m = re.search( b"\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT", lines[-4], flags=re.I) if m: item["firstcreated"] = datetime.strptime( m.group(3).decode(), "%m-%d-%y %H%M").replace(tzinfo=utc) item["versioncreated"] = item["firstcreated"] # parse anpa content body = b"".join(lines[2:]) m = re.match(b"\x02(.*)\x03", body, flags=re.M + re.S) if m: text = m.group(1).decode("latin-1", "replace").split("\n") if item.get(FORMAT) == FORMATS.PRESERVED: # ANPA defines a number of special characters e.g. TLI (Tab Line Inicator) Hex x08 and # TTS Space Band Hex x10 These will be replaced, there will likely be others body_lines = [ line.strip("^").replace("\b", "%08").replace("\x10", "%10") for line in text if line.startswith(("\t", "^", "\b")) ] item["body_html"] = "<pre>" + "\n".join( body_lines) + "</pre>" else: body_lines = [ line.strip() for line in text if line.startswith(("\t")) ] item["body_html"] = "<p>" + "</p><p>".join( body_lines) + "</p>" # content metadata header_lines = [ line.strip("^<= ") for line in text if line.startswith("^") ] if len(header_lines) > 1: item["headline"] = header_lines[1].rstrip("\r\n^<= ") if len(header_lines) > 3: item["byline"] = header_lines[-2].rstrip("\r\n^<= ") # if there is no body use header lines if len(body_lines) == 1 and not body_lines[0]: item["body_html"] = "<p>" + "</p><p>".join( header_lines[2:]) + "</p>" # slugline if len(header_lines) > 1: m = re.match("[A-Z]{2}-[A-Z]{2}--([a-z-0-9.]+)", header_lines[0], flags=re.I) if m: item["slugline"] = m.group(1) # ednote self._parse_ednote(header_lines, item) return item except Exception as ex: raise ParserError.anpaParseFileError(file_path, ex)
def parse(self, file_path, provider=None): try: item = { ITEM_TYPE: CONTENT_TYPE.TEXT, GUID_FIELD: generate_guid(type=GUID_TAG), FORMAT: FORMATS.HTML } with open(file_path, 'rb') as f: lines = [line for line in f] # parse first header line m = re.match(b'\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)', lines[0], flags=re.I) if m: item['provider_sequence'] = m.group(2).decode() # parse second header line m = re.match( b'([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-|ap-)([a-z-.]+)(.*) ' b'([0-9]{1,2})-([0-9]{1,2}) ([0-9]{0,4})', lines[1], flags=re.I) if m: item['priority'] = self.map_priority(m.group(1).decode()) item['anpa_category'] = [{'qcode': m.group(2).decode()}] item['slugline'] = m.group(6).decode('latin-1', 'replace') item['anpa_take_key'] = m.group(7).decode( 'latin-1', 'replace').strip() if len(m.group(10).decode()): item['word_count'] = int(m.group(10).decode()) if m.group(4) == b'\x12': item[FORMAT] = FORMATS.PRESERVED # parse created date at the end of file m = re.search( b'\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT', lines[-4], flags=re.I) if m: item['firstcreated'] = datetime.strptime( m.group(3).decode(), '%m-%d-%y %H%M').replace(tzinfo=utc) item['versioncreated'] = item['firstcreated'] # parse anpa content body = b''.join(lines[2:]) m = re.match(b'\x02(.*)\x03', body, flags=re.M + re.S) if m: text = m.group(1).decode('latin-1', 'replace').split('\n') if item.get(FORMAT) == FORMATS.PRESERVED: # ANPA defines a number of special characters e.g. TLI (Tab Line Inicator) Hex x08 and # TTS Space Band Hex x10 These will be replaced, there will likely be others body_lines = [ l.strip('^').replace('\b', '%08').replace('\x10', '%10') for l in text if l.startswith(('\t', '^', '\b')) ] item['body_html'] = '<pre>' + '\n'.join( body_lines) + '</pre>' else: body_lines = [ l.strip() for l in text if l.startswith(('\t')) ] item['body_html'] = '<p>' + '</p><p>'.join( body_lines) + '</p>' # content metadata header_lines = [ l.strip('^<= ') for l in text if l.startswith('^') ] if len(header_lines) > 1: item['headline'] = header_lines[1].rstrip('\r\n^<= ') if len(header_lines) > 3: item['byline'] = header_lines[-2].rstrip('\r\n^<= ') # if there is no body use header lines if len(body_lines) == 1 and not body_lines[0]: item['body_html'] = '<p>' + '</p><p>'.join( header_lines[2:]) + '</p>' # slugline if len(header_lines) > 1: m = re.match('[A-Z]{2}-[A-Z]{2}--([a-z-0-9.]+)', header_lines[0], flags=re.I) if m: item['slugline'] = m.group(1) # ednote self._parse_ednote(header_lines, item) return item except Exception as ex: raise ParserError.anpaParseFileError(file_path, ex)
def parse(self, file_path, provider=None): try: item = {ITEM_TYPE: CONTENT_TYPE.TEXT, GUID_FIELD: generate_guid(type=GUID_TAG), FORMAT: FORMATS.HTML} with open(file_path, 'rb') as f: lines = [line for line in f] # parse first header line m = re.match(b'\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)', lines[0], flags=re.I) if m: item['provider_sequence'] = m.group(2).decode() # parse second header line m = re.match( b'([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-|ap-)([a-z-.]+)(.*) ' b'([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})', lines[1], flags=re.I) if m: item['priority'] = self.map_priority(m.group(1).decode()) item['anpa_category'] = [{'qcode': m.group(2).decode()}] item['slugline'] = m.group(6).decode('latin-1', 'replace') item['anpa_take_key'] = m.group(7).decode('latin-1', 'replace').strip() item['word_count'] = int(m.group(10).decode()) if m.group(4) == b'\x12': item[FORMAT] = FORMATS.PRESERVED # parse created date at the end of file m = re.search(b'\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT', lines[-4], flags=re.I) if m: item['firstcreated'] = datetime.strptime(m.group(3).decode(), '%m-%d-%y %H%M').replace(tzinfo=utc) item['versioncreated'] = item['firstcreated'] # parse anpa content body = b''.join(lines[2:]) m = re.match(b'\x02(.*)\x03', body, flags=re.M + re.S) if m: text = m.group(1).decode('latin-1', 'replace').split('\n') if item.get(FORMAT) == FORMATS.PRESERVED: # ANPA defines a number of special characters e.g. TLI (Tab Line Inicator) Hex x08 and # TTS Space Band Hex x10 These will be replaced, there will likely be others body_lines = [l.strip('^').replace('\b', '%08').replace('\x10', '%10') for l in text if l.startswith(('\t', '^', '\b'))] item['body_html'] = '<pre>' + '\n'.join(body_lines) + '</pre>' else: body_lines = [l.strip() for l in text if l.startswith(('\t'))] item['body_html'] = '<p>' + '</p><p>'.join(body_lines) + '</p>' # content metadata header_lines = [l.strip('^<= ') for l in text if l.startswith('^')] if len(header_lines) > 1: item['headline'] = header_lines[1].rstrip('\r\n^<= ') if len(header_lines) > 3: item['byline'] = header_lines[-2].rstrip('\r\n^<= ') # if there is no body use header lines if len(body_lines) == 1 and not body_lines[0]: item['body_html'] = '<p>' + '</p><p>'.join(header_lines[2:]) + '</p>' # slugline if len(header_lines) > 1: m = re.match('[A-Z]{2}-[A-Z]{2}--([a-z-0-9.]+)', header_lines[0], flags=re.I) if m: item['slugline'] = m.group(1) # ednote self._parse_ednote(header_lines, item) return item except Exception as ex: raise ParserError.anpaParseFileError(file_path, ex)
def parse(self, file_path, provider=None): try: item = {ITEM_TYPE: CONTENT_TYPE.TEXT, GUID_FIELD: generate_guid(type=GUID_TAG), FORMAT: FORMATS.HTML} with open(file_path, "rb") as f: lines = [line for line in f] # parse first header line m = re.match(b"\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)", lines[0], flags=re.I) if m: item["provider_sequence"] = m.group(2).decode() # parse second header line m = re.match( b"([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-|ap-)([a-z-.]+)(.*) " b"([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})", lines[1], flags=re.I, ) if m: item["priority"] = self.map_priority(m.group(1).decode()) item["anpa_category"] = [{"qcode": m.group(2).decode()}] item["slugline"] = m.group(6).decode("latin-1", "replace") item["anpa_take_key"] = m.group(7).decode("latin-1", "replace").strip() item["word_count"] = int(m.group(10).decode()) if m.group(4) == b"\x12": item[FORMAT] = FORMATS.PRESERVED # parse created date at the end of file m = re.search(b"\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT", lines[-4], flags=re.I) if m: item["firstcreated"] = datetime.strptime(m.group(3).decode(), "%m-%d-%y %H%M").replace(tzinfo=utc) item["versioncreated"] = item["firstcreated"] # parse anpa content body = b"".join(lines[2:]) m = re.match(b"\x02(.*)\x03", body, flags=re.M + re.S) if m: text = m.group(1).decode("latin-1", "replace").split("\n") if item.get(FORMAT) == FORMATS.PRESERVED: # ANPA defines a number of special characters e.g. TLI (Tab Line Inicator) Hex x08 and # TTS Space Band Hex x10 These will be replaced, there will likely be others body_lines = [ l.strip("^").replace("\b", "%08").replace("\x10", "%10") for l in text if l.startswith(("\t", "^", "\b")) ] item["body_html"] = "<pre>" + "\n".join(body_lines) + "</pre>" else: body_lines = [l.strip() for l in text if l.startswith(("\t"))] item["body_html"] = "<p>" + "</p><p>".join(body_lines) + "</p>" # content metadata header_lines = [l.strip("^<= ") for l in text if l.startswith("^")] if len(header_lines) > 1: item["headline"] = header_lines[1].rstrip("\r\n^<= ") if len(header_lines) > 3: item["byline"] = header_lines[-2].rstrip("\r\n^<= ") # if there is no body use header lines if len(body_lines) == 1 and not body_lines[0]: item["body_html"] = "<p>" + "</p><p>".join(header_lines[2:]) + "</p>" # slugline if len(header_lines) > 1: m = re.match("[A-Z]{2}-[A-Z]{2}--([a-z-0-9.]+)", header_lines[0], flags=re.I) if m: item["slugline"] = m.group(1) # ednote for line in header_lines: m = re.search("EDITOR'S NOTE _(.*)", line) if m: item["ednote"] = m.group(1).strip() return item except Exception as ex: raise ParserError.anpaParseFileError(file_path, ex)
def parse(self, file_path, provider=None): try: item = { ITEM_TYPE: CONTENT_TYPE.TEXT, GUID_FIELD: generate_guid(type=GUID_TAG) } with open(file_path, 'rb') as f: lines = [line for line in f] # parse first header line m = re.match(b'\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)', lines[0], flags=re.I) if m: item['provider_sequence'] = m.group(2).decode() # parse second header line m = re.match( b'([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-|ap-)([a-z-.]+)(.*) ' b'([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})', lines[1], flags=re.I) if m: item['priority'] = self.map_priority(m.group(1).decode()) item['anpa_category'] = [{'qcode': m.group(2).decode()}] item['slugline'] = m.group(6).decode('latin-1', 'replace') item['anpa_take_key'] = m.group(7).decode( 'latin-1', 'replace').strip() item['word_count'] = int(m.group(10).decode()) if m.group(4) == b'\x12': item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED # parse created date at the end of file m = re.search( b'\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT', lines[-4], flags=re.I) if m: item['firstcreated'] = datetime.strptime( m.group(3).decode(), '%m-%d-%y %H%M').replace(tzinfo=utc) item['versioncreated'] = item['firstcreated'] # parse anpa content body = b''.join(lines[2:]) m = re.match(b'\x02(.*)\x03', body, flags=re.M + re.S) if m: text = m.group(1).decode('latin-1', 'replace').split('\n') # text body_lines = [l.strip() for l in text if l.startswith('\t')] if item[ITEM_TYPE] == CONTENT_TYPE.PREFORMATTED: item['body_text'] = '\n'.join(body_lines) else: item['body_html'] = '<p>' + '</p><p>'.join( body_lines) + '</p>' # content metadata header_lines = [ l.strip('^<= ') for l in text if l.startswith('^') ] if len(header_lines) > 1: item['headline'] = header_lines[1].rstrip('\r\n^<= ') if len(header_lines) > 3: item['byline'] = header_lines[-2].rstrip('\r\n^<= ') # if there is no body use header lines if len(body_lines) == 1 and not body_lines[0]: item['body_html'] = '<p>' + '</p><p>'.join( header_lines[2:]) + '</p>' # slugline if len(header_lines) > 1: m = re.match('[A-Z]{2}-[A-Z]{2}--([a-z-0-9.]+)', header_lines[0], flags=re.I) if m: item['slugline'] = m.group(1) # ednote for line in header_lines: m = re.search("EDITOR'S NOTE _(.*)", line) if m: item['ednote'] = m.group(1).strip() return item except Exception as ex: raise ParserError.anpaParseFileError(file_path, ex)
def parse_file(self, filename): """Parse anpa file by given filename. :param filename """ try: item = {'type': 'text'} with open(filename, 'rb') as f: lines = [line for line in f] # parse first header line m = re.match(b'\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)', lines[0], flags=re.I) if m: item['provider_sequence'] = m.group(2).decode() # parse second header line m = re.match( b'([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-)([a-z-]+)(.*) ' b'([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})', lines[1], flags=re.I) if m: item['priority'] = m.group(1).decode() item['anpa-category'] = {'qcode': m.group(2).decode()} item['word_count'] = int(m.group(10).decode()) if m.group(4) == b'\x12': item['type'] = 'preformatted' # parse created date at the end of file m = re.search( b'\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT', lines[-4], flags=re.I) if m: item['firstcreated'] = datetime.strptime( m.group(3).decode(), '%m-%d-%y %H%M').replace(tzinfo=utc) # parse anpa content body = b''.join(lines[2:]) m = re.match(b'\x02(.*)\x03', body, flags=re.M + re.S) if m: text = m.group(1).decode().split('\n') # text body_lines = [l.strip() for l in text if l.startswith('\t')] item['body_text'] = '\n'.join(body_lines) # content metadata header_lines = [ l.strip('^<= ') for l in text if l.startswith('^') ] if len(header_lines) > 3: item['headline'] = super().trim_headline(header_lines[1]) item['byline'] = header_lines[-2] # slugline if len(header_lines) > 1: m = re.match('[A-Z]{2}-[A-Z]{2}--([a-z-0-9]+)', header_lines[0], flags=re.I) if m: item['slugline'] = super().trim_slugline(m.group(1)) # ednote for line in header_lines: m = re.search("EDITOR'S NOTE _(.*)", line) if m: item['ednote'] = m.group(1).strip() return item except Exception as ex: raise ParserError.anpaParseFileError(filename, ex)
def parse(self, file_path, provider=None): try: item = { ITEM_TYPE: CONTENT_TYPE.TEXT, GUID_FIELD: generate_guid(type=GUID_TAG), FORMAT: FORMATS.HTML } with open(file_path, 'rb') as f: lines = [line for line in f] # parse first header line m = re.match(b'x01([a-z])([0-9]{4})KYODO\x1f([a-z0-9-]+)', lines[0], flags=re.I) if m: item['provider_sequence'] = m.group(2).decode() # parse second header line m = re.match( b'([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-|ap-)([a-z-.]+)(.*) ' b'([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})', lines[1], flags=re.I) if m: item['priority'] = self.map_priority(m.group(1).decode()) item['anpa_category'] = [{'qcode': m.group(2).decode()}] item['slugline'] = m.group(6).decode('latin-1', 'replace') item['anpa_take_key'] = m.group(7).decode( 'latin-1', 'replace').strip() item['word_count'] = int(m.group(10).decode()) if m.group(4) == b'\x12': item[FORMAT] = FORMATS.PRESERVED # parse created date at the end of file m = re.search( b'\x03([A-Z]{3})-([0-9]{2}:[0-9]{2}-[0-9]{2}-[0-9]{2}-[0-9]{2})', lines[-1], flags=re.I) if m: tz = pytz.timezone(config.TIMEZONE_CODE[str.lower( m.group(1).decode())]) date = datetime.strptime( m.group(2).decode(), '%H:%M-%d-%m-%y').replace(tzinfo=tz) item['firstcreated'] = date.astimezone(pytz.utc) item['versioncreated'] = item['firstcreated'] # parse anpa content body = b''.join(lines[2:]) m = re.match(b'\x02(.*)\x03', body, flags=re.M + re.S) if m: text = m.group(1).decode('latin-1', 'replace').split('\n') is_header = True for line in text: if line == text[0]: m = re.match('BC-(.*)', line, flags=re.I) if m: item['slugline'] = str.rstrip(m.group(1), '\r') continue if is_header is True: if line.endswith("+\r"): is_header = False line = line.rstrip( '\r') if is_header is True else line.rstrip('+\r') line = line if 'headline' in item: item['headline'] += line else: item['headline'] = line continue if line == '==Kyodo\r': break line = line.rstrip('\r') if 'body_html' in item: item['body_html'] += '<p>' + line + '</p>' else: item['body_html'] = '<p>' + line + '</p>' self._parse_ednote(item['headline'], item) return item except Exception as ex: raise ParserError.anpaParseFileError(file_path, ex)
def parse(self, file_path, provider=None): try: item = { ITEM_TYPE: CONTENT_TYPE.TEXT, GUID_FIELD: generate_guid(type=GUID_TAG), FORMAT: FORMATS.HTML } with open(file_path, 'rb') as f: lines = [line for line in f] # parse first header line m = re.match(b'x01([a-z])([0-9]{4})KYODO\x1f([a-z0-9-]+)', lines[0], flags=re.I) if m: item['provider_sequence'] = m.group(2).decode() # parse second header line m = re.match( b'([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-|ap-)([a-z-.]+)(.*) ' b'([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})', lines[1], flags=re.I) if m: item['language'] = 'en' item['priority'] = 2 if m.group(1).decode() == 'u' else 3 qcode = m.group(2).decode().upper() item['anpa_category'] = [{'qcode': qcode}] # Mapping product qcode = self.MAPPING_PRODUCTS.get(qcode, 'NEWS/GENERAL') item.setdefault('subject', []).extend([ { 'name': qcode, 'qcode': qcode, 'parent': 'NEWS', 'scheme': 'services-products' }, { 'name': 'KYODO', 'qcode': 'KYODO', 'scheme': 'sources' }, { 'name': 'default', 'qcode': 'default', 'scheme': 'distribution' }, ]) item['slugline'] = m.group(6).decode('latin-1', 'replace') item['anpa_take_key'] = m.group(7).decode( 'latin-1', 'replace').strip() item['word_count'] = int(m.group(10).decode()) if m.group(4) == b'\x12': item[FORMAT] = FORMATS.PRESERVED # parse created date at the end of file m = re.search( b'\x03([A-Z]{3})-([0-9]{2}:[0-9]{2}-[0-9]{2}-[0-9]{2}-[0-9]{2})', lines[-1], flags=re.I) if m: tz = pytz.timezone(config.TIMEZONE_CODE[str.lower( m.group(1).decode())]) date = datetime.strptime( m.group(2).decode(), '%H:%M-%d-%m-%y').replace(tzinfo=tz) item['firstcreated'] = date.astimezone(pytz.utc) item['versioncreated'] = item['firstcreated'] # parse anpa content body = b''.join(lines[2:]) m = re.match(b'\x02(.*)\x03', body, flags=re.M + re.S) if m: text = m.group(1).decode('latin-1', 'replace').split('\n') item['keywords'] = text[0].strip('\r').split("-") item['abstract'] = re.split( "\\..?", ("".join(line.strip() for line in text[2:-1])))[0] + '.' item.setdefault('extra', {})['city'] = item.get('abstract', '').split(',')[0] is_header = True for line in text: if line == text[0]: m = re.match('BC-(.*)', line, flags=re.I) if m: item['slugline'] = str.rstrip(m.group(1), '\r') continue if is_header is True: if line.endswith("+\r"): is_header = False line = line.rstrip( '\r') if is_header is True else line.rstrip('+\r') line = line if 'headline' in item: item['headline'] += line else: item['headline'] = line continue if line == '==Kyodo\r': break line = line.rstrip('\r') if 'body_html' in item: item['body_html'] += '<p>' + line + '</p>' else: item['body_html'] = '<p>' + line + '</p>' self._parse_ednote(item['headline'], item) # Slugline and keywords is epmty item['slugline'] = None item['keywords'] = [] return item except Exception as ex: raise ParserError.anpaParseFileError(file_path, ex)