Exemplo n.º 1
0
    def parse(self, file_path, provider=None):
        try:
            item = {ITEM_TYPE: CONTENT_TYPE.TEXT}

            with open(file_path, 'rb') as f:
                lines = [line for line in f]

            # parse first header line
            m = re.match(b'\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)', lines[0], flags=re.I)
            if m:
                item['provider_sequence'] = m.group(2).decode()

            # parse second header line
            m = re.match(
                b'([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-)([a-z-]+)(.*) '
                b'([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})',
                lines[1], flags=re.I)
            if m:
                item['priority'] = self.map_priority(m.group(1).decode())
                item['anpa_category'] = [{'qcode': m.group(2).decode()}]
                item['word_count'] = int(m.group(10).decode())
                if m.group(4) == b'\x12':
                    item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED

            # parse created date at the end of file
            m = re.search(b'\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT', lines[-4], flags=re.I)
            if m:
                item['firstcreated'] = datetime.strptime(m.group(3).decode(), '%m-%d-%y %H%M').replace(tzinfo=utc)

            # parse anpa content
            body = b''.join(lines[2:])
            m = re.match(b'\x02(.*)\x03', body, flags=re.M + re.S)
            if m:
                text = m.group(1).decode().split('\n')

                # text
                body_lines = [l.strip() for l in text if l.startswith('\t')]
                item['body_text'] = '\n'.join(body_lines)

                # content metadata
                header_lines = [l.strip('^<= ') for l in text if l.startswith('^')]
                if len(header_lines) > 3:
                    item['headline'] = header_lines[1]
                    item['byline'] = header_lines[-2]

                # slugline
                if len(header_lines) > 1:
                    m = re.match('[A-Z]{2}-[A-Z]{2}--([a-z-0-9]+)', header_lines[0], flags=re.I)
                    if m:
                        item['slugline'] = m.group(1)

                # ednote
                for line in header_lines:
                    m = re.search("EDITOR'S NOTE _(.*)", line)
                    if m:
                        item['ednote'] = m.group(1).strip()

            return item
        except Exception as ex:
            raise ParserError.anpaParseFileError(file_path, ex)
Exemplo n.º 2
0
    def parse(self, file_path, provider=None):
        try:
            item = {
                ITEM_TYPE: CONTENT_TYPE.TEXT,
                GUID_FIELD: generate_guid(type=GUID_TAG),
                FORMAT: FORMATS.HTML
            }

            with open(file_path, "rb") as f:
                lines = [line for line in f]

            # parse first header line
            m = re.match(b"\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)",
                         lines[0],
                         flags=re.I)
            if m:
                item["provider_sequence"] = m.group(2).decode()

            # parse second header line
            m = re.match(
                b"([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-|ap-)([a-z-.0-9]+)(.*) "
                b"([0-9]{1,2})-([0-9]{1,2}) ([0-9]{0,4})",
                lines[1],
                flags=re.I,
            )
            if m:
                item["priority"] = self.map_priority(m.group(1).decode())
                item["anpa_category"] = [{"qcode": m.group(2).decode()}]
                item["slugline"] = m.group(6).decode("latin-1", "replace")
                item["anpa_take_key"] = m.group(7).decode(
                    "latin-1", "replace").strip()
                if len(m.group(10).decode()):
                    item["word_count"] = int(m.group(10).decode())
                if m.group(4) == b"\x12":
                    item[FORMAT] = FORMATS.PRESERVED

            # parse created date at the end of file
            m = re.search(
                b"\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT",
                lines[-4],
                flags=re.I)
            if m:
                item["firstcreated"] = datetime.strptime(
                    m.group(3).decode(), "%m-%d-%y %H%M").replace(tzinfo=utc)
                item["versioncreated"] = item["firstcreated"]

            # parse anpa content
            body = b"".join(lines[2:])
            m = re.match(b"\x02(.*)\x03", body, flags=re.M + re.S)
            if m:
                text = m.group(1).decode("latin-1", "replace").split("\n")

                if item.get(FORMAT) == FORMATS.PRESERVED:
                    # ANPA defines a number of special characters e.g. TLI (Tab Line Inicator) Hex x08 and
                    # TTS Space Band Hex x10 These will be replaced, there will likely be others
                    body_lines = [
                        line.strip("^").replace("\b",
                                                "%08").replace("\x10", "%10")
                        for line in text if line.startswith(("\t", "^", "\b"))
                    ]
                    item["body_html"] = "<pre>" + "\n".join(
                        body_lines) + "</pre>"
                else:
                    body_lines = [
                        line.strip() for line in text
                        if line.startswith(("\t"))
                    ]
                    item["body_html"] = "<p>" + "</p><p>".join(
                        body_lines) + "</p>"

                # content metadata
                header_lines = [
                    line.strip("^<= ") for line in text if line.startswith("^")
                ]
                if len(header_lines) > 1:
                    item["headline"] = header_lines[1].rstrip("\r\n^<= ")
                if len(header_lines) > 3:
                    item["byline"] = header_lines[-2].rstrip("\r\n^<= ")

                    # if there is no body use header lines
                    if len(body_lines) == 1 and not body_lines[0]:
                        item["body_html"] = "<p>" + "</p><p>".join(
                            header_lines[2:]) + "</p>"

                # slugline
                if len(header_lines) > 1:
                    m = re.match("[A-Z]{2}-[A-Z]{2}--([a-z-0-9.]+)",
                                 header_lines[0],
                                 flags=re.I)
                    if m:
                        item["slugline"] = m.group(1)

                # ednote
                self._parse_ednote(header_lines, item)

            return item
        except Exception as ex:
            raise ParserError.anpaParseFileError(file_path, ex)
Exemplo n.º 3
0
    def parse(self, file_path, provider=None):
        try:
            item = {
                ITEM_TYPE: CONTENT_TYPE.TEXT,
                GUID_FIELD: generate_guid(type=GUID_TAG),
                FORMAT: FORMATS.HTML
            }

            with open(file_path, 'rb') as f:
                lines = [line for line in f]

            # parse first header line
            m = re.match(b'\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)',
                         lines[0],
                         flags=re.I)
            if m:
                item['provider_sequence'] = m.group(2).decode()

            # parse second header line
            m = re.match(
                b'([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-|ap-)([a-z-.]+)(.*) '
                b'([0-9]{1,2})-([0-9]{1,2}) ([0-9]{0,4})',
                lines[1],
                flags=re.I)
            if m:
                item['priority'] = self.map_priority(m.group(1).decode())
                item['anpa_category'] = [{'qcode': m.group(2).decode()}]
                item['slugline'] = m.group(6).decode('latin-1', 'replace')
                item['anpa_take_key'] = m.group(7).decode(
                    'latin-1', 'replace').strip()
                if len(m.group(10).decode()):
                    item['word_count'] = int(m.group(10).decode())
                if m.group(4) == b'\x12':
                    item[FORMAT] = FORMATS.PRESERVED

            # parse created date at the end of file
            m = re.search(
                b'\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT',
                lines[-4],
                flags=re.I)
            if m:
                item['firstcreated'] = datetime.strptime(
                    m.group(3).decode(), '%m-%d-%y %H%M').replace(tzinfo=utc)
                item['versioncreated'] = item['firstcreated']

            # parse anpa content
            body = b''.join(lines[2:])
            m = re.match(b'\x02(.*)\x03', body, flags=re.M + re.S)
            if m:
                text = m.group(1).decode('latin-1', 'replace').split('\n')

                if item.get(FORMAT) == FORMATS.PRESERVED:
                    # ANPA defines a number of special characters e.g. TLI (Tab Line Inicator) Hex x08 and
                    # TTS Space Band Hex x10 These will be replaced, there will likely be others
                    body_lines = [
                        l.strip('^').replace('\b',
                                             '%08').replace('\x10', '%10')
                        for l in text if l.startswith(('\t', '^', '\b'))
                    ]
                    item['body_html'] = '<pre>' + '\n'.join(
                        body_lines) + '</pre>'
                else:
                    body_lines = [
                        l.strip() for l in text if l.startswith(('\t'))
                    ]
                    item['body_html'] = '<p>' + '</p><p>'.join(
                        body_lines) + '</p>'

                # content metadata
                header_lines = [
                    l.strip('^<= ') for l in text if l.startswith('^')
                ]
                if len(header_lines) > 1:
                    item['headline'] = header_lines[1].rstrip('\r\n^<= ')
                if len(header_lines) > 3:
                    item['byline'] = header_lines[-2].rstrip('\r\n^<= ')

                    # if there is no body use header lines
                    if len(body_lines) == 1 and not body_lines[0]:
                        item['body_html'] = '<p>' + '</p><p>'.join(
                            header_lines[2:]) + '</p>'

                # slugline
                if len(header_lines) > 1:
                    m = re.match('[A-Z]{2}-[A-Z]{2}--([a-z-0-9.]+)',
                                 header_lines[0],
                                 flags=re.I)
                    if m:
                        item['slugline'] = m.group(1)

                # ednote
                self._parse_ednote(header_lines, item)

            return item
        except Exception as ex:
            raise ParserError.anpaParseFileError(file_path, ex)
Exemplo n.º 4
0
    def parse(self, file_path, provider=None):
        try:
            item = {ITEM_TYPE: CONTENT_TYPE.TEXT, GUID_FIELD: generate_guid(type=GUID_TAG), FORMAT: FORMATS.HTML}

            with open(file_path, 'rb') as f:
                lines = [line for line in f]

            # parse first header line
            m = re.match(b'\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)', lines[0], flags=re.I)
            if m:
                item['provider_sequence'] = m.group(2).decode()

            # parse second header line
            m = re.match(
                b'([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-|ap-)([a-z-.]+)(.*) '
                b'([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})',
                lines[1], flags=re.I)
            if m:
                item['priority'] = self.map_priority(m.group(1).decode())
                item['anpa_category'] = [{'qcode': m.group(2).decode()}]
                item['slugline'] = m.group(6).decode('latin-1', 'replace')
                item['anpa_take_key'] = m.group(7).decode('latin-1', 'replace').strip()
                item['word_count'] = int(m.group(10).decode())
                if m.group(4) == b'\x12':
                    item[FORMAT] = FORMATS.PRESERVED

            # parse created date at the end of file
            m = re.search(b'\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT', lines[-4], flags=re.I)
            if m:
                item['firstcreated'] = datetime.strptime(m.group(3).decode(), '%m-%d-%y %H%M').replace(tzinfo=utc)
                item['versioncreated'] = item['firstcreated']

            # parse anpa content
            body = b''.join(lines[2:])
            m = re.match(b'\x02(.*)\x03', body, flags=re.M + re.S)
            if m:
                text = m.group(1).decode('latin-1', 'replace').split('\n')

                if item.get(FORMAT) == FORMATS.PRESERVED:
                    # ANPA defines a number of special characters e.g. TLI (Tab Line Inicator) Hex x08 and
                    # TTS Space Band Hex x10 These will be replaced, there will likely be others
                    body_lines = [l.strip('^').replace('\b', '%08').replace('\x10', '%10') for l in text if
                                  l.startswith(('\t', '^', '\b'))]
                    item['body_html'] = '<pre>' + '\n'.join(body_lines) + '</pre>'
                else:
                    body_lines = [l.strip() for l in text if l.startswith(('\t'))]
                    item['body_html'] = '<p>' + '</p><p>'.join(body_lines) + '</p>'

                # content metadata
                header_lines = [l.strip('^<= ') for l in text if l.startswith('^')]
                if len(header_lines) > 1:
                    item['headline'] = header_lines[1].rstrip('\r\n^<= ')
                if len(header_lines) > 3:
                    item['byline'] = header_lines[-2].rstrip('\r\n^<= ')

                    # if there is no body use header lines
                    if len(body_lines) == 1 and not body_lines[0]:
                        item['body_html'] = '<p>' + '</p><p>'.join(header_lines[2:]) + '</p>'

                # slugline
                if len(header_lines) > 1:
                    m = re.match('[A-Z]{2}-[A-Z]{2}--([a-z-0-9.]+)', header_lines[0], flags=re.I)
                    if m:
                        item['slugline'] = m.group(1)

                # ednote
                self._parse_ednote(header_lines, item)

            return item
        except Exception as ex:
            raise ParserError.anpaParseFileError(file_path, ex)
Exemplo n.º 5
0
    def parse(self, file_path, provider=None):
        try:
            item = {ITEM_TYPE: CONTENT_TYPE.TEXT, GUID_FIELD: generate_guid(type=GUID_TAG), FORMAT: FORMATS.HTML}

            with open(file_path, "rb") as f:
                lines = [line for line in f]

            # parse first header line
            m = re.match(b"\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)", lines[0], flags=re.I)
            if m:
                item["provider_sequence"] = m.group(2).decode()

            # parse second header line
            m = re.match(
                b"([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-|ap-)([a-z-.]+)(.*) "
                b"([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})",
                lines[1],
                flags=re.I,
            )
            if m:
                item["priority"] = self.map_priority(m.group(1).decode())
                item["anpa_category"] = [{"qcode": m.group(2).decode()}]
                item["slugline"] = m.group(6).decode("latin-1", "replace")
                item["anpa_take_key"] = m.group(7).decode("latin-1", "replace").strip()
                item["word_count"] = int(m.group(10).decode())
                if m.group(4) == b"\x12":
                    item[FORMAT] = FORMATS.PRESERVED

            # parse created date at the end of file
            m = re.search(b"\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT", lines[-4], flags=re.I)
            if m:
                item["firstcreated"] = datetime.strptime(m.group(3).decode(), "%m-%d-%y %H%M").replace(tzinfo=utc)
                item["versioncreated"] = item["firstcreated"]

            # parse anpa content
            body = b"".join(lines[2:])
            m = re.match(b"\x02(.*)\x03", body, flags=re.M + re.S)
            if m:
                text = m.group(1).decode("latin-1", "replace").split("\n")

                if item.get(FORMAT) == FORMATS.PRESERVED:
                    # ANPA defines a number of special characters e.g. TLI (Tab Line Inicator) Hex x08 and
                    # TTS Space Band Hex x10 These will be replaced, there will likely be others
                    body_lines = [
                        l.strip("^").replace("\b", "%08").replace("\x10", "%10")
                        for l in text
                        if l.startswith(("\t", "^", "\b"))
                    ]
                    item["body_html"] = "<pre>" + "\n".join(body_lines) + "</pre>"
                else:
                    body_lines = [l.strip() for l in text if l.startswith(("\t"))]
                    item["body_html"] = "<p>" + "</p><p>".join(body_lines) + "</p>"

                # content metadata
                header_lines = [l.strip("^<= ") for l in text if l.startswith("^")]
                if len(header_lines) > 1:
                    item["headline"] = header_lines[1].rstrip("\r\n^<= ")
                if len(header_lines) > 3:
                    item["byline"] = header_lines[-2].rstrip("\r\n^<= ")

                    # if there is no body use header lines
                    if len(body_lines) == 1 and not body_lines[0]:
                        item["body_html"] = "<p>" + "</p><p>".join(header_lines[2:]) + "</p>"

                # slugline
                if len(header_lines) > 1:
                    m = re.match("[A-Z]{2}-[A-Z]{2}--([a-z-0-9.]+)", header_lines[0], flags=re.I)
                    if m:
                        item["slugline"] = m.group(1)

                # ednote
                for line in header_lines:
                    m = re.search("EDITOR'S NOTE _(.*)", line)
                    if m:
                        item["ednote"] = m.group(1).strip()

            return item
        except Exception as ex:
            raise ParserError.anpaParseFileError(file_path, ex)
Exemplo n.º 6
0
    def parse(self, file_path, provider=None):
        try:
            item = {
                ITEM_TYPE: CONTENT_TYPE.TEXT,
                GUID_FIELD: generate_guid(type=GUID_TAG)
            }

            with open(file_path, 'rb') as f:
                lines = [line for line in f]

            # parse first header line
            m = re.match(b'\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)',
                         lines[0],
                         flags=re.I)
            if m:
                item['provider_sequence'] = m.group(2).decode()

            # parse second header line
            m = re.match(
                b'([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-|ap-)([a-z-.]+)(.*) '
                b'([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})',
                lines[1],
                flags=re.I)
            if m:
                item['priority'] = self.map_priority(m.group(1).decode())
                item['anpa_category'] = [{'qcode': m.group(2).decode()}]
                item['slugline'] = m.group(6).decode('latin-1', 'replace')
                item['anpa_take_key'] = m.group(7).decode(
                    'latin-1', 'replace').strip()
                item['word_count'] = int(m.group(10).decode())
                if m.group(4) == b'\x12':
                    item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED

            # parse created date at the end of file
            m = re.search(
                b'\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT',
                lines[-4],
                flags=re.I)
            if m:
                item['firstcreated'] = datetime.strptime(
                    m.group(3).decode(), '%m-%d-%y %H%M').replace(tzinfo=utc)
                item['versioncreated'] = item['firstcreated']

            # parse anpa content
            body = b''.join(lines[2:])
            m = re.match(b'\x02(.*)\x03', body, flags=re.M + re.S)
            if m:
                text = m.group(1).decode('latin-1', 'replace').split('\n')

                # text
                body_lines = [l.strip() for l in text if l.startswith('\t')]
                if item[ITEM_TYPE] == CONTENT_TYPE.PREFORMATTED:
                    item['body_text'] = '\n'.join(body_lines)
                else:
                    item['body_html'] = '<p>' + '</p><p>'.join(
                        body_lines) + '</p>'

                # content metadata
                header_lines = [
                    l.strip('^<= ') for l in text if l.startswith('^')
                ]
                if len(header_lines) > 1:
                    item['headline'] = header_lines[1].rstrip('\r\n^<= ')
                if len(header_lines) > 3:
                    item['byline'] = header_lines[-2].rstrip('\r\n^<= ')

                    # if there is no body use header lines
                    if len(body_lines) == 1 and not body_lines[0]:
                        item['body_html'] = '<p>' + '</p><p>'.join(
                            header_lines[2:]) + '</p>'

                # slugline
                if len(header_lines) > 1:
                    m = re.match('[A-Z]{2}-[A-Z]{2}--([a-z-0-9.]+)',
                                 header_lines[0],
                                 flags=re.I)
                    if m:
                        item['slugline'] = m.group(1)

                # ednote
                for line in header_lines:
                    m = re.search("EDITOR'S NOTE _(.*)", line)
                    if m:
                        item['ednote'] = m.group(1).strip()

            return item
        except Exception as ex:
            raise ParserError.anpaParseFileError(file_path, ex)
Exemplo n.º 7
0
    def parse_file(self, filename):
        """Parse anpa file by given filename.

        :param filename
        """
        try:
            item = {'type': 'text'}

            with open(filename, 'rb') as f:
                lines = [line for line in f]

            # parse first header line
            m = re.match(b'\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)',
                         lines[0],
                         flags=re.I)
            if m:
                item['provider_sequence'] = m.group(2).decode()

            # parse second header line
            m = re.match(
                b'([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-)([a-z-]+)(.*) '
                b'([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})',
                lines[1],
                flags=re.I)
            if m:
                item['priority'] = m.group(1).decode()
                item['anpa-category'] = {'qcode': m.group(2).decode()}
                item['word_count'] = int(m.group(10).decode())
                if m.group(4) == b'\x12':
                    item['type'] = 'preformatted'

            # parse created date at the end of file
            m = re.search(
                b'\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT',
                lines[-4],
                flags=re.I)
            if m:
                item['firstcreated'] = datetime.strptime(
                    m.group(3).decode(), '%m-%d-%y %H%M').replace(tzinfo=utc)

            # parse anpa content
            body = b''.join(lines[2:])
            m = re.match(b'\x02(.*)\x03', body, flags=re.M + re.S)
            if m:
                text = m.group(1).decode().split('\n')

                # text
                body_lines = [l.strip() for l in text if l.startswith('\t')]
                item['body_text'] = '\n'.join(body_lines)

                # content metadata
                header_lines = [
                    l.strip('^<= ') for l in text if l.startswith('^')
                ]
                if len(header_lines) > 3:
                    item['headline'] = super().trim_headline(header_lines[1])
                    item['byline'] = header_lines[-2]

                # slugline
                if len(header_lines) > 1:
                    m = re.match('[A-Z]{2}-[A-Z]{2}--([a-z-0-9]+)',
                                 header_lines[0],
                                 flags=re.I)
                    if m:
                        item['slugline'] = super().trim_slugline(m.group(1))

                # ednote
                for line in header_lines:
                    m = re.search("EDITOR'S NOTE _(.*)", line)
                    if m:
                        item['ednote'] = m.group(1).strip()

            return item
        except Exception as ex:
            raise ParserError.anpaParseFileError(filename, ex)
Exemplo n.º 8
0
    def parse(self, file_path, provider=None):
        try:
            item = {
                ITEM_TYPE: CONTENT_TYPE.TEXT,
                GUID_FIELD: generate_guid(type=GUID_TAG),
                FORMAT: FORMATS.HTML
            }

            with open(file_path, 'rb') as f:
                lines = [line for line in f]

            # parse first header line
            m = re.match(b'x01([a-z])([0-9]{4})KYODO\x1f([a-z0-9-]+)',
                         lines[0],
                         flags=re.I)
            if m:
                item['provider_sequence'] = m.group(2).decode()

            # parse second header line
            m = re.match(
                b'([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-|ap-)([a-z-.]+)(.*) '
                b'([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})',
                lines[1],
                flags=re.I)
            if m:
                item['priority'] = self.map_priority(m.group(1).decode())
                item['anpa_category'] = [{'qcode': m.group(2).decode()}]
                item['slugline'] = m.group(6).decode('latin-1', 'replace')
                item['anpa_take_key'] = m.group(7).decode(
                    'latin-1', 'replace').strip()
                item['word_count'] = int(m.group(10).decode())
                if m.group(4) == b'\x12':
                    item[FORMAT] = FORMATS.PRESERVED

            # parse created date at the end of file
            m = re.search(
                b'\x03([A-Z]{3})-([0-9]{2}:[0-9]{2}-[0-9]{2}-[0-9]{2}-[0-9]{2})',
                lines[-1],
                flags=re.I)
            if m:
                tz = pytz.timezone(config.TIMEZONE_CODE[str.lower(
                    m.group(1).decode())])
                date = datetime.strptime(
                    m.group(2).decode(), '%H:%M-%d-%m-%y').replace(tzinfo=tz)
                item['firstcreated'] = date.astimezone(pytz.utc)
                item['versioncreated'] = item['firstcreated']

            # parse anpa content
            body = b''.join(lines[2:])
            m = re.match(b'\x02(.*)\x03', body, flags=re.M + re.S)
            if m:
                text = m.group(1).decode('latin-1', 'replace').split('\n')
                is_header = True
                for line in text:
                    if line == text[0]:
                        m = re.match('BC-(.*)', line, flags=re.I)
                        if m:
                            item['slugline'] = str.rstrip(m.group(1), '\r')
                            continue
                    if is_header is True:
                        if line.endswith("+\r"):
                            is_header = False
                        line = line.rstrip(
                            '\r') if is_header is True else line.rstrip('+\r')
                        line = line
                        if 'headline' in item:
                            item['headline'] += line
                        else:
                            item['headline'] = line
                        continue

                    if line == '==Kyodo\r':
                        break
                    line = line.rstrip('\r')

                    if 'body_html' in item:
                        item['body_html'] += '<p>' + line + '</p>'
                    else:
                        item['body_html'] = '<p>' + line + '</p>'

                self._parse_ednote(item['headline'], item)
            return item
        except Exception as ex:
            raise ParserError.anpaParseFileError(file_path, ex)
Exemplo n.º 9
0
    def parse(self, file_path, provider=None):
        try:
            item = {
                ITEM_TYPE: CONTENT_TYPE.TEXT,
                GUID_FIELD: generate_guid(type=GUID_TAG),
                FORMAT: FORMATS.HTML
            }

            with open(file_path, 'rb') as f:
                lines = [line for line in f]

            # parse first header line
            m = re.match(b'x01([a-z])([0-9]{4})KYODO\x1f([a-z0-9-]+)',
                         lines[0],
                         flags=re.I)
            if m:
                item['provider_sequence'] = m.group(2).decode()

            # parse second header line
            m = re.match(
                b'([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-|ap-)([a-z-.]+)(.*) '
                b'([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})',
                lines[1],
                flags=re.I)
            if m:
                item['language'] = 'en'
                item['priority'] = 2 if m.group(1).decode() == 'u' else 3
                qcode = m.group(2).decode().upper()
                item['anpa_category'] = [{'qcode': qcode}]
                # Mapping product
                qcode = self.MAPPING_PRODUCTS.get(qcode, 'NEWS/GENERAL')
                item.setdefault('subject', []).extend([
                    {
                        'name': qcode,
                        'qcode': qcode,
                        'parent': 'NEWS',
                        'scheme': 'services-products'
                    },
                    {
                        'name': 'KYODO',
                        'qcode': 'KYODO',
                        'scheme': 'sources'
                    },
                    {
                        'name': 'default',
                        'qcode': 'default',
                        'scheme': 'distribution'
                    },
                ])
                item['slugline'] = m.group(6).decode('latin-1', 'replace')
                item['anpa_take_key'] = m.group(7).decode(
                    'latin-1', 'replace').strip()
                item['word_count'] = int(m.group(10).decode())
                if m.group(4) == b'\x12':
                    item[FORMAT] = FORMATS.PRESERVED

            # parse created date at the end of file
            m = re.search(
                b'\x03([A-Z]{3})-([0-9]{2}:[0-9]{2}-[0-9]{2}-[0-9]{2}-[0-9]{2})',
                lines[-1],
                flags=re.I)
            if m:
                tz = pytz.timezone(config.TIMEZONE_CODE[str.lower(
                    m.group(1).decode())])
                date = datetime.strptime(
                    m.group(2).decode(), '%H:%M-%d-%m-%y').replace(tzinfo=tz)
                item['firstcreated'] = date.astimezone(pytz.utc)
                item['versioncreated'] = item['firstcreated']

            # parse anpa content
            body = b''.join(lines[2:])
            m = re.match(b'\x02(.*)\x03', body, flags=re.M + re.S)
            if m:
                text = m.group(1).decode('latin-1', 'replace').split('\n')
                item['keywords'] = text[0].strip('\r').split("-")
                item['abstract'] = re.split(
                    "\\..?",
                    ("".join(line.strip() for line in text[2:-1])))[0] + '.'
                item.setdefault('extra',
                                {})['city'] = item.get('abstract',
                                                       '').split(',')[0]
                is_header = True
                for line in text:
                    if line == text[0]:
                        m = re.match('BC-(.*)', line, flags=re.I)
                        if m:
                            item['slugline'] = str.rstrip(m.group(1), '\r')
                            continue
                    if is_header is True:
                        if line.endswith("+\r"):
                            is_header = False
                        line = line.rstrip(
                            '\r') if is_header is True else line.rstrip('+\r')
                        line = line
                        if 'headline' in item:
                            item['headline'] += line
                        else:
                            item['headline'] = line
                        continue

                    if line == '==Kyodo\r':
                        break
                    line = line.rstrip('\r')

                    if 'body_html' in item:
                        item['body_html'] += '<p>' + line + '</p>'
                    else:
                        item['body_html'] = '<p>' + line + '</p>'

                self._parse_ednote(item['headline'], item)
            # Slugline and keywords is epmty
            item['slugline'] = None
            item['keywords'] = []
            return item
        except Exception as ex:
            raise ParserError.anpaParseFileError(file_path, ex)