예제 #1
0
    def add_indent_info_for_dashed_lines(self, xml):
        """Add plaintext indentation prefix to all lines that start with a dash representing
        either a new logical division unit, or a dashed explanatory section for a list preceding it
        (see below).

        "Dashed explanatory section" is easiest understood by example:
        Art. 123 Sausages may be consumed:
        1) cooked, or
        2) grilled, or
        3) fried
        - unless they have been pre-smoked.

        The plaintext info has the form of a prefix "@@INDENT{X}@@", where the bit "{X}" is a
        number 0, 1, 2, 3, .., indicating which indentation level the dashed line starts at.

        Args:
            xml: The XML to operate on, as a list of tags.
        """

        # For each line, add its indent level.
        indent_levels = self.get_all_indent_levels(xml)
        for node in xml.find_all('text'):
            left = int(node["left"])
            indent_level = self.get_indent_level(left, indent_levels)
            if not indent_level is None:
                node["indent"] = indent_level
                node.string = "@@INDENT" + str(indent_level) + "@@" + node.get_text()

        # Check all line starts. If they begin with a dash, and the dash is just in a running
        # piece of text (as opposed to e.g. a list of tirets, or explanatory section at the end
        # of a list of points) - then move the dash to the line above. 
        last_seen_top = 0
        last_line_start = None
        last_node = None
        for node in xml.find_all("text"):
            if (int(node["top"]) > last_seen_top):
                if ((not last_node is None) and (not last_line_start is None) 
                    and self.should_join_dash_line(node, last_node, last_line_start)):
                    # Moving the dash to the line above. Note that one trailing whitespace will be
                    # added after the dash when newlines are removed.
                    last_node.string = last_node.get_text().strip() + u" –"
                    node.string = re.sub(ur"@@– ", "@@", node.get_text().strip(), re.UNICODE)
                last_line_start = node
                last_seen_top = int(node["top"])
            last_node = node

        # Remove indent info for all lines except ones still starting with dash.
        for node in xml.find_all("text"):
            if not re.match(self.DASH_PREFIX_WITH_INDENT, node.get_text()):
                node.string = re.sub(r"^@@INDENT\d@@", "", node.get_text().strip())
예제 #2
0
 def remove_empty_text_nodes(self, xml):
     """Remove the XML nodes containing nothing or whitespace.
     
     Args:
         xml: The XML to operate on, as a list of tags.
     """
     for node in xml.find_all("text"):
         if re.match("^\s*$", node.get_text()):
             node.extract()
예제 #3
0
    def should_join_dash_line(self, node, last_node, last_line_start):
        """Returns whether text in param node starts with a dash, and if so, whether it should be
        combined with the previous node. This concatenation should take place if the dash doesn't
        represent a new logical division (e.g. tiret), or a dashed section holding a summary for
        a list of items above it, but is simply the continuation of text from previous line.

        Args:
            node: The XML node whose text may need to be joined to previous line.
            last_node: The XML directly preceding this node.
            last_line_start: The node holding the beginning of previous line (which may or
                may not be the same as last_node).

        Returns:
            bool: True if text of node starts with a dash and if so, if it should be glued
                together with previous line.
        """
        if not re.match(self.DASH_PREFIX_WITH_INDENT, node.get_text()):
            return False
        current_indent_level = int(node["indent"])
        last_indent_level = int(last_line_start["indent"])
        last_line = last_line_start.get_text()
        if current_indent_level == 0:
            if (last_indent_level == 0) and (not re.match(self.POINT_PREFIX_WITH_INDENT,last_line)):
                return True
            if (last_indent_level == 1) and (re.match(self.LEVEL0_PREFIX_WITH_INDENT, last_line)):
                return True
        if current_indent_level == 1:
            if (last_indent_level < 2) and (not re.match(self.LETTER_PREFIX_WITH_INDENT,last_line)):
                return True
        if current_indent_level == 2:
            join = True
            if (re.match('.*:\s*$', last_node.get_text())):
                join = False
            if (last_indent_level == 3):
                join = False
            if (last_indent_level == 2) and (re.match(self.DASH_PREFIX_WITH_INDENT, last_line)):
                join = False
            return join
        if current_indent_level == 3:
            return not re.match(self.DOUBLE_TIRET_PREFIX_WITH_INDENT, node.get_text())
        if current_indent_level == 4:
            return not re.match(self.TRIPLE_TIRET_PREFIX_WITH_INDENT, node.get_text())
        return False
예제 #4
0
    def undecorate_outgoing_and_upcoming_sections(self, xml):
        """In ISAP unified texts, when a given article, etc is changing on some future date, they
        first print the current version in italics and inside '[ ... ]' markers, and immediately
        after, they print the upcoming version in bold and inside '< ... >' markers.

        This function removes the '[', ']', '<', '>' markers. We rely on the person doing manual
        post-processing to remove the section that's currently not effective and leave the section
        that's currently in force.

        Note that we don't currently catch cases when the outgoing/upcoming sections happen inline,
        for example like this: "2) w okresie między dniem 1 stycznia następnego roku 
        a [terminem określonym dla złożenia] <upływem terminu określonego na złożenie> zeznania..".

        Args:
            xml: The XML to operate on, as a list of tags.
        """

        is_in_outgoing_part = False
        is_in_upcoming_part = False
        for node in xml.find_all('text'):
            i = node.find_all('i') # Find italics.
            b = node.find_all('b') # Find bold.
            text = node.get_text().strip()
            btext = b[0].get_text().strip() if (len(b) == 1) else "!@#$%^&*()" # else garbage.
            itext = i[0].get_text().strip() if (len(i) == 1) else "!@#$%^&*()" # else garbage.
            if (is_in_outgoing_part and is_in_upcoming_part):
                raise Exception("Impossible to be in outgoing and upcoming section at same time.")
            elif (is_in_outgoing_part and not is_in_upcoming_part):
                if (itext != text):
                    raise Exception("Expected italics while being in outgoing section.")
                if text.endswith("]"):
                    is_in_outgoing_part = False
                    node.string = text.rstrip("]")
            elif (not is_in_outgoing_part and is_in_upcoming_part):
                if (btext != text):
                    raise Exception("Expected bold while being in upcoming section.")
                if text.endswith(">"):
                    is_in_upcoming_part = False
                    node.string = text.rstrip(">")
            else:
                if (itext == text) and text.startswith("["):
                    node.string = text.lstrip("[").rstrip("]")
                    if not text.endswith("]"): # Needed in case outgoing section is one line only.
                        is_in_outgoing_part = True
                elif (btext == text) and text.startswith("<"):
                    node.string = text.lstrip("<").rstrip(">")
                    if not text.endswith(">"): # Needed in case upcoming section is one line only.
                        is_in_upcoming_part = True
                # For cases like: "<text ...><b>Art. 22c.</b> <i>[1. Some text </i></text>."
                elif ((btext + u" " + itext == text)
                    and btext.startswith("Art.") and itext.startswith("[")):
                    node.string = btext + u" " + itext.lstrip("[").rstrip("]")
                    if not itext.endswith("]"): # Needed in case outgoing section is one line only.
                        is_in_outgoing_part = True
예제 #5
0
    def xml_to_text(self, xml):
        """Convert the Beautiful Soup XML into plain text. I'm not using xml.get_text() because
        it can glue <text> tags together without either whitespace or newline between them. 

        Args:
            xml: The XML to operate on, as a list of tags.

        Returns:
            str: The law plain text.
        """
        result = ""
        for node in xml.find_all("text"):
            result = result + node.get_text().strip() + "\n"
        return result
예제 #6
0
    def join_text_nodes_on_same_lines(self, xml):
        """Concatenates nodes that are on the same line. Note that this may remove HTML formatting
        inside nodes.

        Args:
            xml: The XML to operate on, as a list of tags.
        """
        last_node = None
        for node in xml.find_all("text"):
            if node.has_attr("line"):
                last_node = node
                break
        for node in xml.find_all("text"):
            if not node.has_attr("line"):
                continue
            if int(node["line"]) > int(last_node["line"]):
                last_node = node
            elif int(node["left"]) > int(last_node["left"]) + int(last_node["width"]):
                last_node.string = last_node.get_text().strip() + " " + node.get_text().strip()
                last_node["width"] = int(last_node["width"]) + int(node["width"])
                node.extract()
예제 #7
0
    def add_newline_if_level0_unit_starts_with_level1_unit(self, xml):
        """In Polish law, the main logical unit for normative laws is denoted "Art." and
        for executive laws "§". The logical unit one level below, is denoted usually by
        strings matching the regex "\d+[a-z]*\.", and occasionally with "§ " prepended to that.
        (Note that there may also be a superscript before the ending dot.)

        Unlike pretty much all other logical units, if a main unit (which we call level 0)
        immediately starts with a unit one level below (which we call level 1), then the level 1
        unit starts NOT on the next line, but on the same line as the level 0 heading. For example,
        we might get the following plain text after parsing the PDF (note the lack of "\n" between
        "Art. 123." and following "1."): "Art. 123. 1. Some text. \n 2. Some more text."

        Such cases are difficult for the parser to tell apart from level 0 units which only
        have a single body of text (not divided into level 1 units) - they start on the same line
        too. To make it easier to distinguish them, we detect here the cases of a level 1 unit
        starting on the same line as level 0 unit, and add a newline between them.
        """

        regex = ( ur"^(" + self.LEVEL0_PREFIX_REGEX + ur")" + ur"\s+"
                 + ur"(" + self.LEVEL1_PREFIX_REGEX + ur")")
        for node in xml.find_all('text'):
            node.string = re.sub(regex, ur"\g<1>\n\g<2>", node.get_text())
예제 #8
0
    def process_superscripts(self, xml):
        """Modify the passed in XML by searching for tags which represent superscript numbering and
        combining them with neighboring tags in such a way that superscripts are no longer
        indicated by XML positional info (lower font height and lower offset from page top than
        the rest of line), but by special surrounding text (^^SUPERSCRIPT^^ before and
        $$SUPERSCRIPT$$ after).

        Args:
            xml: The XML to operate on, as a list of tags.
        """
        text_nodes = xml.find_all('text')

        superscript_pattern = re.compile("^[a-z0-9]+$")
        # TODO: We may relax the node_plus_two_pattern a bit, particularly removing the requirement
        # of a period at the beginning. When a division number is mentioned from another place,
        # the period is not always there.
        node_plus_two_pattern = re.compile("^\. .*")
        n = len(text_nodes)
        if (n < 3):
            return

        nodes_to_remove = []

        for _ in xrange(0, n - 3):
            node = text_nodes.pop(0)
            node_plus_one = text_nodes[0]
            node_plus_two = text_nodes[1]

            node_txt = node.get_text().strip()
            node_plus_one_txt = node_plus_one.get_text().strip()
            node_plus_two_txt = node_plus_two.get_text().strip()

            if not superscript_pattern.match(node_plus_one_txt):
                continue

            if not node_plus_two_pattern.match(node_plus_two_txt):
                continue

            # node and node_plus_two must have the same height.
            if int(node["height"]) != int(node_plus_two["height"]):
                continue

            # node_plus_one must have lower height than node/node_plus_two (smaller font).
            if int(node["height"]) <= int(node_plus_one["height"]):
                continue

            # node and node_plus_two must be in same line.
            if int(node["top"]) != int(node_plus_two["top"]):
                continue

            # node_plus_one must not be below the line of node/node_plus_two.
            if int(node["top"]) < int(node_plus_one["top"]):
                continue

            # Concat all three nodes, surrounding text of node_plus_one with special labels.
            # Put concatenated text in node, remove node_plus_one & node_plus_two.
            node.string = (node_txt + self.SUPERSCRIPT_START + node_plus_one_txt
                + self.SUPERSCRIPT_END + node_plus_two_txt)
            nodes_to_remove.append(node_plus_one)
            nodes_to_remove.append(node_plus_two)

        for node in nodes_to_remove:
            node.extract()