Exemplo n.º 1
0
 def test_numeric_exception(self):
     bad_ids = [u"b002", u"d", u"c", u"a1"]
     idsa = IDSortingAlgorithm(IDSortingAlgorithm.NUMERIC)
     sids = idsa.sort(bad_ids)
     self.assertTrue(sids == bad_ids)
Exemplo n.º 2
0
    def _read_unparsed(self, lines, parameters):
        """
        Read text fragments from an unparsed format text file.

        :param lines: the lines of the unparsed text file
        :type  lines: list of strings
        :param parameters: additional parameters for parsing
                           (e.g., class/id regex strings)
        :type  parameters: dict
        """
        #
        # TODO better and/or parametric parsing,
        #      for example, removing tags but keeping text, etc.
        #
        self._log("Parsing fragments from unparsed text format")
        pairs = []

        # get filter attributes
        attributes = dict()
        if gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX in parameters:
            class_regex_string = parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX]
            if class_regex_string is not None:
                self._log(["Regex for class: '%s'", class_regex_string])
                class_regex = re.compile(r".*\b" + class_regex_string + r"\b.*")
                attributes['class'] = class_regex
        if gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX in parameters:
            id_regex_string = parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX]
            if id_regex_string is not None:
                self._log(["Regex for id: '%s'", id_regex_string])
                id_regex = re.compile(r".*\b" + id_regex_string + r"\b.*")
                attributes['id'] = id_regex

        # get id sorting algorithm
        id_sort = IDSortingAlgorithm.UNSORTED
        if gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT in parameters:
            id_sort = parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT]
        self._log(["Sorting text fragments using '%s'", id_sort])

        # transform text in a soup object
        self._log("Creating soup")
        soup = BeautifulSoup.BeautifulSoup("\n".join(lines))

        # extract according to class_regex and id_regex
        text_from_id = dict()
        ids = []
        self._log(["Finding elements matching attributes '%s'", attributes])
        nodes = soup.findAll(attrs=attributes)
        for node in nodes:
            try:
                f_id = node['id']
                f_text = node.text
                text_from_id[f_id] = f_text
                ids.append(f_id)
            except KeyError:
                self._log("KeyError while parsing a node", Logger.WARNING)

        # sort by ID as requested
        self._log("Sorting text fragments")
        sorted_ids = IDSortingAlgorithm(id_sort).sort(ids)

        # append to fragments
        self._log("Appending fragments")
        for key in sorted_ids:
            pairs.append([key, [text_from_id[key]]])
        self._create_text_fragments(pairs)
Exemplo n.º 3
0
 def test_numeric(self):
     expected = [u"b001", u"a2", u"c03", u"d4"]
     idsa = IDSortingAlgorithm(IDSortingAlgorithm.NUMERIC)
     sids = idsa.sort(self.IDS)
     self.assertTrue(sids == expected)
Exemplo n.º 4
0
 def test_lexicographic(self):
     expected = [u"a2", u"b001", u"c03", u"d4"]
     idsa = IDSortingAlgorithm(IDSortingAlgorithm.LEXICOGRAPHIC)
     sids = idsa.sort(self.IDS)
     self.assertTrue(sids == expected)
Exemplo n.º 5
0
 def test_unsorted(self):
     expected = [u"b001", u"c03", u"d4", u"a2"]
     idsa = IDSortingAlgorithm(IDSortingAlgorithm.UNSORTED)
     sids = idsa.sort(self.IDS)
     self.assertTrue(sids == expected)
Exemplo n.º 6
0
 def test_invalid_algorithm(self):
     with self.assertRaises(ValueError):
         idsa = IDSortingAlgorithm(u"foo")
Exemplo n.º 7
0
    def _read_unparsed(self, lines):
        """
        Read text fragments from an unparsed format text file.

        :param list lines: the lines of the unparsed text file
        """
        from bs4 import BeautifulSoup

        def filter_attributes():
            """ Return a dict with the bs4 filter parameters """
            attributes = {}
            for attribute_name, filter_name in [
                ("class", gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX),
                ("id", gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX)
            ]:
                if filter_name in self.parameters:
                    regex_string = self.parameters[filter_name]
                    if regex_string is not None:
                        self.log([
                            u"Regex for %s: '%s'", attribute_name, regex_string
                        ])
                        regex = re.compile(r".*\b" + regex_string + r"\b.*")
                        attributes[attribute_name] = regex
            return attributes

        #
        # TODO better and/or parametric parsing,
        #      for example, removing tags but keeping text, etc.
        #
        self.log(u"Parsing fragments from unparsed text format")

        # transform text in a soup object
        self.log(u"Creating soup")
        soup = BeautifulSoup("\n".join(lines), "lxml")

        # extract according to class_regex and id_regex
        text_from_id = {}
        ids = []
        filter_attributes = filter_attributes()
        self.log(
            [u"Finding elements matching attributes '%s'", filter_attributes])
        nodes = soup.findAll(attrs=filter_attributes)
        for node in nodes:
            try:
                f_id = gf.safe_unicode(node["id"])
                f_text = gf.safe_unicode(node.text)
                text_from_id[f_id] = f_text
                ids.append(f_id)
            except KeyError:
                self.log_warn(u"KeyError while parsing a node")

        # sort by ID as requested
        id_sort = gf.safe_get(dictionary=self.parameters,
                              key=gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT,
                              default_value=IDSortingAlgorithm.UNSORTED,
                              can_return_none=False)
        self.log([u"Sorting text fragments using '%s'", id_sort])
        sorted_ids = IDSortingAlgorithm(id_sort).sort(ids)

        # append to fragments
        self.log(u"Appending fragments")
        self._create_text_fragments([(key, [text_from_id[key]])
                                     for key in sorted_ids])