Пример #1
0
    def test_to_pagina(self):
        test_paths = (
            u"*/",
            u"Anexo:*/",
            u'//:Tr3s.Jeans',
        )

        for s in test_paths:
            self.assertEqual(to_pagina(to_filename(s)), s)
Пример #2
0
    def process(self):
        """Process all pages under a root directory."""
        # let's see what was processed from before, and open the log file to keep adding
        if os.path.exists(config.LOG_PREPROCESADO):
            with codecs.open(config.LOG_PREPROCESADO, "rt", "utf8") as fh:
                processed_before_set = set(x.strip() for x in fh)
        else:
            processed_before_set = set()
        processed_before_log = codecs.open(config.LOG_PREPROCESADO, "at", "utf8")

        # get the total of directories to parse
        logger.info("Getting how many pages under root dir")
        total_pages = sum(len(filenames) for _, _, filenames in os.walk(self.origen))
        logger.info("Quantity of pages to process: %d", total_pages)

        # open the scores file to keep adding
        scores_log = codecs.open(LOG_SCORES_ACCUM, "at", "utf8")

        count_processed = count_new_ok = count_new_discarded = count_old_before = 0
        tl = utiles.TimingLogger(30, logger.debug)
        for cwd, _, filenames in os.walk(self.origen):
            parts_dir = cwd.split(os.path.sep)
            last3dirs = join(*parts_dir[-3:])

            if len(last3dirs) != 5:  # ej: u"M/a/n"
                # we're not in a leaf, we shouldn't have any files
                if filenames:
                    logger.warning("We have content in a non-leaf directory: %s %s",
                                   last3dirs, filenames)
                continue

            for page_path in filenames:
                count_processed += 1
                tl.log("Processing %s (%d/%d)", last3dirs, count_processed, total_pages)

                if " " in page_path:
                    logger.warning("Have names with spaces! %s %s", last3dirs, page_path)

                # check if the page was processed or discarded before
                if page_path in processed_before_set:
                    count_old_before += 1
                    continue

                wikipage = WikiArchivo(cwd, last3dirs, page_path)

                this_total_score = 0
                other_pages_scores = []
                for procesador in self.preprocesadores:
                    tini = time.time()
                    try:
                        (this_score, other_scores) = procesador(wikipage)
                    except:
                        logger.error("Processor %s crashed on page %r", procesador, page_path)
                        raise
                    self.prof_times[procesador] += time.time() - tini
                    self.prof_quant[procesador] += 1

                    # keep the score for other pages (check before to avoid a bogus function call)
                    if other_scores:
                        other_pages_scores.extend(other_scores)

                    if this_score is None:
                        # the processor indicated to discard this page
                        count_new_discarded += 1
                        break

                    # keep the score for this page
                    this_total_score += this_score

                else:
                    # all processors done, page not discarded
                    count_new_ok += 1
                    wikipage.guardar()

                    # save the real page score
                    scores_log.write("{}|R|{:d}\n".format(
                        to3dirs.to_pagina(page_path), this_total_score))

                    # save the extra pages score (that may exist or not in the dump)
                    for extra_page, extra_score in other_pages_scores:
                        scores_log.write("{}|E|{:d}\n".format(extra_page, extra_score))

                # with score or discarded, log it as processed
                processed_before_log.write(page_path + "\n")

        # all processing done for all the pages
        logger.info("Processed pages: %d new ok, %d discarded, %d already processed before",
                    count_new_ok, count_new_discarded, count_old_before)
        scores_log.close()
        processed_before_log.close()
        for procesador in self.preprocesadores:
            procesador.close()
            logger.debug("Preprocessor %17s usage stats: %s", procesador.nombre, procesador.stats)
Пример #3
0
def get_orig_link(path):
    """A partir del path devuelve el link original externo."""
    orig_link = config.URL_WIKIPEDIA + u"wiki/" + \
                to3dirs.to_pagina(path)
    return orig_link
Пример #4
0
def get_orig_link(path):
    """Gets the original external link of a path."""
    orig_link = (config.URL_WIKIPEDIA + "wiki/" +
                 urllib.parse.quote(to3dirs.to_pagina(path)))
    return orig_link
Пример #5
0
 def test_roundtrip_crazy(self):
     word = "foo . bar / baz % more"
     r = to3dirs.to_pagina(to3dirs.to_filename(word))
     self.assertEqual(r, word)
Пример #6
0
 def test_roundtrip_simple(self):
     for word in ("moño", "foo/bar", "foo.bar"):
         r = to3dirs.to_pagina(to3dirs.to_filename(word))
         self.assertEqual(r, word)
Пример #7
0
 def test_roundtrip_crazy(self):
     word = "foo . bar / baz % more"
     r = to3dirs.to_pagina(to3dirs.to_filename(word))
     self.assertEqual(r, word)
Пример #8
0
 def test_roundtrip_simple(self):
     for word in ("moño", "foo/bar", "foo.bar"):
         r = to3dirs.to_pagina(to3dirs.to_filename(word))
         self.assertEqual(r, word)