def test_to_pagina(self): test_paths = ( u"*/", u"Anexo:*/", u'//:Tr3s.Jeans', ) for s in test_paths: self.assertEqual(to_pagina(to_filename(s)), s)
def process(self): """Process all pages under a root directory.""" # let's see what was processed from before, and open the log file to keep adding if os.path.exists(config.LOG_PREPROCESADO): with codecs.open(config.LOG_PREPROCESADO, "rt", "utf8") as fh: processed_before_set = set(x.strip() for x in fh) else: processed_before_set = set() processed_before_log = codecs.open(config.LOG_PREPROCESADO, "at", "utf8") # get the total of directories to parse logger.info("Getting how many pages under root dir") total_pages = sum(len(filenames) for _, _, filenames in os.walk(self.origen)) logger.info("Quantity of pages to process: %d", total_pages) # open the scores file to keep adding scores_log = codecs.open(LOG_SCORES_ACCUM, "at", "utf8") count_processed = count_new_ok = count_new_discarded = count_old_before = 0 tl = utiles.TimingLogger(30, logger.debug) for cwd, _, filenames in os.walk(self.origen): parts_dir = cwd.split(os.path.sep) last3dirs = join(*parts_dir[-3:]) if len(last3dirs) != 5: # ej: u"M/a/n" # we're not in a leaf, we shouldn't have any files if filenames: logger.warning("We have content in a non-leaf directory: %s %s", last3dirs, filenames) continue for page_path in filenames: count_processed += 1 tl.log("Processing %s (%d/%d)", last3dirs, count_processed, total_pages) if " " in page_path: logger.warning("Have names with spaces! %s %s", last3dirs, page_path) # check if the page was processed or discarded before if page_path in processed_before_set: count_old_before += 1 continue wikipage = WikiArchivo(cwd, last3dirs, page_path) this_total_score = 0 other_pages_scores = [] for procesador in self.preprocesadores: tini = time.time() try: (this_score, other_scores) = procesador(wikipage) except: logger.error("Processor %s crashed on page %r", procesador, page_path) raise self.prof_times[procesador] += time.time() - tini self.prof_quant[procesador] += 1 # keep the score for other pages (check before to avoid a bogus function call) if other_scores: other_pages_scores.extend(other_scores) if this_score is None: # the processor indicated to discard this page count_new_discarded += 1 break # keep the score for this page this_total_score += this_score else: # all processors done, page not discarded count_new_ok += 1 wikipage.guardar() # save the real page score scores_log.write("{}|R|{:d}\n".format( to3dirs.to_pagina(page_path), this_total_score)) # save the extra pages score (that may exist or not in the dump) for extra_page, extra_score in other_pages_scores: scores_log.write("{}|E|{:d}\n".format(extra_page, extra_score)) # with score or discarded, log it as processed processed_before_log.write(page_path + "\n") # all processing done for all the pages logger.info("Processed pages: %d new ok, %d discarded, %d already processed before", count_new_ok, count_new_discarded, count_old_before) scores_log.close() processed_before_log.close() for procesador in self.preprocesadores: procesador.close() logger.debug("Preprocessor %17s usage stats: %s", procesador.nombre, procesador.stats)
def get_orig_link(path): """A partir del path devuelve el link original externo.""" orig_link = config.URL_WIKIPEDIA + u"wiki/" + \ to3dirs.to_pagina(path) return orig_link
def get_orig_link(path): """Gets the original external link of a path.""" orig_link = (config.URL_WIKIPEDIA + "wiki/" + urllib.parse.quote(to3dirs.to_pagina(path))) return orig_link
def test_roundtrip_crazy(self): word = "foo . bar / baz % more" r = to3dirs.to_pagina(to3dirs.to_filename(word)) self.assertEqual(r, word)
def test_roundtrip_simple(self): for word in ("moño", "foo/bar", "foo.bar"): r = to3dirs.to_pagina(to3dirs.to_filename(word)) self.assertEqual(r, word)