def run(): """Extract the images from htmls, and also do extra work on those pages.""" preprocessed = preprocess.pages_selector.top_pages pi = ImageParser() total = len(preprocessed) logger.info("Image parser inited") logger.info("Normal pages: %d pages to process", total) done = 0 tl = utiles.TimingLogger(30, logger.debug) for dir3, fname, _ in preprocessed: try: pi.parse(dir3, fname) except Exception: logger.exception("Parsing crashed in dir3=%r fname=%r", dir3, fname) raise done += 1 tl.log("Parsing found %d images so far (%d of %d pages)", pi.quant, done, total) pi.dump() return pi.imgs_ok, pi.quant
def run(): """Extract the images from htmls, and also do extra work on those pages.""" preprocesados = preprocesar.pages_selector.top_pages pi = ImageParser() total = len(preprocesados) logger.info("Image parser inited") logger.info("Extract images from special resources.") pi.process_dynamics( 'portals', os.path.join(config.DIR_ASSETS, 'dynamic', 'portals.html')) logger.info("Normal pages: %d pages to process", total) done = 0 tl = utiles.TimingLogger(30, logger.debug) for dir3, fname, _ in preprocesados: try: pi.parse(dir3, fname) except: logger.exception("Parsing crashed in dir3=%r fname=%r", dir3, fname) raise done += 1 tl.log("Parsing found %d images so far (%d of %d pages)", pi.cant, done, total) pi.dump() return pi.imgs_ok, pi.cant
def retrieve(): """Download the images from the net.""" lista_descargar = [] # vemos cuales tuvieron problemas antes log_errores = os.path.join(config.DIR_TEMP, "imagenes_neterror.txt") if os.path.exists(log_errores): with codecs.open(log_errores, "r", "utf8") as fh: imgs_problemas = set(x.strip() for x in fh) else: imgs_problemas = set() for linea in codecs.open(config.LOG_REDUCCION, "r", "utf8"): linea = linea.strip() if not linea: continue _, arch, url = linea.split(config.SEPARADOR_COLUMNAS) fullpath = os.path.join(config.DIR_TEMP, "images", arch) if url not in imgs_problemas and not os.path.exists(fullpath): lista_descargar.append((url, fullpath)) tot = len(lista_descargar) p = repartidor.Pool(descargar, 5) tl = utiles.TimingLogger(30, logger.debug) errores = collections.Counter() c_ok = 0 c_err = 0 for i, result in enumerate(p.procesa(lista_descargar), 1): (url, fullpath), stt = result if stt is None: c_ok += 1 else: errores[stt] += 1 c_err += 1 with codecs.open(log_errores, "a", "utf8") as fh: fh.write(url + "\n") tl.log("Downloaded image %d/%d (ok=%d, err=%d)", i, tot, c_ok, c_err) for code, cant in errores.most_common(): logger.warning("Had errors: code=%r quant=%d", code, cant)
def procesar(self): resultados = self.resultados puntaje_extra = {} de_antes = 0 # get the total of directories to parse total_dirs = sum(1 for _ in os.walk(self.origen)) logger.info("Quantity of directories to process: %d", total_dirs) count = 0 tl = utiles.TimingLogger(30, logger.debug) for cwd, directorios, archivos in os.walk(self.origen): partes_dir = cwd.split(os.path.sep) ult3dirs = join(*partes_dir[-3:]) count += 1 if len(ult3dirs) != 5: # ej: u"M/a/n" # we're not in a leaf, we shouldn't have any files if archivos: logger.warning( "We have content in a non-leaf " "directory: %s", archivos) continue tl.log("Processing %s (%d/%d)", ult3dirs, count, total_dirs) for pag in archivos: if " " in pag: logger.warning("Have names with spaces! %s %s", ult3dirs, pag) # vemos si lo teníamos de antes if pag in resultados: de_antes += 1 continue if pag in self.descartados_antes: continue wikiarchivo = WikiArchivo(cwd, ult3dirs, pag) resultados[pag] = {} resultados[pag]["dir3"] = ult3dirs for procesador in self.preprocesadores: (puntaje, otras_pags) = procesador(wikiarchivo) # agregamos el puntaje extra for extra_pag, extra_ptje in otras_pags: if extra_pag in resultados: prev = resultados[extra_pag].get(procesador, 0) resultados[extra_pag][ procesador] = prev + extra_ptje else: ant = puntaje_extra.setdefault(extra_pag, {}) ant[procesador] = ant.get(procesador, 0) + extra_ptje # None significa que el procesador lo marcó para omitir if puntaje is None: del resultados[pag] self.descartados_file.write("%s\n" % pag) break # ponemos el puntaje if puntaje != 0: resultados[pag][procesador] = puntaje else: # lo guardamos sólo si no fue descartado wikiarchivo.guardar() for procesador in self.preprocesadores: procesador.close() logger.debug("Preprocessor %17s usage stats: %s", procesador.nombre, procesador.stats) # cargamos los redirects para tenerlos en cuenta redirects = {} sepcol = config.SEPARADOR_COLUMNAS with codecs.open(config.LOG_REDIRECTS, "r", "utf-8") as fh: for linea in fh: r_from, r_to = linea.strip().split(sepcol) redirects[r_from] = r_to # agregamos el puntaje extra sólo si ya teníamos las páginas con nos logger.debug("Distributing extra scope: %d", len(puntaje_extra)) perdidos = [] for (pag, puntajes) in puntaje_extra.items(): # desreferenciamos el redirect, vaciando el diccionario para # evitar loops while pag in redirects: pag = redirects.pop(pag) # asignamos los puntajes para las páginas que están if pag in resultados: for (proc, ptje) in puntajes.items(): resultados[pag][proc] = resultados[pag].get(proc, 0) + ptje else: perdidos.append((pag, puntajes)) if perdidos: logger.warning("Lost %d scores!", len(perdidos)) fname = join(config.DIR_TEMP, 'perdidos.txt') with codecs.open(fname, 'w', 'utf8') as fh: for pag in perdidos: fh.write(u"%s\n" % (pag, )) return len(resultados) - de_antes, de_antes
def process(self): """Process all pages under a root directory.""" # let's see what was processed from before, and open the log file to keep adding if os.path.exists(config.LOG_PREPROCESADO): with codecs.open(config.LOG_PREPROCESADO, "rt", "utf8") as fh: processed_before_set = set(x.strip() for x in fh) else: processed_before_set = set() processed_before_log = codecs.open(config.LOG_PREPROCESADO, "at", "utf8") # get the total of directories to parse logger.info("Getting how many pages under root dir") total_pages = sum(len(filenames) for _, _, filenames in os.walk(self.origen)) logger.info("Quantity of pages to process: %d", total_pages) # open the scores file to keep adding scores_log = codecs.open(LOG_SCORES_ACCUM, "at", "utf8") count_processed = count_new_ok = count_new_discarded = count_old_before = 0 tl = utiles.TimingLogger(30, logger.debug) for cwd, _, filenames in os.walk(self.origen): parts_dir = cwd.split(os.path.sep) last3dirs = join(*parts_dir[-3:]) if len(last3dirs) != 5: # ej: u"M/a/n" # we're not in a leaf, we shouldn't have any files if filenames: logger.warning("We have content in a non-leaf directory: %s %s", last3dirs, filenames) continue for page_path in filenames: count_processed += 1 tl.log("Processing %s (%d/%d)", last3dirs, count_processed, total_pages) if " " in page_path: logger.warning("Have names with spaces! %s %s", last3dirs, page_path) # check if the page was processed or discarded before if page_path in processed_before_set: count_old_before += 1 continue wikipage = WikiArchivo(cwd, last3dirs, page_path) this_total_score = 0 other_pages_scores = [] for procesador in self.preprocesadores: tini = time.time() try: (this_score, other_scores) = procesador(wikipage) except: logger.error("Processor %s crashed on page %r", procesador, page_path) raise self.prof_times[procesador] += time.time() - tini self.prof_quant[procesador] += 1 # keep the score for other pages (check before to avoid a bogus function call) if other_scores: other_pages_scores.extend(other_scores) if this_score is None: # the processor indicated to discard this page count_new_discarded += 1 break # keep the score for this page this_total_score += this_score else: # all processors done, page not discarded count_new_ok += 1 wikipage.guardar() # save the real page score scores_log.write("{}|R|{:d}\n".format( to3dirs.to_pagina(page_path), this_total_score)) # save the extra pages score (that may exist or not in the dump) for extra_page, extra_score in other_pages_scores: scores_log.write("{}|E|{:d}\n".format(extra_page, extra_score)) # with score or discarded, log it as processed processed_before_log.write(page_path + "\n") # all processing done for all the pages logger.info("Processed pages: %d new ok, %d discarded, %d already processed before", count_new_ok, count_new_discarded, count_old_before) scores_log.close() processed_before_log.close() for procesador in self.preprocesadores: procesador.close() logger.debug("Preprocessor %17s usage stats: %s", procesador.nombre, procesador.stats)