def generar_bloques(self, lang, verbose): self._prep_archive_dir(lang) # lo importamos acá porque no es necesario en producción from src.preproceso import preprocesar # pedir todos los articulos, y ordenarlos en un dict por # su numero de bloque, segun el hash fileNames = preprocesar.pages_selector.top_pages if verbose: print "Procesando", len(fileNames), "articulos" numBloques = len(fileNames) // self.items_per_block + 1 self.guardarNumBloques(numBloques) bloques = {} all_filenames = set() for dir3, fileName, _ in fileNames: all_filenames.add(fileName) bloqNum = utiles.coherent_hash( fileName.encode('utf8')) % numBloques bloques.setdefault(bloqNum, []).append((dir3, fileName)) if verbose: print " archs:", bloqNum, repr(dir3), repr(fileName) # armo el diccionario de redirects, también separados por bloques para # saber a dónde buscarlos redirects = {} for linea in codecs.open(config.LOG_REDIRECTS, "r", "utf-8"): orig, dest = linea.strip().split(config.SEPARADOR_COLUMNAS) # solamente nos quedamos con este redirect si realmente apunta a # un artículo útil (descartando el 'fragment' si hubiera) only_name = dest.split("#")[0] if only_name not in all_filenames: continue # metemos en bloque bloqNum = utiles.coherent_hash(orig.encode('utf8')) % numBloques redirects.setdefault(bloqNum, []).append((orig, dest)) if verbose: print " redirs:", bloqNum, repr(orig), repr(dest) # armamos cada uno de los comprimidos tot_archs = 0 tot_redirs = 0 for bloqNum, fileNames in bloques.items(): tot_archs += len(fileNames) redirs_thisblock = redirects.get(bloqNum, []) tot_redirs += len(redirs_thisblock) Comprimido.crear(redirs_thisblock, bloqNum, fileNames, verbose) return (len(bloques), tot_archs, tot_redirs)
def generar_bloques(self, lang, verbose): self._prep_archive_dir(lang) # lo importamos acá porque no es necesario en producción from src.preproceso import preprocesar # pedir todos los articulos, y ordenarlos en un dict por # su numero de bloque, segun el hash top_pages = preprocesar.pages_selector.top_pages if verbose: print "Procesando", len(top_pages), "articulos" numBloques = len(top_pages) // self.items_per_block + 1 self.guardarNumBloques(numBloques) bloques = {} all_filenames = set() for dir3, filename, _, _ in top_pages: all_filenames.add(filename) bloqNum = utiles.coherent_hash(filename.encode('utf8')) % numBloques bloques.setdefault(bloqNum, []).append((dir3, filename)) if verbose: print " archs:", bloqNum, repr(dir3), repr(filename) # armo el diccionario de redirects, también separados por bloques para # saber a dónde buscarlos redirects = {} for linea in codecs.open(config.LOG_REDIRECTS, "r", "utf-8"): orig, dest = linea.strip().split(config.SEPARADOR_COLUMNAS) # solamente nos quedamos con este redirect si realmente apunta a # un artículo útil (descartando el 'fragment' si hubiera) only_name = dest.split("#")[0] if only_name not in all_filenames: continue # metemos en bloque bloqNum = utiles.coherent_hash(orig.encode('utf8')) % numBloques redirects.setdefault(bloqNum, []).append((orig, dest)) if verbose: print " redirs:", bloqNum, repr(orig), repr(dest) # armamos cada uno de los comprimidos tot_archs = 0 tot_redirs = 0 for bloqNum, fileNames in bloques.items(): tot_archs += len(fileNames) redirs_thisblock = redirects.get(bloqNum, []) tot_redirs += len(redirs_thisblock) Comprimido.crear(redirs_thisblock, bloqNum, fileNames, verbose) return (len(bloques), tot_archs, tot_redirs)
def generar_bloques(cls, lang, verbose): cls._prep_archive_dir(lang) # import this here as it's not needed in production from src.preprocessing import preprocess # get all the articles, and store them in a dict using its block number, calculated # wiht a hash of the name top_pages = preprocess.pages_selector.top_pages logger.debug("Processing %d articles", len(top_pages)) numBloques = len(top_pages) // cls.items_per_block + 1 cls.guardarNumBloques(numBloques) bloques = {} all_filenames = set() for dir3, filename, _ in top_pages: # unquote special fielsystem chars filename_orig = urllib.parse.unquote(filename) all_filenames.add(filename_orig) bloqNum = utiles.coherent_hash(filename.encode('utf8')) % numBloques bloques.setdefault(bloqNum, []).append((dir3, filename)) logger.debug(" files: %s %r %r", bloqNum, dir3, filename) # build the redirect dict, also separated by blocks to know where to find them redirects = {} for line in open(config.LOG_REDIRECTS, "rt", encoding="utf-8"): orig, dest = line.strip().split(config.SEPARADOR_COLUMNAS) # only keep this redirect if really points to an useful article (discarding any # possible 'fragment') only_name = dest.split("#")[0] if only_name not in all_filenames: continue # put it in a block bloqNum = utiles.coherent_hash(orig.encode('utf8')) % numBloques # target must be disk filename dest_filename = to3dirs.to_filename(dest) redirects.setdefault(bloqNum, []).append((orig, dest_filename)) logger.debug(" redirs: %s %r %r", bloqNum, orig, dest_filename) # build each of the compressed blocks tot_archs = 0 tot_redirs = 0 for bloqNum, fileNames in bloques.items(): tot_archs += len(fileNames) redirs_thisblock = redirects.get(bloqNum, []) tot_redirs += len(redirs_thisblock) Comprimido.crear(redirs_thisblock, bloqNum, fileNames, verbose) return (len(bloques), tot_archs, tot_redirs)
def generar_bloques(self, verbose): self._prep_archive_dir() # pedir todas las imágenes, y ordenarlos en un dict por # su numero de bloque, segun el hash fileNames = [] for dirname, subdirs, files in os.walk(config.DIR_IMGSLISTAS): for f in files: name = os.path.join(dirname, f)[len(config.DIR_IMGSLISTAS) + 1:] fileNames.append(name) if verbose: print "Procesando", len(fileNames), "imágenes" numBloques = len(fileNames) // self.items_per_block + 1 self.guardarNumBloques(numBloques) bloques = {} for fileName in fileNames: bloqNum = utiles.coherent_hash( fileName.encode('utf8')) % numBloques bloques.setdefault(bloqNum, []).append(fileName) if verbose: print " archs:", bloqNum, repr(fileName) tot = 0 for bloqNum, fileNames in bloques.items(): tot += len(fileNames) BloqueImagenes.crear(bloqNum, fileNames, verbose) return (len(bloques), tot)
def generar_bloques(cls, verbose): cls._prep_archive_dir() # get all the images, and store them in a dict using its block number, calculated # wiht a hash of the name fileNames = [] for dirname, subdirs, files in os.walk(config.DIR_IMGSLISTAS): for f in files: name = os.path.join(dirname, f)[len(config.DIR_IMGSLISTAS) + 1:] fileNames.append(name) logger.debug("Processing %d images", len(fileNames)) numBloques = len(fileNames) // cls.items_per_block + 1 cls.guardarNumBloques(numBloques) bloques = {} for fileName in fileNames: bloqNum = utiles.coherent_hash(fileName.encode('utf8')) % numBloques bloques.setdefault(bloqNum, []).append(fileName) logger.debug(" files: %s %r", bloqNum, fileName) tot = 0 for bloqNum, fileNames in bloques.items(): tot += len(fileNames) BloqueImagenes.crear(bloqNum, fileNames, verbose) return (len(bloques), tot)
def generar_bloques(self, verbose): self._prep_archive_dir() # pedir todas las imágenes, y ordenarlos en un dict por # su numero de bloque, segun el hash fileNames = [] for dirname, subdirs, files in os.walk(config.DIR_IMGSLISTAS): for f in files: name = os.path.join(dirname, f)[len(config.DIR_IMGSLISTAS) + 1:] fileNames.append(name) if verbose: print "Procesando", len(fileNames), "imágenes" numBloques = len(fileNames) // self.items_per_block + 1 self.guardarNumBloques(numBloques) bloques = {} for fileName in fileNames: bloqNum = utiles.coherent_hash(fileName.encode('utf8')) % numBloques bloques.setdefault(bloqNum, []).append(fileName) if verbose: print " archs:", bloqNum, repr(fileName) tot = 0 for bloqNum, fileNames in bloques.items(): tot += len(fileNames) BloqueImagenes.crear(bloqNum, fileNames, verbose) return (len(bloques), tot)
def create(cls, directory, source): '''Creates the index in the directory. The "source" generates pairs (key, value) to store in the index. The key must be a string, the value can be any hashable Python object. It must return the quantity of pairs indexed. ''' ids_shelf = {} key_shelf = {} ids_cnter = 0 tmp_reverse_id = {} indexed_counter = 0 # fill them for key, value in source: indexed_counter += 1 # process key if not isinstance(key, basestring): raise TypeError("The key must be string or unicode") # docid -> info final if value in tmp_reverse_id: docid = tmp_reverse_id[value] else: docid = ids_cnter tmp_reverse_id[value] = docid ids_cnter += 1 ids_shelf[docid] = value # keys -> docid key_shelf.setdefault(key, set()).add(docid) # save key keyfilename = os.path.join(directory, "easyindex.key.bz2") fh = CompressedFile(keyfilename, "wb") cPickle.dump(key_shelf, fh, 2) fh.close() # split ids_shelf in N dicts of about ~5k entries N = int(round(len(ids_shelf) / 5000.0)) if not N: N = 1 all_idshelves = [{} for i in range(N)] for k,v in ids_shelf.iteritems(): cual = utiles.coherent_hash(k) % N all_idshelves[cual][k] = v # save dict where corresponds for cual, shelf in enumerate(all_idshelves): fname = "easyindex-%03d.ids.bz2" % cual idsfilename = os.path.join(directory, fname) fh = CompressedFile(idsfilename, "wb") cPickle.dump(shelf, fh, 2) fh.close() return indexed_counter
def create(cls, directory, source): '''Creates the index in the directory. The "source" generates pairs (key, value) to store in the index. The key must be a string, the value can be any hashable Python object. It must return the quantity of pairs indexed. ''' ids_shelf = {} key_shelf = {} ids_cnter = 0 tmp_reverse_id = {} indexed_counter = 0 # fill them for key, value in source: indexed_counter += 1 # process key if not isinstance(key, basestring): raise TypeError("The key must be string or unicode") # docid -> info final if value in tmp_reverse_id: docid = tmp_reverse_id[value] else: docid = ids_cnter tmp_reverse_id[value] = docid ids_cnter += 1 ids_shelf[docid] = value # keys -> docid key_shelf.setdefault(key, set()).add(docid) # save key keyfilename = os.path.join(directory, "easyindex.key.bz2") fh = CompressedFile(keyfilename, "wb") cPickle.dump(key_shelf, fh, 2) fh.close() # split ids_shelf in N dicts of about ~5k entries N = int(round(len(ids_shelf) / 5000.0)) if not N: N = 1 all_idshelves = [{} for i in range(N)] for k, v in ids_shelf.iteritems(): cual = utiles.coherent_hash(k) % N all_idshelves[cual][k] = v # save dict where corresponds for cual, shelf in enumerate(all_idshelves): fname = "easyindex-%03d.ids.bz2" % cual idsfilename = os.path.join(directory, fname) fh = CompressedFile(idsfilename, "wb") cPickle.dump(shelf, fh, 2) fh.close() return indexed_counter
def get_item(self, fileName): """Get the item from inside of a block.""" bloqNum = utiles.coherent_hash(fileName.encode('utf8')) % self.num_bloques bloqName = "%08x%s" % (bloqNum, self.archive_extension) logger.debug("block: %s", bloqName) comp = self.getBloque(bloqName) item = comp.get_item(fileName) logger.debug("len item: %s", None if item is None else len(item)) return item
def get_item(self, fileName): bloqNum = utiles.coherent_hash(fileName.encode('utf8')) % self.num_bloques bloqName = "%08x%s" % (bloqNum, self.archive_extension) if self.verbose: print "block:", bloqName comp = self.getBloque(bloqName) item = comp.get_item(fileName) if self.verbose and item is not None: print "len item:", len(item) return item
def _get_info_id(self, allids): '''Returns the values for the given ids. As it groups the ids according to the file, is much faster than retrieving one by one. ''' # group the id per file cuales = {} for i in allids: cual = utiles.coherent_hash(i) % self.idfiles_count cuales.setdefault(cual, []).append(i) # get the info for each file for cual, ids in cuales.items(): idx = self._get_ids_shelve(cual) for i in ids: yield idx[i]