def copy_rec_files(self, user, collection, recording, warc_files): if self.dry_run: target_dirname = os.path.join('/tmp/migrate4.0', collection.my_id) else: target_dirname = user.get_user_temp_warc_path() os.makedirs(target_dirname, exist_ok=True) print('Writing to dir: ' + target_dirname) coll_warc_key = recording.COLL_WARC_KEY.format(coll=collection.my_id) rec_warc_key = recording.REC_WARC_KEY.format(rec=recording.my_id) # Copy WARCs loader = BlockLoader() total_size = 0 for n, url in warc_files.items(): if not url.startswith('s3://'): print('FILE ERR: Skipping local file: ' + url) continue local_filename = n if n != recording.INDEX_FILE_KEY else os.path.basename( url) target_file = os.path.join(target_dirname, local_filename) src = loader.load(url) try: with open(target_file, 'wb') as dest: print('Copying {0} -> {1}'.format(url, target_file)) shutil.copyfileobj(src, dest) size = dest.tell() target_file = add_local_store_prefix(target_file) if n != recording.INDEX_FILE_KEY: self.redis.hset(coll_warc_key, n, target_file) self.redis.sadd(rec_warc_key, n) total_size += size else: recording.set_prop(n, target_file, update_ts=False) if self.dry_run: os.remove(strip_prefix(target_file)) except: import traceback traceback.print_exc() # commit from temp dir to storage if not self.dry_run: recording.commit_to_storage() return total_size
def copy_rec_files(self, user, collection, recording, warc_files): if self.dry_run: target_dirname = os.path.join('/tmp/migrate4.0', collection.my_id) else: target_dirname = user.get_user_temp_warc_path() os.makedirs(target_dirname, exist_ok=True) print('Writing to dir: ' + target_dirname) coll_warc_key = recording.COLL_WARC_KEY.format(coll=collection.my_id) rec_warc_key = recording.REC_WARC_KEY.format(rec=recording.my_id) # Copy WARCs loader = BlockLoader() total_size = 0 for n, url in warc_files.items(): if not url.startswith('s3://'): print('FILE ERR: Skipping local file: ' + url) continue local_filename = n if n != recording.INDEX_FILE_KEY else os.path.basename(url) target_file = os.path.join(target_dirname, local_filename) src = loader.load(url) try: with open(target_file, 'wb') as dest: print('Copying {0} -> {1}'.format(url, target_file)) shutil.copyfileobj(src, dest) size = dest.tell() target_file = add_local_store_prefix(target_file) if n != recording.INDEX_FILE_KEY: self.redis.hset(coll_warc_key, n, target_file) self.redis.sadd(rec_warc_key, n) total_size += size else: recording.set_prop(n, target_file, update_ts=False) if self.dry_run: os.remove(strip_prefix(target_file)) except: import traceback traceback.print_exc() # commit from temp dir to storage if not self.dry_run: recording.commit_to_storage() return total_size
class StaticHandler(BaseHandler): def __init__(self, static_path): mimetypes.init() self.static_path = static_path self.block_loader = BlockLoader() def __call__(self, wbrequest): url = wbrequest.wb_url_str.split('?')[0] full_path = self.static_path + url try: data = self.block_loader.load(full_path) try: data.seek(0, 2) size = data.tell() data.seek(0) headers = [('Content-Length', str(size))] except IOError: headers = None if 'wsgi.file_wrapper' in wbrequest.env: reader = wbrequest.env['wsgi.file_wrapper'](data) else: reader = iter(lambda: data.read(), '') content_type = 'application/octet-stream' guessed = mimetypes.guess_type(full_path) if guessed[0]: content_type = guessed[0] return WbResponse.text_stream(data, content_type=content_type, headers=headers) except IOError: raise NotFoundException('Static File Not Found: ' + wbrequest.wb_url_str) def __str__(self): # pragma: no cover return 'Static files from ' + self.static_path
def wasapi_download(self, username, coll_name, filename): user = self._get_wasapi_user(username) if not user: self._raise_error(404, 'no_such_user') collection = user.get_collection_by_name(coll_name) if not collection: self._raise_error(404, 'no_such_collection') # self.access.assert_is_curr_user(user) # only users with write access can use wasapi self.access.assert_can_write_coll(collection) warc_key = collection.get_warc_key() warc_path = self.redis.hget(warc_key, filename) if not warc_path: self._raise_error(404, 'file_not_found') response.headers['Content-Type'] = 'application/octet-stream' response.headers[ 'Content-Disposition'] = "attachment; filename*=UTF-8''" + filename response.headers['Transfer-Encoding'] = 'chunked' loader = BlockLoader() fh = None try: fh = loader.load(warc_path) except Exception: self._raise_error(400, 'file_load_error') def read_all(fh): for chunk in StreamIter(fh): yield chunk return read_all(fh)
class ZipNumIndexSource(BaseIndexSource): DEFAULT_RELOAD_INTERVAL = 10 # in minutes DEFAULT_MAX_BLOCKS = 10 IDX_EXT = ('.idx', '.summary') def __init__(self, summary, config=None): self.max_blocks = self.DEFAULT_MAX_BLOCKS self.loc_resolver = None self.config = config or {} loc = None cookie_maker = None reload_ival = self.DEFAULT_RELOAD_INTERVAL if config: loc = config.get('shard_index_loc') cookie_maker = config.get('cookie_maker') self.max_blocks = config.get('max_blocks', self.max_blocks) reload_ival = config.get('reload_interval', reload_ival) if isinstance(loc, dict): self.loc_resolver = LocPrefixResolver(summary, loc) else: self.loc_resolver = LocMapResolver(summary, loc) self.summary = summary # reload interval self.loc_update_time = datetime.datetime.now() self.reload_interval = datetime.timedelta(minutes=reload_ival) self.blk_loader = BlockLoader(cookie_maker=cookie_maker) def load_index(self, params): self.loc_resolver.load_loc() return self._do_load_cdx(self.summary, CDXQuery(params)) def _do_load_cdx(self, filename, query): reader = open(filename, 'rb') idx_iter = self.compute_page_range(reader, query) if query.secondary_index_only: def gen_idx(): for idx in idx_iter: yield IDXObject(idx) return gen_idx() if query.page_count: return idx_iter blocks = self.idx_to_cdx(idx_iter, query) def gen_cdx(): for blk in blocks: for cdx in blk: yield CDXObject(cdx) return gen_cdx() def _page_info(self, pages, pagesize, blocks): info = AlwaysJsonResponse(pages=pages, pageSize=pagesize, blocks=blocks) return info def compute_page_range(self, reader, query): pagesize = query.page_size if not pagesize: pagesize = self.max_blocks else: pagesize = int(pagesize) last_line = None # Get End end_iter = search(reader, query.end_key, prev_size=1) try: end_line = six.next(end_iter) except StopIteration: last_line = read_last_line(reader) end_line = last_line # Get Start first_iter = iter_range(reader, query.key, query.end_key, prev_size=1) try: first_line = six.next(first_iter) except StopIteration: if end_line == last_line and query.key >= last_line: first_line = last_line else: reader.close() if query.page_count: yield self._page_info(0, pagesize, 0) return first = IDXObject(first_line) end = IDXObject(end_line) try: blocks = end['lineno'] - first['lineno'] total_pages = int(blocks / pagesize) + 1 except: blocks = -1 total_pages = 1 if query.page_count: # same line, so actually need to look at cdx # to determine if it exists if blocks == 0: try: block_cdx_iter = self.idx_to_cdx([first_line], query) block = six.next(block_cdx_iter) cdx = six.next(block) except StopIteration: total_pages = 0 blocks = -1 yield self._page_info(total_pages, pagesize, blocks + 1) reader.close() return curr_page = query.page if curr_page >= total_pages or curr_page < 0: msg = 'Page {0} invalid: First Page is 0, Last Page is {1}' reader.close() raise CDXException(msg.format(curr_page, total_pages - 1)) startline = curr_page * pagesize endline = startline + pagesize - 1 if blocks >= 0: endline = min(endline, blocks) if curr_page == 0: yield first_line else: startline -= 1 idxiter = itertools.islice(first_iter, startline, endline) for idx in idxiter: yield idx reader.close() def search_by_line_num(self, reader, line): # pragma: no cover def line_cmp(line1, line2): line1_no = int(line1.rsplit(b'\t', 1)[-1]) line2_no = int(line2.rsplit(b'\t', 1)[-1]) return cmp(line1_no, line2_no) line_iter = search(reader, line, compare_func=line_cmp) yield six.next(line_iter) def idx_to_cdx(self, idx_iter, query): blocks = None ranges = [] for idx in idx_iter: idx = IDXObject(idx) if (blocks and blocks.part == idx['part'] and blocks.offset + blocks.length == idx['offset'] and blocks.count < self.max_blocks): blocks.length += idx['length'] blocks.count += 1 ranges.append(idx['length']) else: if blocks: yield self.block_to_cdx_iter(blocks, ranges, query) blocks = ZipBlocks(idx['part'], idx['offset'], idx['length'], 1) ranges = [blocks.length] if blocks: yield self.block_to_cdx_iter(blocks, ranges, query) def block_to_cdx_iter(self, blocks, ranges, query): last_exc = None last_traceback = None try: locations = self.loc_resolver(blocks.part, query) except: raise Exception('No Locations Found for: ' + blocks.part) for location in self.loc_resolver(blocks.part, query): try: return self.load_blocks(location, blocks, ranges, query) except Exception as exc: last_exc = exc import sys last_traceback = sys.exc_info()[2] if last_exc: six.reraise(Exception, last_exc, last_traceback) #raise last_exc else: raise Exception('No Locations Found for: ' + blocks.part) def load_blocks(self, location, blocks, ranges, query): """ Load one or more blocks of compressed cdx lines, return a line iterator which decompresses and returns one line at a time, bounded by query.key and query.end_key """ if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG): msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}' logging.debug(msg.format(b=blocks, loc=location)) reader = self.blk_loader.load(location, blocks.offset, blocks.length) def decompress_block(range_): decomp = gzip_decompressor() buff = decomp.decompress(reader.read(range_)) for line in BytesIO(buff): yield line def iter_blocks(reader): try: for r in ranges: yield decompress_block(r) finally: reader.close() # iterate over all blocks iter_ = itertools.chain.from_iterable(iter_blocks(reader)) # start bound iter_ = linearsearch(iter_, query.key) # end bound iter_ = itertools.takewhile(lambda line: line < query.end_key, iter_) return iter_ def __repr__(self): return 'ZipNumIndexSource({0}, {1})'.format(self.summary, self.config) def __str__(self): return 'zipnum' def __eq__(self, other): if not isinstance(other, self.__class__): return False return (self.summary == other.summary and self.config == other.config) @classmethod def init_from_string(cls, value): is_zipnum = False if value.startswith('zipnum+'): value = value[7:] is_zipnum = True if value.startswith('file://'): value = value[7:] if is_zipnum or value.endswith(cls.IDX_EXT): return cls(value, None) @classmethod def init_from_config(cls, config): if config['type'] != 'zipnum': return return cls(config['path'], config)
class ZipNumCluster(CDXSource): DEFAULT_RELOAD_INTERVAL = 10 # in minutes DEFAULT_MAX_BLOCKS = 50 def __init__(self, summary, config=None): loc = None cookie_maker = None self.max_blocks = self.DEFAULT_MAX_BLOCKS reload_ival = self.DEFAULT_RELOAD_INTERVAL if config: loc = config.get('zipnum_loc') cookie_maker = config.get('cookie_maker') self.max_blocks = config.get('max_blocks', self.max_blocks) reload_ival = config.get('reload_interval', reload_ival) if not loc: splits = os.path.splitext(summary) loc = splits[0] + '.loc' self.summary = summary self.loc_filename = loc # initial loc map self.loc_map = {} self.loc_mtime = 0 self.load_loc() # reload interval self.loc_update_time = datetime.datetime.now() self.reload_interval = datetime.timedelta(minutes=reload_ival) self.blk_loader = BlockLoader(cookie_maker=cookie_maker) def load_loc(self): # check modified time of current file before loading new_mtime = os.path.getmtime(self.loc_filename) if (new_mtime == self.loc_mtime): return # update loc file mtime self.loc_mtime = new_mtime logging.debug('Loading loc from: ' + self.loc_filename) with open(self.loc_filename) as fh: for line in fh: parts = line.rstrip().split('\t') self.loc_map[parts[0]] = parts[1:] # @staticmethod # def reload_timed(timestamp, val, delta, func): # now = datetime.datetime.now() # if now - timestamp >= delta: # func() # return now # return None # # def reload_loc(self): # reload_time = self.reload_timed(self.loc_update_time, # self.loc_map, # self.reload_interval, # self.load_loc) # # if reload_time: # self.loc_update_time = reload_time def lookup_loc(self, part): return self.loc_map[part] def load_cdx(self, query): self.load_loc() reader = open(self.summary) idx_iter = iter_range(reader, query.key, query.end_key, prev_size=1) if query.secondary_index_only: return idx_iter else: blocks = self.idx_to_cdx(idx_iter, query) def gen_cdx(): for blk in blocks: for cdx in blk: yield cdx return gen_cdx() def idx_to_cdx(self, idx_iter, query): blocks = None ranges = [] for idx in idx_iter: idx = IDXObject(idx) if (blocks and blocks.part == idx['part'] and blocks.offset + blocks.length == idx['offset'] and blocks.count < self.max_blocks): blocks.length += idx['length'] blocks.count += 1 ranges.append(idx['length']) else: if blocks: yield self.block_to_cdx_iter(blocks, ranges, query) blocks = ZipBlocks(idx['part'], idx['offset'], idx['length'], 1) ranges = [blocks.length] if blocks: yield self.block_to_cdx_iter(blocks, ranges, query) def block_to_cdx_iter(self, blocks, ranges, query): last_exc = None last_traceback = None for location in self.lookup_loc(blocks.part): try: return self.load_blocks(location, blocks, ranges, query) except Exception as exc: last_exc = exc import sys last_traceback = sys.exc_info()[2] if last_exc: raise exc, None, last_traceback else: raise Exception('No Locations Found for: ' + block.part) def load_blocks(self, location, blocks, ranges, query): if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG): msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}' logging.debug(msg.format(b=blocks, loc=location)) reader = self.blk_loader.load(location, blocks.offset, blocks.length) def decompress_block(range_): decomp = gzip_decompressor() buff = decomp.decompress(reader.read(range_)) return readline_to_iter(BytesIO(buff)) iter_ = itertools.chain(*itertools.imap(decompress_block, ranges)) # start bound iter_ = linearsearch(iter_, query.key) # end bound end = query.end_key iter_ = itertools.takewhile(lambda line: line < end, iter_) return iter_ def __str__(self): return 'ZipNum Cluster: {0}, {1}'.format(self.summary, self.loc_filename)
def copy_data_from_recording(self, source, delete_source=False): """Copy given recording building block entries. :param RedisUniqueComponent source: building block :param bool delete_source: whether to delete source building block :returns: whether successful or not :rtype: bool """ if self == source: return False if not self.is_open(): return False errored = False self._copy_prop(source, 'title') self._copy_prop(source, 'desc') self._copy_prop(source, 'rec_type') self._copy_prop(source, 'recorded_at') #self._copy_prop(source, 'patch_rec') collection = self.get_owner() user = collection.get_owner() target_dirname = user.get_user_temp_warc_path() target_warc_key = self.COLL_WARC_KEY.format(coll=collection.my_id) # Copy WARCs loader = BlockLoader() for n, url in source.iter_all_files(include_index=True): local_filename = n + '.' + timestamp20_now() target_file = os.path.join(target_dirname, local_filename) src = loader.load(url) try: with open(target_file, 'wb') as dest: logger.debug('Copying {0} -> {1}'.format(url, target_file)) shutil.copyfileobj(src, dest) size = dest.tell() if n != self.INDEX_FILE_KEY: self.incr_size(size) self.redis.hset(target_warc_key, n, add_local_store_prefix(target_file)) else: self.set_prop(n, target_file) except: traceback.print_exc() errored = True # COPY cdxj, if exists source_key = self.CDXJ_KEY.format(rec=source.my_id) target_key = self.CDXJ_KEY.format(rec=self.my_id) self.redis.zunionstore(target_key, [source_key]) # recreate pages, if any, in new recording source_coll = source.get_owner() source_pages = source_coll.list_rec_pages(source) collection.import_pages(source_pages, self) # COPY remote archives, if any self.redis.sunionstore(self.RA_KEY.format(rec=self.my_id), self.RA_KEY.format(rec=source.my_id)) # COPY recording warc keys self.redis.sunionstore(self.REC_WARC_KEY.format(rec=self.my_id), self.REC_WARC_KEY.format(rec=source.my_id)) # sync collection cdxj, if exists collection.sync_coll_index(exists=True, do_async=True) if not errored and delete_source: collection = source.get_owner() collection.remove_recording(source, delete=True) return not errored
class ZipNumCluster(CDXSource): DEFAULT_RELOAD_INTERVAL = 10 # in minutes DEFAULT_MAX_BLOCKS = 10 def __init__(self, summary, config=None): self.max_blocks = self.DEFAULT_MAX_BLOCKS self.loc_resolver = None loc = None cookie_maker = None reload_ival = self.DEFAULT_RELOAD_INTERVAL if config: loc = config.get('shard_index_loc') cookie_maker = config.get('cookie_maker') self.max_blocks = config.get('max_blocks', self.max_blocks) reload_ival = config.get('reload_interval', reload_ival) if isinstance(loc, dict): self.loc_resolver = LocPrefixResolver(summary, loc) else: self.loc_resolver = LocMapResolver(summary, loc) self.summary = summary # reload interval self.loc_update_time = datetime.datetime.now() self.reload_interval = datetime.timedelta(minutes=reload_ival) self.blk_loader = BlockLoader(cookie_maker=cookie_maker) # @staticmethod # def reload_timed(timestamp, val, delta, func): # now = datetime.datetime.now() # if now - timestamp >= delta: # func() # return now # return None # # def reload_loc(self): # reload_time = self.reload_timed(self.loc_update_time, # self.loc_map, # self.reload_interval, # self.load_loc) # # if reload_time: # self.loc_update_time = reload_time def load_cdx(self, query): self.loc_resolver.load_loc() return self._do_load_cdx(self.summary, query) def _do_load_cdx(self, filename, query): reader = open(filename, 'rb') idx_iter = self.compute_page_range(reader, query) if query.secondary_index_only or query.page_count: return idx_iter blocks = self.idx_to_cdx(idx_iter, query) def gen_cdx(): for blk in blocks: for cdx in blk: yield cdx return gen_cdx() def _page_info(self, pages, pagesize, blocks): info = dict(pages=pages, pageSize=pagesize, blocks=blocks) return json.dumps(info) + '\n' def compute_page_range(self, reader, query): pagesize = query.page_size if not pagesize: pagesize = self.max_blocks else: pagesize = int(pagesize) last_line = None # Get End end_iter = search(reader, query.end_key, prev_size=1) try: end_line = end_iter.next() except StopIteration: last_line = read_last_line(reader) end_line = last_line # Get Start first_iter = iter_range(reader, query.key, query.end_key, prev_size=1) try: first_line = first_iter.next() except StopIteration: if end_line == last_line and query.key >= last_line: first_line = last_line else: reader.close() if query.page_count: yield self._page_info(0, pagesize, 0) return else: raise first = IDXObject(first_line) end = IDXObject(end_line) try: blocks = end['lineno'] - first['lineno'] total_pages = blocks / pagesize + 1 except: blocks = -1 total_pages = 1 if query.page_count: # same line, so actually need to look at cdx # to determine if it exists if blocks == 0: try: block_cdx_iter = self.idx_to_cdx([first_line], query) block = block_cdx_iter.next() cdx = block.next() except StopIteration: total_pages = 0 blocks = -1 yield self._page_info(total_pages, pagesize, blocks + 1) reader.close() return curr_page = query.page if curr_page >= total_pages or curr_page < 0: msg = 'Page {0} invalid: First Page is 0, Last Page is {1}' reader.close() raise CDXException(msg.format(curr_page, total_pages - 1)) startline = curr_page * pagesize endline = startline + pagesize - 1 if blocks >= 0: endline = min(endline, blocks) if curr_page == 0: yield first_line else: startline -= 1 idxiter = itertools.islice(first_iter, startline, endline) for idx in idxiter: yield idx reader.close() def search_by_line_num(self, reader, line): # pragma: no cover def line_cmp(line1, line2): line1_no = int(line1.rsplit('\t', 1)[-1]) line2_no = int(line2.rsplit('\t', 1)[-1]) return cmp(line1_no, line2_no) line_iter = search(reader, line, compare_func=line_cmp) yield line_iter.next() def idx_to_cdx(self, idx_iter, query): blocks = None ranges = [] for idx in idx_iter: idx = IDXObject(idx) if (blocks and blocks.part == idx['part'] and blocks.offset + blocks.length == idx['offset'] and blocks.count < self.max_blocks): blocks.length += idx['length'] blocks.count += 1 ranges.append(idx['length']) else: if blocks: yield self.block_to_cdx_iter(blocks, ranges, query) blocks = ZipBlocks(idx['part'], idx['offset'], idx['length'], 1) ranges = [blocks.length] if blocks: yield self.block_to_cdx_iter(blocks, ranges, query) def block_to_cdx_iter(self, blocks, ranges, query): last_exc = None last_traceback = None try: locations = self.loc_resolver(blocks.part, query) except: raise Exception('No Locations Found for: ' + blocks.part) for location in self.loc_resolver(blocks.part, query): try: return self.load_blocks(location, blocks, ranges, query) except Exception as exc: last_exc = exc import sys last_traceback = sys.exc_info()[2] if last_exc: raise last_exc, None, last_traceback else: raise Exception('No Locations Found for: ' + blocks.part) def load_blocks(self, location, blocks, ranges, query): """ Load one or more blocks of compressed cdx lines, return a line iterator which decompresses and returns one line at a time, bounded by query.key and query.end_key """ if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG): msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}' logging.debug(msg.format(b=blocks, loc=location)) reader = self.blk_loader.load(location, blocks.offset, blocks.length) def decompress_block(range_): decomp = gzip_decompressor() buff = decomp.decompress(reader.read(range_)) for line in BytesIO(buff): yield line iter_ = itertools.chain(*itertools.imap(decompress_block, ranges)) # start bound iter_ = linearsearch(iter_, query.key) # end bound end = query.end_key iter_ = itertools.takewhile(lambda line: line < end, iter_) return iter_ def __str__(self): return 'ZipNum Cluster: {0}, {1}'.format(self.summary, self.loc_resolver)
def copy_data_from_recording(self, source, delete_source=False): """Copy given recording building block entries. :param RedisUniqueComponent source: building block :param bool delete_source: whether to delete source building block :returns: whether successful or not :rtype: bool """ if self == source: return False if not self.is_open(): return False errored = False self._copy_prop(source, 'title') self._copy_prop(source, 'desc') self._copy_prop(source, 'rec_type') self._copy_prop(source, 'recorded_at') #self._copy_prop(source, 'patch_rec') collection = self.get_owner() user = collection.get_owner() target_dirname = user.get_user_temp_warc_path() target_warc_key = self.COLL_WARC_KEY.format(coll=collection.my_id) # Copy WARCs loader = BlockLoader() for n, url in source.iter_all_files(include_index=True): local_filename = n + '.' + timestamp20_now() target_file = os.path.join(target_dirname, local_filename) src = loader.load(url) try: with open(target_file, 'wb') as dest: print('Copying {0} -> {1}'.format(url, target_file)) shutil.copyfileobj(src, dest) size = dest.tell() if n != self.INDEX_FILE_KEY: self.incr_size(size) self.redis.hset(target_warc_key, n, add_local_store_prefix(target_file)) else: self.set_prop(n, target_file) except: import traceback traceback.print_exc() errored = True # COPY cdxj, if exists source_key = self.CDXJ_KEY.format(rec=source.my_id) target_key = self.CDXJ_KEY.format(rec=self.my_id) self.redis.zunionstore(target_key, [source_key]) # recreate pages, if any, in new recording source_coll = source.get_owner() source_pages = source_coll.list_rec_pages(source) collection.import_pages(source_pages, self) # COPY remote archives, if any self.redis.sunionstore(self.RA_KEY.format(rec=self.my_id), self.RA_KEY.format(rec=source.my_id)) # COPY recording warc keys self.redis.sunionstore(self.REC_WARC_KEY.format(rec=self.my_id), self.REC_WARC_KEY.format(rec=source.my_id)) # sync collection cdxj, if exists collection.sync_coll_index(exists=True, do_async=True) if not errored and delete_source: collection = source.get_owner() collection.remove_recording(source, delete=True) return not errored
class ZipNumIndexSource(BaseIndexSource): DEFAULT_RELOAD_INTERVAL = 10 # in minutes DEFAULT_MAX_BLOCKS = 10 IDX_EXT = ('.idx', '.summary') def __init__(self, summary, config=None): self.max_blocks = self.DEFAULT_MAX_BLOCKS self.loc_resolver = None self.config = config or {} loc = None cookie_maker = None reload_ival = self.DEFAULT_RELOAD_INTERVAL if config: loc = config.get('shard_index_loc') cookie_maker = config.get('cookie_maker') self.max_blocks = config.get('max_blocks', self.max_blocks) reload_ival = config.get('reload_interval', reload_ival) if isinstance(loc, dict): self.loc_resolver = LocPrefixResolver(summary, loc) else: self.loc_resolver = LocMapResolver(summary, loc) self.summary = summary # reload interval self.loc_update_time = datetime.datetime.now() self.reload_interval = datetime.timedelta(minutes=reload_ival) self.blk_loader = BlockLoader(cookie_maker=cookie_maker) def load_index(self, params): self.loc_resolver.load_loc() return self._do_load_cdx(self.summary, CDXQuery(params)) def _do_load_cdx(self, filename, query): reader = open(filename, 'rb') idx_iter = self.compute_page_range(reader, query) if query.secondary_index_only: def gen_idx(): for idx in idx_iter: yield IDXObject(idx) return gen_idx() if query.page_count: return idx_iter blocks = self.idx_to_cdx(idx_iter, query) def gen_cdx(): for blk in blocks: for cdx in blk: yield CDXObject(cdx) return gen_cdx() def _page_info(self, pages, pagesize, blocks): info = AlwaysJsonResponse( pages=pages, pageSize=pagesize, blocks=blocks) return info def compute_page_range(self, reader, query): pagesize = query.page_size if not pagesize: pagesize = self.max_blocks else: pagesize = int(pagesize) last_line = None # Get End end_iter = search(reader, query.end_key, prev_size=1) try: end_line = six.next(end_iter) except StopIteration: last_line = read_last_line(reader) end_line = last_line # Get Start first_iter = iter_range(reader, query.key, query.end_key, prev_size=1) try: first_line = six.next(first_iter) except StopIteration: if end_line == last_line and query.key >= last_line: first_line = last_line else: reader.close() if query.page_count: yield self._page_info(0, pagesize, 0) return first = IDXObject(first_line) end = IDXObject(end_line) try: blocks = end['lineno'] - first['lineno'] total_pages = int(blocks / pagesize) + 1 except: blocks = -1 total_pages = 1 if query.page_count: # same line, so actually need to look at cdx # to determine if it exists if blocks == 0: try: block_cdx_iter = self.idx_to_cdx([first_line], query) block = six.next(block_cdx_iter) cdx = six.next(block) except StopIteration: total_pages = 0 blocks = -1 yield self._page_info(total_pages, pagesize, blocks + 1) reader.close() return curr_page = query.page if curr_page >= total_pages or curr_page < 0: msg = 'Page {0} invalid: First Page is 0, Last Page is {1}' reader.close() raise CDXException(msg.format(curr_page, total_pages - 1)) startline = curr_page * pagesize endline = startline + pagesize - 1 if blocks >= 0: endline = min(endline, blocks) if curr_page == 0: yield first_line else: startline -= 1 idxiter = itertools.islice(first_iter, startline, endline) for idx in idxiter: yield idx reader.close() def search_by_line_num(self, reader, line): # pragma: no cover def line_cmp(line1, line2): line1_no = int(line1.rsplit(b'\t', 1)[-1]) line2_no = int(line2.rsplit(b'\t', 1)[-1]) return cmp(line1_no, line2_no) line_iter = search(reader, line, compare_func=line_cmp) yield six.next(line_iter) def idx_to_cdx(self, idx_iter, query): blocks = None ranges = [] for idx in idx_iter: idx = IDXObject(idx) if (blocks and blocks.part == idx['part'] and blocks.offset + blocks.length == idx['offset'] and blocks.count < self.max_blocks): blocks.length += idx['length'] blocks.count += 1 ranges.append(idx['length']) else: if blocks: yield self.block_to_cdx_iter(blocks, ranges, query) blocks = ZipBlocks(idx['part'], idx['offset'], idx['length'], 1) ranges = [blocks.length] if blocks: yield self.block_to_cdx_iter(blocks, ranges, query) def block_to_cdx_iter(self, blocks, ranges, query): last_exc = None last_traceback = None try: locations = self.loc_resolver(blocks.part, query) except: raise Exception('No Locations Found for: ' + blocks.part) for location in self.loc_resolver(blocks.part, query): try: return self.load_blocks(location, blocks, ranges, query) except Exception as exc: last_exc = exc import sys last_traceback = sys.exc_info()[2] if last_exc: six.reraise(Exception, last_exc, last_traceback) #raise last_exc else: raise Exception('No Locations Found for: ' + blocks.part) def load_blocks(self, location, blocks, ranges, query): """ Load one or more blocks of compressed cdx lines, return a line iterator which decompresses and returns one line at a time, bounded by query.key and query.end_key """ if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG): msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}' logging.debug(msg.format(b=blocks, loc=location)) reader = self.blk_loader.load(location, blocks.offset, blocks.length) def decompress_block(range_): decomp = gzip_decompressor() buff = decomp.decompress(reader.read(range_)) for line in BytesIO(buff): yield line def iter_blocks(reader): try: for r in ranges: yield decompress_block(r) finally: reader.close() # iterate over all blocks iter_ = itertools.chain.from_iterable(iter_blocks(reader)) # start bound iter_ = linearsearch(iter_, query.key) # end bound iter_ = itertools.takewhile(lambda line: line < query.end_key, iter_) return iter_ def __repr__(self): return 'ZipNumIndexSource({0}, {1})'.format(self.summary, self.config) def __str__(self): return 'zipnum' def __eq__(self, other): if not isinstance(other, self.__class__): return False return (self.summary == other.summary and self.config == other.config) @classmethod def init_from_string(cls, value): is_zipnum = False if value.startswith('zipnum+'): value = value[7:] is_zipnum = True if value.startswith('file://'): value = value[7:] if is_zipnum or value.endswith(cls.IDX_EXT): return cls(value, None) @classmethod def init_from_config(cls, config): if config['type'] != 'zipnum': return return cls(config['path'], config)
class ZipNumCluster(CDXSource): DEFAULT_RELOAD_INTERVAL = 10 # in minutes DEFAULT_MAX_BLOCKS = 10 def __init__(self, summary, config=None): self.max_blocks = self.DEFAULT_MAX_BLOCKS self.loc_resolver = None loc = None cookie_maker = None reload_ival = self.DEFAULT_RELOAD_INTERVAL if config: loc = config.get('shard_index_loc') cookie_maker = config.get('cookie_maker') self.max_blocks = config.get('max_blocks', self.max_blocks) reload_ival = config.get('reload_interval', reload_ival) if isinstance(loc, dict): self.loc_resolver = LocPrefixResolver(summary, loc) else: self.loc_resolver = LocMapResolver(summary, loc) self.summary = summary # reload interval self.loc_update_time = datetime.datetime.now() self.reload_interval = datetime.timedelta(minutes=reload_ival) self.blk_loader = BlockLoader(cookie_maker=cookie_maker) # @staticmethod # def reload_timed(timestamp, val, delta, func): # now = datetime.datetime.now() # if now - timestamp >= delta: # func() # return now # return None # # def reload_loc(self): # reload_time = self.reload_timed(self.loc_update_time, # self.loc_map, # self.reload_interval, # self.load_loc) # # if reload_time: # self.loc_update_time = reload_time def load_cdx(self, query): self.loc_resolver.load_loc() reader = open(self.summary, 'rb') idx_iter = self.compute_page_range(reader, query) if query.secondary_index_only or query.page_count: return idx_iter blocks = self.idx_to_cdx(idx_iter, query) def gen_cdx(): for blk in blocks: for cdx in blk: yield cdx return gen_cdx() def compute_page_range(self, reader, query): # Get End end_iter = search(reader, query.end_key, prev_size=1) try: end_line = end_iter.next() except StopIteration: end_line = read_last_line(reader) # Get Start first_iter = iter_range(reader, query.key, query.end_key, prev_size=1) try: first_line = first_iter.next() except StopIteration: reader.close() raise first = IDXObject(first_line) end = IDXObject(end_line) diff = end['lineno'] - first['lineno'] pagesize = query.page_size if not pagesize: pagesize = self.max_blocks total_pages = diff / pagesize + 1 if query.page_count: info = dict(pages=total_pages, pageSize=pagesize, blocks=diff + 1) yield json.dumps(info) reader.close() return curr_page = query.page if curr_page >= total_pages or curr_page < 0: msg = 'Page {0} invalid: First Page is 0, Last Page is {1}' reader.close() raise CDXException(msg.format(curr_page, total_pages - 1)) startline = curr_page * pagesize endline = min(startline + pagesize - 1, diff) if curr_page == 0: yield first_line else: startline -= 1 idxiter = itertools.islice(first_iter, startline, endline) for idx in idxiter: yield idx reader.close() def search_by_line_num(self, reader, line): # pragma: no cover def line_cmp(line1, line2): line1_no = int(line1.rsplit('\t', 1)[-1]) line2_no = int(line2.rsplit('\t', 1)[-1]) return cmp(line1_no, line2_no) line_iter = search(reader, line, compare_func=line_cmp) yield line_iter.next() def idx_to_cdx(self, idx_iter, query): blocks = None ranges = [] for idx in idx_iter: idx = IDXObject(idx) if (blocks and blocks.part == idx['part'] and blocks.offset + blocks.length == idx['offset'] and blocks.count < self.max_blocks): blocks.length += idx['length'] blocks.count += 1 ranges.append(idx['length']) else: if blocks: yield self.block_to_cdx_iter(blocks, ranges, query) blocks = ZipBlocks(idx['part'], idx['offset'], idx['length'], 1) ranges = [blocks.length] if blocks: yield self.block_to_cdx_iter(blocks, ranges, query) def block_to_cdx_iter(self, blocks, ranges, query): last_exc = None last_traceback = None for location in self.loc_resolver(blocks.part): try: return self.load_blocks(location, blocks, ranges, query) except Exception as exc: last_exc = exc import sys last_traceback = sys.exc_info()[2] if last_exc: raise exc, None, last_traceback else: raise Exception('No Locations Found for: ' + block.part) def load_blocks(self, location, blocks, ranges, query): """ Load one or more blocks of compressed cdx lines, return a line iterator which decompresses and returns one line at a time, bounded by query.key and query.end_key """ if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG): msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}' logging.debug(msg.format(b=blocks, loc=location)) reader = self.blk_loader.load(location, blocks.offset, blocks.length) def decompress_block(range_): decomp = gzip_decompressor() buff = decomp.decompress(reader.read(range_)) for line in BytesIO(buff): yield line iter_ = itertools.chain(*itertools.imap(decompress_block, ranges)) # start bound iter_ = linearsearch(iter_, query.key) # end bound end = query.end_key iter_ = itertools.takewhile(lambda line: line < end, iter_) return iter_ def __str__(self): return 'ZipNum Cluster: {0}, {1}'.format(self.summary, self.loc_resolver)