def write_cdxj(self, user, cdxj_key): #full_filename = self.redis.hget(warc_key, self.INDEX_FILE_KEY) full_filename = self.get_prop(self.INDEX_FILE_KEY) if full_filename: cdxj_filename = os.path.basename(strip_prefix(full_filename)) return cdxj_filename, full_filename dirname = user.get_user_temp_warc_path() randstr = base64.b32encode(os.urandom(5)).decode('utf-8') timestamp = timestamp_now() cdxj_filename = self.INDEX_NAME_TEMPL.format(timestamp=timestamp, random=randstr) os.makedirs(dirname, exist_ok=True) full_filename = os.path.join(dirname, cdxj_filename) cdxj_list = self.redis.zrange(cdxj_key, 0, -1) with open(full_filename, 'wt') as out: for cdxj in cdxj_list: out.write(cdxj + '\n') out.flush() full_url = add_local_store_prefix( full_filename.replace(os.path.sep, '/')) #self.redis.hset(warc_key, self.INDEX_FILE_KEY, full_url) self.set_prop(self.INDEX_FILE_KEY, full_url) return cdxj_filename, full_filename
def get_client_url(self, target_url): """Get client URL. :param str target_url: target URL :returns: client URL :rtype: str """ return add_local_store_prefix(target_url.replace(os.path.sep, '/'))
def get_client_url(self, target_url): """Get client URL. :param str target_url: target URL :returns: client URL :rtype: str """ return add_local_store_prefix(target_url.replace(os.path.sep, '/'))
def add_warc_file(self, full_filename, params): base_filename = self._get_rel_or_base_name(full_filename, params) file_key = res_template(self.file_key_template, params) rec_key = res_template(self.rec_file_key_template, params) full_load_path = storagepaths.add_local_store_prefix(full_filename) self.redis.hset(file_key, base_filename, full_load_path) self.redis.sadd(rec_key, base_filename)
def copy_rec_files(self, user, collection, recording, warc_files): if self.dry_run: target_dirname = os.path.join('/tmp/migrate4.0', collection.my_id) else: target_dirname = user.get_user_temp_warc_path() os.makedirs(target_dirname, exist_ok=True) print('Writing to dir: ' + target_dirname) coll_warc_key = recording.COLL_WARC_KEY.format(coll=collection.my_id) rec_warc_key = recording.REC_WARC_KEY.format(rec=recording.my_id) # Copy WARCs loader = BlockLoader() total_size = 0 for n, url in warc_files.items(): if not url.startswith('s3://'): print('FILE ERR: Skipping local file: ' + url) continue local_filename = n if n != recording.INDEX_FILE_KEY else os.path.basename( url) target_file = os.path.join(target_dirname, local_filename) src = loader.load(url) try: with open(target_file, 'wb') as dest: print('Copying {0} -> {1}'.format(url, target_file)) shutil.copyfileobj(src, dest) size = dest.tell() target_file = add_local_store_prefix(target_file) if n != recording.INDEX_FILE_KEY: self.redis.hset(coll_warc_key, n, target_file) self.redis.sadd(rec_warc_key, n) total_size += size else: recording.set_prop(n, target_file, update_ts=False) if self.dry_run: os.remove(strip_prefix(target_file)) except: import traceback traceback.print_exc() # commit from temp dir to storage if not self.dry_run: recording.commit_to_storage() return total_size
def copy_rec_files(self, user, collection, recording, warc_files): if self.dry_run: target_dirname = os.path.join('/tmp/migrate4.0', collection.my_id) else: target_dirname = user.get_user_temp_warc_path() os.makedirs(target_dirname, exist_ok=True) print('Writing to dir: ' + target_dirname) coll_warc_key = recording.COLL_WARC_KEY.format(coll=collection.my_id) rec_warc_key = recording.REC_WARC_KEY.format(rec=recording.my_id) # Copy WARCs loader = BlockLoader() total_size = 0 for n, url in warc_files.items(): if not url.startswith('s3://'): print('FILE ERR: Skipping local file: ' + url) continue local_filename = n if n != recording.INDEX_FILE_KEY else os.path.basename(url) target_file = os.path.join(target_dirname, local_filename) src = loader.load(url) try: with open(target_file, 'wb') as dest: print('Copying {0} -> {1}'.format(url, target_file)) shutil.copyfileobj(src, dest) size = dest.tell() target_file = add_local_store_prefix(target_file) if n != recording.INDEX_FILE_KEY: self.redis.hset(coll_warc_key, n, target_file) self.redis.sadd(rec_warc_key, n) total_size += size else: recording.set_prop(n, target_file, update_ts=False) if self.dry_run: os.remove(strip_prefix(target_file)) except: import traceback traceback.print_exc() # commit from temp dir to storage if not self.dry_run: recording.commit_to_storage() return total_size
def write_cdxj(self, user, cdxj_key): """Write CDX index lines to file. :param RedisUniqueComponent user: user :param str cdxj_key: CDX index file Redis key :returns: CDX file filename and path :rtype: str and str """ #full_filename = self.redis.hget(warc_key, self.INDEX_FILE_KEY) full_filename = self.get_prop(self.INDEX_FILE_KEY) if full_filename: cdxj_filename = os.path.basename(strip_prefix(full_filename)) return cdxj_filename, full_filename dirname = user.get_user_temp_warc_path() randstr = base64.b32encode(os.urandom(5)).decode('utf-8') timestamp = timestamp_now() cdxj_filename = self.INDEX_NAME_TEMPL.format(timestamp=timestamp, random=randstr) os.makedirs(dirname, exist_ok=True) full_filename = os.path.join(dirname, cdxj_filename) cdxj_list = self.redis.zrange(cdxj_key, 0, -1) with open(full_filename, 'wt') as out: for cdxj in cdxj_list: out.write(cdxj + '\n') out.flush() full_url = add_local_store_prefix(full_filename.replace(os.path.sep, '/')) #self.redis.hset(warc_key, self.INDEX_FILE_KEY, full_url) self.set_prop(self.INDEX_FILE_KEY, full_url) return cdxj_filename, full_filename
def copy_data_from_recording(self, source, delete_source=False): """Copy given recording building block entries. :param RedisUniqueComponent source: building block :param bool delete_source: whether to delete source building block :returns: whether successful or not :rtype: bool """ if self == source: return False if not self.is_open(): return False errored = False self._copy_prop(source, 'title') self._copy_prop(source, 'desc') self._copy_prop(source, 'rec_type') self._copy_prop(source, 'recorded_at') #self._copy_prop(source, 'patch_rec') collection = self.get_owner() user = collection.get_owner() target_dirname = user.get_user_temp_warc_path() target_warc_key = self.COLL_WARC_KEY.format(coll=collection.my_id) # Copy WARCs loader = BlockLoader() for n, url in source.iter_all_files(include_index=True): local_filename = n + '.' + timestamp20_now() target_file = os.path.join(target_dirname, local_filename) src = loader.load(url) try: with open(target_file, 'wb') as dest: logger.debug('Copying {0} -> {1}'.format(url, target_file)) shutil.copyfileobj(src, dest) size = dest.tell() if n != self.INDEX_FILE_KEY: self.incr_size(size) self.redis.hset(target_warc_key, n, add_local_store_prefix(target_file)) else: self.set_prop(n, target_file) except: traceback.print_exc() errored = True # COPY cdxj, if exists source_key = self.CDXJ_KEY.format(rec=source.my_id) target_key = self.CDXJ_KEY.format(rec=self.my_id) self.redis.zunionstore(target_key, [source_key]) # recreate pages, if any, in new recording source_coll = source.get_owner() source_pages = source_coll.list_rec_pages(source) collection.import_pages(source_pages, self) # COPY remote archives, if any self.redis.sunionstore(self.RA_KEY.format(rec=self.my_id), self.RA_KEY.format(rec=source.my_id)) # COPY recording warc keys self.redis.sunionstore(self.REC_WARC_KEY.format(rec=self.my_id), self.REC_WARC_KEY.format(rec=source.my_id)) # sync collection cdxj, if exists collection.sync_coll_index(exists=True, do_async=True) if not errored and delete_source: collection = source.get_owner() collection.remove_recording(source, delete=True) return not errored
def copy_data_from_recording(self, source, delete_source=False): """Copy given recording building block entries. :param RedisUniqueComponent source: building block :param bool delete_source: whether to delete source building block :returns: whether successful or not :rtype: bool """ if self == source: return False if not self.is_open(): return False errored = False self._copy_prop(source, 'title') self._copy_prop(source, 'desc') self._copy_prop(source, 'rec_type') self._copy_prop(source, 'recorded_at') #self._copy_prop(source, 'patch_rec') collection = self.get_owner() user = collection.get_owner() target_dirname = user.get_user_temp_warc_path() target_warc_key = self.COLL_WARC_KEY.format(coll=collection.my_id) # Copy WARCs loader = BlockLoader() for n, url in source.iter_all_files(include_index=True): local_filename = n + '.' + timestamp20_now() target_file = os.path.join(target_dirname, local_filename) src = loader.load(url) try: with open(target_file, 'wb') as dest: print('Copying {0} -> {1}'.format(url, target_file)) shutil.copyfileobj(src, dest) size = dest.tell() if n != self.INDEX_FILE_KEY: self.incr_size(size) self.redis.hset(target_warc_key, n, add_local_store_prefix(target_file)) else: self.set_prop(n, target_file) except: import traceback traceback.print_exc() errored = True # COPY cdxj, if exists source_key = self.CDXJ_KEY.format(rec=source.my_id) target_key = self.CDXJ_KEY.format(rec=self.my_id) self.redis.zunionstore(target_key, [source_key]) # recreate pages, if any, in new recording source_coll = source.get_owner() source_pages = source_coll.list_rec_pages(source) collection.import_pages(source_pages, self) # COPY remote archives, if any self.redis.sunionstore(self.RA_KEY.format(rec=self.my_id), self.RA_KEY.format(rec=source.my_id)) # COPY recording warc keys self.redis.sunionstore(self.REC_WARC_KEY.format(rec=self.my_id), self.REC_WARC_KEY.format(rec=source.my_id)) # sync collection cdxj, if exists collection.sync_coll_index(exists=True, do_async=True) if not errored and delete_source: collection = source.get_owner() collection.remove_recording(source, delete=True) return not errored
def get_client_url(self, target_url): return add_local_store_prefix(target_url.replace(os.path.sep, '/'))