def ingest_files(self): """Copy self.files into the user's ingest directory on the CRDS server.""" stats = self._start_stats() destination = self.submission_info.ingest_dir host, path = destination.split(":") total_size = utils.total_size(self.files) ingest_info = self.get_ingested_files() self.scan_for_nonsubmitted_ingests(ingest_info) remaining_files = self.keep_existing_files(ingest_info, self.files) \ if self.args.keep_existing_files else self.files for i, filename in enumerate(remaining_files): file_size = utils.file_size(filename) log.info("Copy started", repr(filename), "[", i+1, "/", len(self.files), " files ]", "[", utils.human_format_number(file_size), "/", utils.human_format_number(total_size), " bytes ]") self.copy_file(filename, path, destination) stats.increment("bytes", file_size) stats.increment("files", 1) stats.log_status("files", "Copy complete", len(self.files)) stats.log_status("bytes", "Copy complete", total_size) log.divider(func=log.verbose) stats.report() log.divider(char="=")
def ingest_files(self): """Copy self.files into the user's ingest directory on the CRDS server.""" stats = self._start_stats() destination = self.submission_info.ingest_dir host, path = destination.split(":") total_size = utils.total_size(self.files) ingest_info = self.get_ingested_files() self.scan_for_nonsubmitted_ingests(ingest_info) remaining_files = self.keep_existing_files(ingest_info, self.files) \ if self.args.keep_existing_files else self.files for i, filename in enumerate(remaining_files): file_size = utils.file_size(filename) log.info("Copy started", repr(filename), "[", i + 1, "/", len(self.files), " files ]", "[", utils.human_format_number(file_size), "/", utils.human_format_number(total_size), " bytes ]") self.copy_file(filename, path, destination) stats.increment("bytes", file_size) stats.increment("files", 1) stats.log_status("files", "Copy complete", len(self.files)) stats.log_status("bytes", "Copy complete", total_size) log.divider(func=log.verbose) stats.report() log.divider(char="=")
def file_progress(activity, name, path, bytes, bytes_so_far, total_bytes, nth_file, total_files): """Output progress information for `activity` on file `name` at `path`.""" return "{activity} {path!s:<55} {bytes} bytes ({nth_file} / {total_files} files) ({bytes_so_far} / {total_bytes} bytes)".format( activity=activity, path=path, bytes=utils.human_format_number(bytes), nth_file=nth_file+1, total_files=total_files, bytes_so_far=utils.human_format_number(bytes_so_far).strip(), total_bytes=utils.human_format_number(total_bytes).strip())
def keep_existing_files(self, ingest_info, files): """Keep files which have already been copied and have the correct server side length. This can save *hours* of copy time for repeat submissions. """ for filename in files[:]: local_size = utils.file_size(filename) basename = os.path.basename(filename) try: existing_size = int(ingest_info[basename]["size"]) except: log.info( "File", repr(filename), "does not exist in ingest directory and will be copied to CRDS server." ) continue if local_size == existing_size: log.info( "File", repr(filename), "has already been copied and has correct length on CRDS server", utils.human_format_number(existing_size)) files.remove(filename) else: log.info( "File", repr(filename), "exists but has incorrect size and must be recopied. Deleting old ingest." ) self.connection.get(ingest_info[basename]["delete_url"]) return files
def _start_stats(self): """Helper method to initialize stats keeping for ingest.""" total_bytes = utils.total_size(self.files) stats = utils.TimingStats(output=log.verbose) stats.start() log.divider(name="ingest files", char="=") log.info("Copying", len(self.files), "file(s) totalling", utils.human_format_number(total_bytes), "bytes") log.divider(func=log.verbose) return stats
def fetch_references(self, references): """Gets all references required to support `only_contexts`. Removes all references from the CRDS reference cache which are not required for `only_contexts`. """ if not self.contexts: return if self.args.readonly_cache: already_have = set(rmap.list_references("*", self.observatory)) fetched = [ x for x in sorted(set(references)-set(already_have)) if not x.startswith("NOT FOUND") ] if fetched: log.info("READONLY CACHE would fetch references:", repr(fetched)) with log.info_on_exception("Reference size information not available."): info_map = api.get_file_info_map(self.observatory, fetched, fields=["size"]) total_bytes = api.get_total_bytes(info_map) log.info("READONLY CACHE would download", len(fetched), "references totaling", utils.human_format_number(total_bytes).strip(), "bytes.") else: self.dump_files(self.contexts[0], references)
def get_data_http(self, filename): """Yield the data returned from `filename` of `pipeline_context` in manageable chunks.""" url = self.get_url(filename) try: infile = urlopen(url) file_size = utils.human_format_number(self.catalog_file_size(filename)).strip() stats = utils.TimingStats() data = infile.read(config.CRDS_DATA_CHUNK_SIZE) while data: stats.increment("bytes", len(data)) status = stats.status("bytes") bytes_so_far = " ".join(status[0].split()[:-1]) log.verbose("Transferred HTTP", repr(url), bytes_so_far, "/", file_size, "bytes at", status[1], verbosity=20) yield data data = infile.read(config.CRDS_DATA_CHUNK_SIZE) except Exception as exc: raise CrdsDownloadError("Failed downloading", srepr(filename), "from url", srepr(url), ":", str(exc)) finally: try: infile.close() except UnboundLocalError: # maybe the open failed. pass
def keep_existing_files(self, ingest_info, files): """Keep files which have already been copied and have the correct server side length. This can save *hours* of copy time for repeat submissions. """ for filename in files[:]: local_size = utils.file_size(filename) basename = os.path.basename(filename) try: existing_size = int(ingest_info[basename]["size"]) except: log.info("File", repr(filename), "does not exist in ingest directory and will be copied to CRDS server.") continue if local_size == existing_size: log.info("File", repr(filename), "has already been copied and has correct length on CRDS server", utils.human_format_number(existing_size)) files.remove(filename) else: log.info("File", repr(filename), "exists but has incorrect size and must be recopied. Deleting old ingest.") self.connection.get(ingest_info[basename]["delete_url"]) return files