def _test(): import sys import os # Act like gzip; with -d, act like gunzip. # The input file is not deleted, however, nor are any other gzip # options or features supported. args = sys.argv[1:] decompress = args and args[0] == "-d" if decompress: arg = args[1] else: arg = args[0] # if not args: # args = ["-"] if decompress: tsize = 0 if arg != "-": # outf = arg + ".dcp" outf = "/dev/null" fh = open(outf, "wb") gh = mgzip.open(arg, "rb") t0 = time.time() # gh.show_index() # data = b"AAA" chunk_size = 10**7 while True: data = gh.read(chunk_size) # data = gh.readline() if not data: break fh.write(data) tsize += len(data) # data = gh.readline() t1 = time.time() fh.close() gh.close() size = tsize / (1024**2) seconds = t1 - t0 speed = size / seconds nsize = os.stat(arg).st_size print( "Decompressed {:.2f} MB data in {:.2f} S, Speed: {:.2f} MB/s, Rate: {:.2f} %" .format(size, seconds, speed, nsize / tsize * 100)) else: if arg != "-": outf = arg + ".gz" fh = open(arg, "rb") gh = mgzip.open(outf, "wb", compresslevel=6) data = fh.read() t0 = time.time() gh.write(data) gh.close() t1 = time.time() size = len(data) / (1024**2) seconds = t1 - t0 speed = size / seconds nsize = os.stat(outf).st_size print( "Compressed {:.2f} MB data in {:.2f} S, Speed: {:.2f} MB/s, Rate: {:.2f} %" .format(size, seconds, speed, nsize / len(data) * 100))
def compress_gzip(target_fullpath_w_filename): output_fullpath_w_filename = target_fullpath_w_filename + ".gz" target_filestream = open(target_fullpath_w_filename, "rb") output_gz = mgzip.open(output_fullpath_w_filename, "wb", compresslevel=9) data = target_filestream.read() output_gz.write(data) output_gz.close()
def writeOutPredictionDict(self, dumping_data, outfilename): if not str(outfilename).endswith('.bin.gz'): outfilename = os.path.splitext(outfilename)[0] + '.bin.gz' with mgzip.open(outfilename, 'wb', thread=8, blocksize=2 * 10**7) as f2: pickle.dump(dumping_data, f2)
def test_read_rb(tmpdir): filename = os.path.join(tmpdir, "test.gz") with gzip.open(filename, "wb") as f1: f1.write(DATA1 * 500) with mgzip.open(filename, "rb") as f2: file_content = f2.read() assert file_content == DATA1 * 500
def _cmp(fnm, lvl): #print('>>>'+fnm) fh = open(fnm, "rb") #gh = mgzip.open(fnm + ".gz", "wb", compresslevel=lvl) #thread=8, blocksize=2*10**8 #gh = gzip.open(fnm + ".gz", "wb", compresslevel=lvl) gh = mgzip.open(fnm + ".gz", "wb", compresslevel=lvl, blocksize=10**6) data = fh.read() gh.write(data) gh.close()
def main(files, pdfpath, dumppath, soft, database_table_prefix, run_for=-1): global dataset_analysis_dict, fake_max_iou_values file_index = 0 t1 = time.time() for file in files: print("\nFILE\n", file_index) with gzip.open(file, 'rb') as f: data_loaded = pickle.load(f) # print("XYZ", len(data_dict['features']), len(data_dict['predicted']), len(data_dict['truth'])) file_results = analyse_multiple_endcaps_multi_cpu( data_loaded, soft=soft, beta_threshold=beta_threshold, distance_threshold=distance_threshold, iou_threshold=iou_threshold) for r in file_results: append_endcap_dict_to_dataset_dict(dataset_analysis_dict, r) # analyse_one_file(data_loaded, soft=soft) if file_index == run_for - 1: break file_index += 1 print("It took", time.time() - t1, "seconds") if len(dumppath) > 0: print("Dumping analysis to bin file", dumppath) with mgzip.open(dumppath, 'wb', thread=8, blocksize=2 * 10**7) as f: pickle.dump(dataset_analysis_dict, f) else: print( "WARNING: No analysis output path specified. Skipped dumping of analysis." ) # print("Number of total fakes is ", num_total_fakes) # np.savetxt('max_fake_iou.txt', fake_max_iou_values, delimiter=',') # 0/0 plotter = HGCalAnalysisPlotter() plotter.add_data_from_analysis_dict(dataset_analysis_dict) if len(pdfpath) != 0: plotter.write_to_pdf(pdfpath) if len(database_table_prefix) != 0: print("Will write plots to database") database_manager = ExperimentDatabaseManager( mysql_credentials=sql_credentials.credentials, cache_size=40) database_manager.set_experiment('analysis_plotting_experiments') plotter.write_data_to_database(database_manager, database_table_prefix) database_manager.close()
def compress(file_path, np=1): """Compress a file in .gz.""" # Note: this will open the whole file in memory # this might not be the best idea with open(file_path, 'r') as fr: file_string = ''.join(fr.readlines()) fr.close() with mgzip.open(f'{file_path}.gz', "wt", thread=np) as fw: fw.write(file_string) fw.close() os.remove(file_path)
def gunzip_file( input_filename: str, output_filename: str, blocksize: int = 5 * 1024 * 1024, threads: Optional[int] = None, ) -> str: """Gzip a file using mgzip for multithreading.""" with open(output_filename, mode="wb") as f_out: with mgzip.open(input_filename, mode="rb", blocksize=blocksize, thread=threads) as f_in: shutil.copyfileobj(f_in, f_out, length=blocksize // 2) return output_filename
def test_write_wb(tmpdir): filename = os.path.join(tmpdir, "test.gz") with mgzip.open(filename, "wb", compresslevel=6) as f1: f1.write(DATA1 * 50) # Try flush and fileno. f1.flush() f1.fileno() if hasattr(os, "fsync"): os.fsync(f1.fileno()) f1.close() f1.close() assert os.path.exists(filename) with gzip.open(filename, "rb") as f2: file_content = f2.read() assert file_content == DATA1 * 50
def make_map(self, file_): """construct a dictionary of pid - (batchnum, line#)""" file_, allowed_pids = file_ use_allowed_pids = len(allowed_pids) > 0 batchnum = int(file_.name.replace(".jsonl.gz", "")) pid2idx = {} result = set() with gzip.open(str(file_), "r") as f: for idx, line in enumerate(f): pid = re.search(PID_PAT, line.decode("utf8")).group(1) result.add(pid) pid2idx[pid] = idx if use_allowed_pids: result = list(result.intersection(allowed_pids)) for pid in result: self.results[pid] = (batchnum, pid2idx[pid])
def make_map(self, file_): """construct a dictionary of pid - (batchnum, line#)""" file_, allowed_pids = file_ use_allowed_pids = len(allowed_pids) > 0 batchnum = int( file_.name.replace(".jsonl.gz", "").replace("pdf_parses_", "")) pid2idx = {} result = set() with gzip.open(str(file_), "r") as f: for idx, line in enumerate(f): obj = json.loads(line) result.add(obj["paper_id"]) pid2idx[obj["paper_id"]] = idx if use_allowed_pids: result = list(result.intersection(allowed_pids)) for pid in result: self.results[pid] = (batchnum, pid2idx[pid])
def backup(path: str, callback=None, block_size=1024 * 1024): """ Performs a gzipped copy of the device containing / and /boot to an external drive. :param path: Path to the destination :param callback: Optional callback function for progress reporting :param block_size amount of data in bytes to read/write at a time :return: Tuple of 2 strings: destination-path/file and elapsed backup time (HH:MM:SS) """ device_size = shutil.disk_usage('/')[0] + shutil.disk_usage('/boot')[0] # device_size = 100*1024*1024 now = datetime.datetime.now().strftime('%Y%m%dT%H%M%S') zip_file = f"{os.uname()[1]}_{round(device_size / 1000000000)}GB_{now}.gz" copied = 0 start = int(time.time()) try: with open(disk, 'rb') as file_in, \ mgzip.open(f"{path}/{zip_file}", 'wb', blocksize=block_size) as file_out: while run_backup: block = file_in.read(block_size) # if not block: if not block or copied >= device_size: break file_out.write(block) copied += block_size if callback: # Make sure copied doesn't exceed device_size, # which will likely happen on the last block. callback(min(max(copied, 0), device_size), total=device_size) except IOError: # print("I/O ERROR({0}): {1}".format(e.errno, e.strerror), # file=sys.stderr) return f"{path}/{zip_file}", None # except: #handle other exceptions such as attribute errors # print("Unexpected error:", sys.exc_info()[0]) if os.path.isfile(f"{path}/{zip_file}") and not run_backup: os.remove(f"{path}/{zip_file}") return f"{path}/{zip_file}", None end = int(time.time()) elapsed = end - start return f"{path}/{zip_file}", f"{datetime.timedelta(seconds=elapsed)}"
def __createTensileBenchmarkContainer(baseImage, dockerFilePath, tag, outDir, logDir, tensileFork, tensileBranch, tensileCommit): """ Build a docker container with a specific ROCm image, Tensile branch and tag. Docker will pre-build the Tensile Client and configure the container to run the benchmark. """ # Save stdout and stderr to file with open(os.path.join(logDir, "dockerBuildLog.log"), 'w') as logFile: # Docker build command buildCmd = str("\ docker build \ -t {0} \ --pull -f {1} \ --build-arg user_uid=$UID \ --build-arg base_image={2} \ --build-arg tensile_fork={3} \ --build-arg tensile_branch={4} \ --build-arg tensile_commit={5} \ . ").format(tag, dockerFilePath, baseImage, tensileFork, tensileBranch, tensileCommit) # Build container and save output streams print("Building docker image: {0} ...".format(tag)) print(buildCmd) subprocess.check_call(shlex.split(buildCmd), stdout=logFile, stderr=logFile) print("Done building docker image!") # Docker save command imageBaseName = tag.split(':')[0] archivePath = os.path.join(outDir, imageBaseName + str(".tar.gz")) saveCmd = str("docker save {0}").format(tag) # Docker will save .tar binary to stdout as long as it's not attached to console. # Pipe stdout into gzip to get smaller .tar.gz archive print("Saving docker image: {0} to {1} ...".format(tag, archivePath)) with gzip.open(archivePath, 'wb') as zipFile: with subprocess.Popen(shlex.split(saveCmd), stdout=subprocess.PIPE, stderr=logFile) as proc: zipFile.write(proc.stdout.read()) print("Done saving docker image!")
def filter_ids_complete(self, file_: Path): """go over the metadata and get the in/out-bound citation data for each VALID paper. Validity is determined by whether the paper has a proper grobid parse, plus the existence of at least one of in/out citations. This method accumulates data into the state, as it won't take so much space. Used by filtering/filter_ids.py. """ file_, valid_pids, min_cite, max_cite, seed = file_ # make sure random.seed(seed) with gzip.open(str(file_), "r") as f: # metadata_ID.jsonl.gz fname = int( file_.name.replace(".jsonl.gz", "").replace("metadata_", "")) self.results[fname] = [] for line in f: obj = json.loads(line) # has_pdf_parse, has_pdf_parsed_(bib_entries, abstract, body_text) if (valid_pids.get(obj["paper_id"], False) and obj["has_inbound_citations"] and obj["has_outbound_citations"] and len(obj["inbound_citations"]) > min_cite and len(obj["outbound_citations"]) > min_cite): # filter valid citations ibc = [ pid for pid in obj["inbound_citations"] if valid_pids.get(pid, False) ] obc = [ pid for pid in obj["outbound_citations"] if valid_pids.get(pid, False) ] random.shuffle(ibc) random.shuffle(obc) if len(ibc) > 0 and len(obc) > 0: self.results[fname].append( (obj["paper_id"], ibc[:max_cite], obc[:max_cite]))
def filter_ids_text(self, file_: Path): """go over the metadata and get the list of paper_ids for each paper with pdf_parse text. Validity is determined by whether the paper has a proper grobid parse. Used to create a list of parseable papers. This method accumulates data into the state, as it won't take so much space. Used by filtering/filter_ids.py. """ # ignore irrelevant entry (cf. filter_ids.py) file_, _ = file_ with gzip.open(str(file_), "r") as f: # metadata_ID.jsonl.gz fname = int( file_.name.replace(".jsonl.gz", "").replace("metadata_", "")) self.results[fname] = [] for line in f: obj = json.loads(line) if (obj["has_pdf_parse"] and obj["has_pdf_parsed_abstract"] and obj["has_pdf_parsed_bib_entries"] and obj["has_pdf_parsed_body_text"]): self.results[fname].append(obj["paper_id"])
def test_pool_close(tmpdir): filename = os.path.join(tmpdir, "test.gz") fh = mgzip.open(filename, "wb", compresslevel=6, thread=4, blocksize=128) fh.write(DATA1 * 500) if sys.version_info >= (3, 8): assert (repr(fh.pool) == "<multiprocessing.pool.ThreadPool state=RUN pool_size=4>") fh.close() assert fh.fileobj is None assert fh.myfileobj is None assert fh.pool_result == [] if sys.version_info >= (3, 8): assert (repr(fh.pool) == "<multiprocessing.pool.ThreadPool state=CLOSE pool_size=4>") if sys.version_info >= (3, 7): with pytest.raises(ValueError) as excinfo: fh.pool.apply(print, ("x", )) assert "Pool not running" in str(excinfo.value) else: with pytest.raises(AssertionError) as excinfo: fh.pool.apply(print, ("x", )) assert "" == str(excinfo.value)
def open(cls, column_names: typing.List[str], file_path: typing.Optional[Path], who: str = "output", require_all_columns: bool = True, prohibit_extra_columns: bool = True, fill_missing_columns: bool = False, error_file: typing.TextIO = sys.stderr, header_error_action: ValidationAction = ValidationAction.EXIT, use_mgzip: bool = False, mgzip_threads: int = MGZIP_THREAD_COUNT_DEFAULT, gzip_in_parallel: bool = False, gzip_queue_size: int = GZIP_QUEUE_SIZE_DEFAULT, column_separator: str = KgtkFormat.COLUMN_SEPARATOR, mode: Mode = Mode.AUTO, output_format: typing.Optional[str] = None, output_column_names: typing.Optional[typing.List[str]] = None, old_column_names: typing.Optional[typing.List[str]] = None, new_column_names: typing.Optional[typing.List[str]] = None, verbose: bool = False, very_verbose: bool = False) -> "KgtkWriter": if file_path is None or str(file_path) == "-": if verbose: print("KgtkWriter: writing stdout", file=error_file, flush=True) if output_format is None: output_format = cls.OUTPUT_FORMAT_DEFAULT return cls._setup( column_names=column_names, file_path=None, who=who, file_out=sys.stdout, require_all_columns=require_all_columns, prohibit_extra_columns=prohibit_extra_columns, fill_missing_columns=fill_missing_columns, error_file=error_file, header_error_action=header_error_action, use_mgzip=use_mgzip, mgzip_threads=mgzip_threads, gzip_in_parallel=gzip_in_parallel, gzip_queue_size=gzip_queue_size, column_separator=column_separator, mode=mode, output_format=output_format, output_column_names=output_column_names, old_column_names=old_column_names, new_column_names=new_column_names, verbose=verbose, very_verbose=very_verbose, ) if str(file_path).startswith(">"): fd: int = int(str(file_path)[1:]) if verbose: print("%s: writing file descriptor %d" % (who, fd), file=error_file, flush=True) if output_format is None: output_format = cls.OUTPUT_FORMAT_DEFAULT return cls._setup( column_names=column_names, file_path=file_path, who=who, file_out=open(fd, "w"), require_all_columns=require_all_columns, prohibit_extra_columns=prohibit_extra_columns, fill_missing_columns=fill_missing_columns, error_file=error_file, header_error_action=header_error_action, use_mgzip=use_mgzip, mgzip_threads=mgzip_threads, gzip_in_parallel=gzip_in_parallel, gzip_queue_size=gzip_queue_size, column_separator=column_separator, mode=mode, output_format=output_format, output_column_names=output_column_names, old_column_names=old_column_names, new_column_names=new_column_names, verbose=verbose, very_verbose=very_verbose, ) if verbose: print("File_path.suffix: %s" % file_path.suffix, file=error_file, flush=True) if file_path.suffix in [".gz", ".bz2", ".xz", ".lz4"]: # TODO: find a better way to coerce typing.IO[Any] to typing.TextIO gzip_file: typing.TextIO if file_path.suffix == ".gz": if use_mgzip: if verbose: print("KgtkWriter: writing gzip with %d threads: %s" % (mgzip_threads, str(file_path)), file=error_file, flush=True) import mgzip gzip_file = mgzip.open( str(file_path), mode="wt", thread=mgzip_threads) # type: ignore else: if verbose: print("KgtkWriter: writing gzip %s" % str(file_path), file=error_file, flush=True) import gzip gzip_file = gzip.open(file_path, mode="wt") # type: ignore elif file_path.suffix == ".bz2": if verbose: print("KgtkWriter: writing bz2 %s" % str(file_path), file=error_file, flush=True) import bz2 gzip_file = bz2.open(file_path, mode="wt") # type: ignore elif file_path.suffix == ".xz": if verbose: print("KgtkWriter: writing lzma %s" % str(file_path), file=error_file, flush=True) import lzma gzip_file = lzma.open(file_path, mode="wt") # type: ignore elif file_path.suffix == ".lz4": if verbose: print("KgtkWriter: writing lz4 %s" % str(file_path), file=error_file, flush=True) import lz4 # type: ignore gzip_file = lz4.frame.open(file_or_path, mode="wt") # type: ignore else: # TODO: throw a better exception. raise ValueError("Unexpected file_path.suffiz = '%s'" % file_path.suffix) if output_format is None: if len(file_path.suffixes) < 2: output_format = cls.OUTPUT_FORMAT_DEFAULT else: format_suffix: str = file_path.suffixes[-2] if format_suffix == ".md": output_format = cls.OUTPUT_FORMAT_MD elif format_suffix == ".csv": output_format = cls.OUTPUT_FORMAT_CSV elif format_suffix == ".json": output_format = cls.OUTPUT_FORMAT_JSON elif format_suffix == ".jsonl": output_format = cls.OUTPUT_FORMAT_JSONL else: output_format = cls.OUTPUT_FORMAT_DEFAULT return cls._setup( column_names=column_names, file_path=file_path, who=who, file_out=gzip_file, require_all_columns=require_all_columns, prohibit_extra_columns=prohibit_extra_columns, fill_missing_columns=fill_missing_columns, error_file=error_file, header_error_action=header_error_action, use_mgzip=use_mgzip, mgzip_threads=mgzip_threads, gzip_in_parallel=gzip_in_parallel, gzip_queue_size=gzip_queue_size, column_separator=column_separator, mode=mode, output_format=output_format, output_column_names=output_column_names, old_column_names=old_column_names, new_column_names=new_column_names, verbose=verbose, very_verbose=very_verbose, ) else: if output_format is None: if file_path.suffix == ".md": output_format = cls.OUTPUT_FORMAT_MD elif file_path.suffix == ".csv": output_format = cls.OUTPUT_FORMAT_CSV elif file_path.suffix == ".json": output_format = cls.OUTPUT_FORMAT_JSON elif file_path.suffix == ".jsonl": output_format = cls.OUTPUT_FORMAT_JSONL else: output_format = cls.OUTPUT_FORMAT_DEFAULT if verbose: print("KgtkWriter: writing file %s" % str(file_path), file=error_file, flush=True) return cls._setup( column_names=column_names, file_path=file_path, who=who, file_out=open(file_path, "w"), require_all_columns=require_all_columns, prohibit_extra_columns=prohibit_extra_columns, fill_missing_columns=fill_missing_columns, error_file=error_file, header_error_action=header_error_action, use_mgzip=use_mgzip, mgzip_threads=mgzip_threads, gzip_in_parallel=gzip_in_parallel, gzip_queue_size=gzip_queue_size, column_separator=column_separator, mode=mode, output_format=output_format, output_column_names=output_column_names, old_column_names=old_column_names, new_column_names=new_column_names, verbose=verbose, very_verbose=very_verbose, )
def analyse(preddir, pdfpath, beta_threshold, distance_threshold, iou_threshold, matching_mode, analysisoutpath, nfiles, local_distance_scaling, is_soft, op, de_e_cut, angle_cut, kill_pu=True): hits2showers = OCHits2Showers(beta_threshold, distance_threshold, is_soft, local_distance_scaling, op=op) showers_matcher = ShowersMatcher(matching_mode, iou_threshold, de_e_cut, angle_cut) files_to_be_tested = [os.path.join(preddir, x) for x in os.listdir(preddir) if x.endswith('.bin.gz')] if nfiles!=-1: files_to_be_tested = files_to_be_tested[0:min(nfiles, len(files_to_be_tested))] showers_dataframe = pd.DataFrame() event_id = 0 for i, file in enumerate(files_to_be_tested): print("Analysing file", i, file) with mgzip.open(file, 'rb') as f: file_data = pickle.load(f) for j, endcap_data in enumerate(file_data): print("Analysing endcap",j) stopwatch = time.time() features_dict, truth_dict, predictions_dict = endcap_data processed_pred_dict, pred_shower_alpha_idx = hits2showers.call(features_dict, predictions_dict) print('took',time.time()-stopwatch,'s for inference clustering') stopwatch = time.time() showers_matcher.set_inputs( features_dict=features_dict, truth_dict=truth_dict, predictions_dict=processed_pred_dict, pred_alpha_idx=pred_shower_alpha_idx ) showers_matcher.process() print('took',time.time()-stopwatch,'s to match') stopwatch = time.time() dataframe = showers_matcher.get_result_as_dataframe() print('took',time.time()-stopwatch,'s to make data frame') dataframe['event_id'] = event_id event_id += 1 if kill_pu: from globals import pu if len(dataframe[dataframe['truthHitAssignementIdx']>=pu.t_idx_offset]): print('\nWARNING REMOVING PU TRUTH MATCHED SHOWERS, HACK.\n') dataframe = dataframe[dataframe['truthHitAssignementIdx']<pu.t_idx_offset] showers_dataframe = pd.concat((showers_dataframe, dataframe)) # This is only to write to pdf files scalar_variables = { 'beta_threshold': str(beta_threshold), 'distance_threshold': str(distance_threshold), 'iou_threshold': str(iou_threshold), 'matching_mode': str(matching_mode), 'is_soft': str(is_soft), 'de_e_cut': str(de_e_cut), 'angle_cut': str(angle_cut), } if len(analysisoutpath) > 0: analysis_data = { 'showers_dataframe' : showers_dataframe, 'events_dataframe' : None, 'scalar_variables' : scalar_variables, } with gzip.open(analysisoutpath, 'wb') as f: print("Writing dataframes to pickled file",analysisoutpath) pickle.dump(analysis_data,f) if len(pdfpath)>0: plotter = HGCalAnalysisPlotter() plotter.set_data(showers_dataframe, None, '', pdfpath, scalar_variables=scalar_variables) plotter.process()
def _fetch(self): training_performance_metrics = None if not self.ignore_cache: if os.path.exists(self.cache_path): with mgzip.open(self.cache_path, 'rb') as f: dumping_data = pickle.load(f) print(dumping_data['experiment_name']) if dumping_data['experiment_name'] == self.experiment_name: training_performance_metrics = dumping_data['data'] print("Loaded data from cache...") else: print( "Cache doesn't contain this experiment, will have to re-fetch." ) condition_string = None if training_performance_metrics is not None: old_exp_names = np.unique( training_performance_metrics['experiment_name']).tolist() old_max_iterations = [ np.max( np.array(training_performance_metrics['iteration'])[ np.char.equal( training_performance_metrics['experiment_name'], expn)]) for expn in old_exp_names ] condition_string = '(%s)' % ' OR '.join([ "(experiment_name='%s' and iteration > '%d')" % (exp_n, iteration) for exp_n, iteration in zip(old_exp_names, old_max_iterations) ]) # condition_string = '(%s)' % condition_string if self.experiment_name is not None: experiment_name = str(self.experiment_name).split(',') if len(experiment_name) == 1: experiment_name = experiment_name[0] else: experiment_name = self.experiment_name if _debug: print("Going to fetch from server") new_data = self.reading_manager.get_data( '%s' % self.database_table_name, experiment_names=experiment_name, condition_string=condition_string) if _debug: print("Fetch from server complete") if new_data is not None and training_performance_metrics is not None: training_performance_metrics = self._combine( training_performance_metrics, new_data) elif new_data is not None and training_performance_metrics is None: training_performance_metrics = new_data if not self.ignore_cache: with mgzip.open(self.cache_path, 'wb') as f: dumping_data = { 'experiment_name': self.experiment_name, 'data': training_performance_metrics } pickle.dump(dumping_data, f) if training_performance_metrics is None: print( "Experiment not found, in your configured database, the following experiments were found:" ) available_experiment_names = self.reading_manager.get_data_from_query( 'SELECT DISTINCT(experiment_name) FROM %s' % self.database_table_name) available_experiment_names = [ x[0] for x in available_experiment_names ] print(available_experiment_names) raise TrainingMetricPlots.ExperimentNotFoundError( "Experiment not found in your configured database") self.training_performance_metrics = training_performance_metrics
def bundle(self, firmware_hash: str, datasets: List[Dataset], *, file: Union[str, BinaryIO, IO[bytes]], shard_spec: ShardSpec = None, delta_to: Dict[str, str] = None, overwrite: bool = False) -> List[ObjectInfo]: """ Builds a data bundle (*.tar.gz) for a firmware hash, including content from the specified datasets (FWAN plugin output locations) :param firmware_hash: The firmware hash to bundle :param datasets: The datasets to include in the bundle :param file: The output path or file-like-object to which the *.tar.gz output should be written :param shard_spec: If provided, only the specified shard of file hashes will appear in the bundle :param delta_to: A dictionary of path->etag values, which if supplied will cause the bundle to be built as a delta to that set, meaning only new objects or objects with modified etags will appear in the bundle. :param overwrite: The output path will not be overwritten, to prevent accidental data loss, unless this is set :return: A list of the object included in the bundle """ if not firmware_hash: raise ValueError('firmware_hash must be specified') if not datasets: raise ValueError('datasets must be specified, and non-empty') if delta_to is None: delta_to = {} logger.info( f"Building {'delta' if delta_to else ''} bundle for {firmware_hash}" ) contents: List[ObjectInfo] = [] with mgzip.open(filename=file, mode='w' if overwrite else 'x') as gz, tarfile.open( fileobj=gz, mode='w', bufsize=tarfile.RECORDSIZE * 4) as tar: # Fetch and process the file tree, using that as the basis for all other paths that need to be bundled. file_tree_path = f'file_tree/{firmware_hash}.jsonl' with CodeTimer('Read firmware file tree from object storage'): try: file_tree_result = fetch_object( bucket=self.firmware_metadata_bucket, key=file_tree_path) except ClientError as e: raise Exception( 'Firmware file tree could not be read') from e with CodeTimer('Extract file hashes from file tree'): try: file_hashes = extract_file_hashes(file_tree_result.payload) except json.JSONDecodeError as e: raise Exception( 'Firmware file tree could not be parsed') from e if not file_hashes: raise Exception('Firmware file tree is empty') file_tree_in_bundle = False if is_dataset_in_shard( dataset=FILE_TREE_DATASET, shard_spec=shard_spec ) and file_tree_result.info.etag != delta_to.get(file_tree_path): with CodeTimer('Add file tree to bundle'): add_to_tarfile(tar, file_tree_result) contents.append(file_tree_result.info) file_tree_in_bundle = True else: logger.info( 'File tree is unchanged or not part of this shard and will not be included in the bundle' ) file_tree_size = file_tree_result.info.size logger.info( 'File tree num distinct file hashes = {hash_count}; size = {size}' .format(hash_count=len(file_hashes), size=naturalsize(file_tree_size))) if shard_spec: with CodeTimer( f'Limiting file hashes to shard {shard_spec.index}'): def is_in_shard(file_hash: str) -> bool: return int(file_hash, 16) % shard_spec.count == shard_spec.index file_hashes = [ file_hash for file_hash in file_hashes if is_in_shard(file_hash) ] logger.info(f'Sharded num file hashes = {len(file_hashes)}') file_tree_result = None # Build paths to be bundled with CodeTimer('Build paths for bundle'): bundle_datasets = [ ds for ds in datasets if is_dataset_in_shard(dataset=ds, shard_spec=shard_spec) ] paths = self.build_paths(firmware_hash=firmware_hash, datasets=bundle_datasets, file_hashes=file_hashes, delta_to=delta_to) or [] path_count = len(paths) # Validate the paths (check for duplicates) with CodeTimer('Validate paths'): duplicates = [ path for path, count in collections.Counter(paths).items() if count > 1 ] if duplicates: raise Exception( f'Bundle paths contained {len(duplicates)} duplicates: {duplicates}' ) total_path_count = path_count + 1 if file_tree_in_bundle else 0 logger.info( f'Bundle will include at most {total_path_count} paths from object storage' ) fetch_count = 0 miss_count = 0 skip_count = 0 fetch_bytes = 0 with CodeTimer('Bundle objects'): with concurrent.futures.ThreadPoolExecutor( max_workers=self.max_workers) as executor: fetch_start = datetime.datetime.now() with CodeTimer( 'Submit object storage path retrieval tasks'): # Randomize path ordering to improve the performance of fetches from object storage, # so that a diversity of object key prefixes is being fetched at any one time. random.shuffle(paths) futures = [ executor.submit( fetch_object, bucket=self.firmware_metadata_bucket, key=path, compare_etag=delta_to.get(path), ) for path in paths ] for future in concurrent.futures.as_completed(futures): try: result = future.result() if result: add_to_tarfile(tar, result) result.payload = None fetch_count += 1 fetch_bytes += result.info.size contents.append(result.info) else: skip_count += 1 if fetch_count % 1000 == 0: logger.info( 'Bundled {} objects ({}) in {}'.format( fetch_count, naturalsize(fetch_bytes), naturaldelta(datetime.datetime.now() - fetch_start))) except ClientError as e: error_code = e.response.get('Error', {}).get('Code') if 'NoSuchKey' in error_code: miss_count += 1 elif '304' == error_code: # The ETag on this object was not modified, so it was not returned skip_count += 1 else: raise e if skip_count: logger.info(f'Skipped {skip_count} unmodified objects') if miss_count: logger.info( f"Made {miss_count} attempts to access object storage paths that didn't exist" ) logger.info('Bundled {} objects ({})'.format( fetch_count + (1 if file_tree_in_bundle else 0), naturalsize(fetch_bytes + (file_tree_size if file_tree_in_bundle else 0)))) # Validate fetched paths (check that each path was uniquely processed) with CodeTimer('Validate fetched paths'): fetched_path_counter = collections.Counter( [obj.path for obj in contents]) duplicates = [ path for path, count in fetched_path_counter.items() if count > 1 ] if duplicates: raise Exception( f'Bundle paths contained {len(duplicates)} duplicates: {duplicates}' ) with CodeTimer('Finalize output'): contents = sorted(contents, key=lambda obj: obj.path) return contents
def get_vasp_dirs(): ctx = click.get_current_context() run = ctx.parent.parent.params["run"] nmax = ctx.parent.params["nmax"] pattern = ctx.parent.params["pattern"] reorg = ctx.parent.params["reorg"] base_path = ctx.parent.params["directory"].rstrip(os.sep) base_path_index = len(base_path.split(os.sep)) if pattern: pattern_split = pattern.split(os.sep) pattern_split_len = len(pattern_split) counter = 0 for root, dirs, files in os.walk(base_path, topdown=True): if counter == nmax: break level = len(root.split(os.sep)) - base_path_index if pattern and dirs and pattern_split_len > level: p = pattern_split[level] dirs[:] = [d for d in dirs if fnmatch(d, p)] for d in dirs: dn = os.path.join(root, d) st = os.stat(dn) if not bool(st.st_mode & perms): raise EmmetCliError( f"Insufficient permissions {st.st_mode} for {dn}.") if is_vasp_dir(files): gzipped = False for f in files: fn = os.path.join(root, f) if os.path.islink(fn): if run: os.unlink(fn) logger.warning(f"Unlinked {fn}.") else: logger.warning(f"Would unlink {fn}.") continue st = os.stat(fn) if not bool(st.st_mode & perms): raise EmmetCliError( f"Insufficient permissions {st.st_mode} for {fn}.") if run and not f.endswith(".gz"): fn_gz = fn + ".gz" if os.path.exists(fn_gz): os.remove(fn_gz) # remove left-over gz (cancelled job) with open(fn, "rb") as fo, mgzip.open(fn_gz, "wb", thread=0) as fw: fw.write(fo.read()) os.remove(fn) # remove original shutil.chown(fn_gz, group="matgen") gzipped = True # NOTE skip symlink'ing on MP calculations from the early days vasp_dir = get_symlinked_path(root, base_path_index) if reorg else root create_orig_inputs(vasp_dir) dirs[:] = [] # don't descend further (i.e. ignore relax1/2) logger.log(logging.INFO if gzipped else logging.DEBUG, vasp_dir) yield vasp_dir counter += 1 return counter
def run( input_file: KGTKFiles, output_file: KGTKFiles, entity_ids: typing.List[str], input_limit: int, output_limit: int, use_mgzip_for_input: bool, use_mgzip_for_output: bool, mgzip_threads_for_input: int, mgzip_threads_for_output: int, ): import simplejson as json import sys in_path = KGTKArgumentParser.get_input_file(input_file) out_path = KGTKArgumentParser.get_output_file(output_file) from gzip import GzipFile print("Processing.", file=sys.stderr, flush=True) # Open the input file first to make it easier to monitor with "pv". input_f: typing.Union[GzipFile, typing.IO[typing.Any]] if str(in_path) == "-": print('Processing wikidata from standard input', file=sys.stderr, flush=True) # It is not well documented, but this is how you read binary data # from stdin in Python 3. input_f = sys.stdin.buffer else: print('Processing wikidata file %s' % str(in_path), file=sys.stderr, flush=True) input_f = open(in_path, mode='rb') if str(in_path).endswith(".bz2"): import bz2 print('Decompressing (bz2)', file=sys.stderr, flush=True) # TODO: Optionally use a system decompression program. input_f = bz2.open(input_f) elif str(in_path).endswith(".gz"): # TODO: Optionally use a system decompression program. if use_mgzip_for_input: import mgzip print('Decompressing (mgzip)', file=sys.stderr, flush=True) input_f = mgzip.open(input_f, thread=mgzip_threads_for_input) else: import gzip print('Decompressing (gzip)', file=sys.stderr, flush=True) input_f = gzip.open(input_f) # Open the input file first to make it easier to monitor with "pv". output_f: typing.Union[GzipFile, typing.IO[typing.Any]] if str(out_path) == "-": print('Sending wikidata JSON to standatd output', file=sys.stderr, flush=True) # It is not well documented, but this is how you write binary data # from stdin in Python 3. output_f = sys.stdout.buffer else: print('Writing wikidata file %s' % str(out_path), file=sys.stderr, flush=True) output_f = open(out_path, mode='wb') if str(out_path).endswith(".bz2"): import bz2 print('Compressing (bz2)', file=sys.stderr, flush=True) # TODO: Optionally use a system decompression program. output_f = bz2.open(output_f, "wb") elif str(out_path).endswith(".gz"): # TODO: Optionally use a compression program. if use_mgzip_for_output: import mgzip print('Compressing (mgzip)', file=sys.stderr, flush=True) output_f = mgzip.open(output_f, "wb", thread=mgzip_threads_for_output) else: import gzip print('Compressing (gzip)', file=sys.stderr, flush=True) output_f = gzip.open(output_f, "wb") entity_id_set: typing.Set[str] = set(entity_ids) output_count: int = 0 input_count: int line: bytes for input_count, line in enumerate(input_f): if input_limit and input_count >= input_limit: break clean_line = line.strip() if clean_line.endswith(b","): clean_line = clean_line[:-1] if len(clean_line) > 1: obj = json.loads(clean_line) entity = obj["id"] if entity in entity_id_set: if output_count == 0: output_f.write(b"[\n") else: output_f.write(b",\n") output_f.write(clean_line) output_count += 1 if output_limit is not None and output_count >= output_limit: break print('Done processing {}'.format(str(in_path)), file=sys.stderr, flush=True) input_f.close() if output_count > 0: output_f.write(b"\n]\n") output_f.close() print('Wrote {} records'.format(output_count), file=sys.stderr, flush=True)
rate = orig_size / duration logger.info("Archived %s files (%sB) in %.3fs: %sB/s" % (file_count, numToReadable(orig_size), duration, numToReadable(rate))) write_manifest_file(out_manifest_file, manifest) start_time = time.time() blocksize = int(min(100 * 2**20, orig_size / threads)) logger.info( "Compressing (%s threads, blocksize %sB) %s to %s" % (threads, numToReadable(blocksize), tarfile, destination_file)) with mgzip.open(destination_file, "wb", thread=threads, blocksize=blocksize) as my_gzip: with open(tarfile, "rb") as my_tar: my_gzip.write(my_tar.read()) end_time = time.time() duration = end_time - start_time out_size = os.path.getsize(destination_file) rate = orig_size / duration logger.info("Compressed %sB into %sB in %.3fs: %sB/s" % (numToReadable(orig_size), numToReadable(out_size), duration, numToReadable(rate))) os.unlink(tarfile)
def gather_abstracts(self, file_): file_, paper_ids, output_dir, valid_lines, pid2loc = file_ target_batchnum = int( file_.name.replace(".jsonl.gz", "").replace("pdf_parses_", "")) # load citations bnum_bucketed_citations = {i: [] for i in range(100)} for pid, (bnum, lnum) in pid2loc.items(): bnum_bucketed_citations[bnum].append((lnum, pid)) # batch load citations instead of reading files on the fly for each citations pat = re.compile(r"paper_id\":\s+\"(\d+)\"") pid2citation = {} for bnum in range(100): cite_valid_lines = sorted(bnum_bucketed_citations[bnum], key=lambda x: x[0]) pdf_parse = file_.parent / file_.name.replace( f"{target_batchnum}", f"{bnum}") f_idx, l_idx = 0, 0 with gzip.open(str(pdf_parse), "rb", thread=8) as f: for f_idx, line in enumerate(f): if f_idx > cite_valid_lines[-1][0]: break if f_idx != cite_valid_lines[l_idx][0]: continue else: l_idx += 1 pid = re.search(pat, line.decode("utf8")).group(1) pid2citation[pid] = line result = [] f_idx, l_idx = 0, 0 with gzip.open(str(file_), "r", thread=8) as f: for f_idx, line in enumerate(f): if f_idx > valid_lines[-1]: break if f_idx != valid_lines[l_idx]: continue else: l_idx += 1 # json parse takes a lot of time obj = json.loads(line) pid = obj["paper_id"] inb_pids, outb_pids = paper_ids[pid] # random pick strategy out_data = [] for out_pid in outb_pids: out_obj = pid2citation.get(out_pid, None) if out_obj is not None: out_obj = json.loads(out_obj) out_abs = load_bg_info(out_pid, out_obj, content="abstract") # target is the paper itself, not the citation bginfo = load_bg_info(out_pid, obj, content="cite_context") if bginfo is None: bginfo = [] out_data.append((out_pid, bginfo, out_abs)) # sort by number of times a paper is cited (i.e., number of matches.) out_data = sorted(out_data, key=lambda x: len(x[1]), reverse=True) # max 10, look up 20 in case there are bunch of None's in_data = [] # TODO: better inbound citation selection strategy for in_pid in inb_pids: in_obj = pid2citation.get(in_pid, None) if in_obj is not None: in_obj = json.loads(in_obj) cite_context = load_bg_info(pid, in_obj, content="cite_context") if cite_context != []: in_data.append((in_pid, cite_context)) result.append((in_data, out_data, obj)) with (output_dir / f"{target_batchnum}.pkl").open("wb") as f: pickle.dump(result, f)
else: raise Exception('Error: couldn\'t locate output folder/file') print(files_to_be_tested) pdfpath = '' if len(args.p) != 0: pdfpath = args.p # TODO: Remove this files_to_be_tested = files_to_be_tested[0:100] if False: all_data = [] for file in files_to_be_tested: print("Reading", file) with mgzip.open(file, 'rb') as f: data_loaded = pickle.load(f) all_data.append(data_loaded) analysed_graphs, metadata = matching_and_analysis.OCAnlayzerWrapper( metadata).analyse_from_data(all_data) else: analysed_graphs, metadata = matching_and_analysis.OCAnlayzerWrapper( metadata).analyse_from_files(files_to_be_tested) plotter = hp.TrackMLPlotter() plotter.add_data_from_analysed_graph_list(analysed_graphs, metadata) if len(pdfpath) > 0: plotter.write_to_pdf(pdfpath=pdfpath) if len(args.analysisoutpath) != 0: with gzip.open(args.analysisoutpath, 'wb') as f: pickle.dump((analysed_graphs, metadata), f)
def ff(): with open(FN, 'rb') as f_in: with mgzip.open(FN+'.mpgz', 'wb') as f_out: shutil.copyfileobj(f_in, f_out)