def _get_data_list(self, data_dir: PosixPath) -> list: """ Args: data_dir (PosixPath): data directory path Returns: list: [(image_path, anno_path), ...] """ image_paths = sorted(data_dir.glob('*.jpg')) anno_paths = sorted(data_dir.glob('*.json')) data_list = [(ip.resolve().as_posix(), ap.resolve().as_posix()) for ip, ap in zip(image_paths, anno_paths)] return data_list
def main(keyword: str, directory: pathlib.PosixPath, backup: bool, lowercase: bool): """Rename all pictures in a directory with a common keyword. The date from the metadata of the pictures is retrieved and concanated to the keyword, followed by a counter to distinguish pictures taken the same day. Parameters ---------- keyword : str Common keyword to use when renaming the pictures. The default value is the name of the current folder. directory : str, default './' Directory containing the pictures to rename, default is the current directory. backup : bool, default False If flag is present, copy the pictures instead of renaming them. Examples -------- $ rhinopics mykeyword -> mykeyword_20190621_001 """ paths = sorted(directory.glob('*'), key=os.path.getmtime) nb_digits = len(str(len(paths))) builder = RhinoBuilder(nb_digits, keyword, backup, lowercase) with tqdm(total=len(paths)) as pbar: for path in paths: rhino = builder.factory(path) if rhino is not None: rhino.rename() pbar.update()
def eval_benchmark_models(netcdf_folder: PosixPath, func: Callable) -> dict: """Evaluate benchmark models on specific metric function. Parameters ---------- netcdf_folder : PosixPath Directory, containing basin-wise netcdf files, which contain the benchmark model simulations func : Callable The metric function to evaluate. Must satisfy the func(obs, sim) convention. Returns ------- dict Dictionary, containing the metric values of each basin and each benchmark model. """ nc_files = list(netcdf_folder.glob('*.nc')) benchmark_models = defaultdict(dict) for nc_file in tqdm.tqdm(nc_files): basin = nc_file.name[:8] xr = xarray.open_dataset(nc_file) for key in xr.keys(): if key != 'QObs': obs = xr['QObs'].values sim = xr[key].values sim = sim[obs >= 0] obs = obs[obs >= 0] value = func(obs, sim) if np.isnan(value): print(f"{key}: {nc_file}") else: benchmark_models[key][basin] = value return benchmark_models
def check_file_exists(folder, fileglob="*"): """pre-check if file(s) exist(s) for links in view templates""" fldr = Path(folder) try: for f in fldr.glob(fileglob): current_app.logger.info('checking found fname: %s', f) if f.is_file(): return True except: return False
def get_run_qcreport_data(project_name, run_dir, qcreport_glob='*_QCreport*.csv', qcreport_data_suffix='.data.csv', data_header='GT_QC_Sample_ID,'): """Return run Qcreport data file if exists, else parse the actual data lines after row starting 'GT_QC_Sample_ID' """ current_app.logger.info('Getting QCreport data: %s', run_dir) qcr_lines = [] qcr_rows = [] qcr_data_file = "" qcr_data_path = "" try: run_path = Path(run_dir) for f in run_path.glob(qcreport_glob): # current_app.logger.debug('.....QCreport file: %s', f.name) if f.match('*' + qcreport_data_suffix): current_app.logger.info('QCreport data file found: %s', f.name) return f.name elif f.is_file(): qcr_lines = read_file_text(f) qcr_data_file = f.name[:-4] + qcreport_data_suffix qcr_data_path = os.path.join(run_dir, qcr_data_file) try: for index, row in enumerate(qcr_lines): # current_app.logger.debug('qc_report row: %s', str(row[0:30)]) if data_header in row: qcr_rows = os.linesep.join(qcr_lines[index:]) break if qcr_rows: current_app.logger.info( 'QCreport data will be written to file: %s', qcr_data_path) with open(qcr_data_path, 'w') as qcd: qcd.writelines(qcr_rows) # current_app.logger.info('QCreport data will be read from file: %s', qcr_data_path) with open(qcr_data_path, 'r') as qcd: qcd = qcd.readlines() if len(qcd) == len(qcr_rows): current_app.logger.info( 'QCreport data successfully read-checked.') else: current_app.logger.error( 'QCreport data not read successfully??') except Exception as e: current_app.logger.exception( 'QCreport data row reading issues: %s', run_dir) raise e finally: current_app.logger.debug('QCreport data file: %s', qcr_data_file) return qcr_data_file except Exception as e: current_app.logger.exception('QCreport data file issues: %s', run_dir) raise e
def find_empty(source, parser=nafparser.parse_coach): LOG.debug("find invalid coaches in %s", source) path = PosixPath(source) if not path.is_dir(): LOG.error("No such path %s", path) return [] for f in path.glob("*.html"): with open(f, 'r') as html_file: parse_result = parser(html_file.read()) if not parse_result: yield(f)
def iter_target_dir(collection_path): targets = PosixPath( os.path.join(collection_path, "tests/integration/targets/")) for target in targets.glob("*"): aliases = target / "aliases" if not target.is_dir(): continue if not aliases.is_file(): continue lines = aliases.read_text().split("\n") yield (target, lines)
def _read_dir(dir: PosixPath): ret = [] for each in dir.glob('*.wav'): file_name = str(each) if not DataLoader.check_sec(file_name, min_sec): continue else: ret.append(file_name) if len(ret) == n_max: break return ret
def zip_last_n_files(directory: PosixPath = LOG_DIR, zip_file: str = ZIP_FILE, n: int = 3): logs = list(directory.glob('*.log')) last_n_sorted_logs = sorted(logs, key=lambda x: os.path.getctime(x))[-n:] with ZipFile(zip_file, 'w') as new_zip: for name in last_n_sorted_logs: ctime = time.ctime(os.path.getctime(name)) cdate = datetime.strptime(ctime, "%a %b %d %H:%M:%S %Y") date = cdate.strftime('%Y-%m-%d') shortname = os.path.basename(name.with_suffix('')) arcname = shortname + '_' + date + '.log' new_zip.write(str(name), arcname=arcname)
def get_all_targets(collection_path): """Return all the targets from a directory.""" targets_dir = PosixPath(os.path.join(collection_path, "tests/integration/targets/")) targets = {} for target in targets_dir.glob("*"): aliases = target / "aliases" if not target.is_dir(): continue if not aliases.is_file(): continue lines = aliases.read_text().split("\n") targets[target.name] = lines return targets
def zip_last_n_files(directory: PosixPath = LOG_DIR, zip_file: str = ZIP_FILE, n: int = 3): file_list = {} for f in directory.glob('*.log'): s = f.stat() file_list[s.st_ctime] = f file_list = sorted(file_list.items()) with ZipFile(zip_file, 'w') as zippery: for t, f in file_list[-n:]: zippery.write( f, f'{f.stem}_{datetime.fromtimestamp(t).strftime("%Y-%m-%d")}.log' )
def zip_last_n_files(directory: PosixPath = LOG_DIR, zip_file: str = ZIP_FILE, n: int = 3): paths_dt = [] for log_file in directory.glob('*.log'): creation_ts = log_file.stat().st_ctime paths_dt.append((log_file, datetime.fromtimestamp(creation_ts))) paths_dt.sort(key=lambda x: x[1], reverse=True) with ZipFile(zip_file, 'w') as file: for path, dt in paths_dt[:n]: date_str = dt.strftime("%Y-%m-%d") arcname = f"{path.stem}_{date_str}{path.suffix}" file.write(path, arcname=arcname)
def infer_settings(opt_root, opt_pattern="**/optimizer.py"): opt_root = PosixPath(opt_root) assert opt_root.is_dir(), "Opt root directory doesn't exist: %s" % opt_root assert opt_root.is_absolute(), "Only absolute path should have even gotten this far." # Always sort for reproducibility source_files = sorted(opt_root.glob(opt_pattern)) source_files = [ss.relative_to(opt_root) for ss in source_files] settings = {_cleanup(str(ss.parent)): [str(ss), {}] for ss in source_files} assert all(joinable(kk) for kk in settings), "Something went wrong in name sanitization." assert len(settings) == len(source_files), "Name collision after sanitization of %s" % repr(source_files) assert len(set(CONFIG.keys()) & set(settings.keys())) == 0, "Name collision with builtin optimizers." return settings
def get_latest_checkpoint(directory: pathlib.PosixPath, args: argparse.Namespace): latest_checkpoint = None checkpoint_files = list(directory.glob(f"*{args.checkpoint_id_pattern}*")) if checkpoint_files: latest_checkpoint = 0 for checkpoint_file in checkpoint_files: checkpoint_file = str(checkpoint_file).split("/")[-1] checkpoint_epoch = re.findall( args.checkpoint_extract_pattern, checkpoint_file ) checkpoint_epoch = int(checkpoint_epoch[0]) if checkpoint_epoch > latest_checkpoint: latest_checkpoint = checkpoint_epoch pass else: print("Unable to parse latest checkpoint information") return latest_checkpoint
def get_file_paths(folder, fileglob="*", name_only=False): """pre-check if file(s) exist(s) and return list of Path objects or filenames only""" fldr = Path(folder) files = [] try: for f in fldr.glob(fileglob): current_app.logger.info('get_file_paths dir glob name: %s', f) if f.is_file(): if name_only: files.append(f.name) else: files.append(f) # current_app.logger.debug('get_file_paths files: %s', str(files)) return files except: msg = f'!! Error file "{fileglob}" not found!' current_app.logger.error(msg) return False
def __init__(self, path: pathlib.PosixPath): self.name = path.name self.image_paths = [] self.preview_path = path.parent / "preview.jpg" for image_path in sorted([ x for x in path.glob("./*") if x.is_file() and any( str(x).endswith(ext) for ext in [".png", ".jpg", ".jpeg"]) ]): if image_path.name.startswith("preview"): self.preview_path = image_path else: self.image_paths.append(image_path) self.num_items = len(self.image_paths) self.image_width, self.image_height = self._validate_image_dimensions() self.images_per_row = math.ceil(math.sqrt(self.num_items)) self.images_per_column = (math.floor( self.num_items / self.images_per_row) if self.num_items % self.images_per_row == 0 else math.ceil( self.num_items / self.images_per_row))
def select_model_by_facet_value( facet_value: str, root: PosixPath = Path("/kbdata/Processed/Models/")) -> dict: """Select models over time by a specific facet value. Arguments: facet_value (str): selected facet value, e.g. 'Katholiek' root (PosixPath): the folder where all models are stored Returns: a dictionary that maps year to a path """ models = root.glob(f"*-{facet_value}.w2v.model") out = {} for m in models: start = m.stem.lstrip("FT-").split('-')[0] out[int(start)] = m return out
def qc_report_run_info(run_path: Path, qc_report_glob='*_QCreport*.csv'): """get general run info from GT's QCreport file""" try: current_app.logger.info('Parsing QCreport file') qc_info = {} qc_report_fieldnames = [ 'Project', # 'Application', 'Sequence Protocol', 'Sample Size', 'Fastq Files', 'Date Report', ] # N.B. line formats of this report are: "Project: 18-weinstock-005,,,,," qc_report_list = list(run_path.glob(qc_report_glob)) # current_app.logger.debug('qc_report_list: %s', qc_report_list) qc_report_csv = qc_report_list[0] qcr_lines = read_file_text(qc_report_csv) qc_info = {'GT Project': None} # first item in display qcr_rows = [r.split(',') for r in qcr_lines] # current_app.logger.debug('length qcr_rows: %s', len(qcr_rows)) for row in qcr_rows: # current_app.logger.debug('qcr_row: %s', str(row)) for fld in qc_report_fieldnames: if fld in row[0]: # current_app.logger.debug('fld: %s', fld) [f1, f2] = row[0].replace(',', '').split(': ') # current_app.logger.debug('f1,f2: %s, %s', f1, f2) qc_info.update({f1: f2}) qc_info['GT Project'] = qc_info.pop('Project', None) current_app.logger.debug('qc_info: %s', qc_info) except Exception as e: current_app.logger.exception('reading from run' 's QCreport csv file!') finally: return qc_info
def get_matching_files(directory: PosixPath, filter_str: str) -> list: """Get all file names in "directory" and (case insensitive) match the ones that exactly match "filter_str" In case there is no exact match, return closely matching files. If there are no closely matching files either, return an empty list. (Return file names, not full paths). For example: d = Path('.') files in dir: bite1 test output get_matching_files(d, 'bite1') => ['bite1'] get_matching_files(d, 'Bite') => ['bite1'] get_matching_files(d, 'pybites') => ['bite1'] get_matching_files(d, 'test') => ['test'] get_matching_files(d, 'test2') => ['test'] get_matching_files(d, 'output') => ['output'] get_matching_files(d, 'o$tput') => ['output'] get_matching_files(d, 'nonsense') => [] """ files = [file.name for file in directory.glob('*')] return difflib.get_close_matches(filter_str, files)
def _get_next_version(path: pathlib.PosixPath) -> pathlib.PosixPath: candidate_paths = path.glob('**/*') dirs = [x.name for x in candidate_paths if x.is_dir()] versions = [int(dir_name) for dir_name in dirs if dir_name.isdigit()] new_version = max(versions) + 1 if versions else 0 return path / str(new_version)
def __init__(self, data_path: PosixPath, transform): self.file_path_list = list(data_path.glob('**/*.jpg')) self.transform = transform
def run_metrics_run_info(run_path: Path, run_metrics_glob='Run_Metric_*.csv'): """get general run info from GT's Run_Metrics file""" try: current_app.logger.info('Parsing RunMetrics file') metric_info = {} metrics_fields = { # header string in file: display_on_page, 'LIMSProjectID': 'GT Project', 'FlowCellID': 'FlowCell ID', 'RunDate': 'Run Date', # 'ProjectSeqRequestDate': 'Run Request Date', 'MachineID': 'Machine ID', 'Reads(M)': 'Reads (M)', 'ReadsPF (M)': 'Reads PF (M)', 'TotalYield(Gb)': 'Total Yield (Gb)', 'LoadingConc.(pM)': 'Loading Conc. (pM)', 'Density': 'Cluster Density', 'PhiXAligned%': 'PhiX % Aligned', 'Q30': 'Overall % Q30', '% >= Q30: Read 1': '%>=Q30: Read 1', '% >= Q30: Read 2': '%>=Q30: Read 2', 'Error Rate (%): Read 1': 'Error Rate: Read 1', 'Error Rate (%): Read 2': 'Error Rate: Read 2', '%>=Q30: Read 1': '%>=Q30: Read 1', '%>=Q30: Read 2': '%>=Q30: Read 2', 'Error Rate: Read 1': 'Error Rate: Read 1', 'Error Rate: Read 2': 'Error Rate: Read 2', } run_metrics_list = list(run_path.glob(run_metrics_glob)) # current_app.logger.debug('run_metrics_list: %s', run_metrics_list) run_metrics_dict = {} run_metrics_csv = run_metrics_list[0] import pprint pp = pprint.PrettyPrinter(indent=12, width=80) pf = pp.pformat with open(run_metrics_csv) as qrm: qrm_head = qrm.readline().split(',') qrm_data = qrm.readline().split(',') run_metrics_dict = dict(zip(qrm_head, qrm_data)) for line in qrm.readlines(): metricline = [f.strip() for f in line.split(',')] # current_app.logger.debug('.... run metricline: %s', metricline) if metricline[0] == 'Level': qrm_head_1 = [d + ': Read 1' for d in metricline if d] qrm_head_4 = [d + ': Read 2' for d in metricline if d] if metricline[0] == 'Read 1': run_metrics_dict.update(dict(zip(qrm_head_1, metricline))) if metricline[0] == 'Read 2 (I)': continue if metricline[0] == 'Read 3 (I)': continue if metricline[0] == 'Read 4': run_metrics_dict.update(dict(zip(qrm_head_4, metricline))) if metricline[0] == 'Total': break # ignore remainder of file lines # metric_info = { v:run_metrics_dict[k] # for k,v in metrics_fields.items() # if metrics_fields.get(k, None)} for k, v in metrics_fields.items(): try: if metrics_fields.get(k, ''): metric_info[v] = run_metrics_dict[k] except KeyError as e: msg = f'!! KeyError file "{k}" not found!' current_app.logger.error(msg) continue # current_app.logger.debug('metric_info: %s', metric_info) except Exception as e: current_app.logger.exception('reading from run' 's Run Metrics csv file!') finally: return metric_info
description= 'Split all images in train and test set (strasfied by source image name)') parser.add_argument('source_folder') parser.add_argument('train_folder') parser.add_argument('test_folder') parser.add_argument('test_fraction', type=float, default=0.2, nargs='?') args = parser.parse_args() if args.test_fraction < 0.0 or args.test_fraction >= 1.0: raise ValueError src_dir = PosixPath(args.source_folder) train_dir = PosixPath(args.train_folder) test_dir = PosixPath(args.test_folder) base_files = [fn.name for fn in src_dir.glob('000/*.npy')] train_files, test_files = train_test_split(base_files, test_size=args.test_fraction) for base_name in train_files: for fn in src_dir.glob(f'[0-9][0-9][0-9]/{base_name}'): dest_dir = train_dir / fn.parts[-2] dest_dir.mkdir(parents=True, exist_ok=True) fn.rename(dest_dir / fn.name) for base_name in test_files: for fn in src_dir.glob(f'[0-9][0-9][0-9]/{base_name}'): dest_dir = test_dir / fn.parts[-2] dest_dir.mkdir(parents=True, exist_ok=True) fn.rename(dest_dir / fn.name)
def get_run_dirs(root_dir: PosixPath, model: str, loss: str) -> List: """Get all folders that are trained for a specific model configuration Parameters ---------- root_dir : PosixPath Path to the folder containing all model runs. model : str One of ['ealstm', 'lstm', 'lstm_no_static'], defining the model type to find. loss : str One of ['NSELoss', 'MSELoss'], defining the loss function that the model was trained for. Returns ------- List List of PosixPaths, where each path points to the folder of one model run. Raises ------ ValueError If an invalid model type was passed. ValueError If an invalid loss type was passed. RuntimeError If root directory contains no subfolder. """ valid_models = ["ealstm", "lstm", "lstm_no_static"] if not model in valid_models: raise ValueError(f"`model` must be one of {valid_models}") valid_loss = ['MSELoss', 'NSELoss'] if not loss in valid_loss: raise ValueError(f"`loss` must be one of {valid_loss}") folders = list(root_dir.glob('*/')) if len(folders) == 0: raise RuntimeError(f"No subfolders found in {root_dir}") run_dirs = [] for folder in folders: if folder.is_dir(): with open(folder / "cfg.json", "r") as fp: cfg = json.load(fp) if (model == "ealstm") and (not cfg["concat_static"]) and ( not cfg["no_static"]): if (loss == "NSELoss") and (not cfg["use_mse"]): run_dirs.append(folder) elif (loss == "MSELoss") and (cfg["use_mse"]): run_dirs.append(folder) else: pass if (model == "lstm") and (cfg["concat_static"]) and ( not cfg["no_static"]): if (loss == "NSELoss") and (not cfg["use_mse"]): run_dirs.append(folder) elif (loss == "MSELoss") and (cfg["use_mse"]): run_dirs.append(folder) else: pass if (model == "lstm_no_static") and (cfg["no_static"]): if (loss == "NSELoss") and (not cfg["use_mse"]): run_dirs.append(folder) elif (loss == "MSELoss") and (cfg["use_mse"]): run_dirs.append(folder) else: pass return run_dirs
from matplotlib.colors import Normalize from sys import argv import numpy as np from pathlib import PosixPath from multiprocessing import Pool import csv if len(argv) != 2: print("Usage: {} <output dir>".format(argv[0])) exit() output_dir = PosixPath(argv[1]) assert (output_dir.is_dir()) def plot_frame(path): with open(path) as frame: array = np.asarray([[abs(float(cell)) for cell in row] for row in csv.reader(frame)], dtype=np.float32) pyplot.pcolormesh(array) ax = pyplot.gca() ax.set_ylim(ax.get_ylim()[::-1]) ax.xaxis.tick_top() path = path.with_suffix(".png") pyplot.savefig(path, format="png") with Pool() as pool: pool.map(plot_frame, (out_file for out_file in output_dir.glob("*.csv")))
def get_run_dirs_gridEvaluation(root_dir: PosixPath, model: str, basin_subset: int, training_years: int) -> List: """Get all folders that are trained for a specific training configuration in the grid evaluation Parameters ---------- root_dir : PosixPath Path to the folder containing all model runs. model : str One of ['ealstm', 'xgboost'], defining the model type to find. basin_subset : int Number from 0-10, identifying the random basin subset (5 random subsets of 53 basins, 5 random subsets of 265 basins, or all 531 basins) training_years: int Number of training years used to train the model Returns ------- List List of PosixPaths, where each path points to the folder of one model run. Raises ------ ValueError If an invalid model type was passed. ValueError If an invalid loss type was passed. RuntimeError If root directory contains no subfolder. """ valid_models = ["xgboost", "ealstm"] valid_basin_subsets = list(range(21)) valid_training_years = [3, 6, 9] if (not model in valid_models) or (not basin_subset in valid_basin_subsets) or ( not training_years in valid_training_years): raise ValueError( f"`model` must be one of {valid_models} and `basin_subset` one of {valid_basin_subsets} and `training_years` one of {valid_training_years}" ) folders = list(root_dir.glob('*/')) if len(folders) == 0: raise RuntimeError(f"No subfolders found in {root_dir}") run_dirs = [] for folder in folders: if folder.is_dir(): with open(folder / "cfg.json", "r") as fp: cfg = json.load(fp) folder_properties = folder.name.split('_') if 'param_search' in folder.name: continue if int(folder_properties[-2]) == basin_subset: if (folder_properties[-4] == '30092002' and training_years == 3) or \ (folder_properties[-4] == '30092005' and training_years == 6) or \ (folder_properties[-4] == '30092008' and training_years == 9): if folder_properties[1] == 'ealstm': if (model == "ealstm" ) and (not cfg["concat_static"]) and ( not cfg["no_static"]) and (not cfg["use_mse"]): run_dirs.append(folder) elif folder_properties[1] == 'xgb': if (model == "xgboost") and not cfg["use_mse"]: run_dirs.append(folder) else: pass return run_dirs