def run(self, progress_callback): client = DSSClient('http://localhost:%s' % os.environ.get('DKU_BACKEND_PORT'), internal_ticket = os.environ.get('DKU_API_TICKET')) rt = ResultTable() rt.set_name("Killed sessions") rt.add_column("session_id", "Session id", "STRING") rt.add_column("notebook_project", "Notebook project key", "STRING") rt.add_column("notebook_project", "Notebook name", "STRING") simulate = self.config.get('simulate', True) max_idle = float(self.config.get('maxIdleTimeHours', 0)) max_age = float(self.config.get('maxSessionAgeHours', 0)) dont_kill_busy = self.config.get('dontKillBusyKernels', True) dont_kill_connected = self.config.get('dontKillConnectedKernels', True) now = get_epochtime_ms() logging.info("Listing notebooks max_age_ms=%s max_idle_ms=%s" % (max_age * 1000 * 3600, max_idle * 1000 * 3600)) for nbk in client.list_running_notebooks(): state = nbk.get_state() for session in state["activeSessions"]: logging.info("Check kill of %s session_age=%s kernel_idle=%s" % ( session, (now - session["sessionStartTime"]), (now - session["kernelLastActivityTime"]))) kill = False if max_age > 0 and (now - session["sessionStartTime"]) > max_age * 1000 * 3600: logging.info( " -> Will kill on max_age") kill = True if max_idle > 0 and (now - session["kernelLastActivityTime"]) > max_idle * 1000 * 3600: logging.info( " -> Will kill on max_idle") kill = True if dont_kill_busy and session["kernelExecutionState"] == "busy": logging.info(" -> Don't kill (busy)") kill = False if dont_kill_connected and session["kernelConnections"] > 0: logging.info(" -> Don't kill (connected)") kill = False if kill: logging.info("Unloading session %s" % session["sessionId"]) rt.add_record([session["sessionId"], session.get("projectKey", "?"), session.get("notebookName", "?")]) if not simulate: nbk.unload(session["sessionId"]) else: logging.info("Don't kill %s" % session["sessionId"]) return rt
def run(self, progress_callback): dip_home = os.environ['DIP_HOME'] saved_models = osp.join(dip_home, 'saved_models') rt = ResultTable() rt.set_name("Analysis data used space") rt.add_column("project", "Project key", "STRING") rt.add_column("saved_model_id", "Saved model id", "STRING") rt.add_column("saved_model_name", "Saved model name", "STRING") rt.add_column("total", "Total space (MB)", "STRING") rt.add_column("splits", "Splits space (MB)", "STRING") rt.add_column("versions", "Number of versions", "STRING") if self.config.get('allProjects', False): projects = [ project_key for project_key in os.listdir(saved_models) ] else: projects = [self.project_key] for project in projects: project_sm = osp.join(saved_models, project) if not osp.isdir(project_sm): continue for saved_model in os.listdir(project_sm): sm_dir = osp.join(project_sm, saved_model) versions_dir = osp.join(sm_dir, "versions") if not osp.isdir(versions_dir): continue versions = 0 total_splits = 0 total = cleanup.du(sm_dir) for version in os.listdir(versions_dir): version_dir = osp.join(versions_dir, version) split_dir = osp.join(version_dir, "split") if osp.isdir(split_dir): total_splits += cleanup.du(split_dir) versions += 1 record = [] record.append(project) record.append(saved_model) record.append(saved_model) record.append(total / 1024) record.append(total_splits / 1024) record.append(versions) rt.add_record(record) return rt
def run(self, progress_callback): to_delete = [] # As `docker images` sorts images by creation date, we only have to keep the most recent one built for DSS. # Sample cmd: $ docker images 'dku-exec-base-notattributed' --format '{{.Repository}}\t{{.Tag}}\t{{.ID}}\t{{.CreatedAt}}' if self.config['rm_dss_images']: cmd = self._get_docker_cmd('images', self.config['base_image_name'], '--format', '{{.Repository}}\t{{.Tag}}\t{{.ID}}\t{{.CreatedAt}}') p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) is_most_recent = True for line in iter(p.stdout.readline, ''): elements = line.split('\t') if len(elements) != 4: continue if is_most_recent: is_most_recent = False else: to_delete.append({'repo': elements[0], 'tag': elements[1], 'id': elements[2], 'createdAt': elements[3]}) # Dangling images, that could be wiped with `docker image prune` (but would need the docker daemon to be up-to-date) # Sample cmd: $ docker images -f 'dangling=true' --format '{{.Repository}}\t{{.Tag}}\t{{.ID}}\t{{.CreatedAt}}' if self.config['rm_none_images']: cmd = self._get_docker_cmd('images', '-f', 'dangling=true', '--format', '{{.Repository}}\t{{.Tag}}\t{{.ID}}\t{{.CreatedAt}}') p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) for line in iter(p.stdout.readline, ''): elements = line.split('\t') if len(elements) != 4: continue to_delete.append({'repo': elements[0], 'tag': elements[1], 'id': elements[2], 'createdAt': elements[3]}) if self.config['perform_deletion']: rmi_args = [elt['id'] for elt in to_delete] print('Will delete these images: ' + str(rmi_args)) if self.config['force_rm']: rmi_args.insert(0, '--force') cmd = self._get_docker_cmd('rmi', *rmi_args) subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) rt = ResultTable() rt.set_name("Removed containers") rt.add_column("repo", "Repository", "STRING") rt.add_column("tag", "Tag", "STRING") rt.add_column("id", "Identifier", "STRING") rt.add_column("createdAt", "Created at", "STRING") for elt in to_delete: rt.add_record([elt['repo'], elt['tag'], elt['id'], elt['createdAt']]) return rt
def run(self, progress_callback): dip_home = os.environ['DIP_HOME'] exports_folder = os.path.join(dip_home, 'exports') simulate = bool(self.config.get("simulate", False)) maximum_age = int(self.config.get("age", 15)) maximum_timestamp = int( time.mktime((datetime.datetime.now() - datetime.timedelta(days=maximum_age)).timetuple())) to_delete = [] for export_id in os.listdir(exports_folder): if os.stat(os.path.join(exports_folder, export_id)).st_mtime < maximum_timestamp: to_delete.append(export_id) def folder_size(folder): total_size = 0 for dirpath, dirnames, filenames in os.walk(folder): for f in filenames: fp = os.path.join(dirpath, f) total_size += os.path.getsize(fp) return total_size rt = ResultTable() rt.set_name("Removed exports") rt.add_column("id", "Export identifier", "STRING") rt.add_column("age", "Age (days)", "STRING") rt.add_column("size", "Size (KB)", "STRING") for export_id in to_delete: export_folder = os.path.join(exports_folder, export_id) size = folder_size(export_folder) mtime = os.stat(export_folder).st_mtime age = (time.mktime(datetime.datetime.now().timetuple()) - mtime) / 86400 if not simulate: shutil.rmtree(export_folder) rt.add_record([export_id, int(age), size / 1024]) return rt
def run(self, progress_callback): rt = ResultTable() rt.set_name("List datasets on connection") if self.all_projects: rt.add_column("dataset", "Dataset", "FQ_DATASET_WITH_TYPE") else: rt.add_column("dataset", "Dataset", "LOCAL_DATASET_WITH_TYPE") rt.add_column("table", "Table (SQL only)", "STRING") rt.add_column("schema", "Schema (SQL only)", "STRING") rt.add_column("tags", "Tags", "STRING_LIST") if self.config.get("all_projects", False) == True: for project_key in self.client.list_project_keys(): project = self.client.get_project(project_key) self.run_for_project(rt, project) else: project = self.client.get_project(self.project_key) self.run_for_project(rt, project) return rt
def run(self, progress_callback): included = self.config.get('includedTags', '') excluded = self.config.get('excludedTags', '') included_or = self.config.get('includedTagsCombine', 'OR') == 'OR' excluded_or = self.config.get('excludedTagsCombine', 'OR') == 'OR' included_set = set(included.split(',')) excluded_set = set(excluded.split(',')) project = self.client.get_project(self.project_key) to_delete = [] for dataset in project.list_datasets(): tags = dataset.get('tags', []) included = apply(tags, included_set, included_or) excluded = apply(tags, excluded_set, excluded_or) if included and not excluded: to_delete.append(dataset) rt = ResultTable() rt.set_name("Delete datasets by tag") simulate = self.config.get('simulate', True) rt.add_column("dataset", simulate and "Dataset to delete" or "Deleted dataset", "LOCAL_DATASET_WITH_TYPE") if not simulate: rt.add_column("result", "Result", "STRING") if not simulate: for dataset in to_delete: try: project.get_dataset(dataset.get('name')).delete(drop_data=self.config.get("drop_data", True)) rt.add_record(["%s:%s" % (dataset.get("type"), dataset.get("name")), "SUCCESS"]) except Exception as e: rt.add_record(["%s:%s" % (dataset.get("type"), dataset.get("name")), "FAILED: %s" % str(e)]) return rt else: rows = [] for dataset in to_delete: rt.add_record(["%s:%s" % (dataset.get("type"), dataset.get("name"))]) return rt
def run(self, progress_callback): maximum_age = int(self.config.get("age", 15)) maximum_timestamp = int(time.mktime((datetime.datetime.now() - datetime.timedelta(days=maximum_age)).timetuple())) lines = int(self.config.get("lines", 5)) orphans_only = bool(self.config.get("orphansOnly", False)) do_it = bool(self.config.get("performDeletion", False)) dip_home = os.environ['DIP_HOME'] config_home = os.path.join(dip_home, "config") analysis_data_folder = osp.join(dip_home, "analysis-data") def truncate_file(path, rows): yourfile = pd.read_csv(path, nrows = rows) yourfile.to_csv(path, index = False) rt = ResultTable() rt.set_name("Saved models cleanup") rt.add_column("project", "Project key", "STRING") rt.add_column("analysis", "Analysis", "STRING") rt.add_column("dataset", "Dataset", "STRING") rt.add_column("model", "Model", "STRING") rt.add_column("total_size_before", "Total space before", "STRING") rt.add_column("total_size_after", "Total space after", "STRING") rt.add_column("kept_splits", "Kept splits", "STRING") rt.add_column("truncated_splits", "Truncated splits", "STRING") rt.add_column("reclaimed_size", "Reclaimed size", "STRING") grand_total_used = 0 grand_total_reclaimed = 0 grand_total_kept = 0 grand_total_deleted = 0 for project_key in cleanup.get_projects_to_consider(self.project_key, self.config): analysis_data_project_folder = osp.join(analysis_data_folder, project_key) if not osp.isdir(analysis_data_project_folder): continue project_folder = os.path.join(config_home, "projects", project_key) for analysis_id in os.listdir(analysis_data_project_folder): analysis_data_analysis_folder = osp.join(analysis_data_project_folder,analysis_id) if not osp.isdir(analysis_data_analysis_folder): continue analysis_folder = os.path.join(project_folder, "analysis", analysis_id) total_used = 0 total_reclaimed = 0 total_kept = 0 total_deleted = 0 model_records = [] dataset_name = None analysis_name = None try: core_params_file = os.path.join(analysis_folder, "core_params.json") if os.path.isfile(core_params_file): with open(core_params_file, 'r') as f: core_params = json.load(f) dataset_name = core_params.get('inputDatasetSmartName', None) analysis_name = core_params.get('name', None) except Exception: pass model_ids = [] for model_id in os.listdir(analysis_data_analysis_folder): analysis_data_model_folder = os.path.join(analysis_data_analysis_folder, model_id) if not osp.isdir(analysis_data_model_folder): continue model_folder = os.path.join(analysis_folder, "ml", model_id) used = 0 reclaimed = 0 kept = 0 deleted = 0 try: used = cleanup.du(analysis_data_model_folder,size_unit="b") except Exception: pass model_name = None try: model_params_file = os.path.join(model_folder, "params.json") if os.path.isfile(model_params_file): with open(model_params_file, 'r') as f: model_params = json.load(f) model_name = model_params.get('name', None) except: pass splits_dates = {} # Scan session to find out split usage sessions_folder = os.path.join(analysis_data_model_folder,"sessions") if osp.isdir(sessions_folder): for session in os.listdir(sessions_folder): split_ref_file = osp.join(sessions_folder, session, "split_ref.json") if not osp.isfile(split_ref_file): continue session_timestamp = os.stat(osp.join(sessions_folder, session)).st_mtime split_ref = None with open(split_ref_file, 'r') as f: split_ref = json.load(f).get("splitInstanceId",None) if split_ref is not None and splits_dates.get(split_ref,0) < session_timestamp: splits_dates[split_ref] = session_timestamp # Check it agaisnt actual splits splits_folder = os.path.join(analysis_data_model_folder,"splits") if osp.isdir(splits_folder): for split in glob.glob(osp.join(splits_folder,"*.json")): split_name, _ = osp.splitext(split) split_short_name = osp.basename(split_name) split_date = splits_dates.get(split_short_name, None) if split_date is None or (split_date < maximum_timestamp and not orphans_only): deleted += 1 split_data = {} with open(split, 'r') as f: split_data = json.load(f) for split_data_filename in [split_data.get("testPath",None), split_data.get("trainPath",None)]: if split_data_filename is None: continue split_data_file = osp.join(splits_folder,split_data_filename) _, split_data_extension = osp.splitext(split_data_filename) if osp.isfile(split_data_file): if do_it: if split_date is None: reclaimed = os.stat(split_data_file).st_size os.unlink(split_data_file) else: size_before = os.stat(split_data_file).st_size try: data_file = pd.read_csv(split_data_file, nrows = lines) data_file.to_csv(split_data_file, index = False, compression="gzip" if split_data_extension == ".gz" else None) except Exception as e: logging.getLogger().error("{}: {}".format(split_data_file,str(e))) reclaimed = size_before - os.stat(split_data_file).st_size pass else: reclaimed = os.stat(split_data_file).st_size if do_it and split_date is None: os.unlink(split) pass else: kept += 1 total_reclaimed += reclaimed total_used += used total_kept += kept total_deleted += deleted model_records.append([ project_key, analysis_name, dataset_name, model_name, cleanup.format_size(used), cleanup.format_size(used-reclaimed), kept, deleted, cleanup.format_size(reclaimed) ]) rt.add_record([ project_key, analysis_name, dataset_name, "Total all models", cleanup.format_size(total_used), cleanup.format_size(total_used-total_reclaimed), total_kept, total_deleted, cleanup.format_size(total_reclaimed) ]) for record in model_records: rt.add_record(record) grand_total_reclaimed += total_reclaimed grand_total_used += total_used grand_total_kept += total_kept grand_total_deleted += total_deleted rt.add_record([ "Total used", "-", "-", "-", cleanup.format_size(grand_total_used), cleanup.format_size(grand_total_used-grand_total_reclaimed), grand_total_kept, grand_total_deleted, cleanup.format_size(grand_total_reclaimed) ]) return rt
def run(self, progress_callback): dip_home = os.environ["DIP_HOME"] config_home = os.path.join(dip_home, "config") analysis_data_folder = os.path.join(dip_home, "analysis-data") rt = ResultTable() rt.set_name("Analysis data") rt.add_column("projectKey", "Project", "STRING") rt.add_column("dataset", "Dataset", "STRING") rt.add_column("analysis", "Analysis", "STRING") rt.add_column("model", "Model", "STRING") rt.add_column("used", "Disk space used", "STRING") rt.add_column("path", "Path", "STRING") for project_key in os.listdir(analysis_data_folder): analysis_data_project_folder = os.path.join(analysis_data_folder, project_key) if not os.path.isdir(analysis_data_project_folder): continue project_folder = os.path.join(config_home, "projects", project_key) orphaned_project = not os.path.isdir(project_folder) for analysis_id in os.listdir(analysis_data_project_folder): analysis_data_analysis_folder = os.path.join(analysis_data_project_folder, analysis_id) if not os.path.isdir(analysis_data_analysis_folder): continue analysis_folder = os.path.join(project_folder, "analysis", analysis_id) orphaned_analysis = not os.path.isdir(analysis_folder) if not orphaned_project else None total_used = 0 model_records = [] for model_id in os.listdir(analysis_data_analysis_folder): analysis_data_model_folder = os.path.join(analysis_data_analysis_folder, model_id) if not os.path.isdir(analysis_data_model_folder): continue model_folder = os.path.join(analysis_folder, "ml", model_id) orphaned_model = not os.path.isdir(model_folder) if not orphaned_project and not orphaned_analysis else None try: used = self.disk_space_used(analysis_data_model_folder) total_used += used except: used = None try: core_params_file = os.path.join(analysis_folder, "core_params.json") if os.path.isfile(core_params_file): with open(core_params_file, 'r') as f: core_params = json.load(f) dataset_name = core_params.get('inputDatasetSmartName', None) analysis_name = core_params.get('name', None) else: dataset_name = None analysis_name = None except: dataset_name = None analysis_name = None try: model_params_file = os.path.join(model_folder, "params.json") if os.path.isfile(model_params_file): with open(model_params_file, 'r') as f: model_params = json.load(f) model_name = model_params.get('name', None) else: model_name = None except: model_name = None record = [] # 0 if orphaned_project: record.append('(orphaned)') else: record.append(project_key) # 1 record.append(dataset_name) # 2 if orphaned_analysis: record.append('(orphaned)') elif analysis_name is not None: record.append(analysis_name) else: record.append(analysis_id) # 3 if orphaned_model: record.append('(orphaned)') elif model_name is not None: record.append(model_name) else: record.append(model_id) # 4 if used is None: record.append('N/A') elif used < 1024: record.append('%s b' % used) elif used < 1024 * 1024: record.append('%s Kb' % int(used/1024)) elif used < 1024 * 1024 * 1024: record.append('%s Mb' % int(used/(1024*1024))) else: record.append('%s Gb' % int(used/(1024*1024*1024))) # 5 record.append(analysis_data_model_folder) model_records.append(record) record = [] # 0 if orphaned_project: record.append('(orphaned)') else: record.append(project_key) # 1 record.append(dataset_name) # 2 if orphaned_analysis: record.append('(orphaned)') elif analysis_name is not None: record.append(analysis_name) else: record.append(analysis_id) # 3 record.append(None) # 4 if total_used is None: record.append('N/A') elif total_used < 1024: record.append('%s b' % used) elif total_used < 1024 * 1024: record.append('%s Kb' % int(total_used/1024)) elif total_used < 1024 * 1024 * 1024: record.append('%s Mb' % int(total_used/(1024*1024))) else: record.append('%s Gb' % int(total_used/(1024*1024*1024))) # 5 record.append(analysis_data_analysis_folder) rt.add_record(record) for model_record in model_records: rt.add_record(model_record) table_rows = [] idx = 0 for record in rt.records: analysis_row = record[3] is None row_cells = [] for i in range(0, 6): if analysis_row and i == 3: continue value = record[i] if value is not None: if i == 5: show_path_var = "showPath%s" % idx row_cells.append('<td class="mx-textellipsis" title="%s"><a class="mx-link-nodecoration" href="" ng-click="%s = !%s"><i class="icon-eye"></i></a></td>' % (value, show_path_var, show_path_var)) else: row_cells.append('<td class="mx-textellipsis" title="%s" %s>%s</td>' % (value, (' colspan="2"' if analysis_row and i == 2 else ''), value)) else: row_cells.append('<td></td>') if analysis_row: # analysis row table_rows.append('<tr style="font-weight: bold;">%s</tr>' % (''.join(row_cells))) else: # model row table_rows.append('<tr>%s</tr>' % (''.join(row_cells))) path_cell_style = 'white-space: nowrap; padding-left: 20px; font-family: monospace; font-size: 11px;' if analysis_row: path_cell_style = path_cell_style + '; font-weight: bold' table_rows.append('<tr ng-if="%s"><td colspan="6" title="%s" style="%s">%s</td></tr>' % (show_path_var, record[5], path_cell_style, record[5])) idx += 1 html = '<div>' table_header = '<th>%s</th>' % ('</th><th>'.join(['Project', 'Dataset', 'Analysis', 'Model', 'Disk usage', 'Path'])) html += '<table class="table table-striped" style="table-layout: fixed;">%s%s</table>' % (table_header, ''.join(table_rows)) html += '</div>' return html # return rt
def run(self, progress_callback): dip_home = os.environ['DIP_HOME'] analysis_data = osp.join(dip_home, 'analysis-data') projects_sessions = {} projects_splits = {} analyses_sessions = {} analyses_splits = {} projects_analyses = {} if self.config.get('allProjects', False): projects = [ project_key for project_key in os.listdir(analysis_data) ] else: projects = [self.project_key] for project in projects: project_analysis_data = osp.join(analysis_data, project) project_sessions = 0 project_splits = 0 projects_analyses[project] = [] if not osp.isdir(project_analysis_data): projects_sessions[project] = 0 projects_splits[project] = 0 continue for analysis in os.listdir(project_analysis_data): analysis_dir = osp.join(project_analysis_data, analysis) analysis_sessions = 0 analysis_splits = 0 projects_analyses[project].append(analysis) for mltask in os.listdir(analysis_dir): mltask_dir = osp.join(analysis_dir, mltask) sessions_dir = osp.join(mltask_dir, "sessions") splits_dir = osp.join(mltask_dir, "splits") if osp.isdir(sessions_dir): analysis_sessions += cleanup.du(sessions_dir) if osp.isdir(splits_dir): analysis_splits += cleanup.du(splits_dir) project_sessions += analysis_sessions project_splits += analysis_splits analyses_splits[(project, analysis)] = analysis_splits analyses_sessions[(project, analysis)] = analysis_sessions projects_sessions[project] = project_sessions projects_splits[project] = project_splits rt = ResultTable() rt.set_name("Analysis data used space") if self.config["granularity"] == "project": rt.add_column("project", "Project key", "STRING") rt.add_column("total", "Total space (MB)", "STRING") rt.add_column("sessions", "Sessions space (MB)", "STRING") rt.add_column("splits", "Splits space (MB)", "STRING") for project in projects: total = (projects_sessions[project] + projects_splits[project]) if len(projects) > 0 and total == 0: continue record = [] record.append(project) record.append(total / 1024) record.append(projects_sessions[project] / 1024) record.append(projects_splits[project] / 1024) rt.add_record(record) else: rt.add_column("project", "Project key", "STRING") rt.add_column("analysis", "Analysis id", "STRING") rt.add_column("total", "Total space (MB)", "STRING") rt.add_column("sessions", "Sessions space (MB)", "STRING") rt.add_column("splits", "Splits space (MB)", "STRING") for project in projects: for analysis in projects_analyses[project]: record = [] record.append(project) record.append(analysis) record.append( (analyses_sessions[(project, analysis)] + analyses_splits[(project, analysis)]) / 1024) record.append(analyses_sessions[(project, analysis)] / 1024) record.append(analyses_splits[(project, analysis)] / 1024) rt.add_record(record) return rt
def run(self, progress_callback): maximum_age = int(self.config.get("age", 15)) maximum_timestamp = int( time.mktime((datetime.datetime.now() - datetime.timedelta(days=maximum_age)).timetuple())) lines = int(self.config.get("lines", 5)) do_it = bool(self.config.get("performDeletion", False)) dip_home = os.environ['DIP_HOME'] saved_models = osp.join(dip_home, 'saved_models') def truncate_file(path, rows): yourfile = pd.read_csv(path, nrows=rows) yourfile.to_csv(path, index=False) rt = ResultTable() rt.set_name("Saved models cleanup") rt.add_column("project", "Project key", "STRING") rt.add_column("saved_model_id", "Saved model id", "STRING") rt.add_column("saved_model_name", "Saved model name", "STRING") rt.add_column("total_size_before", "Total space before (MB)", "STRING") rt.add_column("total_size_after", "Total space after (MB)", "STRING") rt.add_column("kept_splits", "Kept splits", "STRING") rt.add_column("truncated_splits", "Truncated splits", "STRING") rt.add_column("reclaimed_size", "Reclaimed size", "STRING") grand_total_used = 0 grand_total_reclaimed = 0 grand_total_kept = 0 grand_total_deleted = 0 for project in cleanup.get_projects_to_consider( self.project_key, self.config): project_sm = osp.join(saved_models, project) if not osp.isdir(project_sm): continue for saved_model in os.listdir(project_sm): sm_dir = osp.join(project_sm, saved_model) versions_dir = osp.join(sm_dir, "versions") if not osp.isdir(versions_dir): continue kept_versions = 0 deleted_versions = 0 size_reclaimed = 0 total_size_before = cleanup.du(sm_dir, size_unit="b") for version in os.listdir(versions_dir): version_dir = osp.join(versions_dir, version) if os.stat(version_dir).st_mtime < maximum_timestamp: # Need to clean this version deleted_versions += 1 split_dir = osp.join(version_dir, "split") if osp.isdir(split_dir): for name in os.listdir(split_dir): path = osp.join(split_dir, name) ext = osp.splitext(path)[-1].lower() if ext == ".csv": if do_it: try: initial = os.stat(path).st_size truncate_file(path, lines) size_reclaimed += initial - os.stat( path).st_size except Exception as e: logging.getLogger().error( "{}: {}".format(path, str(e))) else: size_reclaimed += os.stat(path).st_size else: kept_versions += 1 total_size_after = cleanup.du(sm_dir, size_unit="b") record = [] record.append(project) record.append(saved_model) record.append(saved_model) record.append(cleanup.format_size(total_size_before)) record.append(cleanup.format_size(total_size_after)) record.append(kept_versions) record.append(deleted_versions) record.append(cleanup.format_size(size_reclaimed)) rt.add_record(record) grand_total_reclaimed += size_reclaimed grand_total_used += total_size_before grand_total_kept += kept_versions grand_total_deleted += deleted_versions rt.add_record([ "Total", "-", "-", cleanup.format_size(grand_total_used), cleanup.format_size(grand_total_used - grand_total_reclaimed), grand_total_kept, grand_total_deleted, cleanup.format_size(grand_total_reclaimed) ]) return rt