def create_resulttable(self): """ Transforms the log dataframe into a ResultTable. """ result_table = ResultTable() for column_name in self.log_df.keys(): result_table.add_column(column_name, str.capitalize(column_name), "STRING") for log_row in self.log_df.itertuples(): result_table.add_record(list(log_row)[1:]) return result_table
def search(self, search_term): """ Search the Helm Chart Repositories installed for different Helm charts that can be installed """ rt = ResultTable() cmd = [self.helm, "search", "repo"] if search_term: cmd.append(search_term) process = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) stdout, stderr = process.communicate() if stderr: raise Exception("Exception searching repos: {}".format(stderr)) rows = stdout.split("\n") result = [] rt.add_column("name", "Name", "STRING") rt.add_column("chartVersion", "Chart Version", "STRING") rt.add_column("appVersion", "App Version", "STRING") rt.add_column("description", "Description", "STRING") for row in rows[1:]: record = [] for r in row.split("\t"): record.append(r.strip()) rt.add_record(record) return rt
def run(self, progress_callback): dip_home = os.environ['DIP_HOME'] exports_folder = os.path.join(dip_home, 'exports') simulate = bool(self.config.get("simulate", False)) maximum_age = int(self.config.get("age", 15)) maximum_timestamp = int( time.mktime((datetime.datetime.now() - datetime.timedelta(days=maximum_age)).timetuple())) to_delete = [] for export_id in os.listdir(exports_folder): if os.stat(os.path.join(exports_folder, export_id)).st_mtime < maximum_timestamp: to_delete.append(export_id) def folder_size(folder): total_size = 0 for dirpath, dirnames, filenames in os.walk(folder): for f in filenames: fp = os.path.join(dirpath, f) total_size += os.path.getsize(fp) return total_size rt = ResultTable() rt.set_name("Removed exports") rt.add_column("id", "Export identifier", "STRING") rt.add_column("age", "Age (days)", "STRING") rt.add_column("size", "Size (KB)", "STRING") for export_id in to_delete: export_folder = os.path.join(exports_folder, export_id) size = folder_size(export_folder) mtime = os.stat(export_folder).st_mtime age = (time.mktime(datetime.datetime.now().timetuple()) - mtime) / 86400 if not simulate: shutil.rmtree(export_folder) rt.add_record([export_id, int(age), size / 1024]) return rt
def run(self, progress_callback): to_delete = [] # As `docker images` sorts images by creation date, we only have to keep the most recent one built for DSS. # Sample cmd: $ docker images 'dku-exec-base-notattributed' --format '{{.Repository}}\t{{.Tag}}\t{{.ID}}\t{{.CreatedAt}}' if self.config['rm_dss_images']: cmd = self._get_docker_cmd('images', self.config['base_image_name'], '--format', '{{.Repository}}\t{{.Tag}}\t{{.ID}}\t{{.CreatedAt}}') p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) is_most_recent = True for line in iter(p.stdout.readline, ''): elements = line.split('\t') if len(elements) != 4: continue if is_most_recent: is_most_recent = False else: to_delete.append({'repo': elements[0], 'tag': elements[1], 'id': elements[2], 'createdAt': elements[3]}) # Dangling images, that could be wiped with `docker image prune` (but would need the docker daemon to be up-to-date) # Sample cmd: $ docker images -f 'dangling=true' --format '{{.Repository}}\t{{.Tag}}\t{{.ID}}\t{{.CreatedAt}}' if self.config['rm_none_images']: cmd = self._get_docker_cmd('images', '-f', 'dangling=true', '--format', '{{.Repository}}\t{{.Tag}}\t{{.ID}}\t{{.CreatedAt}}') p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) for line in iter(p.stdout.readline, ''): elements = line.split('\t') if len(elements) != 4: continue to_delete.append({'repo': elements[0], 'tag': elements[1], 'id': elements[2], 'createdAt': elements[3]}) if self.config['perform_deletion']: rmi_args = [elt['id'] for elt in to_delete] print('Will delete these images: ' + str(rmi_args)) if self.config['force_rm']: rmi_args.insert(0, '--force') cmd = self._get_docker_cmd('rmi', *rmi_args) subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) rt = ResultTable() rt.set_name("Removed containers") rt.add_column("repo", "Repository", "STRING") rt.add_column("tag", "Tag", "STRING") rt.add_column("id", "Identifier", "STRING") rt.add_column("createdAt", "Created at", "STRING") for elt in to_delete: rt.add_record([elt['repo'], elt['tag'], elt['id'], elt['createdAt']]) return rt
def run(self, progress_callback): rt = ResultTable() rt.set_name("List datasets on connection") if self.all_projects: rt.add_column("dataset", "Dataset", "FQ_DATASET_WITH_TYPE") else: rt.add_column("dataset", "Dataset", "LOCAL_DATASET_WITH_TYPE") rt.add_column("table", "Table (SQL only)", "STRING") rt.add_column("schema", "Schema (SQL only)", "STRING") rt.add_column("tags", "Tags", "STRING_LIST") if self.config.get("all_projects", False) == True: for project_key in self.client.list_project_keys(): project = self.client.get_project(project_key) self.run_for_project(rt, project) else: project = self.client.get_project(self.project_key) self.run_for_project(rt, project) return rt
def run(self, progress_callback): included = self.config.get('includedTags', '') excluded = self.config.get('excludedTags', '') included_or = self.config.get('includedTagsCombine', 'OR') == 'OR' excluded_or = self.config.get('excludedTagsCombine', 'OR') == 'OR' included_set = set(included.split(',')) excluded_set = set(excluded.split(',')) project = self.client.get_project(self.project_key) to_delete = [] for dataset in project.list_datasets(): tags = dataset.get('tags', []) included = apply(tags, included_set, included_or) excluded = apply(tags, excluded_set, excluded_or) if included and not excluded: to_delete.append(dataset) rt = ResultTable() rt.set_name("Delete datasets by tag") simulate = self.config.get('simulate', True) rt.add_column("dataset", simulate and "Dataset to delete" or "Deleted dataset", "LOCAL_DATASET_WITH_TYPE") if not simulate: rt.add_column("result", "Result", "STRING") if not simulate: for dataset in to_delete: try: project.get_dataset(dataset.get('name')).delete(drop_data=self.config.get("drop_data", True)) rt.add_record(["%s:%s" % (dataset.get("type"), dataset.get("name")), "SUCCESS"]) except Exception as e: rt.add_record(["%s:%s" % (dataset.get("type"), dataset.get("name")), "FAILED: %s" % str(e)]) return rt else: rows = [] for dataset in to_delete: rt.add_record(["%s:%s" % (dataset.get("type"), dataset.get("name"))]) return rt
def run(self, progress_callback): client = DSSClient('http://localhost:%s' % os.environ.get('DKU_BACKEND_PORT'), internal_ticket = os.environ.get('DKU_API_TICKET')) rt = ResultTable() rt.set_name("Killed sessions") rt.add_column("session_id", "Session id", "STRING") rt.add_column("notebook_project", "Notebook project key", "STRING") rt.add_column("notebook_project", "Notebook name", "STRING") simulate = self.config.get('simulate', True) max_idle = float(self.config.get('maxIdleTimeHours', 0)) max_age = float(self.config.get('maxSessionAgeHours', 0)) dont_kill_busy = self.config.get('dontKillBusyKernels', True) dont_kill_connected = self.config.get('dontKillConnectedKernels', True) now = get_epochtime_ms() logging.info("Listing notebooks max_age_ms=%s max_idle_ms=%s" % (max_age * 1000 * 3600, max_idle * 1000 * 3600)) for nbk in client.list_running_notebooks(): state = nbk.get_state() for session in state["activeSessions"]: logging.info("Check kill of %s session_age=%s kernel_idle=%s" % ( session, (now - session["sessionStartTime"]), (now - session["kernelLastActivityTime"]))) kill = False if max_age > 0 and (now - session["sessionStartTime"]) > max_age * 1000 * 3600: logging.info( " -> Will kill on max_age") kill = True if max_idle > 0 and (now - session["kernelLastActivityTime"]) > max_idle * 1000 * 3600: logging.info( " -> Will kill on max_idle") kill = True if dont_kill_busy and session["kernelExecutionState"] == "busy": logging.info(" -> Don't kill (busy)") kill = False if dont_kill_connected and session["kernelConnections"] > 0: logging.info(" -> Don't kill (connected)") kill = False if kill: logging.info("Unloading session %s" % session["sessionId"]) rt.add_record([session["sessionId"], session.get("projectKey", "?"), session.get("notebookName", "?")]) if not simulate: nbk.unload(session["sessionId"]) else: logging.info("Don't kill %s" % session["sessionId"]) return rt
import dataiku, logging, dku_dataproc from dataiku.runnables import Runnable, ResultTable from gce_client import DataProcClient rt = ResultTable() rt.add_column("node_type", "Node type", "STRING") rt.add_column("machine_type", "Machine type", "STRING") rt.add_column("machine_private_ip", "Private IP", "STRING") rt.add_column("is_preemptible", "Pre-emptible VM?", "STRING") rt.add_column("status", "Status", "STRING") logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO) logging.getLogger().setLevel(logging.INFO) class MyRunnable(Runnable): def __init__(self, project_key, config, plugin_config): self.project_key = project_key self.config = config self.plugin_config = plugin_config def get_progress_target(self): return None def run(self, progress_callback): dss_cluster = dataiku.api_client().get_cluster(self.config["dss_cluster_id"]) settings = dss_cluster.get_settings() (client, cluster_name) = dku_dataproc.get_client_and_wait(settings) computeClient = client.forkComputeClient() clusterBody = client.getDataprocClusterByName(cluster_name) logging.info("retrieving master instance")
def run(self, progress_callback): maximum_age = int(self.config.get("age", 15)) maximum_timestamp = int(time.mktime((datetime.datetime.now() - datetime.timedelta(days=maximum_age)).timetuple())) lines = int(self.config.get("lines", 5)) orphans_only = bool(self.config.get("orphansOnly", False)) do_it = bool(self.config.get("performDeletion", False)) dip_home = os.environ['DIP_HOME'] config_home = os.path.join(dip_home, "config") analysis_data_folder = osp.join(dip_home, "analysis-data") def truncate_file(path, rows): yourfile = pd.read_csv(path, nrows = rows) yourfile.to_csv(path, index = False) rt = ResultTable() rt.set_name("Saved models cleanup") rt.add_column("project", "Project key", "STRING") rt.add_column("analysis", "Analysis", "STRING") rt.add_column("dataset", "Dataset", "STRING") rt.add_column("model", "Model", "STRING") rt.add_column("total_size_before", "Total space before", "STRING") rt.add_column("total_size_after", "Total space after", "STRING") rt.add_column("kept_splits", "Kept splits", "STRING") rt.add_column("truncated_splits", "Truncated splits", "STRING") rt.add_column("reclaimed_size", "Reclaimed size", "STRING") grand_total_used = 0 grand_total_reclaimed = 0 grand_total_kept = 0 grand_total_deleted = 0 for project_key in cleanup.get_projects_to_consider(self.project_key, self.config): analysis_data_project_folder = osp.join(analysis_data_folder, project_key) if not osp.isdir(analysis_data_project_folder): continue project_folder = os.path.join(config_home, "projects", project_key) for analysis_id in os.listdir(analysis_data_project_folder): analysis_data_analysis_folder = osp.join(analysis_data_project_folder,analysis_id) if not osp.isdir(analysis_data_analysis_folder): continue analysis_folder = os.path.join(project_folder, "analysis", analysis_id) total_used = 0 total_reclaimed = 0 total_kept = 0 total_deleted = 0 model_records = [] dataset_name = None analysis_name = None try: core_params_file = os.path.join(analysis_folder, "core_params.json") if os.path.isfile(core_params_file): with open(core_params_file, 'r') as f: core_params = json.load(f) dataset_name = core_params.get('inputDatasetSmartName', None) analysis_name = core_params.get('name', None) except Exception: pass model_ids = [] for model_id in os.listdir(analysis_data_analysis_folder): analysis_data_model_folder = os.path.join(analysis_data_analysis_folder, model_id) if not osp.isdir(analysis_data_model_folder): continue model_folder = os.path.join(analysis_folder, "ml", model_id) used = 0 reclaimed = 0 kept = 0 deleted = 0 try: used = cleanup.du(analysis_data_model_folder,size_unit="b") except Exception: pass model_name = None try: model_params_file = os.path.join(model_folder, "params.json") if os.path.isfile(model_params_file): with open(model_params_file, 'r') as f: model_params = json.load(f) model_name = model_params.get('name', None) except: pass splits_dates = {} # Scan session to find out split usage sessions_folder = os.path.join(analysis_data_model_folder,"sessions") if osp.isdir(sessions_folder): for session in os.listdir(sessions_folder): split_ref_file = osp.join(sessions_folder, session, "split_ref.json") if not osp.isfile(split_ref_file): continue session_timestamp = os.stat(osp.join(sessions_folder, session)).st_mtime split_ref = None with open(split_ref_file, 'r') as f: split_ref = json.load(f).get("splitInstanceId",None) if split_ref is not None and splits_dates.get(split_ref,0) < session_timestamp: splits_dates[split_ref] = session_timestamp # Check it agaisnt actual splits splits_folder = os.path.join(analysis_data_model_folder,"splits") if osp.isdir(splits_folder): for split in glob.glob(osp.join(splits_folder,"*.json")): split_name, _ = osp.splitext(split) split_short_name = osp.basename(split_name) split_date = splits_dates.get(split_short_name, None) if split_date is None or (split_date < maximum_timestamp and not orphans_only): deleted += 1 split_data = {} with open(split, 'r') as f: split_data = json.load(f) for split_data_filename in [split_data.get("testPath",None), split_data.get("trainPath",None)]: if split_data_filename is None: continue split_data_file = osp.join(splits_folder,split_data_filename) _, split_data_extension = osp.splitext(split_data_filename) if osp.isfile(split_data_file): if do_it: if split_date is None: reclaimed = os.stat(split_data_file).st_size os.unlink(split_data_file) else: size_before = os.stat(split_data_file).st_size try: data_file = pd.read_csv(split_data_file, nrows = lines) data_file.to_csv(split_data_file, index = False, compression="gzip" if split_data_extension == ".gz" else None) except Exception as e: logging.getLogger().error("{}: {}".format(split_data_file,str(e))) reclaimed = size_before - os.stat(split_data_file).st_size pass else: reclaimed = os.stat(split_data_file).st_size if do_it and split_date is None: os.unlink(split) pass else: kept += 1 total_reclaimed += reclaimed total_used += used total_kept += kept total_deleted += deleted model_records.append([ project_key, analysis_name, dataset_name, model_name, cleanup.format_size(used), cleanup.format_size(used-reclaimed), kept, deleted, cleanup.format_size(reclaimed) ]) rt.add_record([ project_key, analysis_name, dataset_name, "Total all models", cleanup.format_size(total_used), cleanup.format_size(total_used-total_reclaimed), total_kept, total_deleted, cleanup.format_size(total_reclaimed) ]) for record in model_records: rt.add_record(record) grand_total_reclaimed += total_reclaimed grand_total_used += total_used grand_total_kept += total_kept grand_total_deleted += total_deleted rt.add_record([ "Total used", "-", "-", "-", cleanup.format_size(grand_total_used), cleanup.format_size(grand_total_used-grand_total_reclaimed), grand_total_kept, grand_total_deleted, cleanup.format_size(grand_total_reclaimed) ]) return rt
def run(self, progress_callback): def update_percent(percent, last_update_time): new_time = time.time() if (new_time - last_update_time) > 3: progress_callback(percent) return new_time else: return last_update_time # Get project and folder containing the Excel files client = dataiku.api_client() project = client.get_project(self.project_key) folder_id = self.config.get("model_folder_id") overwrite = self.config.get("overwrite", False) folder = dataiku.Folder(folder_id, project_key=self.project_key) folder_path = folder.get_path() macro_creates_dataset = False # A boolean used to provide an informative message to the user when the macro creates a dataset # List files in folder and get path files_list = os.listdir(folder_path) # List the datasets in the project datasets_in_project = [] for i in range(len(project.list_datasets())): datasets_in_project.append(project.list_datasets()[i]['name']) # Actions performed actions_performed = dict() num_files = len(files_list) update_time = time.time() for file_index, my_file in enumerate(files_list): ## Get file path file_path = os.path.join(folder_path, my_file) ## Get Excel file and load in a pandas dataframe sheets_names = pd.ExcelFile(file_path).sheet_names for sheet in sheets_names: ### Rename sheets by "file_sheet" ss=openpyxl.load_workbook(file_path) ss_sheet = ss.get_sheet_by_name(sheet) title = ss_sheet.title if not my_file.split(".")[0] in title: title = '_'.join((my_file.split(".")[0] + "_" + sheet).split()) title = '_'.join(title.split()) title = title.replace(')','') title = title.replace('(','') create_dataset = True if title in datasets_in_project: if overwrite: project.get_dataset(title).delete() actions_performed[title] = "replaced" else: create_dataset = False actions_performed[title] = "skipped (already exists)" else: actions_performed[title] = "created" macro_creates_dataset = True if create_dataset: dataset = project.create_dataset(title ,'FilesInFolder' , params={'folderSmartId': folder_id, 'filesSelectionRules': {'mode': 'EXPLICIT_SELECT_FILES', 'explicitFiles': [my_file]}} , formatType='excel' , formatParams={"xlsx":True, "sheets":"*"+ss_sheet.title,'parseHeaderRow': True}) df = pd.read_excel(file_path, sheet_name=ss_sheet.title, nrows=1000) dataset.set_schema({'columns': [{'name': column, 'type': 'string'} for column, column_type in df.dtypes.items()]}) percent = 100*float(file_index+1)/num_files update_time = update_percent(percent, update_time) # Output table rt = ResultTable() rt.add_column("actions", "Actions", "STRING") # Actions : "dataset" has been created or replaced for i in range(len(actions_performed)): record = [] record.append(list(actions_performed.keys())[i] + " has been " + list(actions_performed.values())[i]) rt.add_record(record) if macro_creates_dataset: rt.add_record(["Please refresh this page to see new datasets."]) return rt
def run(self, progress_callback): # Retrieve macro parameters: is_dry_run = self.config.get("is_dry_run") keep_partitioned = self.config.get("keep_partitioned") keep_shared = self.config.get("keep_shared") logging.info("DRY RUN is set to {}".format(str(is_dry_run))) logging.info("KEEP PARTITIONED is set to {}".format( str(keep_partitioned))) logging.info("KEEP SHARED is set to {}".format(str(keep_shared))) # Initialize macro result table: result_table = ResultTable() result_table.add_column("dataset", "dataset", "STRING") result_table.add_column("status", "status", "STRING") client = dataiku.api_client() if self.config.get("project_key", None): project = client.get_project(self.config.get("project_key")) else: project = client.get_project(self.project_key) all_datasets = project.list_datasets() all_recipes = project.list_recipes() # Build deduplicated lists of input/output datasets: input_datasets = [] output_datasets = [] for rcp in all_recipes: rcp_inputs_dict = rcp["inputs"] rcp_outputs_dict = rcp["outputs"] # CASE: no input dataset if rcp_inputs_dict: input_key = list(rcp_inputs_dict.keys())[0] rcp_inputs_list = [ x["ref"] for x in rcp_inputs_dict[input_key]["items"] ] input_datasets += rcp_inputs_list output_key = list(rcp_outputs_dict.keys())[0] rcp_outputs_list = [ x["ref"] for x in rcp_outputs_dict[output_key]["items"] ] # Append them to the overall input list: output_datasets += rcp_outputs_list # Deduplicate input/output lists: input_datasets = list(set(input_datasets)) output_datasets = list(set(output_datasets)) # Identify Flow input/outputs & add them to result table: flow_inputs = [x for x in input_datasets if x not in output_datasets] for obj in flow_inputs: result_table.add_record([obj, "KEEP(INPUT)"]) flow_outputs = [x for x in output_datasets if x not in input_datasets] for obj in flow_outputs: result_table.add_record([obj, "KEEP(OUTPUT)"]) logging.info("Found {} FLOW INPUT datasets: {}".format( str(len(flow_inputs)), str(flow_inputs))) logging.info("Found {} FLOW OUTPUT datasets: {}".format( str(len(flow_outputs)), str(flow_outputs))) # Identify shared datasets: shared_objs = project.get_settings( ).settings["exposedObjects"]["objects"] shared_datasets = [ x["localName"] for x in shared_objs if x["type"] == "DATASET" ] logging.info("Found {} SHARED datasets: {}".format( str(len(shared_datasets)), str(shared_datasets))) # Identify partitioned (partd) datasets: is_partd = lambda x: len(x["partitioning"]["dimensions"]) > 0 partd_datasets = [x["name"] for x in all_datasets if is_partd(x)] logging.info("Found {} PARTITIONED datasets: {}".format( str(len(partd_datasets)), str(partd_datasets))) # List all datasets to keep, potentially including shared & partd ones: to_keep = flow_inputs + flow_outputs if keep_partitioned: to_keep += partd_datasets # Add them to result table: for obj in partd_datasets: result_table.add_record([obj, "KEEP(PARTITIONED)"]) if keep_shared: to_keep += shared_datasets # Add them to result table: for obj in shared_datasets: result_table.add_record([obj, "KEEP(SHARED)"]) logging.info("Total of {} datasets to KEEP: {}".format( str(len(to_keep)), str(to_keep))) # Perform cleanup or simulate it (dry run): if not is_dry_run: for ds in all_datasets: ds_name = ds["name"] if ds_name not in to_keep: dataset = project.get_dataset(ds_name) logging.info("Clearing {}...".format(ds_name)) dataset.clear() return result_table
def run(self, progress_callback): dip_home = os.environ['DIP_HOME'] analysis_data = osp.join(dip_home, 'analysis-data') projects_sessions = {} projects_splits = {} analyses_sessions = {} analyses_splits = {} projects_analyses = {} if self.config.get('allProjects', False): projects = [ project_key for project_key in os.listdir(analysis_data) ] else: projects = [self.project_key] for project in projects: project_analysis_data = osp.join(analysis_data, project) project_sessions = 0 project_splits = 0 projects_analyses[project] = [] if not osp.isdir(project_analysis_data): projects_sessions[project] = 0 projects_splits[project] = 0 continue for analysis in os.listdir(project_analysis_data): analysis_dir = osp.join(project_analysis_data, analysis) analysis_sessions = 0 analysis_splits = 0 projects_analyses[project].append(analysis) for mltask in os.listdir(analysis_dir): mltask_dir = osp.join(analysis_dir, mltask) sessions_dir = osp.join(mltask_dir, "sessions") splits_dir = osp.join(mltask_dir, "splits") if osp.isdir(sessions_dir): analysis_sessions += cleanup.du(sessions_dir) if osp.isdir(splits_dir): analysis_splits += cleanup.du(splits_dir) project_sessions += analysis_sessions project_splits += analysis_splits analyses_splits[(project, analysis)] = analysis_splits analyses_sessions[(project, analysis)] = analysis_sessions projects_sessions[project] = project_sessions projects_splits[project] = project_splits rt = ResultTable() rt.set_name("Analysis data used space") if self.config["granularity"] == "project": rt.add_column("project", "Project key", "STRING") rt.add_column("total", "Total space (MB)", "STRING") rt.add_column("sessions", "Sessions space (MB)", "STRING") rt.add_column("splits", "Splits space (MB)", "STRING") for project in projects: total = (projects_sessions[project] + projects_splits[project]) if len(projects) > 0 and total == 0: continue record = [] record.append(project) record.append(total / 1024) record.append(projects_sessions[project] / 1024) record.append(projects_splits[project] / 1024) rt.add_record(record) else: rt.add_column("project", "Project key", "STRING") rt.add_column("analysis", "Analysis id", "STRING") rt.add_column("total", "Total space (MB)", "STRING") rt.add_column("sessions", "Sessions space (MB)", "STRING") rt.add_column("splits", "Splits space (MB)", "STRING") for project in projects: for analysis in projects_analyses[project]: record = [] record.append(project) record.append(analysis) record.append( (analyses_sessions[(project, analysis)] + analyses_splits[(project, analysis)]) / 1024) record.append(analyses_sessions[(project, analysis)] / 1024) record.append(analyses_splits[(project, analysis)] / 1024) rt.add_record(record) return rt
def run(self, progress_callback): # Retrieve macro parameters: is_dry_run = self.config.get("is_dry_run") keep_partitioned = self.config.get("keep_partitioned") keep_shared = self.config.get("keep_shared") logging.info("DRY RUN is set to {}".format(str(is_dry_run))) logging.info("KEEP PARTITIONED is set to {}".format( str(keep_partitioned))) logging.info("KEEP SHARED is set to {}".format(str(keep_shared))) # Initialize macro result table: result_table = ResultTable() result_table.add_column("dataset", "Dataset", "STRING") result_table.add_column("type", "Type", "STRING") result_table.add_column("action", "Action", "STRING") result_table.add_column("action_status", "Action Status", "STRING") action_status = "Not done (Dry run)" if is_dry_run else "Done" client = dataiku.api_client() if self.config.get("project_key", None): project = client.get_project(self.config.get("project_key")) else: project = client.get_project(self.project_key) manually_selected_datasets = self.config.get("datasets_to_exclude") all_datasets = project.list_datasets() all_recipes = project.list_recipes() # Build deduplicated lists of input/output datasets: input_datasets = [] output_datasets = [] for recipe in all_recipes: recipe_inputs_dict = recipe["inputs"] recipe_outputs_dict = recipe["outputs"] # CASE: no input dataset if recipe_inputs_dict: append_datasets_to_list(recipe_inputs_dict, input_datasets) append_datasets_to_list(recipe_outputs_dict, output_datasets) # Identify Flow input/outputs: flow_inputs = [ dataset for dataset in input_datasets if dataset not in output_datasets ] flow_outputs = [ dataset for dataset in output_datasets if dataset not in input_datasets ] logging.info("Found {} FLOW INPUT datasets: {}".format( str(len(flow_inputs)), str(flow_inputs))) logging.info("Found {} FLOW OUTPUT datasets: {}".format( str(len(flow_outputs)), str(flow_outputs))) # Identify standalone, intermediate, and partitioned datasets excluded_datasets = [] standalone_datasets = [] intermediate_datasets = [] partitioned_datasets = [] for dataset in all_datasets: if dataset["name"] in manually_selected_datasets: excluded_datasets.append(dataset["name"]) if dataset["name"] not in input_datasets + output_datasets: standalone_datasets.append(dataset["name"]) if dataset[ "name"] not in flow_inputs + flow_outputs + standalone_datasets: intermediate_datasets.append(dataset["name"]) is_partitioned = lambda dataset: len(dataset["partitioning"][ "dimensions"]) > 0 if is_partitioned(dataset): partitioned_datasets.append(dataset["name"]) logging.info("Found {} EXCLUDED datasets: {}".format( str(len(excluded_datasets)), str(excluded_datasets))) logging.info("Found {} STANDALONE datasets: {}".format( str(len(standalone_datasets)), str(standalone_datasets))) logging.info("Found {} INTERMEDIATE datasets: {}".format( str(len(intermediate_datasets)), str(intermediate_datasets))) logging.info("Found {} PARTITIONED datasets: {}".format( str(len(partitioned_datasets)), str(partitioned_datasets))) # Identify shared datasets: shared_objects = project.get_settings( ).settings["exposedObjects"]["objects"] shared_datasets = [ object["localName"] for object in shared_objects if object["type"] == "DATASET" ] logging.info("Found {} SHARED datasets: {}".format( str(len(shared_datasets)), str(shared_datasets))) # Add dataset types to results list results = [] datasets = { "EXCLUDED": excluded_datasets, "STANDALONE": standalone_datasets, "INPUT": flow_inputs, "OUTPUT": flow_outputs, "INTERMEDIATE": intermediate_datasets, "SHARED": shared_datasets, "PARTITIONED": partitioned_datasets } for dataset_type, dataset_type_list in datasets.items(): for dataset in dataset_type_list: results.append([dataset, dataset_type]) # Identify which datasets should be kept to_keep = excluded_datasets + standalone_datasets + flow_inputs + flow_outputs if keep_partitioned: to_keep += partitioned_datasets if keep_shared: to_keep += shared_datasets logging.info("Total of {} datasets to KEEP: {}".format( str(len(to_keep)), str(to_keep))) # Create df with all results results_df = pd.DataFrame(results, columns=["Dataset", "Type"]) results_grouped = results_df.groupby( ["Dataset"])['Type'].apply(lambda x: ', '.join(x)).reset_index() results_grouped["Action"] = results_grouped["Dataset"].apply( lambda x: "KEEP" if x in to_keep else "CLEAR") results_grouped["Status"] = action_status results_grouped = results_grouped.sort_values(by=['Action', 'Type']) # Perform cleanup to_clear = list( results_grouped["Dataset"][results_grouped['Action'] == "CLEAR"]) logging.info("Total of {} datasets to CLEAR: {}".format( str(len(to_clear)), str(to_clear))) if not is_dry_run: for ds in to_clear: dataset = project.get_dataset(ds) logging.info("Clearing {}...".format(ds)) dataset.clear() logging.info("Clearing {} datasets: done.".format( str(len(to_clear)))) # Pass results to result table for index, row in results_grouped.iterrows(): result_table.add_record(list(row)) return result_table
def run(self, progress_callback): maximum_age = int(self.config.get("age", 15)) maximum_timestamp = int( time.mktime((datetime.datetime.now() - datetime.timedelta(days=maximum_age)).timetuple())) lines = int(self.config.get("lines", 5)) do_it = bool(self.config.get("performDeletion", False)) dip_home = os.environ['DIP_HOME'] saved_models = osp.join(dip_home, 'saved_models') def truncate_file(path, rows): yourfile = pd.read_csv(path, nrows=rows) yourfile.to_csv(path, index=False) rt = ResultTable() rt.set_name("Saved models cleanup") rt.add_column("project", "Project key", "STRING") rt.add_column("saved_model_id", "Saved model id", "STRING") rt.add_column("saved_model_name", "Saved model name", "STRING") rt.add_column("total_size_before", "Total space before (MB)", "STRING") rt.add_column("total_size_after", "Total space after (MB)", "STRING") rt.add_column("kept_splits", "Kept splits", "STRING") rt.add_column("truncated_splits", "Truncated splits", "STRING") rt.add_column("reclaimed_size", "Reclaimed size", "STRING") grand_total_used = 0 grand_total_reclaimed = 0 grand_total_kept = 0 grand_total_deleted = 0 for project in cleanup.get_projects_to_consider( self.project_key, self.config): project_sm = osp.join(saved_models, project) if not osp.isdir(project_sm): continue for saved_model in os.listdir(project_sm): sm_dir = osp.join(project_sm, saved_model) versions_dir = osp.join(sm_dir, "versions") if not osp.isdir(versions_dir): continue kept_versions = 0 deleted_versions = 0 size_reclaimed = 0 total_size_before = cleanup.du(sm_dir, size_unit="b") for version in os.listdir(versions_dir): version_dir = osp.join(versions_dir, version) if os.stat(version_dir).st_mtime < maximum_timestamp: # Need to clean this version deleted_versions += 1 split_dir = osp.join(version_dir, "split") if osp.isdir(split_dir): for name in os.listdir(split_dir): path = osp.join(split_dir, name) ext = osp.splitext(path)[-1].lower() if ext == ".csv": if do_it: try: initial = os.stat(path).st_size truncate_file(path, lines) size_reclaimed += initial - os.stat( path).st_size except Exception as e: logging.getLogger().error( "{}: {}".format(path, str(e))) else: size_reclaimed += os.stat(path).st_size else: kept_versions += 1 total_size_after = cleanup.du(sm_dir, size_unit="b") record = [] record.append(project) record.append(saved_model) record.append(saved_model) record.append(cleanup.format_size(total_size_before)) record.append(cleanup.format_size(total_size_after)) record.append(kept_versions) record.append(deleted_versions) record.append(cleanup.format_size(size_reclaimed)) rt.add_record(record) grand_total_reclaimed += size_reclaimed grand_total_used += total_size_before grand_total_kept += kept_versions grand_total_deleted += deleted_versions rt.add_record([ "Total", "-", "-", cleanup.format_size(grand_total_used), cleanup.format_size(grand_total_used - grand_total_reclaimed), grand_total_kept, grand_total_deleted, cleanup.format_size(grand_total_reclaimed) ]) return rt
def run(self, progress_callback): dip_home = os.environ['DIP_HOME'] saved_models = osp.join(dip_home, 'saved_models') rt = ResultTable() rt.set_name("Analysis data used space") rt.add_column("project", "Project key", "STRING") rt.add_column("saved_model_id", "Saved model id", "STRING") rt.add_column("saved_model_name", "Saved model name", "STRING") rt.add_column("total", "Total space (MB)", "STRING") rt.add_column("splits", "Splits space (MB)", "STRING") rt.add_column("versions", "Number of versions", "STRING") if self.config.get('allProjects', False): projects = [ project_key for project_key in os.listdir(saved_models) ] else: projects = [self.project_key] for project in projects: project_sm = osp.join(saved_models, project) if not osp.isdir(project_sm): continue for saved_model in os.listdir(project_sm): sm_dir = osp.join(project_sm, saved_model) versions_dir = osp.join(sm_dir, "versions") if not osp.isdir(versions_dir): continue versions = 0 total_splits = 0 total = cleanup.du(sm_dir) for version in os.listdir(versions_dir): version_dir = osp.join(versions_dir, version) split_dir = osp.join(version_dir, "split") if osp.isdir(split_dir): total_splits += cleanup.du(split_dir) versions += 1 record = [] record.append(project) record.append(saved_model) record.append(saved_model) record.append(total / 1024) record.append(total_splits / 1024) record.append(versions) rt.add_record(record) return rt
def run(self, progress_callback): dip_home = os.environ["DIP_HOME"] config_home = os.path.join(dip_home, "config") analysis_data_folder = os.path.join(dip_home, "analysis-data") rt = ResultTable() rt.set_name("Analysis data") rt.add_column("projectKey", "Project", "STRING") rt.add_column("dataset", "Dataset", "STRING") rt.add_column("analysis", "Analysis", "STRING") rt.add_column("model", "Model", "STRING") rt.add_column("used", "Disk space used", "STRING") rt.add_column("path", "Path", "STRING") for project_key in os.listdir(analysis_data_folder): analysis_data_project_folder = os.path.join(analysis_data_folder, project_key) if not os.path.isdir(analysis_data_project_folder): continue project_folder = os.path.join(config_home, "projects", project_key) orphaned_project = not os.path.isdir(project_folder) for analysis_id in os.listdir(analysis_data_project_folder): analysis_data_analysis_folder = os.path.join(analysis_data_project_folder, analysis_id) if not os.path.isdir(analysis_data_analysis_folder): continue analysis_folder = os.path.join(project_folder, "analysis", analysis_id) orphaned_analysis = not os.path.isdir(analysis_folder) if not orphaned_project else None total_used = 0 model_records = [] for model_id in os.listdir(analysis_data_analysis_folder): analysis_data_model_folder = os.path.join(analysis_data_analysis_folder, model_id) if not os.path.isdir(analysis_data_model_folder): continue model_folder = os.path.join(analysis_folder, "ml", model_id) orphaned_model = not os.path.isdir(model_folder) if not orphaned_project and not orphaned_analysis else None try: used = self.disk_space_used(analysis_data_model_folder) total_used += used except: used = None try: core_params_file = os.path.join(analysis_folder, "core_params.json") if os.path.isfile(core_params_file): with open(core_params_file, 'r') as f: core_params = json.load(f) dataset_name = core_params.get('inputDatasetSmartName', None) analysis_name = core_params.get('name', None) else: dataset_name = None analysis_name = None except: dataset_name = None analysis_name = None try: model_params_file = os.path.join(model_folder, "params.json") if os.path.isfile(model_params_file): with open(model_params_file, 'r') as f: model_params = json.load(f) model_name = model_params.get('name', None) else: model_name = None except: model_name = None record = [] # 0 if orphaned_project: record.append('(orphaned)') else: record.append(project_key) # 1 record.append(dataset_name) # 2 if orphaned_analysis: record.append('(orphaned)') elif analysis_name is not None: record.append(analysis_name) else: record.append(analysis_id) # 3 if orphaned_model: record.append('(orphaned)') elif model_name is not None: record.append(model_name) else: record.append(model_id) # 4 if used is None: record.append('N/A') elif used < 1024: record.append('%s b' % used) elif used < 1024 * 1024: record.append('%s Kb' % int(used/1024)) elif used < 1024 * 1024 * 1024: record.append('%s Mb' % int(used/(1024*1024))) else: record.append('%s Gb' % int(used/(1024*1024*1024))) # 5 record.append(analysis_data_model_folder) model_records.append(record) record = [] # 0 if orphaned_project: record.append('(orphaned)') else: record.append(project_key) # 1 record.append(dataset_name) # 2 if orphaned_analysis: record.append('(orphaned)') elif analysis_name is not None: record.append(analysis_name) else: record.append(analysis_id) # 3 record.append(None) # 4 if total_used is None: record.append('N/A') elif total_used < 1024: record.append('%s b' % used) elif total_used < 1024 * 1024: record.append('%s Kb' % int(total_used/1024)) elif total_used < 1024 * 1024 * 1024: record.append('%s Mb' % int(total_used/(1024*1024))) else: record.append('%s Gb' % int(total_used/(1024*1024*1024))) # 5 record.append(analysis_data_analysis_folder) rt.add_record(record) for model_record in model_records: rt.add_record(model_record) table_rows = [] idx = 0 for record in rt.records: analysis_row = record[3] is None row_cells = [] for i in range(0, 6): if analysis_row and i == 3: continue value = record[i] if value is not None: if i == 5: show_path_var = "showPath%s" % idx row_cells.append('<td class="mx-textellipsis" title="%s"><a class="mx-link-nodecoration" href="" ng-click="%s = !%s"><i class="icon-eye"></i></a></td>' % (value, show_path_var, show_path_var)) else: row_cells.append('<td class="mx-textellipsis" title="%s" %s>%s</td>' % (value, (' colspan="2"' if analysis_row and i == 2 else ''), value)) else: row_cells.append('<td></td>') if analysis_row: # analysis row table_rows.append('<tr style="font-weight: bold;">%s</tr>' % (''.join(row_cells))) else: # model row table_rows.append('<tr>%s</tr>' % (''.join(row_cells))) path_cell_style = 'white-space: nowrap; padding-left: 20px; font-family: monospace; font-size: 11px;' if analysis_row: path_cell_style = path_cell_style + '; font-weight: bold' table_rows.append('<tr ng-if="%s"><td colspan="6" title="%s" style="%s">%s</td></tr>' % (show_path_var, record[5], path_cell_style, record[5])) idx += 1 html = '<div>' table_header = '<th>%s</th>' % ('</th><th>'.join(['Project', 'Dataset', 'Analysis', 'Model', 'Disk usage', 'Path'])) html += '<table class="table table-striped" style="table-layout: fixed;">%s%s</table>' % (table_header, ''.join(table_rows)) html += '</div>' return html # return rt
def run(self, progress_callback): # Get project and folder containing the Excel files client = dataiku.api_client() project = client.get_project(self.project_key) folder_id = self.config.get("model_folder_id") folder = dataiku.Folder(folder_id, project_key=self.project_key) folder_path = folder.get_path() # List files in folder and get path files_list = os.listdir(folder_path) # List the datasets in the project datasets_in_project = [] for i in range(len(project.list_datasets())): datasets_in_project.append(project.list_datasets()[i]['name']) # Actions performed actions_performed = dict() for my_file in files_list: ## Get file path file_path = os.path.join(folder_path, my_file) ## Get Excel file and load in a pandas dataframe sheets_names = pd.ExcelFile(file_path).sheet_names for sheet in sheets_names: ### Rename sheets by "file_sheet" ss = openpyxl.load_workbook(file_path) ss_sheet = ss.get_sheet_by_name(sheet) if not my_file.split(".")[0] in ss_sheet.title: ss_sheet.title = my_file.split(".")[0] + "_" + sheet ss.save(file_path) ## If the dataset already exists, delete and replace it actions_performed[ss_sheet.title] = "created" if ss_sheet.title in datasets_in_project: project.get_dataset(ss_sheet.title).delete() actions_performed[ss_sheet.title] = "replaced" ### Create dataset from Excel sheet project.create_dataset(ss_sheet.title, 'FilesInFolder', params={ 'folderSmartId': folder_id, 'path': my_file }, formatType='excel', formatParams={ "xlsx": True, "sheets": "*" + ss_sheet.title, 'parseHeaderRow': True }) # Output table from dataiku.runnables import Runnable, ResultTable rt = ResultTable() rt.add_column("actions", "Actions", "STRING") # Actions : "dataset" has been created or replaced for i in range(len(actions_performed)): record = [] record.append(actions_performed.keys()[i] + " has been " + actions_performed.values()[i]) rt.add_record(record) return rt