def create_resulttable(self):
        """
        Transforms the log dataframe into a ResultTable.
        """
        result_table = ResultTable()

        for column_name in self.log_df.keys():
            result_table.add_column(column_name, str.capitalize(column_name), "STRING")
        for log_row in self.log_df.itertuples():
            result_table.add_record(list(log_row)[1:])
        return result_table
    def search(self, search_term):
        """
        Search the Helm Chart Repositories installed for different Helm charts that can be installed
        """
        rt = ResultTable()
        cmd = [self.helm, "search", "repo"]
        
        if search_term:
            cmd.append(search_term)
            
        process = subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE
        )
        stdout, stderr = process.communicate()
        
        if stderr:
            raise Exception("Exception searching repos: {}".format(stderr))
            
        rows = stdout.split("\n")
        result = []
        
        rt.add_column("name", "Name", "STRING")
        rt.add_column("chartVersion", "Chart Version", "STRING")
        rt.add_column("appVersion", "App Version", "STRING")
        rt.add_column("description", "Description", "STRING")

        for row in rows[1:]:
            record = []
            for r in row.split("\t"):
                record.append(r.strip())
            rt.add_record(record)

        return rt
Пример #3
0
    def run(self, progress_callback):
        dip_home = os.environ['DIP_HOME']
        exports_folder = os.path.join(dip_home, 'exports')

        simulate = bool(self.config.get("simulate", False))

        maximum_age = int(self.config.get("age", 15))
        maximum_timestamp = int(
            time.mktime((datetime.datetime.now() -
                         datetime.timedelta(days=maximum_age)).timetuple()))

        to_delete = []
        for export_id in os.listdir(exports_folder):
            if os.stat(os.path.join(exports_folder,
                                    export_id)).st_mtime < maximum_timestamp:
                to_delete.append(export_id)

        def folder_size(folder):
            total_size = 0
            for dirpath, dirnames, filenames in os.walk(folder):
                for f in filenames:
                    fp = os.path.join(dirpath, f)
                    total_size += os.path.getsize(fp)
            return total_size

        rt = ResultTable()
        rt.set_name("Removed exports")

        rt.add_column("id", "Export identifier", "STRING")
        rt.add_column("age", "Age (days)", "STRING")
        rt.add_column("size", "Size (KB)", "STRING")

        for export_id in to_delete:
            export_folder = os.path.join(exports_folder, export_id)
            size = folder_size(export_folder)

            mtime = os.stat(export_folder).st_mtime
            age = (time.mktime(datetime.datetime.now().timetuple()) -
                   mtime) / 86400

            if not simulate:
                shutil.rmtree(export_folder)

            rt.add_record([export_id, int(age), size / 1024])

        return rt
Пример #4
0
    def run(self, progress_callback):
        to_delete = []

        # As `docker images` sorts images by creation date, we only have to keep the most recent one built for DSS.
        # Sample cmd: $ docker images 'dku-exec-base-notattributed' --format '{{.Repository}}\t{{.Tag}}\t{{.ID}}\t{{.CreatedAt}}'
        if self.config['rm_dss_images']:
            cmd = self._get_docker_cmd('images', self.config['base_image_name'], '--format', '{{.Repository}}\t{{.Tag}}\t{{.ID}}\t{{.CreatedAt}}')
            p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
            is_most_recent = True
            for line in iter(p.stdout.readline, ''):
                elements = line.split('\t')
                if len(elements) != 4:
                    continue

                if is_most_recent:
                    is_most_recent = False
                else:
                    to_delete.append({'repo': elements[0], 'tag': elements[1], 'id': elements[2], 'createdAt': elements[3]})

        # Dangling images, that could be wiped with `docker image prune` (but would need the docker daemon to be up-to-date)
        # Sample cmd: $ docker images -f 'dangling=true' --format '{{.Repository}}\t{{.Tag}}\t{{.ID}}\t{{.CreatedAt}}'
        if self.config['rm_none_images']:
            cmd = self._get_docker_cmd('images', '-f', 'dangling=true', '--format', '{{.Repository}}\t{{.Tag}}\t{{.ID}}\t{{.CreatedAt}}')
            p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
            for line in iter(p.stdout.readline, ''):
                elements = line.split('\t')
                if len(elements) != 4:
                    continue

                to_delete.append({'repo': elements[0], 'tag': elements[1], 'id': elements[2], 'createdAt': elements[3]})

        if self.config['perform_deletion']:
            rmi_args = [elt['id'] for elt in to_delete]
            print('Will delete these images: ' + str(rmi_args))
            if self.config['force_rm']:
                rmi_args.insert(0, '--force')
            cmd = self._get_docker_cmd('rmi', *rmi_args)
            subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

        rt = ResultTable()
        rt.set_name("Removed containers")

        rt.add_column("repo", "Repository", "STRING")
        rt.add_column("tag", "Tag", "STRING")
        rt.add_column("id", "Identifier", "STRING")
        rt.add_column("createdAt", "Created at", "STRING")

        for elt in to_delete:
            rt.add_record([elt['repo'], elt['tag'], elt['id'], elt['createdAt']])

        return rt
Пример #5
0
    def run(self, progress_callback):
        rt = ResultTable()
        rt.set_name("List datasets on connection")

        if self.all_projects:
            rt.add_column("dataset", "Dataset", "FQ_DATASET_WITH_TYPE")
        else:
            rt.add_column("dataset", "Dataset", "LOCAL_DATASET_WITH_TYPE")

        rt.add_column("table", "Table (SQL only)", "STRING")
        rt.add_column("schema", "Schema (SQL only)", "STRING")
        rt.add_column("tags", "Tags", "STRING_LIST")

        if self.config.get("all_projects", False) == True:
            for project_key in self.client.list_project_keys():
                project = self.client.get_project(project_key)
                self.run_for_project(rt, project)
        else:
            project = self.client.get_project(self.project_key)
            self.run_for_project(rt, project)
        return rt
Пример #6
0
    def run(self, progress_callback):
        included = self.config.get('includedTags', '')
        excluded = self.config.get('excludedTags', '')
        included_or = self.config.get('includedTagsCombine', 'OR') == 'OR'
        excluded_or = self.config.get('excludedTagsCombine', 'OR') == 'OR'
        included_set = set(included.split(','))
        excluded_set = set(excluded.split(','))

        project = self.client.get_project(self.project_key)
        to_delete = []
        for dataset in project.list_datasets():
            tags = dataset.get('tags', [])
            included = apply(tags, included_set, included_or)
            excluded = apply(tags, excluded_set, excluded_or)
            if included and not excluded:
                to_delete.append(dataset)

        rt = ResultTable()
        rt.set_name("Delete datasets by tag")

        simulate = self.config.get('simulate', True)

        rt.add_column("dataset", simulate and "Dataset to delete" or "Deleted dataset", "LOCAL_DATASET_WITH_TYPE")
        if not simulate:
            rt.add_column("result", "Result", "STRING")

        if not simulate:
            for dataset in to_delete:
                try:
                    project.get_dataset(dataset.get('name')).delete(drop_data=self.config.get("drop_data", True))
                    rt.add_record(["%s:%s" % (dataset.get("type"), dataset.get("name")), "SUCCESS"])
                except Exception as e:
                    rt.add_record(["%s:%s" % (dataset.get("type"), dataset.get("name")), "FAILED: %s" % str(e)])
            return rt
        else:
            rows = []
            for dataset in to_delete:
                rt.add_record(["%s:%s" % (dataset.get("type"), dataset.get("name"))])
            return rt
Пример #7
0
    def run(self, progress_callback):
        client = DSSClient('http://localhost:%s' % os.environ.get('DKU_BACKEND_PORT'), internal_ticket = os.environ.get('DKU_API_TICKET'))

        rt = ResultTable()
        rt.set_name("Killed sessions")

        rt.add_column("session_id", "Session id", "STRING")
        rt.add_column("notebook_project", "Notebook project key", "STRING")
        rt.add_column("notebook_project", "Notebook name", "STRING")

        simulate = self.config.get('simulate', True)

        max_idle = float(self.config.get('maxIdleTimeHours', 0))
        max_age = float(self.config.get('maxSessionAgeHours', 0))

        dont_kill_busy = self.config.get('dontKillBusyKernels', True)
        dont_kill_connected = self.config.get('dontKillConnectedKernels', True)

        now = get_epochtime_ms()

        logging.info("Listing notebooks max_age_ms=%s max_idle_ms=%s" % (max_age * 1000 * 3600, max_idle * 1000 * 3600))

        for nbk in client.list_running_notebooks():
            state = nbk.get_state()

            for session in state["activeSessions"]:
                logging.info("Check kill of %s session_age=%s kernel_idle=%s" % (
                    session, (now - session["sessionStartTime"]), (now - session["kernelLastActivityTime"])))

                kill = False

                if max_age > 0 and (now - session["sessionStartTime"]) > max_age * 1000 * 3600:
                    logging.info( " -> Will kill on max_age")
                    kill = True

                if max_idle > 0 and (now - session["kernelLastActivityTime"]) > max_idle * 1000 * 3600:
                    logging.info( " -> Will kill on max_idle")
                    kill = True

                if dont_kill_busy and session["kernelExecutionState"] == "busy":
                    logging.info(" -> Don't kill (busy)")
                    kill = False

                if dont_kill_connected and session["kernelConnections"] > 0:
                    logging.info(" -> Don't kill (connected)")
                    kill = False

                if kill:
                    logging.info("Unloading session %s" % session["sessionId"])
                    rt.add_record([session["sessionId"], session.get("projectKey", "?"), session.get("notebookName", "?")])

                    if not simulate:
                        nbk.unload(session["sessionId"])
                else:
                    logging.info("Don't kill %s" % session["sessionId"])
        return rt
Пример #8
0
import dataiku, logging, dku_dataproc
from dataiku.runnables import Runnable, ResultTable
from gce_client import DataProcClient

rt = ResultTable()
rt.add_column("node_type", "Node type", "STRING")
rt.add_column("machine_type", "Machine type", "STRING")
rt.add_column("machine_private_ip", "Private IP", "STRING")
rt.add_column("is_preemptible", "Pre-emptible VM?", "STRING")
rt.add_column("status", "Status", "STRING")

logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO)
logging.getLogger().setLevel(logging.INFO)

class MyRunnable(Runnable):
    def __init__(self, project_key, config, plugin_config):
        self.project_key = project_key
        self.config = config
        self.plugin_config = plugin_config
        
    def get_progress_target(self):
        return None

    def run(self, progress_callback):
        dss_cluster = dataiku.api_client().get_cluster(self.config["dss_cluster_id"])
        settings = dss_cluster.get_settings()
        (client, cluster_name) = dku_dataproc.get_client_and_wait(settings)
        computeClient = client.forkComputeClient()
        clusterBody = client.getDataprocClusterByName(cluster_name)

        logging.info("retrieving master instance")
Пример #9
0
    def run(self, progress_callback):
        maximum_age = int(self.config.get("age",  15))  
        maximum_timestamp = int(time.mktime((datetime.datetime.now() - datetime.timedelta(days=maximum_age)).timetuple()))
        lines       = int(self.config.get("lines", 5))
        orphans_only = bool(self.config.get("orphansOnly", False))
        do_it = bool(self.config.get("performDeletion", False))
        dip_home = os.environ['DIP_HOME']
        config_home = os.path.join(dip_home, "config")
        analysis_data_folder = osp.join(dip_home, "analysis-data")

        def truncate_file(path, rows):
            yourfile = pd.read_csv(path, nrows = rows)
            yourfile.to_csv(path, index = False)

        rt = ResultTable()
        rt.set_name("Saved models cleanup")
        rt.add_column("project", "Project key", "STRING")
        rt.add_column("analysis", "Analysis", "STRING")
        rt.add_column("dataset", "Dataset", "STRING")
        rt.add_column("model", "Model", "STRING")
        rt.add_column("total_size_before", "Total space before", "STRING")
        rt.add_column("total_size_after", "Total space after", "STRING")
        rt.add_column("kept_splits", "Kept splits", "STRING")
        rt.add_column("truncated_splits", "Truncated splits", "STRING")
        rt.add_column("reclaimed_size", "Reclaimed size", "STRING")
        
        grand_total_used = 0
        grand_total_reclaimed = 0
        grand_total_kept = 0
        grand_total_deleted = 0
        for project_key in cleanup.get_projects_to_consider(self.project_key, self.config):
            analysis_data_project_folder = osp.join(analysis_data_folder, project_key)
            if not osp.isdir(analysis_data_project_folder):
                continue
            project_folder = os.path.join(config_home, "projects", project_key)
            
            for analysis_id in os.listdir(analysis_data_project_folder):
                analysis_data_analysis_folder = osp.join(analysis_data_project_folder,analysis_id)
                if not osp.isdir(analysis_data_analysis_folder):
                    continue

                analysis_folder = os.path.join(project_folder, "analysis", analysis_id)
                total_used = 0
                total_reclaimed = 0
                total_kept = 0
                total_deleted = 0
                model_records = []
                dataset_name = None
                analysis_name = None
                try:
                    core_params_file = os.path.join(analysis_folder, "core_params.json")
                    if os.path.isfile(core_params_file):
                        with open(core_params_file, 'r') as f:
                            core_params = json.load(f)
                            dataset_name = core_params.get('inputDatasetSmartName', None)
                            analysis_name = core_params.get('name', None)
                except Exception:
                    pass

                model_ids = []
                for model_id in os.listdir(analysis_data_analysis_folder):
                    analysis_data_model_folder = os.path.join(analysis_data_analysis_folder, model_id)
                    if not osp.isdir(analysis_data_model_folder):
                        continue

                    model_folder = os.path.join(analysis_folder, "ml", model_id)
                    used = 0
                    reclaimed = 0
                    kept = 0
                    deleted = 0
                    try:
                        used = cleanup.du(analysis_data_model_folder,size_unit="b")
                    except Exception:
                        pass

                    model_name = None
                    try:
                        model_params_file = os.path.join(model_folder, "params.json")
                        if os.path.isfile(model_params_file):
                            with open(model_params_file, 'r') as f:
                                model_params = json.load(f)
                            model_name = model_params.get('name', None)
                    except:
                        pass
                    
                    splits_dates = {}

                    # Scan session to find out split usage
                    sessions_folder = os.path.join(analysis_data_model_folder,"sessions")
                    if osp.isdir(sessions_folder):
                        for session in os.listdir(sessions_folder):
                            split_ref_file = osp.join(sessions_folder, session, "split_ref.json")
                            if not osp.isfile(split_ref_file):
                                continue
                            session_timestamp = os.stat(osp.join(sessions_folder, session)).st_mtime
                            split_ref = None
                            with open(split_ref_file, 'r') as f:
                                split_ref = json.load(f).get("splitInstanceId",None)
                            if split_ref is not None and splits_dates.get(split_ref,0) < session_timestamp:
                                splits_dates[split_ref] = session_timestamp
                          
                    # Check it agaisnt actual splits
                    splits_folder = os.path.join(analysis_data_model_folder,"splits")
                    if osp.isdir(splits_folder):
                        for split in glob.glob(osp.join(splits_folder,"*.json")):
                            split_name, _ = osp.splitext(split)
                            split_short_name = osp.basename(split_name)
                            split_date = splits_dates.get(split_short_name, None)
                            if split_date is None or (split_date < maximum_timestamp and not orphans_only):
                                deleted += 1
                                split_data = {}
                                with open(split, 'r') as f:
                                    split_data = json.load(f)
                                for split_data_filename in [split_data.get("testPath",None), split_data.get("trainPath",None)]:
                                    if split_data_filename is None:
                                        continue
                                    split_data_file = osp.join(splits_folder,split_data_filename)
                                    _, split_data_extension = osp.splitext(split_data_filename)
                                    if osp.isfile(split_data_file):
                                        if do_it:
                                            if split_date is None:
                                                reclaimed = os.stat(split_data_file).st_size 
                                                os.unlink(split_data_file)
                                            else:
                                                size_before = os.stat(split_data_file).st_size 
                                                try:
                                                    data_file = pd.read_csv(split_data_file, nrows = lines)
                                                    data_file.to_csv(split_data_file, index = False, compression="gzip" if split_data_extension == ".gz" else None)
                                                except Exception as e:
                                                    logging.getLogger().error("{}: {}".format(split_data_file,str(e)))
                                                reclaimed = size_before - os.stat(split_data_file).st_size 
                                            pass
                                    else:
                                        reclaimed = os.stat(split_data_file).st_size
                                if do_it and split_date is None:
                                    os.unlink(split)
                                    pass
                            else:
                                kept += 1

                    total_reclaimed += reclaimed
                    total_used += used
                    total_kept += kept
                    total_deleted += deleted
                    
                    model_records.append([
                        project_key,
                        analysis_name,
                        dataset_name,
                        model_name,
                        cleanup.format_size(used), 
                        cleanup.format_size(used-reclaimed),
                        kept,
                        deleted,
                        cleanup.format_size(reclaimed)
                        ])

                rt.add_record([
                    project_key,
                    analysis_name,
                    dataset_name,
                    "Total all models",
                    cleanup.format_size(total_used), 
                    cleanup.format_size(total_used-total_reclaimed),
                    total_kept,
                    total_deleted,
                    cleanup.format_size(total_reclaimed)
                    ])
                for record in model_records:
                    rt.add_record(record)

                grand_total_reclaimed += total_reclaimed
                grand_total_used += total_used
                grand_total_kept += total_kept
                grand_total_deleted += total_deleted

        rt.add_record([
            "Total used",
            "-",
            "-",
            "-",
            cleanup.format_size(grand_total_used), 
            cleanup.format_size(grand_total_used-grand_total_reclaimed),
            grand_total_kept,
            grand_total_deleted,
            cleanup.format_size(grand_total_reclaimed)
            ])

        return rt
Пример #10
0
    def run(self, progress_callback):

        def update_percent(percent, last_update_time):
            new_time = time.time()
            if (new_time - last_update_time) > 3:
                progress_callback(percent)
                return new_time
            else:
                return last_update_time

        # Get project and folder containing the Excel files
        client = dataiku.api_client()
        project = client.get_project(self.project_key)

        folder_id = self.config.get("model_folder_id")
        overwrite = self.config.get("overwrite", False)

        folder = dataiku.Folder(folder_id, project_key=self.project_key)
        folder_path = folder.get_path()

        macro_creates_dataset = False # A boolean used to provide an informative message to the user when the macro creates a dataset
        # List files in folder and get path
        files_list = os.listdir(folder_path)

        # List the datasets in the project
        datasets_in_project = []
        for i in range(len(project.list_datasets())):
            datasets_in_project.append(project.list_datasets()[i]['name'])
        
        # Actions performed
        actions_performed = dict()
        num_files = len(files_list)

        update_time = time.time()
        for file_index, my_file in enumerate(files_list):
            
            ## Get file path
            file_path = os.path.join(folder_path, my_file)
            
            ## Get Excel file and load in a pandas dataframe
            sheets_names = pd.ExcelFile(file_path).sheet_names
            for sheet in sheets_names:
                ### Rename sheets by "file_sheet"
                ss=openpyxl.load_workbook(file_path)
                ss_sheet = ss.get_sheet_by_name(sheet)
                title = ss_sheet.title
                
                if not my_file.split(".")[0] in title:
                    title = '_'.join((my_file.split(".")[0] + "_" + sheet).split())
                
                title = '_'.join(title.split())
                title = title.replace(')','')
                title = title.replace('(','')
                
                create_dataset = True
                if title in datasets_in_project:
                    if overwrite:
                        project.get_dataset(title).delete()
                        actions_performed[title] = "replaced"
                    else:
                        create_dataset = False
                        actions_performed[title] = "skipped (already exists)"
                else:
                    actions_performed[title] = "created"
                    macro_creates_dataset = True
                if create_dataset:
                    dataset = project.create_dataset(title
                                    ,'FilesInFolder'
                                    , params={'folderSmartId': folder_id,
                                              'filesSelectionRules': {'mode': 'EXPLICIT_SELECT_FILES', 
                                                                     'explicitFiles': [my_file]}}
                                    , formatType='excel'
                                    , formatParams={"xlsx":True, "sheets":"*"+ss_sheet.title,'parseHeaderRow': True})
                    
                    df = pd.read_excel(file_path, sheet_name=ss_sheet.title, nrows=1000)
                    dataset.set_schema({'columns': [{'name': column, 'type': 'string'} for column, column_type in df.dtypes.items()]})

                percent = 100*float(file_index+1)/num_files
                update_time = update_percent(percent, update_time)

        # Output table
        rt = ResultTable()
        rt.add_column("actions", "Actions", "STRING")

        # Actions : "dataset" has been created or replaced
        for i in range(len(actions_performed)):
            record = []
            record.append(list(actions_performed.keys())[i] + " has been " + list(actions_performed.values())[i])
            rt.add_record(record)
        
        if macro_creates_dataset:
            rt.add_record(["Please refresh this page to see new datasets."])

        return rt
Пример #11
0
    def run(self, progress_callback):

        # Retrieve macro parameters:
        is_dry_run = self.config.get("is_dry_run")
        keep_partitioned = self.config.get("keep_partitioned")
        keep_shared = self.config.get("keep_shared")
        logging.info("DRY RUN is set to {}".format(str(is_dry_run)))
        logging.info("KEEP PARTITIONED is set to {}".format(
            str(keep_partitioned)))
        logging.info("KEEP SHARED is set to {}".format(str(keep_shared)))

        # Initialize macro result table:
        result_table = ResultTable()
        result_table.add_column("dataset", "dataset", "STRING")
        result_table.add_column("status", "status", "STRING")

        client = dataiku.api_client()
        if self.config.get("project_key", None):
            project = client.get_project(self.config.get("project_key"))
        else:
            project = client.get_project(self.project_key)

        all_datasets = project.list_datasets()
        all_recipes = project.list_recipes()

        # Build deduplicated lists of input/output datasets:
        input_datasets = []
        output_datasets = []
        for rcp in all_recipes:
            rcp_inputs_dict = rcp["inputs"]
            rcp_outputs_dict = rcp["outputs"]
            # CASE: no input dataset
            if rcp_inputs_dict:
                input_key = list(rcp_inputs_dict.keys())[0]
                rcp_inputs_list = [
                    x["ref"] for x in rcp_inputs_dict[input_key]["items"]
                ]
                input_datasets += rcp_inputs_list
            output_key = list(rcp_outputs_dict.keys())[0]
            rcp_outputs_list = [
                x["ref"] for x in rcp_outputs_dict[output_key]["items"]
            ]
            # Append them to the overall input list:
            output_datasets += rcp_outputs_list
        # Deduplicate input/output lists:
        input_datasets = list(set(input_datasets))
        output_datasets = list(set(output_datasets))

        # Identify Flow input/outputs & add them to result table:
        flow_inputs = [x for x in input_datasets if x not in output_datasets]
        for obj in flow_inputs:
            result_table.add_record([obj, "KEEP(INPUT)"])
        flow_outputs = [x for x in output_datasets if x not in input_datasets]
        for obj in flow_outputs:
            result_table.add_record([obj, "KEEP(OUTPUT)"])
        logging.info("Found {} FLOW INPUT datasets: {}".format(
            str(len(flow_inputs)), str(flow_inputs)))
        logging.info("Found {} FLOW OUTPUT datasets: {}".format(
            str(len(flow_outputs)), str(flow_outputs)))

        # Identify shared datasets:
        shared_objs = project.get_settings(
        ).settings["exposedObjects"]["objects"]
        shared_datasets = [
            x["localName"] for x in shared_objs if x["type"] == "DATASET"
        ]
        logging.info("Found {} SHARED datasets: {}".format(
            str(len(shared_datasets)), str(shared_datasets)))

        # Identify partitioned (partd) datasets:
        is_partd = lambda x: len(x["partitioning"]["dimensions"]) > 0
        partd_datasets = [x["name"] for x in all_datasets if is_partd(x)]
        logging.info("Found {} PARTITIONED datasets: {}".format(
            str(len(partd_datasets)), str(partd_datasets)))

        # List all datasets to keep, potentially including shared & partd ones:
        to_keep = flow_inputs + flow_outputs
        if keep_partitioned:
            to_keep += partd_datasets
            # Add them to result table:
            for obj in partd_datasets:
                result_table.add_record([obj, "KEEP(PARTITIONED)"])
        if keep_shared:
            to_keep += shared_datasets
            # Add them to result table:
            for obj in shared_datasets:
                result_table.add_record([obj, "KEEP(SHARED)"])
        logging.info("Total of {} datasets to KEEP: {}".format(
            str(len(to_keep)), str(to_keep)))

        # Perform cleanup or simulate it (dry run):
        if not is_dry_run:
            for ds in all_datasets:
                ds_name = ds["name"]
                if ds_name not in to_keep:
                    dataset = project.get_dataset(ds_name)
                    logging.info("Clearing {}...".format(ds_name))
                    dataset.clear()

        return result_table
Пример #12
0
    def run(self, progress_callback):
        dip_home = os.environ['DIP_HOME']
        analysis_data = osp.join(dip_home, 'analysis-data')

        projects_sessions = {}
        projects_splits = {}
        analyses_sessions = {}
        analyses_splits = {}
        projects_analyses = {}

        if self.config.get('allProjects', False):
            projects = [
                project_key for project_key in os.listdir(analysis_data)
            ]
        else:
            projects = [self.project_key]

        for project in projects:
            project_analysis_data = osp.join(analysis_data, project)
            project_sessions = 0
            project_splits = 0
            projects_analyses[project] = []

            if not osp.isdir(project_analysis_data):
                projects_sessions[project] = 0
                projects_splits[project] = 0
                continue

            for analysis in os.listdir(project_analysis_data):
                analysis_dir = osp.join(project_analysis_data, analysis)
                analysis_sessions = 0
                analysis_splits = 0
                projects_analyses[project].append(analysis)

                for mltask in os.listdir(analysis_dir):
                    mltask_dir = osp.join(analysis_dir, mltask)
                    sessions_dir = osp.join(mltask_dir, "sessions")
                    splits_dir = osp.join(mltask_dir, "splits")

                    if osp.isdir(sessions_dir):
                        analysis_sessions += cleanup.du(sessions_dir)
                    if osp.isdir(splits_dir):
                        analysis_splits += cleanup.du(splits_dir)

                project_sessions += analysis_sessions
                project_splits += analysis_splits

                analyses_splits[(project, analysis)] = analysis_splits
                analyses_sessions[(project, analysis)] = analysis_sessions

            projects_sessions[project] = project_sessions
            projects_splits[project] = project_splits

        rt = ResultTable()
        rt.set_name("Analysis data used space")

        if self.config["granularity"] == "project":
            rt.add_column("project", "Project key", "STRING")
            rt.add_column("total", "Total space (MB)", "STRING")
            rt.add_column("sessions", "Sessions space (MB)", "STRING")
            rt.add_column("splits", "Splits space (MB)", "STRING")

            for project in projects:
                total = (projects_sessions[project] + projects_splits[project])
                if len(projects) > 0 and total == 0:
                    continue
                record = []
                record.append(project)
                record.append(total / 1024)
                record.append(projects_sessions[project] / 1024)
                record.append(projects_splits[project] / 1024)
                rt.add_record(record)
        else:
            rt.add_column("project", "Project key", "STRING")
            rt.add_column("analysis", "Analysis id", "STRING")
            rt.add_column("total", "Total space (MB)", "STRING")
            rt.add_column("sessions", "Sessions space (MB)", "STRING")
            rt.add_column("splits", "Splits space (MB)", "STRING")

            for project in projects:
                for analysis in projects_analyses[project]:
                    record = []
                    record.append(project)
                    record.append(analysis)
                    record.append(
                        (analyses_sessions[(project, analysis)] +
                         analyses_splits[(project, analysis)]) / 1024)
                    record.append(analyses_sessions[(project, analysis)] /
                                  1024)
                    record.append(analyses_splits[(project, analysis)] / 1024)
                    rt.add_record(record)

        return rt
Пример #13
0
    def run(self, progress_callback):

        # Retrieve macro parameters:
        is_dry_run = self.config.get("is_dry_run")
        keep_partitioned = self.config.get("keep_partitioned")
        keep_shared = self.config.get("keep_shared")
        logging.info("DRY RUN is set to {}".format(str(is_dry_run)))
        logging.info("KEEP PARTITIONED is set to {}".format(
            str(keep_partitioned)))
        logging.info("KEEP SHARED is set to {}".format(str(keep_shared)))

        # Initialize macro result table:
        result_table = ResultTable()
        result_table.add_column("dataset", "Dataset", "STRING")
        result_table.add_column("type", "Type", "STRING")
        result_table.add_column("action", "Action", "STRING")
        result_table.add_column("action_status", "Action Status", "STRING")

        action_status = "Not done (Dry run)" if is_dry_run else "Done"

        client = dataiku.api_client()
        if self.config.get("project_key", None):
            project = client.get_project(self.config.get("project_key"))
        else:
            project = client.get_project(self.project_key)

        manually_selected_datasets = self.config.get("datasets_to_exclude")
        all_datasets = project.list_datasets()
        all_recipes = project.list_recipes()

        # Build deduplicated lists of input/output datasets:
        input_datasets = []
        output_datasets = []
        for recipe in all_recipes:
            recipe_inputs_dict = recipe["inputs"]
            recipe_outputs_dict = recipe["outputs"]
            # CASE: no input dataset
            if recipe_inputs_dict:
                append_datasets_to_list(recipe_inputs_dict, input_datasets)
            append_datasets_to_list(recipe_outputs_dict, output_datasets)

        # Identify Flow input/outputs:
        flow_inputs = [
            dataset for dataset in input_datasets
            if dataset not in output_datasets
        ]
        flow_outputs = [
            dataset for dataset in output_datasets
            if dataset not in input_datasets
        ]
        logging.info("Found {} FLOW INPUT datasets: {}".format(
            str(len(flow_inputs)), str(flow_inputs)))
        logging.info("Found {} FLOW OUTPUT datasets: {}".format(
            str(len(flow_outputs)), str(flow_outputs)))

        # Identify standalone, intermediate, and partitioned datasets
        excluded_datasets = []
        standalone_datasets = []
        intermediate_datasets = []
        partitioned_datasets = []

        for dataset in all_datasets:
            if dataset["name"] in manually_selected_datasets:
                excluded_datasets.append(dataset["name"])
            if dataset["name"] not in input_datasets + output_datasets:
                standalone_datasets.append(dataset["name"])
            if dataset[
                    "name"] not in flow_inputs + flow_outputs + standalone_datasets:
                intermediate_datasets.append(dataset["name"])
            is_partitioned = lambda dataset: len(dataset["partitioning"][
                "dimensions"]) > 0
            if is_partitioned(dataset):
                partitioned_datasets.append(dataset["name"])

        logging.info("Found {} EXCLUDED datasets: {}".format(
            str(len(excluded_datasets)), str(excluded_datasets)))
        logging.info("Found {} STANDALONE datasets: {}".format(
            str(len(standalone_datasets)), str(standalone_datasets)))
        logging.info("Found {} INTERMEDIATE datasets: {}".format(
            str(len(intermediate_datasets)), str(intermediate_datasets)))
        logging.info("Found {} PARTITIONED datasets: {}".format(
            str(len(partitioned_datasets)), str(partitioned_datasets)))

        # Identify shared datasets:
        shared_objects = project.get_settings(
        ).settings["exposedObjects"]["objects"]
        shared_datasets = [
            object["localName"] for object in shared_objects
            if object["type"] == "DATASET"
        ]
        logging.info("Found {} SHARED datasets: {}".format(
            str(len(shared_datasets)), str(shared_datasets)))

        # Add dataset types to results list
        results = []

        datasets = {
            "EXCLUDED": excluded_datasets,
            "STANDALONE": standalone_datasets,
            "INPUT": flow_inputs,
            "OUTPUT": flow_outputs,
            "INTERMEDIATE": intermediate_datasets,
            "SHARED": shared_datasets,
            "PARTITIONED": partitioned_datasets
        }

        for dataset_type, dataset_type_list in datasets.items():
            for dataset in dataset_type_list:
                results.append([dataset, dataset_type])

        # Identify which datasets should be kept
        to_keep = excluded_datasets + standalone_datasets + flow_inputs + flow_outputs
        if keep_partitioned:
            to_keep += partitioned_datasets
        if keep_shared:
            to_keep += shared_datasets
        logging.info("Total of {} datasets to KEEP: {}".format(
            str(len(to_keep)), str(to_keep)))

        # Create df with all results
        results_df = pd.DataFrame(results, columns=["Dataset", "Type"])
        results_grouped = results_df.groupby(
            ["Dataset"])['Type'].apply(lambda x: ', '.join(x)).reset_index()
        results_grouped["Action"] = results_grouped["Dataset"].apply(
            lambda x: "KEEP" if x in to_keep else "CLEAR")
        results_grouped["Status"] = action_status
        results_grouped = results_grouped.sort_values(by=['Action', 'Type'])

        # Perform cleanup
        to_clear = list(
            results_grouped["Dataset"][results_grouped['Action'] == "CLEAR"])
        logging.info("Total of {} datasets to CLEAR: {}".format(
            str(len(to_clear)), str(to_clear)))

        if not is_dry_run:
            for ds in to_clear:
                dataset = project.get_dataset(ds)
                logging.info("Clearing {}...".format(ds))
                dataset.clear()
            logging.info("Clearing {} datasets: done.".format(
                str(len(to_clear))))

        # Pass results to result table
        for index, row in results_grouped.iterrows():
            result_table.add_record(list(row))

        return result_table
Пример #14
0
    def run(self, progress_callback):
        maximum_age = int(self.config.get("age", 15))
        maximum_timestamp = int(
            time.mktime((datetime.datetime.now() -
                         datetime.timedelta(days=maximum_age)).timetuple()))
        lines = int(self.config.get("lines", 5))

        do_it = bool(self.config.get("performDeletion", False))

        dip_home = os.environ['DIP_HOME']
        saved_models = osp.join(dip_home, 'saved_models')

        def truncate_file(path, rows):
            yourfile = pd.read_csv(path, nrows=rows)
            yourfile.to_csv(path, index=False)

        rt = ResultTable()
        rt.set_name("Saved models cleanup")
        rt.add_column("project", "Project key", "STRING")
        rt.add_column("saved_model_id", "Saved model id", "STRING")
        rt.add_column("saved_model_name", "Saved model name", "STRING")
        rt.add_column("total_size_before", "Total space before (MB)", "STRING")
        rt.add_column("total_size_after", "Total space after (MB)", "STRING")
        rt.add_column("kept_splits", "Kept splits", "STRING")
        rt.add_column("truncated_splits", "Truncated splits", "STRING")
        rt.add_column("reclaimed_size", "Reclaimed size", "STRING")

        grand_total_used = 0
        grand_total_reclaimed = 0
        grand_total_kept = 0
        grand_total_deleted = 0

        for project in cleanup.get_projects_to_consider(
                self.project_key, self.config):
            project_sm = osp.join(saved_models, project)

            if not osp.isdir(project_sm):
                continue

            for saved_model in os.listdir(project_sm):
                sm_dir = osp.join(project_sm, saved_model)
                versions_dir = osp.join(sm_dir, "versions")

                if not osp.isdir(versions_dir):
                    continue

                kept_versions = 0
                deleted_versions = 0
                size_reclaimed = 0
                total_size_before = cleanup.du(sm_dir, size_unit="b")

                for version in os.listdir(versions_dir):
                    version_dir = osp.join(versions_dir, version)

                    if os.stat(version_dir).st_mtime < maximum_timestamp:
                        # Need to clean this version
                        deleted_versions += 1
                        split_dir = osp.join(version_dir, "split")
                        if osp.isdir(split_dir):
                            for name in os.listdir(split_dir):
                                path = osp.join(split_dir, name)
                                ext = osp.splitext(path)[-1].lower()
                                if ext == ".csv":
                                    if do_it:
                                        try:
                                            initial = os.stat(path).st_size
                                            truncate_file(path, lines)
                                            size_reclaimed += initial - os.stat(
                                                path).st_size
                                        except Exception as e:
                                            logging.getLogger().error(
                                                "{}: {}".format(path, str(e)))
                                    else:
                                        size_reclaimed += os.stat(path).st_size
                    else:
                        kept_versions += 1

                total_size_after = cleanup.du(sm_dir, size_unit="b")
                record = []
                record.append(project)
                record.append(saved_model)
                record.append(saved_model)
                record.append(cleanup.format_size(total_size_before))
                record.append(cleanup.format_size(total_size_after))
                record.append(kept_versions)
                record.append(deleted_versions)
                record.append(cleanup.format_size(size_reclaimed))
                rt.add_record(record)

                grand_total_reclaimed += size_reclaimed
                grand_total_used += total_size_before
                grand_total_kept += kept_versions
                grand_total_deleted += deleted_versions

        rt.add_record([
            "Total", "-", "-",
            cleanup.format_size(grand_total_used),
            cleanup.format_size(grand_total_used - grand_total_reclaimed),
            grand_total_kept, grand_total_deleted,
            cleanup.format_size(grand_total_reclaimed)
        ])
        return rt
Пример #15
0
    def run(self, progress_callback):
        dip_home = os.environ['DIP_HOME']
        saved_models = osp.join(dip_home, 'saved_models')

        rt = ResultTable()
        rt.set_name("Analysis data used space")
        rt.add_column("project", "Project key", "STRING")
        rt.add_column("saved_model_id", "Saved model id", "STRING")
        rt.add_column("saved_model_name", "Saved model name", "STRING")
        rt.add_column("total", "Total space (MB)", "STRING")
        rt.add_column("splits", "Splits space (MB)", "STRING")
        rt.add_column("versions", "Number of versions", "STRING")

        if self.config.get('allProjects', False):
            projects = [
                project_key for project_key in os.listdir(saved_models)
            ]
        else:
            projects = [self.project_key]

        for project in projects:
            project_sm = osp.join(saved_models, project)

            if not osp.isdir(project_sm):
                continue

            for saved_model in os.listdir(project_sm):
                sm_dir = osp.join(project_sm, saved_model)
                versions_dir = osp.join(sm_dir, "versions")

                if not osp.isdir(versions_dir):
                    continue

                versions = 0
                total_splits = 0
                total = cleanup.du(sm_dir)

                for version in os.listdir(versions_dir):
                    version_dir = osp.join(versions_dir, version)
                    split_dir = osp.join(version_dir, "split")
                    if osp.isdir(split_dir):
                        total_splits += cleanup.du(split_dir)
                    versions += 1

                record = []
                record.append(project)
                record.append(saved_model)
                record.append(saved_model)
                record.append(total / 1024)
                record.append(total_splits / 1024)
                record.append(versions)
                rt.add_record(record)

        return rt
Пример #16
0
    def run(self, progress_callback):
        dip_home = os.environ["DIP_HOME"]
        config_home = os.path.join(dip_home, "config")
        
        analysis_data_folder = os.path.join(dip_home, "analysis-data")

        rt = ResultTable()
        rt.set_name("Analysis data")
        rt.add_column("projectKey", "Project", "STRING")
        rt.add_column("dataset", "Dataset", "STRING")
        rt.add_column("analysis", "Analysis", "STRING")
        rt.add_column("model", "Model", "STRING")
        rt.add_column("used", "Disk space used", "STRING")
        rt.add_column("path", "Path", "STRING")
 
        for project_key in os.listdir(analysis_data_folder):
            analysis_data_project_folder = os.path.join(analysis_data_folder, project_key)
            if not os.path.isdir(analysis_data_project_folder):
                continue
                
            project_folder = os.path.join(config_home, "projects", project_key)
            orphaned_project = not os.path.isdir(project_folder)
            
            for analysis_id in os.listdir(analysis_data_project_folder):
                analysis_data_analysis_folder = os.path.join(analysis_data_project_folder, analysis_id)
                if not os.path.isdir(analysis_data_analysis_folder):
                    continue
                
                analysis_folder = os.path.join(project_folder, "analysis", analysis_id)
                orphaned_analysis = not os.path.isdir(analysis_folder) if not orphaned_project else None

                total_used = 0
                
                model_records = []
                for model_id in os.listdir(analysis_data_analysis_folder):
                    analysis_data_model_folder = os.path.join(analysis_data_analysis_folder, model_id)
                    if not os.path.isdir(analysis_data_model_folder):
                        continue

                    model_folder = os.path.join(analysis_folder, "ml", model_id)
                    orphaned_model = not os.path.isdir(model_folder) if not orphaned_project and not orphaned_analysis else None

                    try:
                        used = self.disk_space_used(analysis_data_model_folder)
                        total_used += used
                    except:
                        used = None
                        
                    try:
                        core_params_file = os.path.join(analysis_folder, "core_params.json")
                        if os.path.isfile(core_params_file):
                            with open(core_params_file, 'r') as f:
                                core_params = json.load(f)
                            dataset_name = core_params.get('inputDatasetSmartName', None)
                            analysis_name = core_params.get('name', None)
                        else:
                            dataset_name = None
                            analysis_name = None
                    except:
                        dataset_name = None
                        analysis_name = None
                    
                    try:
                        model_params_file = os.path.join(model_folder, "params.json")
                        if os.path.isfile(model_params_file):
                            with open(model_params_file, 'r') as f:
                                model_params = json.load(f)
                            model_name = model_params.get('name', None)
                        else:
                            model_name = None
                    except:
                        model_name = None
                    
                    record = []
                    
                    # 0
                    if orphaned_project:
                        record.append('(orphaned)')
                    else:
                        record.append(project_key)
                    
                    # 1
                    record.append(dataset_name)
                    
                    # 2
                    if orphaned_analysis:
                        record.append('(orphaned)')
                    elif analysis_name is not None:
                        record.append(analysis_name)
                    else:
                        record.append(analysis_id)

                    # 3
                    if orphaned_model:
                        record.append('(orphaned)')
                    elif model_name is not None:
                        record.append(model_name)
                    else:
                        record.append(model_id)
                    
                    # 4
                    if used is None:
                        record.append('N/A')
                    elif used < 1024:
                        record.append('%s b' % used)
                    elif used < 1024 * 1024:
                        record.append('%s Kb' % int(used/1024))
                    elif used < 1024 * 1024 * 1024:
                        record.append('%s Mb' % int(used/(1024*1024)))
                    else:
                        record.append('%s Gb' % int(used/(1024*1024*1024)))
                    
                    # 5
                    record.append(analysis_data_model_folder)
                    
                    model_records.append(record)
                    
                record = []

                # 0
                if orphaned_project:
                    record.append('(orphaned)')
                else:
                    record.append(project_key)

                # 1
                record.append(dataset_name)

                # 2
                if orphaned_analysis:
                    record.append('(orphaned)')
                elif analysis_name is not None:
                    record.append(analysis_name)
                else:
                    record.append(analysis_id)
                
                # 3
                record.append(None)

                # 4
                if total_used is None:
                    record.append('N/A')
                elif total_used < 1024:
                    record.append('%s b' % used)
                elif total_used < 1024 * 1024:
                    record.append('%s Kb' % int(total_used/1024))
                elif total_used < 1024 * 1024 * 1024:
                    record.append('%s Mb' % int(total_used/(1024*1024)))
                else:
                    record.append('%s Gb' % int(total_used/(1024*1024*1024)))

                # 5
                record.append(analysis_data_analysis_folder)

                rt.add_record(record)
                for model_record in model_records:
                    rt.add_record(model_record)
                    
                    
        table_rows = []
        idx = 0
        for record in rt.records:
            analysis_row = record[3] is None

            row_cells = []
            for i in range(0, 6):
                if analysis_row and i == 3:
                    continue
                value = record[i]
                if value is not None:
                    if i == 5:
                        show_path_var = "showPath%s" % idx
                        row_cells.append('<td class="mx-textellipsis" title="%s"><a class="mx-link-nodecoration" href="" ng-click="%s = !%s"><i class="icon-eye"></i></a></td>' % (value, show_path_var, show_path_var))
                    else:
                        row_cells.append('<td class="mx-textellipsis" title="%s" %s>%s</td>' % (value, (' colspan="2"' if analysis_row and i == 2 else ''), value))
                else:
                    row_cells.append('<td></td>')
                    
            if analysis_row:
                # analysis row
                table_rows.append('<tr style="font-weight: bold;">%s</tr>' % (''.join(row_cells)))
            else:
                # model row
                table_rows.append('<tr>%s</tr>' % (''.join(row_cells)))
            path_cell_style = 'white-space: nowrap; padding-left: 20px; font-family: monospace; font-size: 11px;'
            if analysis_row:
                path_cell_style = path_cell_style + '; font-weight: bold'
            table_rows.append('<tr ng-if="%s"><td colspan="6" title="%s" style="%s">%s</td></tr>' % (show_path_var, record[5], path_cell_style, record[5]))
            idx += 1
                
        html = '<div>'
        table_header = '<th>%s</th>' % ('</th><th>'.join(['Project', 'Dataset', 'Analysis', 'Model', 'Disk usage', 'Path']))
        html += '<table class="table table-striped" style="table-layout: fixed;">%s%s</table>' % (table_header, ''.join(table_rows))
        html += '</div>'
        return html
        # return rt
                    
                                                
                    
                    
                    
                    
                    
                    
                    
Пример #17
0
    def run(self, progress_callback):

        # Get project and folder containing the Excel files
        client = dataiku.api_client()
        project = client.get_project(self.project_key)

        folder_id = self.config.get("model_folder_id")
        folder = dataiku.Folder(folder_id, project_key=self.project_key)
        folder_path = folder.get_path()

        # List files in folder and get path
        files_list = os.listdir(folder_path)

        # List the datasets in the project
        datasets_in_project = []
        for i in range(len(project.list_datasets())):
            datasets_in_project.append(project.list_datasets()[i]['name'])

        # Actions performed
        actions_performed = dict()

        for my_file in files_list:
            ## Get file path
            file_path = os.path.join(folder_path, my_file)

            ## Get Excel file and load in a pandas dataframe
            sheets_names = pd.ExcelFile(file_path).sheet_names
            for sheet in sheets_names:

                ### Rename sheets by "file_sheet"
                ss = openpyxl.load_workbook(file_path)
                ss_sheet = ss.get_sheet_by_name(sheet)
                if not my_file.split(".")[0] in ss_sheet.title:
                    ss_sheet.title = my_file.split(".")[0] + "_" + sheet
                    ss.save(file_path)

            ## If the dataset already exists, delete and replace it
                actions_performed[ss_sheet.title] = "created"
                if ss_sheet.title in datasets_in_project:
                    project.get_dataset(ss_sheet.title).delete()
                    actions_performed[ss_sheet.title] = "replaced"

                ### Create dataset from Excel sheet
                project.create_dataset(ss_sheet.title,
                                       'FilesInFolder',
                                       params={
                                           'folderSmartId': folder_id,
                                           'path': my_file
                                       },
                                       formatType='excel',
                                       formatParams={
                                           "xlsx": True,
                                           "sheets": "*" + ss_sheet.title,
                                           'parseHeaderRow': True
                                       })

        # Output table
        from dataiku.runnables import Runnable, ResultTable
        rt = ResultTable()
        rt.add_column("actions", "Actions", "STRING")

        # Actions : "dataset" has been created or replaced
        for i in range(len(actions_performed)):
            record = []
            record.append(actions_performed.keys()[i] + " has been " +
                          actions_performed.values()[i])
            rt.add_record(record)

        return rt