def increment_queries_session(queries_ds_name): session_var_name = f'ML-ASSISTED-LABELING__{queries_ds_name}__session' variables = dataiku.Project().get_variables() session_id = variables['standard'].get(session_var_name, 0) + 1 variables['standard'][session_var_name] = session_id dataiku.Project().set_variables(variables) return session_id
def add_perf_metrics(metadata_name, model_version, n_samples, contradictions, auc): metrics_name = f'ML-ASSISTED-LABELING__{metadata_name}__metrics' variables = dataiku.Project().get_variables() standards = variables['standard'] metrics = standards.get(metrics_name, []) metrics.append({ "n_samples": n_samples, "contradictions": contradictions, "auc": auc, "versionId": model_version }) standards[metrics_name] = metrics variables['standard'] = standards dataiku.Project().set_variables(variables)
import dataiku import numpy as np from dataiku.customstep import * from cardinal import uncertainty from lal import utils, gpu_utils # settings at the step instance level (set by the user creating a scenario step) from lal.classifiers.base_classifier import FolderBasedDataClassifier, TableBasedDataClassifier logging.basicConfig(level=logging.INFO, format='%(name)s %(levelname)s - %(message)s') step_config = get_step_config() client = dataiku.api_client() project = client.get_project(dataiku.Project().project_key) # GPU set up gpu_opts = gpu_utils.load_gpu_options( step_config.get('should_use_gpu', False), step_config.get('list_gpu', ''), float(step_config.get('gpu_allocation', 0.))) if step_config['model'] in [m['id'] for m in project.list_saved_models()]: model = dataiku.Model(step_config['model']) else: # model_id could be set in a master project of a DKU APP, but the saved model was then recreated in an App logging.info( 'Model {} was not found in project, trying to find a model by "Classifier" name' .format(step_config['model'])) model = dataiku.Model( 'Classifier') # default name for ML Assisted labeling plugin DKU Apps
def get_local_var(var_name): return dataiku.Project().get_variables()['local'].get( '{}__{}'.format(LOCAL_VAR_PREFIX, var_name), None)
def get_current_session_id(queries_ds_name=None): if queries_ds_name is None: return 0 return dataiku.Project().get_variables()['standard'].get( f'ML-ASSISTED-LABELING__{queries_ds_name}__session', 0)
def get_perf_metrics(metadata_name): metrics_name = f'ML-ASSISTED-LABELING__{metadata_name}__metrics' variables = dataiku.Project().get_variables() standards = variables['standard'] return standards.get(metrics_name, [])
def create_dku_config(config): dku_config = DkuConfig( local_vars=dataiku.Project().get_variables()['local'], local_prefix=LOCAL_VAR_PREFIX ) dku_config.add_param( name='unlabeled', value=config.get('unlabeled'), required=True ) dku_config.add_param( name='text_column', value=config.get('text_column'), required=True ) categories = config.get('categories') categories_key = [c.get('from') for c in categories] dku_config.add_param( name='categories', value=categories, checks=[ { 'type': 'custom', 'cond': all(categories_key), 'err_msg': "All the categories must have a key. Aborting." }, { 'type': 'custom', 'cond': len(categories_key) == len(set(categories_key)), 'err_msg': "Categories key must be unique. Aborting." }], required=True ) dku_config.add_param( name='labels_ds', value=config.get('labels_ds'), required=True ) dku_config.add_param( name='metadata_ds', value=config.get('metadata_ds'), required=True ) dku_config.add_param( name='label_col_name', value=config.get('label_col_name'), required=True ) dku_config.add_param( name='use_prelabeling', value=config.get('use_prelabeling'), required=True ) dku_config.add_param( name='language', value=config.get('language'), checks=[ { 'type': 'exists', 'err_msg': 'You must select one of the language.\n' 'If the language is not in the list, please use the "Custom..." parameter to define a custom way to tokenize your text\n' 'If your dataset contains samples of several languages, you can use "Detected language column" and create a column in your dataset containing the language of the sample.' }, { 'type': 'in', 'op': list(SUPPORTED_LANGUAGES_SPACY.keys()) + ['language_column', 'none'] }, ], required=True ) dku_config.add_param( name='language_column', value=config.get('language_column'), required=(dku_config.language == "language_column") ) dku_config.add_param( name='text_direction', value=config.get('text_direction'), checks=[{ 'type': 'in', 'op': ['rtl', 'ltr'] }], required=(dku_config.language == "none") ) dku_config.add_param( name='tokenization_engine', value=config.get('tokenization_engine'), checks=[{ 'type': 'in', 'op': ['white_space', 'char'] }], required=(dku_config.language == "none") ) return dku_config
def get_local_categories(): return dataiku.Project().get_variables()['local'].get( 'ML-ASSISTED-LABELING__categories', [])