def increment_queries_session(queries_ds_name):
    session_var_name = f'ML-ASSISTED-LABELING__{queries_ds_name}__session'
    variables = dataiku.Project().get_variables()
    session_id = variables['standard'].get(session_var_name, 0) + 1
    variables['standard'][session_var_name] = session_id
    dataiku.Project().set_variables(variables)
    return session_id
def add_perf_metrics(metadata_name, model_version, n_samples, contradictions,
                     auc):
    metrics_name = f'ML-ASSISTED-LABELING__{metadata_name}__metrics'

    variables = dataiku.Project().get_variables()
    standards = variables['standard']

    metrics = standards.get(metrics_name, [])
    metrics.append({
        "n_samples": n_samples,
        "contradictions": contradictions,
        "auc": auc,
        "versionId": model_version
    })
    standards[metrics_name] = metrics

    variables['standard'] = standards
    dataiku.Project().set_variables(variables)
import dataiku
import numpy as np
from dataiku.customstep import *

from cardinal import uncertainty
from lal import utils, gpu_utils
# settings at the step instance level (set by the user creating a scenario step)
from lal.classifiers.base_classifier import FolderBasedDataClassifier, TableBasedDataClassifier

logging.basicConfig(level=logging.INFO,
                    format='%(name)s %(levelname)s - %(message)s')

step_config = get_step_config()

client = dataiku.api_client()
project = client.get_project(dataiku.Project().project_key)

# GPU set up
gpu_opts = gpu_utils.load_gpu_options(
    step_config.get('should_use_gpu', False), step_config.get('list_gpu', ''),
    float(step_config.get('gpu_allocation', 0.)))

if step_config['model'] in [m['id'] for m in project.list_saved_models()]:
    model = dataiku.Model(step_config['model'])
else:
    # model_id could be set in a master project of a DKU APP, but the saved model was then recreated in an App
    logging.info(
        'Model {} was not found in project, trying to find a model by "Classifier" name'
        .format(step_config['model']))
    model = dataiku.Model(
        'Classifier')  # default name for ML Assisted labeling plugin DKU Apps
def get_local_var(var_name):
    return dataiku.Project().get_variables()['local'].get(
        '{}__{}'.format(LOCAL_VAR_PREFIX, var_name), None)
def get_current_session_id(queries_ds_name=None):
    if queries_ds_name is None:
        return 0
    return dataiku.Project().get_variables()['standard'].get(
        f'ML-ASSISTED-LABELING__{queries_ds_name}__session', 0)
def get_perf_metrics(metadata_name):
    metrics_name = f'ML-ASSISTED-LABELING__{metadata_name}__metrics'

    variables = dataiku.Project().get_variables()
    standards = variables['standard']
    return standards.get(metrics_name, [])
def create_dku_config(config):
    dku_config = DkuConfig(
        local_vars=dataiku.Project().get_variables()['local'],
        local_prefix=LOCAL_VAR_PREFIX
    )
    dku_config.add_param(
        name='unlabeled',
        value=config.get('unlabeled'),
        required=True
    )
    dku_config.add_param(
        name='text_column',
        value=config.get('text_column'),
        required=True
    )
    categories = config.get('categories')
    categories_key = [c.get('from') for c in categories]
    dku_config.add_param(
        name='categories',
        value=categories,
        checks=[
            {
                'type': 'custom',
                'cond': all(categories_key),
                'err_msg': "All the categories must have a key. Aborting."
            },
            {
                'type': 'custom',
                'cond': len(categories_key) == len(set(categories_key)),
                'err_msg': "Categories key must be unique. Aborting."
            }],
        required=True
    )
    dku_config.add_param(
        name='labels_ds',
        value=config.get('labels_ds'),
        required=True
    )
    dku_config.add_param(
        name='metadata_ds',
        value=config.get('metadata_ds'),
        required=True
    )
    dku_config.add_param(
        name='label_col_name',
        value=config.get('label_col_name'),
        required=True
    )
    dku_config.add_param(
        name='use_prelabeling',
        value=config.get('use_prelabeling'),
        required=True
    )
    dku_config.add_param(
        name='language',
        value=config.get('language'),
        checks=[
            {
                'type': 'exists',
                'err_msg': 'You must select one of the language.\n'
                           'If the language is not in the list, please use the "Custom..." parameter to define a custom way to tokenize your text\n'
                           'If your dataset contains samples of several languages, you can use "Detected language column" and create a column in your dataset containing the language of the sample.'
            },
            {
                'type': 'in',
                'op': list(SUPPORTED_LANGUAGES_SPACY.keys()) + ['language_column', 'none']
            },
        ],
        required=True
    )
    dku_config.add_param(
        name='language_column',
        value=config.get('language_column'),
        required=(dku_config.language == "language_column")
    )
    dku_config.add_param(
        name='text_direction',
        value=config.get('text_direction'),
        checks=[{
            'type': 'in',
            'op': ['rtl', 'ltr']
        }],
        required=(dku_config.language == "none")
    )
    dku_config.add_param(
        name='tokenization_engine',
        value=config.get('tokenization_engine'),
        checks=[{
            'type': 'in',
            'op': ['white_space', 'char']
        }],
        required=(dku_config.language == "none")
    )
    return dku_config
Пример #8
0
def get_local_categories():
    return dataiku.Project().get_variables()['local'].get(
        'ML-ASSISTED-LABELING__categories', [])