Пример #1
0
def kfold(fold_num, out_dir, intent_train_file, workspace_base_file,
          figure_path, keep_workspace, username, password, iam_apikey, url, version, weight_mode,
          conf_thres, partial_credit_table):
    FOLD_TRAIN = 'fold_train'
    FOLD_TEST = 'fold_test'
    WORKSPACE_SPEC = 'fold_workspace'
    WORKSPACE_NAME = 'workspace_name'
    TEST_OUT = 'test_out'

    print('Begin {} with following details:'.format(KFOLD.upper()))
    print('{}={}'.format(INTENT_FILE_ITEM, intent_train_file))
    print('{}={}'.format(WORKSPACE_BASE_ITEM, workspace_base_file))
    print('{}={}'.format(FIGURE_PATH_ITEM, figure_path))
    print('{}={}'.format(OUT_DIR_ITEM, out_dir))
    print('{}={}'.format(FOLD_NUM_ITEM, fold_num))
    print('{}={}'.format(DO_KEEP_WORKSPACE_ITEM, BOOL_MAP[keep_workspace]))
    print('{}={}'.format(WEIGHT_MODE_ITEM, weight_mode))
    print('{}={}'.format(CONF_THRES_ITEM, conf_thres))
    print('{}={}'.format(WCS_USERNAME_ITEM, username))
    print('{}={}'.format(WCS_BASEURL_ITEM, url))
    print('{}={}'.format(WA_API_VERSION_ITEM, version))
    print('{}={}'.format(PARTIAL_CREDIT_TABLE_ITEM, partial_credit_table))

    working_dir = os.path.join(out_dir, KFOLD)
    if not os.path.exists(working_dir):
        os.makedirs(working_dir)

    # Prepare folds
    if subprocess.run([sys.executable, CREATE_TEST_TRAIN_FOLDS_PATH,
                       '-i', intent_train_file, '-o', working_dir,
                       '-k', str(fold_num)],
                      stdout=subprocess.PIPE).returncode == 0:
        print('Created {} folds'.format(str(fold_num)))
    else:
        raise RuntimeError('Failure in folds creation')

    # Construct fold params
    fold_params = [{FOLD_TRAIN: os.path.join(working_dir, str(idx),
                                             TRAIN_FILENAME),
                    FOLD_TEST: os.path.join(working_dir, str(idx),
                                            TEST_FILENAME),
                    TEST_OUT: os.path.join(working_dir, str(idx),
                                           TEST_OUT_FILENAME),
                    WORKSPACE_SPEC: os.path.join(working_dir,
                                                 str(idx), SPEC_FILENAME),
                    WORKSPACE_NAME: '{}_{}'.format(KFOLD, str(idx))}
                   for idx in range(fold_num)]

    # Begin training
    train_processes_specs = {}
    for fold_param in fold_params:
        spec_file = open(fold_param[WORKSPACE_SPEC], 'w')
        train_args = [sys.executable, TRAIN_CONVERSATION_PATH,
                      '-i', fold_param[FOLD_TRAIN],
                      '-n', fold_param[WORKSPACE_NAME],
                      '-u', username, '-p', password,
                      '-a', iam_apikey,
                      '-l', url, '-v', version,
                      '-w', workspace_base_file]
        train_processes_specs[
            subprocess.Popen(train_args, stdout=spec_file)] = spec_file

    train_failure_idx = []
    for idx, (process, file) in enumerate(train_processes_specs.items()):
        if process.wait() == 0:
            file.close()
        else:
            train_failure_idx.append(idx)

    try:
        if len(train_failure_idx) != 0:
            raise RuntimeError(
                'Fail to train {} fold workspace'.format(','.join(
                    str(train_failure_idx))))

        print('Trained {} workspaces'.format(str(fold_num)))

        # Begin testing
        test_processes = []
        workspace_ids = []
        FOLD_TEST_RATE = int(MAX_TEST_RATE / fold_num)
        for fold_param in fold_params:
            workspace_id = None
            with open(fold_param[WORKSPACE_SPEC]) as f:
                workspace_id = json.load(f)[WORKSPACE_ID_TAG]
                workspace_ids.append(workspace_id)
            test_args = [sys.executable, TEST_CONVERSATION_PATH,
                         '-i', fold_param[FOLD_TEST],
                         '-o', fold_param[TEST_OUT],
                         '-u', username, '-p', password,
                         '-a', iam_apikey, '-l', url, '-v', version,
                         '-t', UTTERANCE_COLUMN, '-g', GOLDEN_INTENT_COLUMN,
                         '-w', workspace_id, '-r', str(FOLD_TEST_RATE),
                         '-m']
            if partial_credit_table is not None:
                test_args += ['--partial_credit_table', partial_credit_table]
            test_processes.append(subprocess.Popen(test_args))

        test_failure_idx_str = []
        for idx, process in enumerate(test_processes):
            if process.wait() != 0:
                test_failure_idx_str.append(str(idx))

        if len(test_failure_idx_str) != 0:
            raise RuntimeError('Fail to test {} fold workspace'.format(
                ','.join(test_failure_idx_str)))

        print('Tested {} workspaces'.format(str(fold_num)))

        test_out_files = [fold_param[TEST_OUT] for fold_param in fold_params]

        # Add a column for the fold number
        for idx, this_file in enumerate(test_out_files):
            this_df = pd.read_csv(this_file, quoting=csv.QUOTE_ALL, encoding='utf-8', \
                               keep_default_na=False)
            this_df['Fold Index'] = idx
            this_df.to_csv( this_file, encoding='utf-8', quoting=csv.QUOTE_ALL, index=False )


        # Union test out
        kfold_result_file = os.path.join(out_dir, KFOLD_UNION_FILE)
        pd.concat([pd.read_csv(file, quoting=csv.QUOTE_ALL, encoding=UTF_8,
                               keep_default_na=False)
                   for file in test_out_files]) \
          .to_csv(kfold_result_file,
                  encoding='utf-8', quoting=csv.QUOTE_ALL, index=False)
        print("Wrote k-fold result file to {}".format(kfold_result_file))

        classfier_names = ['Fold {}'.format(idx) for idx in range(fold_num)]

        plot_args = [sys.executable, CREATE_PRECISION_CURVE_PATH,
                     '-t', '{} Fold Test'.format(str(fold_num)),
                     '-o', figure_path, '-w', weight_mode,
                     '--tau', conf_thres, '-n'] + \
            classfier_names + ['-i'] + test_out_files

        if subprocess.run(plot_args).returncode != 0:
            raise RuntimeError('Failure in plotting curves')

        kfold_result_file_base = kfold_result_file[:-4]
        metrics_args = [sys.executable, INTENT_METRICS_PATH,
                     '-i', kfold_result_file,
                     '-o', kfold_result_file_base+".metrics.csv",
                     '--partial_credit_on', str(partial_credit_table is not None)]
        if subprocess.run(metrics_args).returncode != 0:
            raise RuntimeError('Failure in generating intent metrics')

        confusion_args = [sys.executable, CONFUSION_MATRIX_PATH,
                          '-i', kfold_result_file,
                          '-o', kfold_result_file_base+".confusion_args.csv"]
        if subprocess.run(confusion_args).returncode != 0:
            raise RuntimeError('Failure in generating confusion matrix')

    finally:
        if not keep_workspace:
            workspace_ids = []
            for idx in range(fold_num):
                if idx not in train_failure_idx:
                    with open(fold_params[idx][WORKSPACE_SPEC]) as f:
                        workspace_id = json.load(f)[WORKSPACE_ID_TAG]
                        workspace_ids.append(workspace_id)

            delete_workspaces(username, password, iam_apikey, url, version, workspace_ids)
Пример #2
0
def kfold(fold_num, temp_dir, intent_train_file, workspace_base_file,
          figure_path, keep_workspace, username, password, weight_mode,
          conf_thres):
    FOLD_TRAIN = 'fold_train'
    FOLD_TEST = 'fold_test'
    WORKSPACE_SPEC = 'fold_workspace'
    WORKSPACE_NAME = 'workspace_name'
    TEST_OUT = 'test_out'

    print('Begin {} with following details:'.format(KFOLD.upper()))
    print('{}={}'.format(INTENT_FILE_ITEM, intent_train_file))
    print('{}={}'.format(WORKSPACE_BASE_ITEM, workspace_base_file))
    print('{}={}'.format(FIGURE_PATH_ITEM, figure_path))
    print('{}={}'.format(TEMP_DIR_ITEM, temp_dir))
    print('{}={}'.format(FOLD_NUM_ITEM, fold_num))
    print('{}={}'.format(DO_KEEP_WORKSPACE_ITEM, BOOL_MAP[keep_workspace]))
    print('{}={}'.format(WEIGHT_MODE_ITEM, weight_mode))
    print('{}={}'.format(CONF_THRES_ITEM, conf_thres))
    print('{}={}'.format(WCS_USERNAME_ITEM, username))

    working_dir = os.path.join(temp_dir, KFOLD)
    if not os.path.exists(working_dir):
        os.makedirs(working_dir)

    # Prepare folds
    if subprocess.run([
            sys.executable, CREATE_TEST_TRAIN_FOLDS_PATH, '-i',
            intent_train_file, '-o', working_dir, '-k',
            str(fold_num)
    ],
                      stdout=subprocess.PIPE).returncode == 0:
        print('Created {} folds'.format(str(fold_num)))
    else:
        raise RuntimeError('Failure in folds creation')

    # Construct fold params
    fold_params = [{
        FOLD_TRAIN:
        os.path.join(working_dir, str(idx), TRAIN_FILENAME),
        FOLD_TEST:
        os.path.join(working_dir, str(idx), TEST_FILENAME),
        TEST_OUT:
        os.path.join(working_dir, str(idx), TEST_OUT_FILENAME),
        WORKSPACE_SPEC:
        os.path.join(working_dir, str(idx), SPEC_FILENAME),
        WORKSPACE_NAME:
        '{}_{}'.format(KFOLD, str(idx))
    } for idx in range(fold_num)]

    # Begin training
    train_processes_specs = {}
    for fold_param in fold_params:
        spec_file = open(fold_param[WORKSPACE_SPEC], 'w')
        train_args = [
            sys.executable, TRAIN_CONVERSATION_PATH, '-i',
            fold_param[FOLD_TRAIN], '-n', fold_param[WORKSPACE_NAME], '-u',
            username, '-p', password, '-w', workspace_base_file
        ]
        train_processes_specs[subprocess.Popen(train_args,
                                               stdout=spec_file)] = spec_file

    train_failure_idx = []
    for idx, (process, file) in enumerate(train_processes_specs.items()):
        if process.wait() == 0:
            file.close()
        else:
            train_failure_idx.append(idx)

    try:
        if len(train_failure_idx) != 0:
            raise RuntimeError('Fail to train {} fold workspace'.format(
                ','.join(str(train_failure_idx))))

        print('Trained {} workspaces'.format(str(fold_num)))

        # Begin testing
        test_processes = []
        workspace_ids = []
        FOLD_TEST_RATE = int(MAX_TEST_RATE / fold_num)
        for fold_param in fold_params:
            workspace_id = None
            with open(fold_param[WORKSPACE_SPEC]) as f:
                workspace_id = json.load(f)[WORKSPACE_ID_TAG]
                workspace_ids.append(workspace_id)
            test_args = [
                sys.executable, TEST_CONVERSATION_PATH, '-i',
                fold_param[FOLD_TEST], '-o', fold_param[TEST_OUT], '-u',
                username, '-p', password, '-t', UTTERANCE_COLUMN, '-g',
                GOLDEN_INTENT_COLUMN, '-w', workspace_id, '-r',
                str(FOLD_TEST_RATE), '-m'
            ]
            test_processes.append(subprocess.Popen(test_args))

        test_failure_idx_str = []
        for idx, process in enumerate(test_processes):
            if process.wait() != 0:
                test_failure_idx_str.append(str(idx))

        if len(test_failure_idx_str) != 0:
            raise RuntimeError('Fail to test {} fold workspace'.format(
                ','.join(test_failure_idx_str)))

        print('Tested {} workspaces'.format(str(fold_num)))

        test_out_files = [fold_param[TEST_OUT] for fold_param in fold_params]

        # Union test out
        pd.concat([pd.read_csv(file, quoting=csv.QUOTE_ALL, encoding=UTF_8,
                               keep_default_na=False)
                   for file in test_out_files]) \
          .to_csv(os.path.join(working_dir, KFOLD_UNION_FILE),
                  encoding='utf-8', quoting=csv.QUOTE_ALL, index=False)

        classfier_names = ['Fold {}'.format(idx) for idx in range(fold_num)]

        plot_args = [sys.executable, CREATE_PRECISION_CURVE_PATH,
                     '-t', '{} Fold Test'.format(str(fold_num)),
                     '-o', figure_path, '-w', weight_mode,
                     '--tau', conf_thres, '-n'] + \
            classfier_names + ['-i'] + test_out_files

        if subprocess.run(plot_args).returncode == 0:
            print('Generated precision curves for {} folds'.format(
                str(fold_num)))
        else:
            raise RuntimeError('Failure in plotting curves')
    finally:
        if not keep_workspace:
            workspace_ids = []
            for idx in range(fold_num):
                if idx not in train_failure_idx:
                    with open(fold_params[idx][WORKSPACE_SPEC]) as f:
                        workspace_id = json.load(f)[WORKSPACE_ID_TAG]
                        workspace_ids.append(workspace_id)

            delete_workspaces(username, password, workspace_ids)