Exemplo n.º 1
0
    def test_split_kfold_classwise(self):
        nclasses = 10
        k = 10
        max_ninstances = 100
        min_ninstances = k

        labels = []
        for i in range(nclasses):
            ninstances = min_ninstances + one_randint(max_ninstances -
                                                      min_ninstances)
            labels += [i] * ninstances

        labels = np.array(labels, dtype=int)
        np.random.shuffle(labels)
        folds_iter1 = split_classwise(labels, k)
        folds_iter2 = split_classwise(labels, k)

        sorted_indices = np.arange(len(labels))
        self.assertEqual(len(folds_iter1), k)
        self.assertEqual(len(folds_iter2), k)

        for i in range(k):
            fold1 = folds_iter1[i]
            fold2 = folds_iter2[i]

            test1 = fold1['test']
            train1 = fold1['train']

            test2 = fold2['test']
            train2 = fold2['train']

            all1 = np.concatenate((test1, train1))
            all1.sort()

            all2 = np.concatenate((test2, train2))
            all2.sort()

            self.assertEqual(len(np.intersect1d(test1, train1)), 0)
            self.assertTrue(np.all(sorted_indices == all1))

            self.assertEqual(len(np.intersect1d(test2, train2)), 0)
            self.assertTrue(np.all(sorted_indices == all2))

            self.assertTrue(np.all(all1 == all2))
            self.assertFalse(
                len(test1) != len(test2) or np.all(test1 == test2))
            self.assertFalse(
                len(train1) != len(train2) or np.all(train1 == train2))
Exemplo n.º 2
0
def run_nfolds(data, nsyls, nfolds, niters, enum_labels, nlabels, classifier,
               bar):
    ntrials = nfolds * niters
    if bar:
        bar.max = ntrials

    label_prediction_scores = [0] * ntrials
    label_hitss = [0] * ntrials
    label_missess = [0] * ntrials
    label_hitrates = np.empty((ntrials, nlabels))
    label_hitrates[:] = np.nan
    importancess = np.empty((ntrials, data.shape[1]))

    ind = 0

    for i in range(niters):
        folds = split_classwise(enum_labels, nfolds)

        for fold in folds:
            test_syl_idx = fold['test']
            train_syl_idx = fold['train']

            train_y = enum_labels[train_syl_idx]
            test_y = enum_labels[test_syl_idx]

            train_x = data[train_syl_idx, :]
            test_x = data[test_syl_idx, :]

            score, label_hits, label_misses, importances = classifier(
                train_x, train_y, test_x, test_y, nlabels)

            label_prediction_scores[ind] = score
            label_hitss[ind] = label_hits
            label_missess[ind] = label_misses

            label_hitrate = label_hits / (label_hits + label_misses).astype(
                np.float)

            label_hitrates[ind, :] = label_hitrate
            importancess[ind, :] = importances

            ind += 1

            if bar:
                bar.next()
    if bar:
        bar.finish()

    return label_prediction_scores, label_hitrates, importancess
    def split(self, ratio, limits=None):
        fold = split_classwise(self.enum_labels,
                               ratio,
                               nfolds=1,
                               balanced=self.balanced,
                               limits=limits)
        train = fold[0]['train']
        test = fold[0]['test']

        trainable_enum_labels = self.enum_labels[train]

        trainset = inds2dataset(
            train, self.data, self.labels,
            self.lens).make_trainable(trainable_enum_labels)
        testset = inds2dataset(test, self.data, self.labels, self.lens)

        return trainset, testset
Exemplo n.º 4
0
    def handle(self, *args, **options):
        clsf_type = options['clsf_type']
        database_name = options['database_name']
        source = options['source']
        annotator_name = options['annotator_name']
        label_level = options['label_level']
        min_occur = options['min_occur']
        ipc = options['ipc']
        ratio_ = options['ratio']
        profile = options.get('profile', None)
        load_dir = options['load_dir']

        tsv_file = profile + '.tsv'
        trials_file = profile + '.trials'

        if ipc is not None:
            assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur'
            ipc_min = ipc
            ipc_max = ipc
        else:
            ipc_min = min_occur
            ipc_max = int(np.floor(min_occur * 1.5))

        train_ratio, valid_ratio = get_ratios(ratio_, 2)

        open_mode = 'w'

        assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format(
            clsf_type)
        classifier = classifiers[clsf_type]

        database = get_or_error(Database, dict(name__iexact=database_name))
        annotator = get_or_error(User, dict(username__iexact=annotator_name))
        aggregations = Aggregation.objects.filter(enabled=True).order_by('id')
        aggregators = [aggregator_map[x.name] for x in aggregations]

        _sids, _tids = get_sids_tids(database)
        _labels, no_label_ids = get_labels_by_sids(_sids, label_level,
                                                   annotator, min_occur)
        if len(no_label_ids) > 0:
            _sids, _tids, _labels = exclude_no_labels(_sids, _tids, _labels,
                                                      no_label_ids)

        unique_labels, enum_labels = np.unique(_labels, return_inverse=True)
        fold = split_classwise(enum_labels,
                               ratio=valid_ratio,
                               limits=(min_occur,
                                       int(np.floor(min_occur * 1.5))),
                               nfolds=1,
                               balanced=True)
        train = fold[0]['train']
        test = fold[0]['test']
        all_indices = np.concatenate((train, test))

        tids = _tids[all_indices]
        labels = _labels[all_indices]

        with open('/tmp/hyperopt.pkl', 'rb') as f:
            saved = pickle.load(f)

        performance_data = saved[clsf_type]
        accuracies = performance_data['accuracies']
        groups = performance_data['groups']
        params = performance_data['params']

        group_name = '{}-{}'.format('mfcc', source)
        group_member_inds = np.where(groups == group_name)
        group_accuracies = accuracies[group_member_inds]

        best_acc_idx = np.argmax(group_accuracies)

        group_params = {}
        best_params = {}
        for param_name in params:
            param_values = np.array(params[param_name])
            group_param_values = param_values[group_member_inds]
            group_params[param_name] = group_param_values

            converter = converters[clsf_type][param_name]
            best_params[param_name] = converter(
                group_param_values[best_acc_idx])

        params_names = []
        params_converters = []
        params_count = 0

        v2t_ratio = valid_ratio / (train_ratio + valid_ratio)
        nfolds = int(np.floor(1. / v2t_ratio + 0.01))

        def loss(params):
            mfcc_args = {}
            for i in range(params_count):
                param_name = params_names[i]
                param_converter = params_converters[i]
                param_value = params[i]
                mfcc_args[param_name] = param_converter(param_value)

            _fmin = mfcc_args['fmin']
            _fmax = mfcc_args['fmax']
            _ncep = mfcc_args['ncep']

            extract_mfcc_multiparams(database_name, load_dir, _ncep, _fmin,
                                     _fmax)

            data = []
            tid2rows = {tid: [] for tid in tids}

            for aggregator in aggregators:
                agg_saved_file = 'database={}-feature=mfcc-aggregator={}-fmin={}-fmax={}-ncep={}.pkl'\
                    .format(database_name, aggregator.get_name(), _fmin, _fmax, _ncep)
                agg_saved_file_loc = os.path.join(load_dir, agg_saved_file)

                with open(agg_saved_file_loc, 'rb') as f:
                    tid2aval = pickle.load(f)
                    for tid in tids:
                        val = tid2aval[tid]
                        row = tid2rows[tid]
                        row.append(val)

            for tid in tids:
                row = tid2rows[tid]
                row = np.hstack(row).T
                data.append(row)
            data = np.array(data)
            data = zscore(data)
            data[np.where(np.isnan(data))] = 0
            data[np.where(np.isinf(data))] = 0

            unique_labels = np.unique(labels)
            nlabels = len(unique_labels)

            dp = EnumDataProvider(data, labels, balanced=True)
            trainvalidset, _ = dp.split(0, limits=(ipc_min, ipc_max))

            score = perform_k_fold(classifier, trainvalidset, nfolds,
                                   v2t_ratio, nlabels, **best_params)
            return 1. - score

        ncep_choices = hp.uniform('ncep', 13, 48)
        fmin_choices = hp.uniform('fmin', 0, 5)
        fmax_choices = hp.uniform('fmax', 8, 24)
        mfcc_params = {
            'ncep': (lambda x: int(np.round(x)), ncep_choices),
            'fmin': (lambda x: int(np.round(x) * 100), fmin_choices),
            'fmax': (lambda x: int(np.round(x) * 1000), fmax_choices),
        }

        space = []

        for arg_name, (converter, arg_values) in mfcc_params.items():
            space.append(arg_values)
            params_names.append(arg_name)
            params_converters.append(converter)
            params_count += 1

        trials = Trials()
        best = fmin(fn=loss,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=100,
                    trials=trials)
        print(best)

        with open(trials_file, 'wb') as f:
            pickle.dump(trials, f)

        best_trial = trials.best_trial
        best_trial_args_values_ = best_trial['misc']['vals']
        best_trial_args_values = {}
        for arg_name, arg_values in best_trial_args_values_.items():
            converter = mfcc_params[arg_name][0]
            arg_value = converter(arg_values[0])
            best_trial_args_values[arg_name] = arg_value

        model_args = ['id'] + list(
            best_trial_args_values.keys()) + ['accuracy']

        model_args_values = {x: [] for x in model_args}
        for idx, trial in enumerate(trials.trials):
            if trial == best_trial:
                idx = 'Best'
            trial_args_values = trial['misc']['vals']
            for arg_name in model_args:
                if arg_name == 'id':
                    model_args_values['id'].append(idx)
                elif arg_name == 'accuracy':
                    trial_accuracy = 1. - trial['result']['loss']
                    model_args_values['accuracy'].append(trial_accuracy)
                else:
                    converter = mfcc_params[arg_name][0]
                    val = converter(trial_args_values[arg_name][0])
                    model_args_values[arg_name].append(val)

        with open(tsv_file, open_mode, encoding='utf-8') as f:
            for arg in model_args:
                values = model_args_values[arg]
                f.write('{}\t'.format(arg))
                f.write('\t'.join(map(str, values)))
                f.write('\n')
            open_mode = 'a'
 def make_folds(self, nfolds, ratio=None):
     if ratio is None:
         ratio = 1. / nfolds
     self.folds = split_classwise(self.enum_labels, ratio, nfolds)
     return self.folds