예제 #1
0
    def handle(self, *args, **options):
        database_name = options['database_name']
        annotator_name = options['annotator_name']
        label_level = options['label_level']
        min_occur = options['min_occur']
        ipc = options['ipc']
        ratio_ = options['ratio']
        niters = options['niters']
        profile = options.get('profile', None)
        tsv_file = profile + '.tsv'

        if ipc is not None:
            assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur'
            ipc_min = ipc
            ipc_max = ipc
        else:
            ipc_min = min_occur
            ipc_max = int(np.floor(min_occur * 1.5))

        train_ratio, valid_ratio, test_ratio = get_ratios(ratio_)

        open_mode = 'w'

        database = get_or_error(Database, dict(name__iexact=database_name))
        annotator = get_or_error(User, dict(username__iexact=annotator_name))
        features = Feature.objects.all().order_by('id').filter(name='spectrum')

        sids, tids = get_sids_tids(database)
        labels, no_label_ids = get_labels_by_sids(sids, label_level, annotator,
                                                  min_occur)

        if len(no_label_ids) > 0:
            sids, tids, labels = exclude_no_labels(sids, tids, labels,
                                                   no_label_ids)

        full_data = extract_rawdata(tids, features)
        data = [x[0].T for x in full_data]

        unique_labels = np.unique(labels)
        nlabels = len(unique_labels)

        dp = OneHotSequenceProvider(data, labels, balanced=True)
        trainvalidset, testset = dp.split(
            test_ratio, limits=(min_occur, int(np.floor(min_occur * 1.5))))

        v2t_ratio = valid_ratio / (train_ratio + valid_ratio)
        nfolds = int(np.floor(1. / v2t_ratio + 0.01))

        hidden_layer_sizes_choices = [
            (100, ),
            (200, ),
            (400, ),
            (100, 100),
            (100, 200),
            (100, 400),
            (200, 100),
            (200, 200),
            (200, 400),
            (400, 100),
            (400, 200),
            (400, 400),
        ]

        choices = {'cnn': {'hidden_layer_sizes': hidden_layer_sizes_choices}}

        best_trial_args_values = {}

        for arg_name, arg_values in choices['cnn'].items():
            losses = []
            ids = []

            def loss_func(params):
                arg_value = params[0]
                classifier_args = best_trial_args_values.copy()
                classifier_args[arg_name] = arg_value
                print('classifier_args = {}'.format(classifier_args))
                score = perform_k_fold(cnn, trainvalidset, nfolds, v2t_ratio,
                                       nlabels, **classifier_args)
                return 1. - score

            for idx, arg_value in enumerate(arg_values):
                loss = loss_func((arg_value, ))
                ids.append(idx)
                losses.append(loss)

            best_loss_idx = np.argmin(losses)
            best_arg_value = arg_values[best_loss_idx]
            best_trial_args_values[arg_name] = best_arg_value

            model_args = ['id'] + list(
                best_trial_args_values.keys()) + ['accuracy']

            model_args_values = {x: [] for x in model_args}
            for idx, loss in enumerate(losses):
                if idx == best_loss_idx:
                    idx_str = 'Best'
                else:
                    idx_str = str(idx)
                # trial_args_values = trial['misc']['vals']
                for arg_name_ in model_args:
                    if arg_name_ == 'id':
                        model_args_values['id'].append(idx_str)
                    elif arg_name_ == 'accuracy':
                        trial_accuracy = 1. - loss
                        model_args_values['accuracy'].append(trial_accuracy)
                    else:
                        if arg_name_ == arg_name:
                            val = arg_values[idx]
                        else:
                            val = best_trial_args_values[arg_name_]
                        model_args_values[arg_name_].append(val)

            with open(tsv_file, open_mode, encoding='utf-8') as f:
                for arg in model_args:
                    values = model_args_values[arg]
                    f.write('{}\t'.format(arg))
                    f.write('\t'.join(map(str, values)))
                    f.write('\n')
                open_mode = 'a'

        # Perform classification on the test set
        nfolds = int(np.floor(1 / test_ratio + 0.01))
        ntrials = nfolds * niters
        label_prediction_scores = [0] * ntrials
        label_hitss = [0] * ntrials
        label_missess = [0] * ntrials
        label_hitrates = np.empty((ntrials, nlabels))
        label_hitrates[:] = np.nan
        importancess = np.empty((ntrials, data.shape[1]))
        cfmats = np.ndarray((ntrials, nlabels, nlabels))
        ind = 0

        bar = Bar('Running CNN', max=ntrials)

        for iter in range(niters):
            traintetset, _ = dp.split(0, limits=(ipc_min, ipc_max))
            traintetset.make_folds(nfolds, test_ratio)

            for k in range(nfolds):
                trainset, testset = traintetset.get_fold(k)
                train_x = np.array(trainset.data)
                train_y = np.array(trainset.labels, dtype=np.int32)
                test_x = np.array(testset.data)
                test_y = np.array(testset.labels, dtype=np.int32)

                score, label_hits, label_misses, cfmat, importances =\
                    cnn(train_x, train_y, test_x, test_y, nlabels, True, **best_trial_args_values)

                label_prediction_scores[ind] = score
                label_hitss[ind] = label_hits
                label_missess[ind] = label_misses

                label_hitrate = label_hits / (label_hits +
                                              label_misses).astype(np.float)

                label_hitrates[ind, :] = label_hitrate
                importancess[ind, :] = importances
                cfmats[ind, :, :] = cfmat

                bar.next()
                ind += 1
        bar.finish()

        mean_label_prediction_scores = np.nanmean(label_prediction_scores)
        std_label_prediction_scores = np.nanstd(label_prediction_scores)
        sum_cfmat = np.nansum(cfmats, axis=0)

        with open(tsv_file, open_mode, encoding='utf-8') as f:
            f.write('Results using best-model\'s paramaters on testset\n')
            f.write('Feature group\tLabel prediction mean\tstdev\t{}\n'.format(
                '\t '.join(unique_labels)))
            f.write('{}\t{}\t{}\t{}\n'.format(
                'Spectrum', mean_label_prediction_scores,
                std_label_prediction_scores,
                '\t'.join(map(str, np.nanmean(label_hitrates, 0)))))

            f.write('\t')
            f.write('\t'.join(unique_labels))
            f.write('\n')
            for i in range(nlabels):
                label = unique_labels[i]
                cfrow = sum_cfmat[:, i]
                f.write(label)
                f.write('\t')
                f.write('\t'.join(map(str, cfrow)))
                f.write('\n')
            f.write('\n')
예제 #2
0
    def handle(self, *args, **options):
        clsf_type = options['clsf_type']
        database_name = options['database_name']
        source = options['source']
        annotator_name = options['annotator_name']
        label_level = options['label_level']
        min_occur = options['min_occur']
        ipc = options['ipc']
        ratio_ = options['ratio']
        niters = options['niters']
        profile = options.get('profile', None)
        tsv_file = profile + '.tsv'
        if ipc is not None:
            assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur'
            ipc_min = ipc
            ipc_max = ipc
        else:
            ipc_min = min_occur
            ipc_max = int(np.floor(min_occur * 1.5))

        train_ratio, valid_ratio = get_ratios(ratio_, 2)

        open_mode = 'w'

        assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format(
            clsf_type)
        classifier = classifiers[clsf_type]

        database = get_or_error(Database, dict(name__iexact=database_name))
        annotator = get_or_error(User, dict(username__iexact=annotator_name))

        features = Feature.objects.all().order_by('id')
        aggregations = Aggregation.objects.filter(enabled=True).order_by('id')
        aggregators = [aggregator_map[x.name] for x in aggregations]

        enabled_features = []
        for f in features:
            if f.name in feature_map:
                enabled_features.append(f)

        features_hash = '-'.join(
            list(map(str, [x.id for x in enabled_features])))
        aggregations_hash = '-'.join(
            list(map(str, aggregations.values_list('id', flat=True))))

        dm = DataMatrix.objects.filter(
            database=database,
            features_hash=features_hash,
            aggregations_hash=aggregations_hash).last()
        if dm is None:
            raise Exception(
                'No full data matrix for database {}'.format(database_name))

        dm_sids_path = dm.get_sids_path()
        dm_tids_path = dm.get_tids_path()
        dm_bytes_path = dm.get_bytes_path()
        feature_cols = dm.get_cols_path()
        with open(feature_cols, 'r', encoding='utf-8') as f:
            col_inds = json.load(f)

        _sids = bytes_to_ndarray(dm_sids_path, np.int32)
        _sids, sort_order = np.unique(_sids, return_index=True)

        try:
            _tids = bytes_to_ndarray(dm_tids_path, np.int32)
            _tids = _tids[sort_order]
        except FileNotFoundError:
            _tids = get_tids(_sids)

        full_data = get_rawdata_from_binary(dm_bytes_path, len(_sids))
        full_data = full_data[sort_order, :]

        labels, no_label_ids = get_labels_by_sids(_sids, label_level,
                                                  annotator, min_occur)

        if len(no_label_ids) > 0:
            sids, tids, labels = exclude_no_labels(_sids, _tids, labels,
                                                   no_label_ids)
            lookup_ids_rows = np.searchsorted(_sids, sids)
            full_data = full_data[lookup_ids_rows, :]

        full_data = zscore(full_data)
        full_data[np.where(np.isnan(full_data))] = 0
        full_data[np.where(np.isinf(full_data))] = 0

        unique_labels = np.unique(labels)
        nlabels = len(unique_labels)

        for ftgroup_name, feature_names in ftgroup_names.items():
            if ftgroup_name == 'all':
                features = list(feature_map.values())
            else:
                features = [feature_map[x] for x in feature_names]
            ft_col_inds = []
            for feature in features:
                if feature.is_fixed_length:
                    col_name = feature.name
                    col_range = col_inds[col_name]
                    ft_col_inds += range(col_range[0], col_range[1])
                else:
                    for aggregator in aggregators:
                        col_name = '{}_{}'.format(feature.name,
                                                  aggregator.get_name())
                        col_range = col_inds[col_name]
                        ft_col_inds += range(col_range[0], col_range[1])

            ft_col_inds = np.array(ft_col_inds, dtype=np.int32)
            ndims = len(ft_col_inds)
            data = full_data[:, ft_col_inds]

            if source == 'pca':
                explained, data = pca_optimal(data, ndims, 0.9)
                pca_dims = data.shape[1]

            with open('/tmp/hyperopt.pkl', 'rb') as f:
                saved = pickle.load(f)

            performance_data = saved[clsf_type]
            accuracies = performance_data['accuracies']
            groups = performance_data['groups']
            params = performance_data['params']

            group_name = '{}-{}'.format(ftgroup_name, source)
            group_member_inds = np.where(groups == group_name)
            group_accuracies = accuracies[group_member_inds]

            best_acc_idx = np.argmax(group_accuracies)

            group_params = {}
            best_params = {}
            for param_name in params:
                param_values = np.array(params[param_name])
                group_param_values = param_values[group_member_inds]
                group_params[param_name] = group_param_values

                converter = converters[clsf_type][param_name]
                best_params[param_name] = converter(
                    group_param_values[best_acc_idx])

            dp = EnumDataProvider(data, labels, balanced=True)

            nfolds = int(np.floor(1 / valid_ratio + 0.01))
            ntrials = nfolds * niters
            label_prediction_scores = [0] * ntrials
            label_hitss = [0] * ntrials
            label_missess = [0] * ntrials
            label_hitrates = np.empty((ntrials, nlabels))
            label_hitrates[:] = np.nan
            importancess = np.empty((ntrials, data.shape[1]))
            cfmats = np.ndarray((ntrials, nlabels, nlabels))

            ind = 0

            bar = Bar('Features: {}. Classifier: {} Data type: {}...'.format(
                ftgroup_name, clsf_type, source),
                      max=ntrials)

            for iter in range(niters):
                traintetset, _ = dp.split(0, limits=(ipc_min, ipc_max))
                traintetset.make_folds(nfolds, valid_ratio)
                for k in range(nfolds):
                    trainset, testset = traintetset.get_fold(k)
                    train_x = np.array(trainset.data)
                    train_y = np.array(trainset.labels, dtype=np.int32)
                    test_x = np.array(testset.data)
                    test_y = np.array(testset.labels, dtype=np.int32)

                    score, label_hits, label_misses, cfmat, importances = \
                        classifier(train_x, train_y, test_x, test_y, nlabels, True, **best_params)

                    label_prediction_scores[ind] = score
                    label_hitss[ind] = label_hits
                    label_missess[ind] = label_misses

                    label_hitrate = label_hits / (
                        label_hits + label_misses).astype(np.float)

                    label_hitrates[ind, :] = label_hitrate
                    importancess[ind, :] = importances
                    cfmats[ind, :, :] = cfmat

                    bar.next()
                    ind += 1
            bar.finish()

            mean_label_prediction_scores = np.nanmean(label_prediction_scores)
            std_label_prediction_scores = np.nanstd(label_prediction_scores)
            sum_cfmat = np.nansum(cfmats, axis=0)

            with open(tsv_file, open_mode, encoding='utf-8') as f:
                if source == 'full':
                    f.write('{}\t{}\t{}\t{}\t{}\n'.format(
                        ftgroup_name, ndims, mean_label_prediction_scores,
                        std_label_prediction_scores,
                        '\t'.join(map(str, np.nanmean(label_hitrates, 0)))))
                else:
                    f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                        ftgroup_name, ndims, explained, pca_dims,
                        mean_label_prediction_scores,
                        std_label_prediction_scores,
                        '\t'.join(map(str, np.nanmean(label_hitrates, 0)))))

                f.write('Accuracy: \n')
                f.write('\t'.join(list(map(str, label_prediction_scores))))
                f.write('\n')
                f.write('\t')
                f.write('\t'.join(unique_labels))
                f.write('\n')
                for i in range(nlabels):
                    label = unique_labels[i]
                    cfrow = sum_cfmat[:, i]
                    f.write(label)
                    f.write('\t')
                    f.write('\t'.join(map(str, cfrow)))
                    f.write('\n')
                f.write('\n')
                open_mode = 'a'
예제 #3
0
    def handle(self, *args, **options):
        clsf_type = options['clsf_type']
        database_name = options['database_name']
        source = options['source']
        annotator_name = options['annotator_name']
        label_level = options['label_level']
        min_occur = options['min_occur']
        ipc = options['ipc']
        ratio_ = options['ratio']
        profile = options.get('profile', None)
        load_dir = options['load_dir']

        tsv_file = profile + '.tsv'
        trials_file = profile + '.trials'

        if ipc is not None:
            assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur'
            ipc_min = ipc
            ipc_max = ipc
        else:
            ipc_min = min_occur
            ipc_max = int(np.floor(min_occur * 1.5))

        train_ratio, valid_ratio = get_ratios(ratio_, 2)

        open_mode = 'w'

        assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format(
            clsf_type)
        classifier = classifiers[clsf_type]

        database = get_or_error(Database, dict(name__iexact=database_name))
        annotator = get_or_error(User, dict(username__iexact=annotator_name))
        aggregations = Aggregation.objects.filter(enabled=True).order_by('id')
        aggregators = [aggregator_map[x.name] for x in aggregations]

        _sids, _tids = get_sids_tids(database)
        _labels, no_label_ids = get_labels_by_sids(_sids, label_level,
                                                   annotator, min_occur)
        if len(no_label_ids) > 0:
            _sids, _tids, _labels = exclude_no_labels(_sids, _tids, _labels,
                                                      no_label_ids)

        unique_labels, enum_labels = np.unique(_labels, return_inverse=True)
        fold = split_classwise(enum_labels,
                               ratio=valid_ratio,
                               limits=(min_occur,
                                       int(np.floor(min_occur * 1.5))),
                               nfolds=1,
                               balanced=True)
        train = fold[0]['train']
        test = fold[0]['test']
        all_indices = np.concatenate((train, test))

        tids = _tids[all_indices]
        labels = _labels[all_indices]

        with open('/tmp/hyperopt.pkl', 'rb') as f:
            saved = pickle.load(f)

        performance_data = saved[clsf_type]
        accuracies = performance_data['accuracies']
        groups = performance_data['groups']
        params = performance_data['params']

        group_name = '{}-{}'.format('mfcc', source)
        group_member_inds = np.where(groups == group_name)
        group_accuracies = accuracies[group_member_inds]

        best_acc_idx = np.argmax(group_accuracies)

        group_params = {}
        best_params = {}
        for param_name in params:
            param_values = np.array(params[param_name])
            group_param_values = param_values[group_member_inds]
            group_params[param_name] = group_param_values

            converter = converters[clsf_type][param_name]
            best_params[param_name] = converter(
                group_param_values[best_acc_idx])

        params_names = []
        params_converters = []
        params_count = 0

        v2t_ratio = valid_ratio / (train_ratio + valid_ratio)
        nfolds = int(np.floor(1. / v2t_ratio + 0.01))

        def loss(params):
            mfcc_args = {}
            for i in range(params_count):
                param_name = params_names[i]
                param_converter = params_converters[i]
                param_value = params[i]
                mfcc_args[param_name] = param_converter(param_value)

            _fmin = mfcc_args['fmin']
            _fmax = mfcc_args['fmax']
            _ncep = mfcc_args['ncep']

            extract_mfcc_multiparams(database_name, load_dir, _ncep, _fmin,
                                     _fmax)

            data = []
            tid2rows = {tid: [] for tid in tids}

            for aggregator in aggregators:
                agg_saved_file = 'database={}-feature=mfcc-aggregator={}-fmin={}-fmax={}-ncep={}.pkl'\
                    .format(database_name, aggregator.get_name(), _fmin, _fmax, _ncep)
                agg_saved_file_loc = os.path.join(load_dir, agg_saved_file)

                with open(agg_saved_file_loc, 'rb') as f:
                    tid2aval = pickle.load(f)
                    for tid in tids:
                        val = tid2aval[tid]
                        row = tid2rows[tid]
                        row.append(val)

            for tid in tids:
                row = tid2rows[tid]
                row = np.hstack(row).T
                data.append(row)
            data = np.array(data)
            data = zscore(data)
            data[np.where(np.isnan(data))] = 0
            data[np.where(np.isinf(data))] = 0

            unique_labels = np.unique(labels)
            nlabels = len(unique_labels)

            dp = EnumDataProvider(data, labels, balanced=True)
            trainvalidset, _ = dp.split(0, limits=(ipc_min, ipc_max))

            score = perform_k_fold(classifier, trainvalidset, nfolds,
                                   v2t_ratio, nlabels, **best_params)
            return 1. - score

        ncep_choices = hp.uniform('ncep', 13, 48)
        fmin_choices = hp.uniform('fmin', 0, 5)
        fmax_choices = hp.uniform('fmax', 8, 24)
        mfcc_params = {
            'ncep': (lambda x: int(np.round(x)), ncep_choices),
            'fmin': (lambda x: int(np.round(x) * 100), fmin_choices),
            'fmax': (lambda x: int(np.round(x) * 1000), fmax_choices),
        }

        space = []

        for arg_name, (converter, arg_values) in mfcc_params.items():
            space.append(arg_values)
            params_names.append(arg_name)
            params_converters.append(converter)
            params_count += 1

        trials = Trials()
        best = fmin(fn=loss,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=100,
                    trials=trials)
        print(best)

        with open(trials_file, 'wb') as f:
            pickle.dump(trials, f)

        best_trial = trials.best_trial
        best_trial_args_values_ = best_trial['misc']['vals']
        best_trial_args_values = {}
        for arg_name, arg_values in best_trial_args_values_.items():
            converter = mfcc_params[arg_name][0]
            arg_value = converter(arg_values[0])
            best_trial_args_values[arg_name] = arg_value

        model_args = ['id'] + list(
            best_trial_args_values.keys()) + ['accuracy']

        model_args_values = {x: [] for x in model_args}
        for idx, trial in enumerate(trials.trials):
            if trial == best_trial:
                idx = 'Best'
            trial_args_values = trial['misc']['vals']
            for arg_name in model_args:
                if arg_name == 'id':
                    model_args_values['id'].append(idx)
                elif arg_name == 'accuracy':
                    trial_accuracy = 1. - trial['result']['loss']
                    model_args_values['accuracy'].append(trial_accuracy)
                else:
                    converter = mfcc_params[arg_name][0]
                    val = converter(trial_args_values[arg_name][0])
                    model_args_values[arg_name].append(val)

        with open(tsv_file, open_mode, encoding='utf-8') as f:
            for arg in model_args:
                values = model_args_values[arg]
                f.write('{}\t'.format(arg))
                f.write('\t'.join(map(str, values)))
                f.write('\n')
            open_mode = 'a'
예제 #4
0
    def handle(self, *args, **options):
        clsf_type = options['clsf_type']
        database_name = options['database_name']
        source = options['source']
        annotator_name = options['annotator_name']
        label_level = options['label_level']
        min_occur = options['min_occur']
        ipc = options['ipc']
        ratio_ = options['ratio']
        profile = options['profile']
        agg = options['agg']

        tsv_file = profile + '.tsv'
        trials_file = profile + '.trials'
        if ipc is not None:
            assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur'
            ipc_min = ipc
            ipc_max = ipc
        else:
            ipc_min = min_occur
            ipc_max = int(np.floor(min_occur * 1.5))

        train_ratio, valid_ratio, test_ratio = get_ratios(ratio_)

        open_mode = 'w'

        assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format(
            clsf_type)
        classifier = classifiers[clsf_type]

        database = get_or_error(Database, dict(name__iexact=database_name))
        annotator = get_or_error(User, dict(username__iexact=annotator_name))

        features = list(feature_map.values())
        aggregations = Aggregation.objects.filter(enabled=True).order_by('id')

        if agg == 'all':
            aggregators = [aggregator_map[x.name] for x in aggregations]
        else:
            aggregators = enabled_aggregators[agg]

        _sids, _tids = get_sids_tids(database)

        full_data, col_inds = extract_rawdata(_tids, features, aggregators)

        labels, no_label_ids = get_labels_by_sids(_sids, label_level,
                                                  annotator, min_occur)

        if len(no_label_ids) > 0:
            sids, tids, labels = exclude_no_labels(_sids, _tids, labels,
                                                   no_label_ids)
            lookup_ids_rows = np.searchsorted(_sids, sids)
            full_data = full_data[lookup_ids_rows, :]

        full_data = zscore(full_data)
        full_data[np.where(np.isnan(full_data))] = 0
        full_data[np.where(np.isinf(full_data))] = 0

        unique_labels = np.unique(labels)
        nlabels = len(unique_labels)

        for ftgroup_name, feature_names in ftgroup_names.items():
            if ftgroup_name == 'all':
                features = list(feature_map.values())
            else:
                features = [feature_map[x] for x in feature_names]
            ft_col_inds = []
            for feature in features:
                if feature.is_fixed_length:
                    col_name = feature.name
                    col_range = col_inds[col_name]
                    ft_col_inds += range(col_range[0], col_range[1])
                else:
                    for aggregator in aggregators:
                        col_name = '{}_{}'.format(feature.name,
                                                  aggregator.get_name())
                        col_range = col_inds[col_name]
                        ft_col_inds += range(col_range[0], col_range[1])

            ft_col_inds = np.array(ft_col_inds, dtype=np.int32)
            ndims = len(ft_col_inds)
            data = full_data[:, ft_col_inds]

            if source == 'pca':
                explained, data = pca_optimal(data, ndims, 0.9)
                pca_dims = data.shape[1]

            dp = EnumDataProvider(data, labels, balanced=True)
            trainvalidset, testset = dp.split(test_ratio,
                                              limits=(ipc_min, ipc_max))

            v2t_ratio = valid_ratio / (train_ratio + valid_ratio)
            nfolds = int(np.floor(1. / v2t_ratio + 0.01))

            params_names = []
            params_converters = []
            params_count = 0

            def loss(params):
                classifier_args = {}
                for i in range(params_count):
                    param_name = params_names[i]
                    param_converter = params_converters[i]
                    param_value = params[i]
                    classifier_args[param_name] = param_converter(param_value)

                print(classifier_args)
                score = perform_k_fold(classifier, trainvalidset, nfolds,
                                       v2t_ratio, nlabels, **classifier_args)
                return 1. - score

            n_estimators_choices = hp.uniform('n_estimators', 40, 100)
            min_samples_split_choices = hp.uniform('min_samples_split', 2, 21)
            min_samples_leaf_choices = hp.uniform('min_samples_leaf', 1, 20)

            n_features = data.shape[1]
            auto_gamma = 1 / n_features
            gamma_choices = hp.uniform('gamma', auto_gamma / 10,
                                       auto_gamma * 10)
            c_choices = hp.uniform('C', -1, 2)
            hidden_layer_size_choices = hp.uniform('hidden_layer_sizes', 100,
                                                   5000)
            n_neighbors_choices = hp.uniform('n_neighbors', 1, 10)

            choices = {
                'rf': {
                    'n_estimators':
                    (lambda x: int(np.round(x)), n_estimators_choices),
                    'min_samples_split':
                    (lambda x: int(np.round(x)), min_samples_split_choices),
                    'min_samples_leaf':
                    (lambda x: int(np.round(x)), min_samples_leaf_choices),
                },
                'svm_rbf': {
                    'gamma': (float, gamma_choices),
                    'C': (lambda x: 10**x, c_choices),
                },
                'svm_linear': {
                    'C': (lambda x: 10**x, c_choices),
                },
                'nnet': {
                    'hidden_layer_sizes':
                    (lambda x: (int(np.round(x)), ), hidden_layer_size_choices)
                },
                'knn': {
                    'n_neighbors':
                    (lambda x: int(np.round(x)), n_neighbors_choices)
                }
            }

            space = []
            for arg_name, (converter,
                           arg_values) in choices[clsf_type].items():
                space.append(arg_values)
                params_names.append(arg_name)
                params_converters.append(converter)
                params_count += 1

            trials = Trials()
            max_evals = params_count * 10
            best = fmin(fn=loss,
                        space=space,
                        algo=tpe.suggest,
                        max_evals=max_evals,
                        trials=trials)
            print(best)

            with open(trials_file, 'wb') as f:
                pickle.dump(trials, f)

            best_trial = trials.best_trial
            best_trial_args_values_ = best_trial['misc']['vals']
            best_trial_args_values = {}
            for arg_name, arg_values in best_trial_args_values_.items():
                converter = choices[clsf_type][arg_name][0]
                arg_value = converter(arg_values[0])
                best_trial_args_values[arg_name] = arg_value

            model_args = ['id'] + list(
                best_trial_args_values.keys()) + ['accuracy']

            model_args_values = {x: [] for x in model_args}
            for idx, trial in enumerate(trials.trials):
                if trial == best_trial:
                    idx = 'Best'
                trial_args_values = trial['misc']['vals']
                for arg_name in model_args:
                    if arg_name == 'id':
                        model_args_values['id'].append(idx)
                    elif arg_name == 'accuracy':
                        trial_accuracy = 1. - trial['result']['loss']
                        model_args_values['accuracy'].append(trial_accuracy)
                    else:
                        # choice = choices[clsf_type][arg_name]
                        converter = choices[clsf_type][arg_name][0]
                        val = converter(trial_args_values[arg_name][0])
                        # val = choice[choice_idx]
                        model_args_values[arg_name].append(val)

            # Perform classification on the test set
            train_x = np.array(trainvalidset.data)
            train_y = np.array(trainvalidset.labels, dtype=np.int32)
            test_x = np.array(testset.data)
            test_y = np.array(testset.labels, dtype=np.int32)

            score, label_hits, label_misses, cfmat, importances =\
                classifier(train_x, train_y, test_x, test_y, nlabels, True, **best_trial_args_values)
            lb_hitrates = label_hits / (label_hits + label_misses).astype(
                np.float)

            with open(tsv_file, open_mode, encoding='utf-8') as f:
                for arg in model_args:
                    values = model_args_values[arg]
                    f.write('{}\t'.format(arg))
                    f.write('\t'.join(map(str, values)))
                    f.write('\n')

                f.write('Results using best-model\'s paramaters on testset\n')

                if source == 'full':
                    f.write(
                        'Feature group\tNdims\tLabel prediction score\t{}\n'.
                        format('\t '.join(unique_labels)))
                    f.write('{}\t{}\t{}\t{}\n'.format(
                        ftgroup_name, ndims, score,
                        '\t'.join(map(str, lb_hitrates))))
                else:
                    f.write(
                        'Feature group\tNdims\tPCA explained\tPCA Dims\tLabel prediction score\t{}\n'
                        .format('\t '.join(unique_labels)))
                    f.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                        ftgroup_name, ndims, explained, pca_dims, score,
                        '\t'.join(map(str, lb_hitrates))))
                f.write('\n')
                open_mode = 'a'
    def handle(self, *args, **options):
        clsf_type = options['clsf_type']
        database_name = options['database_name']
        source = options['source']
        annotator_name = options['annotator_name']
        label_level = options['label_level']
        min_occur = options['min_occur']
        ratio_ = options['ratio']
        niters = options['niters']
        csv_filename = options.get('csv_filename', None)

        train_ratio, valid_ratio = get_ratios(ratio_, 2)

        assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format(clsf_type)
        classifier = classifiers[clsf_type]

        database = get_or_error(Database, dict(name__iexact=database_name))
        annotator = get_or_error(User, dict(username__iexact=annotator_name))

        features = Feature.objects.all().order_by('id')
        aggregations = Aggregation.objects.filter(enabled=True).order_by('id')

        enabled_features = []
        for f in features:
            if f.name in feature_map:
                enabled_features.append(f)

        features_hash = '-'.join(list(map(str, [x.id for x in enabled_features])))
        aggregations_hash = '-'.join(list(map(str, aggregations.values_list('id', flat=True))))

        dm = DataMatrix.objects.filter(database=database, features_hash=features_hash,
                                       aggregations_hash=aggregations_hash).last()
        if dm is None:
            raise Exception('No full data matrix for database {}'.format(database_name))

        dm_sids_path = dm.get_sids_path()
        dm_tids_path = dm.get_tids_path()
        dm_bytes_path = dm.get_bytes_path()
        feature_cols = dm.get_cols_path()
        with open(feature_cols, 'r', encoding='utf-8') as f:
            col_inds = json.load(f)

        _sids = bytes_to_ndarray(dm_sids_path, np.int32)
        _sids, sort_order = np.unique(_sids, return_index=True)

        try:
            _tids = bytes_to_ndarray(dm_tids_path, np.int32)
            _tids = _tids[sort_order]
        except FileNotFoundError:
            _tids = get_tids(_sids)

        full_data = get_rawdata_from_binary(dm_bytes_path, len(_sids))
        full_data = full_data[sort_order, :]

        labels, no_label_ids = get_labels_by_sids(_sids, label_level, annotator, min_occur)

        if len(no_label_ids) > 0:
            sids, tids, labels = exclude_no_labels(_sids, _tids, labels, no_label_ids)
            lookup_ids_rows = np.searchsorted(_sids, sids)
            full_data = full_data[lookup_ids_rows, :]

        full_data = zscore(full_data)
        full_data[np.where(np.isnan(full_data))] = 0
        full_data[np.where(np.isinf(full_data))] = 0

        unique_labels = np.unique(labels)
        nlabels = len(unique_labels)

        if csv_filename:
            with open(csv_filename, 'w', encoding='utf-8') as f:
                if source == 'pca':
                    f.write('Feature group\tAggregators\tNdims\tPCA explained\tPCA Dims\tLabel prediction mean\tstdev'
                            '\t{}\n'.format('\t '.join(unique_labels)))
                else:
                    f.write('Feature group\tAggregators\tNdims\tLabel prediction mean\tstdev\t{}\n'
                            .format('\t '.join(unique_labels)))

        for ftgroup_name, feature_names in ftgroup_names.items():
            for agggroup_name, aggs in list(enabled_aggregators.items()) + [('all', None)]:
                if agggroup_name == 'all':
                    aggs = [aggregator_map[x.name] for x in aggregations]
                if ftgroup_name == 'all':
                    features = list(feature_map.values())
                else:
                    features = [feature_map[x] for x in feature_names]
                ft_col_inds = []
                for feature in features:
                    if feature.is_fixed_length:
                        col_name = feature.name
                        col_range = col_inds[col_name]
                        ft_col_inds += range(col_range[0], col_range[1])
                    else:
                        for aggregator in aggs:
                            col_name = '{}_{}'.format(feature.name, aggregator.get_name())
                            col_range = col_inds[col_name]
                            ft_col_inds += range(col_range[0], col_range[1])

                ft_col_inds = np.array(ft_col_inds, dtype=np.int32)
                ndims = len(ft_col_inds)
                data = full_data[:, ft_col_inds]

                if source == 'pca':
                    explained, data = pca_optimal(data, ndims, 0.9)
                    pca_dims = data.shape[1]

                dp = EnumDataProvider(data, labels, balanced=True)

                nfolds = int(np.floor(1 / valid_ratio + 0.01))
                ntrials = nfolds * niters
                label_prediction_scores = [0] * ntrials
                label_hitss = [0] * ntrials
                label_missess = [0] * ntrials
                label_hitrates = np.empty((ntrials, nlabels))
                label_hitrates[:] = np.nan
                importancess = np.empty((ntrials, data.shape[1]))
                cfmats = np.ndarray((ntrials, nlabels, nlabels))

                ind = 0

                bar = Bar('Features: {}. Aggregator: {}. Classifier: {} Data type: {}...'
                          .format(ftgroup_name, agggroup_name, clsf_type, source), max=ntrials)

                for iter in range(niters):
                    traintetset, _ = dp.split(0, limits=(min_occur, int(np.floor(min_occur * 1.5))))
                    traintetset.make_folds(nfolds, valid_ratio)
                    for k in range(nfolds):
                        trainset, testset = traintetset.get_fold(k)
                        train_x = np.array(trainset.data)
                        train_y = np.array(trainset.labels, dtype=np.int32)
                        test_x = np.array(testset.data)
                        test_y = np.array(testset.labels, dtype=np.int32)

                        score, label_hits, label_misses, cfmat, importances = \
                            classifier(train_x, train_y, test_x, test_y, nlabels, True)

                        label_prediction_scores[ind] = score
                        label_hitss[ind] = label_hits
                        label_missess[ind] = label_misses

                        label_hitrate = label_hits / (label_hits + label_misses).astype(np.float)

                        label_hitrates[ind, :] = label_hitrate
                        importancess[ind, :] = importances
                        cfmats[ind, :, :] = cfmat

                        bar.next()
                        ind += 1
                bar.finish()

                mean_label_prediction_scores = np.nanmean(label_prediction_scores)
                std_label_prediction_scores = np.nanstd(label_prediction_scores)
                sum_cfmat = np.nansum(cfmats, axis=0)

                if csv_filename:
                    with open(csv_filename, 'a', encoding='utf-8') as f:
                        if source == 'full':
                            f.write('{}\t{}\t{}\t{}\t{}\t{}\n'
                                    .format(ftgroup_name, agggroup_name, ndims, mean_label_prediction_scores,
                                            std_label_prediction_scores,
                                            '\t'.join(map(str, np.nanmean(label_hitrates, 0)))))
                        else:
                            f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'
                                    .format(ftgroup_name, agggroup_name, ndims, explained, pca_dims,
                                            mean_label_prediction_scores, std_label_prediction_scores,
                                            '\t'.join(map(str, np.nanmean(label_hitrates, 0)))))
                        f.write('\t')
                        f.write('\t'.join(unique_labels))
                        f.write('\n')
                        for i in range(nlabels):
                            label = unique_labels[i]
                            cfrow = sum_cfmat[:, i]
                            f.write(label)
                            f.write('\t')
                            f.write('\t'.join(map(str, cfrow)))
                            f.write('\n')
                        f.write('\n')
                else:
                    print('{}/{}: {} by {}: mean = {} std = {}'
                          .format(ftgroup_name, agggroup_name, clsf_type, source, mean_label_prediction_scores,
                                  std_label_prediction_scores))
예제 #6
0
    def handle(self, *args, **options):
        clsf_type = options['clsf_type']
        database_name = options['database_name']
        annotator_name = options['annotator_name']
        label_level = options['label_level']
        min_occur = options['min_occur']
        ipc = options['ipc']
        ratio_ = options['ratio']
        profile = options['profile']

        load_from = options['load_from']
        format = options['format']
        min_max_loc = options['min_max_loc']
        denormalised = options['denormalised']
        kernel_only = options['kernel_only']

        extractor = extractors[format]

        tsv_file = profile + '.tsv'
        trials_file = profile + '.trials'
        if ipc is not None:
            assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur'
            ipc_min = ipc
            ipc_max = ipc
        else:
            ipc_min = min_occur
            ipc_max = int(np.floor(min_occur * 1.5))

        train_ratio, valid_ratio, test_ratio = get_ratios(ratio_)

        open_mode = 'w'

        assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format(
            clsf_type)
        classifier = classifiers[clsf_type]

        annotator = get_or_error(User, dict(username__iexact=annotator_name))

        if not load_from.lower().endswith('.zip'):
            load_from += '.zip'

        variables = read_variables(load_from)
        variables['extractor'] = extractor
        variables['denormalised'] = denormalised

        if denormalised:
            global_min, global_max = load_global_min_max(min_max_loc)
            variables['global_min'] = global_min
            variables['global_max'] = global_max

        variables['is_log_psd'] = format.startswith('log_')

        factory = NDS2SAEFactory()
        factory.set_output(load_from)
        factory.learning_rate = None
        factory.learning_rate_func = None
        encoder = factory.build()
        session = encoder.recreate_session()

        _sids, full_data = encode_into_data(variables, encoder, session,
                                            database_name, kernel_only)

        labels, no_label_ids = get_labels_by_sids(_sids, label_level,
                                                  annotator, min_occur)

        if len(no_label_ids) > 0:
            sids, _, labels = exclude_no_labels(_sids, None, labels,
                                                no_label_ids)
            lookup_ids_rows = np.searchsorted(_sids, sids)
            full_data = full_data[lookup_ids_rows, :]

        full_data = zscore(full_data)
        full_data[np.where(np.isnan(full_data))] = 0
        full_data[np.where(np.isinf(full_data))] = 0

        ndims = full_data.shape[1]

        unique_labels = np.unique(labels)
        nlabels = len(unique_labels)

        dp = EnumDataProvider(full_data, labels, balanced=True)
        trainvalidset, testset = dp.split(test_ratio,
                                          limits=(ipc_min, ipc_max))

        v2t_ratio = valid_ratio / (train_ratio + valid_ratio)
        nfolds = int(np.floor(1. / v2t_ratio + 0.01))

        params_names = []
        params_converters = []
        params_count = 0

        def loss(params):
            classifier_args = {}
            for i in range(params_count):
                param_name = params_names[i]
                param_converter = params_converters[i]
                param_value = params[i]
                classifier_args[param_name] = param_converter(param_value)

            print(classifier_args)
            score = perform_k_fold(classifier, trainvalidset, nfolds,
                                   v2t_ratio, nlabels, **classifier_args)
            return 1. - score

        n_estimators_choices = hp.uniform('n_estimators', 40, 100)
        min_samples_split_choices = hp.uniform('min_samples_split', 2, 21)
        min_samples_leaf_choices = hp.uniform('min_samples_leaf', 1, 20)

        n_features = full_data.shape[1]
        auto_gamma = 1 / n_features
        gamma_choices = hp.uniform('gamma', auto_gamma / 10, auto_gamma * 10)
        c_choices = hp.uniform('C', -1, 2)
        hidden_layer_size_choices = hp.uniform('hidden_layer_sizes', 100, 5000)
        n_neighbors_choices = hp.uniform('n_neighbors', 1, 10)

        choices = {
            'rf': {
                'n_estimators':
                (lambda x: int(np.round(x)), n_estimators_choices),
                'min_samples_split':
                (lambda x: int(np.round(x)), min_samples_split_choices),
                'min_samples_leaf':
                (lambda x: int(np.round(x)), min_samples_leaf_choices),
            },
            'svm_rbf': {
                'gamma': (float, gamma_choices),
                'C': (lambda x: 10**x, c_choices),
            },
            'svm_linear': {
                'C': (lambda x: 10**x, c_choices),
            },
            'nnet': {
                'hidden_layer_sizes':
                (lambda x: (int(np.round(x)), ), hidden_layer_size_choices)
            },
            'knn': {
                'n_neighbors':
                (lambda x: int(np.round(x)), n_neighbors_choices)
            }
        }

        space = []
        for arg_name, (converter, arg_values) in choices[clsf_type].items():
            space.append(arg_values)
            params_names.append(arg_name)
            params_converters.append(converter)
            params_count += 1

        trials = Trials()
        max_evals = params_count * 10
        best = fmin(fn=loss,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=max_evals,
                    trials=trials)
        print(best)

        with open(trials_file, 'wb') as f:
            pickle.dump(trials, f)

        best_trial = trials.best_trial
        best_trial_args_values_ = best_trial['misc']['vals']
        best_trial_args_values = {}
        for arg_name, arg_values in best_trial_args_values_.items():
            converter = choices[clsf_type][arg_name][0]
            arg_value = converter(arg_values[0])
            best_trial_args_values[arg_name] = arg_value

        model_args = ['id'] + list(
            best_trial_args_values.keys()) + ['accuracy']

        model_args_values = {x: [] for x in model_args}
        for idx, trial in enumerate(trials.trials):
            if trial == best_trial:
                idx = 'Best'
            trial_args_values = trial['misc']['vals']
            for arg_name in model_args:
                if arg_name == 'id':
                    model_args_values['id'].append(idx)
                elif arg_name == 'accuracy':
                    trial_accuracy = 1. - trial['result']['loss']
                    model_args_values['accuracy'].append(trial_accuracy)
                else:
                    # choice = choices[clsf_type][arg_name]
                    converter = choices[clsf_type][arg_name][0]
                    val = converter(trial_args_values[arg_name][0])
                    # val = choice[choice_idx]
                    model_args_values[arg_name].append(val)

        # Perform classification on the test set
        train_x = np.array(trainvalidset.data)
        train_y = np.array(trainvalidset.labels, dtype=np.int32)
        test_x = np.array(testset.data)
        test_y = np.array(testset.labels, dtype=np.int32)

        score, label_hits, label_misses, cfmat, importances =\
            classifier(train_x, train_y, test_x, test_y, nlabels, True, **best_trial_args_values)
        lb_hitrates = label_hits / (label_hits + label_misses).astype(np.float)

        with open(tsv_file, open_mode, encoding='utf-8') as f:
            for arg in model_args:
                values = model_args_values[arg]
                f.write('{}\t'.format(arg))
                f.write('\t'.join(map(str, values)))
                f.write('\n')

            f.write('Results using best-model\'s paramaters on testset\n')
            f.write(
                'Feature group\tNdims\tLabel prediction score\t{}\n'.format(
                    '\t '.join(unique_labels)))
            f.write('{}\t{}\t{}\t{}\n'.format('s2senc', ndims, score,
                                              '\t'.join(map(str,
                                                            lb_hitrates))))

            f.write('\n')
            open_mode = 'a'