예제 #1
0
 def process_class_measures(self, original_measures):
     if self.feature_grouper == 'pca':
         explained, pcaed_measures = pca_optimal(original_measures,
                                                 self.pca_dimension * 2,
                                                 self.pca_explained,
                                                 self.pca_dimension)
         pcaed_measures = zscore(pcaed_measures)
         print('explained = {}, data.shape= {}'.format(
             explained, pcaed_measures.shape))
         dist_triu = pdist(pcaed_measures)
         return dist_triu, pcaed_measures
     else:
         dist_triu = pdist(original_measures)
         return dist_triu, original_measures
예제 #2
0
    def handle(self, *args, **options):
        clsf_type = options['clsf_type']
        database_name = options['database_name']
        source = options['source']
        annotator_name = options['annotator_name']
        label_level = options['label_level']
        min_occur = options['min_occur']
        ipc = options['ipc']
        ratio_ = options['ratio']
        niters = options['niters']
        profile = options.get('profile', None)
        tsv_file = profile + '.tsv'
        if ipc is not None:
            assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur'
            ipc_min = ipc
            ipc_max = ipc
        else:
            ipc_min = min_occur
            ipc_max = int(np.floor(min_occur * 1.5))

        train_ratio, valid_ratio = get_ratios(ratio_, 2)

        open_mode = 'w'

        assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format(
            clsf_type)
        classifier = classifiers[clsf_type]

        database = get_or_error(Database, dict(name__iexact=database_name))
        annotator = get_or_error(User, dict(username__iexact=annotator_name))

        features = Feature.objects.all().order_by('id')
        aggregations = Aggregation.objects.filter(enabled=True).order_by('id')
        aggregators = [aggregator_map[x.name] for x in aggregations]

        enabled_features = []
        for f in features:
            if f.name in feature_map:
                enabled_features.append(f)

        features_hash = '-'.join(
            list(map(str, [x.id for x in enabled_features])))
        aggregations_hash = '-'.join(
            list(map(str, aggregations.values_list('id', flat=True))))

        dm = DataMatrix.objects.filter(
            database=database,
            features_hash=features_hash,
            aggregations_hash=aggregations_hash).last()
        if dm is None:
            raise Exception(
                'No full data matrix for database {}'.format(database_name))

        dm_sids_path = dm.get_sids_path()
        dm_tids_path = dm.get_tids_path()
        dm_bytes_path = dm.get_bytes_path()
        feature_cols = dm.get_cols_path()
        with open(feature_cols, 'r', encoding='utf-8') as f:
            col_inds = json.load(f)

        _sids = bytes_to_ndarray(dm_sids_path, np.int32)
        _sids, sort_order = np.unique(_sids, return_index=True)

        try:
            _tids = bytes_to_ndarray(dm_tids_path, np.int32)
            _tids = _tids[sort_order]
        except FileNotFoundError:
            _tids = get_tids(_sids)

        full_data = get_rawdata_from_binary(dm_bytes_path, len(_sids))
        full_data = full_data[sort_order, :]

        labels, no_label_ids = get_labels_by_sids(_sids, label_level,
                                                  annotator, min_occur)

        if len(no_label_ids) > 0:
            sids, tids, labels = exclude_no_labels(_sids, _tids, labels,
                                                   no_label_ids)
            lookup_ids_rows = np.searchsorted(_sids, sids)
            full_data = full_data[lookup_ids_rows, :]

        full_data = zscore(full_data)
        full_data[np.where(np.isnan(full_data))] = 0
        full_data[np.where(np.isinf(full_data))] = 0

        unique_labels = np.unique(labels)
        nlabels = len(unique_labels)

        for ftgroup_name, feature_names in ftgroup_names.items():
            if ftgroup_name == 'all':
                features = list(feature_map.values())
            else:
                features = [feature_map[x] for x in feature_names]
            ft_col_inds = []
            for feature in features:
                if feature.is_fixed_length:
                    col_name = feature.name
                    col_range = col_inds[col_name]
                    ft_col_inds += range(col_range[0], col_range[1])
                else:
                    for aggregator in aggregators:
                        col_name = '{}_{}'.format(feature.name,
                                                  aggregator.get_name())
                        col_range = col_inds[col_name]
                        ft_col_inds += range(col_range[0], col_range[1])

            ft_col_inds = np.array(ft_col_inds, dtype=np.int32)
            ndims = len(ft_col_inds)
            data = full_data[:, ft_col_inds]

            if source == 'pca':
                explained, data = pca_optimal(data, ndims, 0.9)
                pca_dims = data.shape[1]

            with open('/tmp/hyperopt.pkl', 'rb') as f:
                saved = pickle.load(f)

            performance_data = saved[clsf_type]
            accuracies = performance_data['accuracies']
            groups = performance_data['groups']
            params = performance_data['params']

            group_name = '{}-{}'.format(ftgroup_name, source)
            group_member_inds = np.where(groups == group_name)
            group_accuracies = accuracies[group_member_inds]

            best_acc_idx = np.argmax(group_accuracies)

            group_params = {}
            best_params = {}
            for param_name in params:
                param_values = np.array(params[param_name])
                group_param_values = param_values[group_member_inds]
                group_params[param_name] = group_param_values

                converter = converters[clsf_type][param_name]
                best_params[param_name] = converter(
                    group_param_values[best_acc_idx])

            dp = EnumDataProvider(data, labels, balanced=True)

            nfolds = int(np.floor(1 / valid_ratio + 0.01))
            ntrials = nfolds * niters
            label_prediction_scores = [0] * ntrials
            label_hitss = [0] * ntrials
            label_missess = [0] * ntrials
            label_hitrates = np.empty((ntrials, nlabels))
            label_hitrates[:] = np.nan
            importancess = np.empty((ntrials, data.shape[1]))
            cfmats = np.ndarray((ntrials, nlabels, nlabels))

            ind = 0

            bar = Bar('Features: {}. Classifier: {} Data type: {}...'.format(
                ftgroup_name, clsf_type, source),
                      max=ntrials)

            for iter in range(niters):
                traintetset, _ = dp.split(0, limits=(ipc_min, ipc_max))
                traintetset.make_folds(nfolds, valid_ratio)
                for k in range(nfolds):
                    trainset, testset = traintetset.get_fold(k)
                    train_x = np.array(trainset.data)
                    train_y = np.array(trainset.labels, dtype=np.int32)
                    test_x = np.array(testset.data)
                    test_y = np.array(testset.labels, dtype=np.int32)

                    score, label_hits, label_misses, cfmat, importances = \
                        classifier(train_x, train_y, test_x, test_y, nlabels, True, **best_params)

                    label_prediction_scores[ind] = score
                    label_hitss[ind] = label_hits
                    label_missess[ind] = label_misses

                    label_hitrate = label_hits / (
                        label_hits + label_misses).astype(np.float)

                    label_hitrates[ind, :] = label_hitrate
                    importancess[ind, :] = importances
                    cfmats[ind, :, :] = cfmat

                    bar.next()
                    ind += 1
            bar.finish()

            mean_label_prediction_scores = np.nanmean(label_prediction_scores)
            std_label_prediction_scores = np.nanstd(label_prediction_scores)
            sum_cfmat = np.nansum(cfmats, axis=0)

            with open(tsv_file, open_mode, encoding='utf-8') as f:
                if source == 'full':
                    f.write('{}\t{}\t{}\t{}\t{}\n'.format(
                        ftgroup_name, ndims, mean_label_prediction_scores,
                        std_label_prediction_scores,
                        '\t'.join(map(str, np.nanmean(label_hitrates, 0)))))
                else:
                    f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                        ftgroup_name, ndims, explained, pca_dims,
                        mean_label_prediction_scores,
                        std_label_prediction_scores,
                        '\t'.join(map(str, np.nanmean(label_hitrates, 0)))))

                f.write('Accuracy: \n')
                f.write('\t'.join(list(map(str, label_prediction_scores))))
                f.write('\n')
                f.write('\t')
                f.write('\t'.join(unique_labels))
                f.write('\n')
                for i in range(nlabels):
                    label = unique_labels[i]
                    cfrow = sum_cfmat[:, i]
                    f.write(label)
                    f.write('\t')
                    f.write('\t'.join(map(str, cfrow)))
                    f.write('\n')
                f.write('\n')
                open_mode = 'a'
예제 #3
0
    def handle(self, *args, **options):
        clsf_type = options['clsf_type']
        database_name = options['database_name']
        source = options['source']
        annotator_name = options['annotator_name']
        label_level = options['label_level']
        min_occur = options['min_occur']
        ipc = options['ipc']
        ratio_ = options['ratio']
        profile = options['profile']
        agg = options['agg']

        tsv_file = profile + '.tsv'
        trials_file = profile + '.trials'
        if ipc is not None:
            assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur'
            ipc_min = ipc
            ipc_max = ipc
        else:
            ipc_min = min_occur
            ipc_max = int(np.floor(min_occur * 1.5))

        train_ratio, valid_ratio, test_ratio = get_ratios(ratio_)

        open_mode = 'w'

        assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format(
            clsf_type)
        classifier = classifiers[clsf_type]

        database = get_or_error(Database, dict(name__iexact=database_name))
        annotator = get_or_error(User, dict(username__iexact=annotator_name))

        features = list(feature_map.values())
        aggregations = Aggregation.objects.filter(enabled=True).order_by('id')

        if agg == 'all':
            aggregators = [aggregator_map[x.name] for x in aggregations]
        else:
            aggregators = enabled_aggregators[agg]

        _sids, _tids = get_sids_tids(database)

        full_data, col_inds = extract_rawdata(_tids, features, aggregators)

        labels, no_label_ids = get_labels_by_sids(_sids, label_level,
                                                  annotator, min_occur)

        if len(no_label_ids) > 0:
            sids, tids, labels = exclude_no_labels(_sids, _tids, labels,
                                                   no_label_ids)
            lookup_ids_rows = np.searchsorted(_sids, sids)
            full_data = full_data[lookup_ids_rows, :]

        full_data = zscore(full_data)
        full_data[np.where(np.isnan(full_data))] = 0
        full_data[np.where(np.isinf(full_data))] = 0

        unique_labels = np.unique(labels)
        nlabels = len(unique_labels)

        for ftgroup_name, feature_names in ftgroup_names.items():
            if ftgroup_name == 'all':
                features = list(feature_map.values())
            else:
                features = [feature_map[x] for x in feature_names]
            ft_col_inds = []
            for feature in features:
                if feature.is_fixed_length:
                    col_name = feature.name
                    col_range = col_inds[col_name]
                    ft_col_inds += range(col_range[0], col_range[1])
                else:
                    for aggregator in aggregators:
                        col_name = '{}_{}'.format(feature.name,
                                                  aggregator.get_name())
                        col_range = col_inds[col_name]
                        ft_col_inds += range(col_range[0], col_range[1])

            ft_col_inds = np.array(ft_col_inds, dtype=np.int32)
            ndims = len(ft_col_inds)
            data = full_data[:, ft_col_inds]

            if source == 'pca':
                explained, data = pca_optimal(data, ndims, 0.9)
                pca_dims = data.shape[1]

            dp = EnumDataProvider(data, labels, balanced=True)
            trainvalidset, testset = dp.split(test_ratio,
                                              limits=(ipc_min, ipc_max))

            v2t_ratio = valid_ratio / (train_ratio + valid_ratio)
            nfolds = int(np.floor(1. / v2t_ratio + 0.01))

            params_names = []
            params_converters = []
            params_count = 0

            def loss(params):
                classifier_args = {}
                for i in range(params_count):
                    param_name = params_names[i]
                    param_converter = params_converters[i]
                    param_value = params[i]
                    classifier_args[param_name] = param_converter(param_value)

                print(classifier_args)
                score = perform_k_fold(classifier, trainvalidset, nfolds,
                                       v2t_ratio, nlabels, **classifier_args)
                return 1. - score

            n_estimators_choices = hp.uniform('n_estimators', 40, 100)
            min_samples_split_choices = hp.uniform('min_samples_split', 2, 21)
            min_samples_leaf_choices = hp.uniform('min_samples_leaf', 1, 20)

            n_features = data.shape[1]
            auto_gamma = 1 / n_features
            gamma_choices = hp.uniform('gamma', auto_gamma / 10,
                                       auto_gamma * 10)
            c_choices = hp.uniform('C', -1, 2)
            hidden_layer_size_choices = hp.uniform('hidden_layer_sizes', 100,
                                                   5000)
            n_neighbors_choices = hp.uniform('n_neighbors', 1, 10)

            choices = {
                'rf': {
                    'n_estimators':
                    (lambda x: int(np.round(x)), n_estimators_choices),
                    'min_samples_split':
                    (lambda x: int(np.round(x)), min_samples_split_choices),
                    'min_samples_leaf':
                    (lambda x: int(np.round(x)), min_samples_leaf_choices),
                },
                'svm_rbf': {
                    'gamma': (float, gamma_choices),
                    'C': (lambda x: 10**x, c_choices),
                },
                'svm_linear': {
                    'C': (lambda x: 10**x, c_choices),
                },
                'nnet': {
                    'hidden_layer_sizes':
                    (lambda x: (int(np.round(x)), ), hidden_layer_size_choices)
                },
                'knn': {
                    'n_neighbors':
                    (lambda x: int(np.round(x)), n_neighbors_choices)
                }
            }

            space = []
            for arg_name, (converter,
                           arg_values) in choices[clsf_type].items():
                space.append(arg_values)
                params_names.append(arg_name)
                params_converters.append(converter)
                params_count += 1

            trials = Trials()
            max_evals = params_count * 10
            best = fmin(fn=loss,
                        space=space,
                        algo=tpe.suggest,
                        max_evals=max_evals,
                        trials=trials)
            print(best)

            with open(trials_file, 'wb') as f:
                pickle.dump(trials, f)

            best_trial = trials.best_trial
            best_trial_args_values_ = best_trial['misc']['vals']
            best_trial_args_values = {}
            for arg_name, arg_values in best_trial_args_values_.items():
                converter = choices[clsf_type][arg_name][0]
                arg_value = converter(arg_values[0])
                best_trial_args_values[arg_name] = arg_value

            model_args = ['id'] + list(
                best_trial_args_values.keys()) + ['accuracy']

            model_args_values = {x: [] for x in model_args}
            for idx, trial in enumerate(trials.trials):
                if trial == best_trial:
                    idx = 'Best'
                trial_args_values = trial['misc']['vals']
                for arg_name in model_args:
                    if arg_name == 'id':
                        model_args_values['id'].append(idx)
                    elif arg_name == 'accuracy':
                        trial_accuracy = 1. - trial['result']['loss']
                        model_args_values['accuracy'].append(trial_accuracy)
                    else:
                        # choice = choices[clsf_type][arg_name]
                        converter = choices[clsf_type][arg_name][0]
                        val = converter(trial_args_values[arg_name][0])
                        # val = choice[choice_idx]
                        model_args_values[arg_name].append(val)

            # Perform classification on the test set
            train_x = np.array(trainvalidset.data)
            train_y = np.array(trainvalidset.labels, dtype=np.int32)
            test_x = np.array(testset.data)
            test_y = np.array(testset.labels, dtype=np.int32)

            score, label_hits, label_misses, cfmat, importances =\
                classifier(train_x, train_y, test_x, test_y, nlabels, True, **best_trial_args_values)
            lb_hitrates = label_hits / (label_hits + label_misses).astype(
                np.float)

            with open(tsv_file, open_mode, encoding='utf-8') as f:
                for arg in model_args:
                    values = model_args_values[arg]
                    f.write('{}\t'.format(arg))
                    f.write('\t'.join(map(str, values)))
                    f.write('\n')

                f.write('Results using best-model\'s paramaters on testset\n')

                if source == 'full':
                    f.write(
                        'Feature group\tNdims\tLabel prediction score\t{}\n'.
                        format('\t '.join(unique_labels)))
                    f.write('{}\t{}\t{}\t{}\n'.format(
                        ftgroup_name, ndims, score,
                        '\t'.join(map(str, lb_hitrates))))
                else:
                    f.write(
                        'Feature group\tNdims\tPCA explained\tPCA Dims\tLabel prediction score\t{}\n'
                        .format('\t '.join(unique_labels)))
                    f.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                        ftgroup_name, ndims, explained, pca_dims, score,
                        '\t'.join(map(str, lb_hitrates))))
                f.write('\n')
                open_mode = 'a'
    def handle(self, *args, **options):
        clsf_type = options['clsf_type']
        database_name = options['database_name']
        source = options['source']
        annotator_name = options['annotator_name']
        label_level = options['label_level']
        min_occur = options['min_occur']
        ratio_ = options['ratio']
        niters = options['niters']
        csv_filename = options.get('csv_filename', None)

        train_ratio, valid_ratio = get_ratios(ratio_, 2)

        assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format(clsf_type)
        classifier = classifiers[clsf_type]

        database = get_or_error(Database, dict(name__iexact=database_name))
        annotator = get_or_error(User, dict(username__iexact=annotator_name))

        features = Feature.objects.all().order_by('id')
        aggregations = Aggregation.objects.filter(enabled=True).order_by('id')

        enabled_features = []
        for f in features:
            if f.name in feature_map:
                enabled_features.append(f)

        features_hash = '-'.join(list(map(str, [x.id for x in enabled_features])))
        aggregations_hash = '-'.join(list(map(str, aggregations.values_list('id', flat=True))))

        dm = DataMatrix.objects.filter(database=database, features_hash=features_hash,
                                       aggregations_hash=aggregations_hash).last()
        if dm is None:
            raise Exception('No full data matrix for database {}'.format(database_name))

        dm_sids_path = dm.get_sids_path()
        dm_tids_path = dm.get_tids_path()
        dm_bytes_path = dm.get_bytes_path()
        feature_cols = dm.get_cols_path()
        with open(feature_cols, 'r', encoding='utf-8') as f:
            col_inds = json.load(f)

        _sids = bytes_to_ndarray(dm_sids_path, np.int32)
        _sids, sort_order = np.unique(_sids, return_index=True)

        try:
            _tids = bytes_to_ndarray(dm_tids_path, np.int32)
            _tids = _tids[sort_order]
        except FileNotFoundError:
            _tids = get_tids(_sids)

        full_data = get_rawdata_from_binary(dm_bytes_path, len(_sids))
        full_data = full_data[sort_order, :]

        labels, no_label_ids = get_labels_by_sids(_sids, label_level, annotator, min_occur)

        if len(no_label_ids) > 0:
            sids, tids, labels = exclude_no_labels(_sids, _tids, labels, no_label_ids)
            lookup_ids_rows = np.searchsorted(_sids, sids)
            full_data = full_data[lookup_ids_rows, :]

        full_data = zscore(full_data)
        full_data[np.where(np.isnan(full_data))] = 0
        full_data[np.where(np.isinf(full_data))] = 0

        unique_labels = np.unique(labels)
        nlabels = len(unique_labels)

        if csv_filename:
            with open(csv_filename, 'w', encoding='utf-8') as f:
                if source == 'pca':
                    f.write('Feature group\tAggregators\tNdims\tPCA explained\tPCA Dims\tLabel prediction mean\tstdev'
                            '\t{}\n'.format('\t '.join(unique_labels)))
                else:
                    f.write('Feature group\tAggregators\tNdims\tLabel prediction mean\tstdev\t{}\n'
                            .format('\t '.join(unique_labels)))

        for ftgroup_name, feature_names in ftgroup_names.items():
            for agggroup_name, aggs in list(enabled_aggregators.items()) + [('all', None)]:
                if agggroup_name == 'all':
                    aggs = [aggregator_map[x.name] for x in aggregations]
                if ftgroup_name == 'all':
                    features = list(feature_map.values())
                else:
                    features = [feature_map[x] for x in feature_names]
                ft_col_inds = []
                for feature in features:
                    if feature.is_fixed_length:
                        col_name = feature.name
                        col_range = col_inds[col_name]
                        ft_col_inds += range(col_range[0], col_range[1])
                    else:
                        for aggregator in aggs:
                            col_name = '{}_{}'.format(feature.name, aggregator.get_name())
                            col_range = col_inds[col_name]
                            ft_col_inds += range(col_range[0], col_range[1])

                ft_col_inds = np.array(ft_col_inds, dtype=np.int32)
                ndims = len(ft_col_inds)
                data = full_data[:, ft_col_inds]

                if source == 'pca':
                    explained, data = pca_optimal(data, ndims, 0.9)
                    pca_dims = data.shape[1]

                dp = EnumDataProvider(data, labels, balanced=True)

                nfolds = int(np.floor(1 / valid_ratio + 0.01))
                ntrials = nfolds * niters
                label_prediction_scores = [0] * ntrials
                label_hitss = [0] * ntrials
                label_missess = [0] * ntrials
                label_hitrates = np.empty((ntrials, nlabels))
                label_hitrates[:] = np.nan
                importancess = np.empty((ntrials, data.shape[1]))
                cfmats = np.ndarray((ntrials, nlabels, nlabels))

                ind = 0

                bar = Bar('Features: {}. Aggregator: {}. Classifier: {} Data type: {}...'
                          .format(ftgroup_name, agggroup_name, clsf_type, source), max=ntrials)

                for iter in range(niters):
                    traintetset, _ = dp.split(0, limits=(min_occur, int(np.floor(min_occur * 1.5))))
                    traintetset.make_folds(nfolds, valid_ratio)
                    for k in range(nfolds):
                        trainset, testset = traintetset.get_fold(k)
                        train_x = np.array(trainset.data)
                        train_y = np.array(trainset.labels, dtype=np.int32)
                        test_x = np.array(testset.data)
                        test_y = np.array(testset.labels, dtype=np.int32)

                        score, label_hits, label_misses, cfmat, importances = \
                            classifier(train_x, train_y, test_x, test_y, nlabels, True)

                        label_prediction_scores[ind] = score
                        label_hitss[ind] = label_hits
                        label_missess[ind] = label_misses

                        label_hitrate = label_hits / (label_hits + label_misses).astype(np.float)

                        label_hitrates[ind, :] = label_hitrate
                        importancess[ind, :] = importances
                        cfmats[ind, :, :] = cfmat

                        bar.next()
                        ind += 1
                bar.finish()

                mean_label_prediction_scores = np.nanmean(label_prediction_scores)
                std_label_prediction_scores = np.nanstd(label_prediction_scores)
                sum_cfmat = np.nansum(cfmats, axis=0)

                if csv_filename:
                    with open(csv_filename, 'a', encoding='utf-8') as f:
                        if source == 'full':
                            f.write('{}\t{}\t{}\t{}\t{}\t{}\n'
                                    .format(ftgroup_name, agggroup_name, ndims, mean_label_prediction_scores,
                                            std_label_prediction_scores,
                                            '\t'.join(map(str, np.nanmean(label_hitrates, 0)))))
                        else:
                            f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'
                                    .format(ftgroup_name, agggroup_name, ndims, explained, pca_dims,
                                            mean_label_prediction_scores, std_label_prediction_scores,
                                            '\t'.join(map(str, np.nanmean(label_hitrates, 0)))))
                        f.write('\t')
                        f.write('\t'.join(unique_labels))
                        f.write('\n')
                        for i in range(nlabels):
                            label = unique_labels[i]
                            cfrow = sum_cfmat[:, i]
                            f.write(label)
                            f.write('\t')
                            f.write('\t'.join(map(str, cfrow)))
                            f.write('\n')
                        f.write('\n')
                else:
                    print('{}/{}: {} by {}: mean = {} std = {}'
                          .format(ftgroup_name, agggroup_name, clsf_type, source, mean_label_prediction_scores,
                                  std_label_prediction_scores))