示例#1
0
def save_data(data, name):

    dataset_path = _datasets_path + name
    print("data saved to {}".format(dataset_path))
    for field in _csv_files:
        filename = field + '.csv'
        file_path = get_path(dataset_path, filename)
        before_save(file_path)
        save_file(data[field], file_path)

    descriptor = {key: data[key] for key in _json_files if key in data}
    save_file(descriptor, get_path(dataset_path, 'spec.json'))
示例#2
0
def categorical2pysbrl_data(x: np.ndarray,
                            y: np.ndarray,
                            data_name,
                            supp=0.05,
                            zmin=1,
                            zmax=3):

    assert len(y.shape) == 1
    assert y.dtype == np.int
    labels = np.unique(y)
    labels = np.arange(np.max(labels) + 1)
    # assert max(labels) + 1 == len(labels)

    x_by_labels = []
    for label in labels:
        x_by_labels.append(x[y == label])
    transactions_by_labels = [
        categorical2transactions(_x) for _x in x_by_labels
    ]
    itemsets = transactions2freqitems(transactions_by_labels,
                                      supp=supp,
                                      zmin=zmin,
                                      zmax=zmax)
    rules = [itemset2feature_categories(itemset) for itemset in itemsets]
    data_by_rule = []
    for features, categories in rules:
        satisfied = rule_satisfied(x, features, categories)
        data_by_rule.append(satisfied)

    # Write data file
    data_filename = get_path(_datasets_path, data_name + '.data')
    before_save(data_filename)
    with open(data_filename, 'w') as f:
        for itemset, data in zip(itemsets, data_by_rule):
            rule_str = '{' + ','.join(itemset) + '}' + '  '
            f.write(rule_str)
            bit_s = ' '.join(['1' if bit else '0' for bit in data])
            f.write(bit_s)
            f.write('\n')

    # Write label file
    label_filename = get_path(_datasets_path, data_name + '.label')
    before_save(label_filename)
    with open(label_filename, 'w') as f:
        for label in labels:
            f.write('{label=%d} ' % label)
            bits = y == label
            bit_s = ' '.join(['1' if bit else '0' for bit in bits])
            f.write(bit_s)
            f.write('\n')
    return data_filename, label_filename
示例#3
0
def load_data(name):
    dataset_path = _datasets_path + name
    dataset = {}
    for field in _csv_files:
        filename = field + '.csv'
        file_path = get_path(dataset_path, filename)
        dataset[field] = load_file(file_path)

    for field in ['target']:
        dataset[field] = dataset[field].reshape((-1))

    descriptor = load_file(get_path(dataset_path, 'spec.json'))
    for key, val in descriptor.items():
        dataset[key] = val
    return dataset
示例#4
0
 def load_cache(self, is_train=False):
     file_name = self.name + ('-train' if is_train else '-test') + '.csv'
     file_path = get_path(sample_cache_dir, file_name)
     return load_file(file_path)
示例#5
0
 def cache_sample(self, x, is_train=False):
     file_name = self.name + ('-train' if is_train else '-test') + '.csv'
     file_path = get_path(sample_cache_dir, file_name)
     save_file(x, file_path)
示例#6
0
def _format_name(name):
    return get_path(Config.model_dir(), "{}{}".format(name, FILE_EXTENSION))
示例#7
0
def get_dataset(data_name,
                discrete=False,
                seed=None,
                split=False,
                train_size=0.75,
                shuffle=True,
                one_hot=True,
                verbose=1):
    if data_name in sklearn_datasets:
        if data_name == 'breast_cancer':
            data = load_breast_cancer()
        elif data_name == 'iris':
            data = load_iris()
        else:  # data_name == 'wine':
            data = load_wine()
        data['is_categorical'] = np.array([False] * data['data'].shape[1])
        data['categories'] = [None] * data['data'].shape[1]
        opts = sklearn_datasets[data_name]
    elif data_name in local_datasets:
        data = load_data(data_name)
        opts = local_datasets[data_name]

    else:
        raise LookupError("Unknown data_name: {}".format(data_name))

    is_categorical = data['is_categorical']

    x = data['data']
    y = data['target']
    # feature_names = data['feature_names']

    if one_hot:
        if verbose:
            print('Converting categorical features to one hot numeric')
        one_hot_features = is_categorical
        if 'is_binary' in data:  # We don't want to one hot already binary data
            one_hot_features = np.logical_and(
                is_categorical, np.logical_not(data['is_binary']))
        one_hot_encoder = OneHotEncoder(
            categorical_features=one_hot_features).fit(data['data'])
        data['one_hot_encoder'] = one_hot_encoder
        if verbose:
            print('Total number of categorical features:',
                  np.sum(one_hot_features))
            if hasattr(one_hot_encoder, 'n_values_'):
                print('One hot value numbers:', one_hot_encoder.n_values_)
    if discrete:
        if verbose:
            print(
                'Discretizing all continuous features using MDLP discretizer')
        discretizer_name = data_name + '-discretizer' + (
            '' if seed is None else ('-' + str(seed))) + '.pkl'
        discretizer_path = get_path(_cached_path, discretizer_name)
        min_depth = 0 if 'min_depth' not in opts else opts['min_depth']
        discretizer = get_discretizer(
            x,
            y,
            continuous_features=np.logical_not(is_categorical),
            filenames=discretizer_path,
            min_depth=min_depth)
        # data['data'] = discretizer.transform(x)
        data['discretizer'] = discretizer

    if split:
        names = [
            get_path(_datasets_path, data_name + suffix) for suffix in
            ['/train_x.npy', '/test_x.npy', '/train_y.npy', '/test_y.npy']
        ]
        train_x, test_x, train_y, test_y = get_split(x,
                                                     y,
                                                     train_size=train_size,
                                                     shuffle=shuffle,
                                                     filenames=names)
        data.update({
            'train_x': train_x,
            'test_x': test_x,
            'train_y': train_y,
            'test_y': test_y,
        })

    mins = np.min(x, axis=0)
    maxs = np.max(x, axis=0)
    ranges = np.vstack([mins, maxs]).T
    data['ranges'] = ranges
    # hacker for some feature_names are arrays
    for key in ['feature_names', 'target_names']:
        if isinstance(data[key], np.ndarray):
            data[key] = data[key].tolist()

    if verbose > 0:
        print("-----------------------")
        print("Data Specs: {:s}".format(data_name))
        print("#data: {:d}".format(len(data['target'])))
        uniq, counts = np.unique(data['target'], return_counts=True)
        counts = [str(c) for c in counts]
        print("Label distribution: [{}]".format('/'.join(counts)))
        print("#features: {:d}".format(data['data'].shape[1]))
        print("#labels: {:d}".format(len(np.unique(data['target']))))
        print("-----------------------")
    return data
示例#8
0
import numpy as np
from flask import Flask, json

from rule_surrogate.utils.io_utils import get_path

# path = get_path('frontend/dist/static', absolute=True)
# print("Static folder: {:s}".format(path))


class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)


class HashableList(list):
    def __hash__(self):
        # l = [e if hasattr(e, '__hass__') else frozenset(e) for e in self]
        return hash(json.jsonify(self))


app = Flask(__name__)

app.config['FRONT_END_ROOT'] = get_path('front-end/build', absolute=True)
app.config['STATIC_FOLDER'] = get_path('front-end/build/static', absolute=True)

# This will make life easier when we have np.ndarray in the object to be jsonified
app.json_encoder = NumpyEncoder