示例#1
0
def construct_data(model_name, x, y, bins=20):
    model = get_model(model_name)
    data_name = get_model_data(model_name)
    try:
        data = get_dataset(data_name, split=True, verbose=0, discrete=True)
    except LookupError:
        print("Cannot find data with name {}".format(data_name))
        return None
    ranges = data['ranges']
    categories = data['categories']
    discretizer = data['discretizer']
    hists = data2histogram(x, bins, ranges)
    confidence = model.fidelity(x) if isinstance(model,
                                                 SurrogateMixin) else None
    score = model.score(y, model.predict(x))
    ret = {
        'data': x,
        'target': y,
        # 'featureNames': data['feature_names'],
        # 'labelNames': data['target_names'],
        # 'isCategorical': is_categorical,
        # 'categories': categories,
        # 'continuous': [True] * x.shape[1],
        'hists': hists,
        # 'ranges': ranges,
        'ratios': get_category_ratios(x, discretizer, categories),
        # 'discretizers': discretizer2json(discretizer, x),
        'confidence': confidence,
        'score': score
    }
    return ret
示例#2
0
def train_svm(name='oversample', dataset='pima', C=1., sample=True, **kwargs):
    from sklearn.svm import SVC
    data = get_dataset(dataset, split=True, discrete=False, one_hot=True)
    train_x, train_y, test_x, test_y, feature_names = \
        data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names']

    uniq, counts = np.unique(train_y, return_counts=True)
    print('before sample: [{}]'.format('/'.join([str(c) for c in counts])))

    sample_filters = {'Glucose': [105, 121], 'Age': [31.5, 64.4], 'Body Mass Index': [25.7, 100]}

    filters = {feature_names.index(key): value for key, value in sample_filters.items()}
    # filters[train_x.shape[1]] = [0]
    # filters
    print("over sampling training data")
    if sample:
        train_x, train_y = re_sampling(train_x, train_y, filters, rate=1)
    print("#data after over sampling:", len(train_y))

    uniq, counts = np.unique(train_y, return_counts=True)
    print('after sample: [{}]'.format('/'.join([str(c) for c in counts])))

    one_hot_encoder, is_categorical = data['one_hot_encoder'], data['is_categorical']
    model_name = '-'.join([dataset, 'svm'] + [name])
    model = SVC(C=C, probability=True, **kwargs)
    nn = SKClassifier(model, name=model_name, standardize=True, one_hot_encoder=one_hot_encoder)
    nn.train(train_x, train_y)
    nn.evaluate(train_x, train_y, stage='train')
    acc, loss, auc = nn.test(test_x, test_y)
    nn.save()
    return acc, loss, auc
示例#3
0
def cv_nn(dataset, neurons=(20,20), max_iter=1000):
    from sklearn.model_selection import cross_validate, ShuffleSplit
    from sklearn.neural_network import MLPClassifier

    n_test = 5
    data = get_dataset(dataset, split=True, discrete=False, one_hot=True)
    train_x, train_y, test_x, test_y, feature_names = \
        data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names']

    train_x, train_y = do_re_sample(train_x, train_y, feature_names)

    alphas = [0.1, 0.5, 1.0]
    cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
    for alpha in alphas:
        clf = MLPClassifier(neurons, alpha=alpha, max_iter=max_iter, tol=1e-5)
        scores = []
        for i in range(n_test):
            cv_scores = cross_validate(clf, train_x, train_y, cv=cv)
            scores += cv_scores['test_score'].tolist()
        mean_score = np.mean(scores)
        std_score = np.std(scores)
        min_score = np.min(scores)
        max_score = np.max(scores)
        print('alpha {}:'.format(alpha))
        print('score: {}, std: {}, min: {}, max: {}\n'.format(mean_score, std_score, min_score, max_score))
示例#4
0
def model_metric(model_name, data):
    try:
        model = get_model(model_name)
    except FileNotFoundError:
        return None

    if data == 'train' or data == 'test':
        dataset = get_dataset(get_model_data(model_name), split=True)
        if data == 'train':
            x = dataset['train_x']
            y = dataset['train_y']
        else:
            x = dataset['test_x']
            y = dataset['test_y']
    # elif data == 'sample_train' or 'sample_test':
    #     pass
    else:
        raise ValueError("Unknown data {}".format(data))
    conf_mat = confusion_matrix(y, model.predict(x))
    y_pred = model.predict_prob(x)
    # if y_pred.shape[1] == 2:
    #     auc = roc_auc_score(y, y_pred[:, 1])
    # else:
    auc = auc_score(y, y_pred, average=None)
    ret = {
        'confusionMatrix': conf_mat,
        'auc': auc
    }
    return jsonify(ret)
示例#5
0
def get_stream(model_name, data_type, conditional=True, bins=20, filters=None):
    model = get_model(model_name)
    dataset = get_dataset(get_model_data(model_name), split=True)
    ranges = dataset['ranges']
    categories = dataset['categories']
    x, y = get_model_x_y(model_name, data_type, filters)
    streams = compute_streams(model, x, y, ranges, categories, conditional, bins)
    return jsonify(streams)
示例#6
0
def train_nn(name='nn',
             dataset='abalone2',
             neurons=(20, ),
             alpha=0.01,
             **kwargs):
    from sklearn.neural_network import MLPClassifier, MLPRegressor
    data = get_dataset(dataset, split=True, discrete=False, one_hot=True)
    train_x, train_y, test_x, test_y, feature_names = \
        data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names']

    uniq, counts = np.unique(train_y, return_counts=True)
    print('before sample: [{}]'.format('/'.join([str(c) for c in counts])))

    # abalone2
    sample_filters = {
        'shell weight': [0.249, 0.432],
        'shucked weight': [0.337, 0.483]
    }

    # wine_quality_red
    # sample_filters = {'alcohol': [10.5, 11.7]}

    filters = {
        feature_names.index(key): value
        for key, value in sample_filters.items()
    }
    # abalone2
    # filters[train_x.shape[1]] = [0, 2]
    # wine_quality_red
    # filters[train_x.shape[1]] = [2, 4]

    # filters
    print("over sampling training data")
    train_x, train_y = over_sampling(train_x, train_y, filters, rate=2)
    print("#data after over sampling:", len(train_y))

    uniq, counts = np.unique(train_y, return_counts=True)
    print('after sample: [{}]'.format('/'.join([str(c) for c in counts])))

    one_hot_encoder, is_categorical = data['one_hot_encoder'], data[
        'is_categorical']
    model_name = '-'.join([dataset, name] +
                          [str(neuron) for neuron in neurons] + ['oversample'])
    model = MLPClassifier(hidden_layer_sizes=neurons,
                          max_iter=5000,
                          alpha=alpha,
                          **kwargs)
    nn = SKClassifier(model,
                      name=model_name,
                      standardize=True,
                      one_hot_encoder=one_hot_encoder)
    nn.train(train_x, train_y)
    nn.evaluate(train_x, train_y, stage='train')
    acc, loss, auc = nn.test(test_x, test_y)
    nn.save()
    return acc, loss, auc
示例#7
0
def train_svm(name='svm', dataset='wine', C=1.0, problem='classification', **kwargs):
    data = get_dataset(dataset, split=True, discrete=False, one_hot=True)
    train_x, train_y, test_x, test_y, feature_names = \
        data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names']
    if rebalance:
        print("balancing training data")
        train_x, train_y = sample_balance(train_x, train_y)
        print("#data after balancing:", len(train_y))
    one_hot_encoder, is_categorical = data['one_hot_encoder'], data['is_categorical']
    model_name = '-'.join([dataset, name])
    svm = SVM(name=model_name, problem=problem, C=C, one_hot_encoder=one_hot_encoder, **kwargs)
    svm.train(train_x, train_y)
    svm.evaluate(train_x, train_y, stage='train')
    acc, loss, auc = svm.test(test_x, test_y)
    return svm, acc
示例#8
0
def train_nn(name='nn', dataset='wine', neurons=(20,), alpha=0.01, problem='classification', **kwargs):
    data = get_dataset(dataset, split=True, discrete=False, one_hot=True)
    train_x, train_y, test_x, test_y, feature_names = \
        data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names']
    if rebalance:
        print("balancing training data")
        train_x, train_y = sample_balance(train_x, train_y)
        print("#data after balancing:", len(train_y))
    one_hot_encoder, is_categorical = data['one_hot_encoder'], data['is_categorical']
    model_name = '-'.join([dataset, name] + [str(neuron) for neuron in neurons])
    nn = NeuralNet(name=model_name, problem=problem, neurons=neurons, max_iter=5000, alpha=alpha,
                   one_hot_encoder=one_hot_encoder, **kwargs)
    nn.train(train_x, train_y)
    nn.evaluate(train_x, train_y, stage='train')
    acc, loss, auc = nn.test(test_x, test_y)
    return nn, acc
示例#9
0
def get_model_x_y(model_name, data_type='train', filters=None):

    data_name = get_model_data(model_name)
    model = get_model(model_name)
    try:
        data = get_dataset(data_name, split=True, verbose=0, discrete=True)
    except LookupError:
        print("Cannot find data with name {}".format(data_name))
        return None
    if data_type == 'train' or data_type == 'test':
        x = data[data_type + '_x']
        y = data[data_type + '_y']
    elif data_type == 'sample train' or 'sample test':
        x, y = get_surrogate_data(model, data_type)
    else:
        raise ValueError("Unkown data_type {}".format(data_type))
    return filter_data(data['is_categorical'], x, y, filters)
示例#10
0
def train_nn(name='nn', dataset='wine', neurons=(20,), alpha=0.01, **kwargs):
    from sklearn.neural_network import MLPClassifier
    data = get_dataset(dataset, split=True, discrete=False, one_hot=True)
    train_x, train_y, test_x, test_y, feature_names = \
        data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names']
    if rebalance:
        print("balancing training data")
        train_x, train_y = sample_balance(train_x, train_y)
        print("#data after balancing:", len(train_y))
    one_hot_encoder, is_categorical = data['one_hot_encoder'], data['is_categorical']
    model_name = '-'.join([dataset, name] + [str(neuron) for neuron in neurons])
    model = MLPClassifier(hidden_layer_sizes=neurons, max_iter=5000, alpha=alpha, **kwargs)
    nn = SKClassifier(model, name=model_name, standardize=True, one_hot_encoder=one_hot_encoder)
    nn.train(train_x, train_y)
    nn.evaluate(train_x, train_y, stage='train')
    nn.test(test_x, test_y)
    nn.save()
示例#11
0
def train_svm(name='svm', dataset='wine', C=1.0, problem='classification', **kwargs):
    from sklearn.svm import SVC
    data = get_dataset(dataset, split=True, discrete=False, one_hot=True)
    train_x, train_y, test_x, test_y, feature_names = \
        data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names']
    if rebalance:
        print("balancing training data")
        train_x, train_y = sample_balance(train_x, train_y)
        print("#data after balancing:", len(train_y))
    one_hot_encoder, is_categorical = data['one_hot_encoder'], data['is_categorical']
    model_name = '-'.join([dataset, name])
    model = SVC(C=C, probability=True, **kwargs)
    svm = SKClassifier(model, name=model_name, one_hot_encoder=one_hot_encoder)
    svm.train(train_x, train_y)
    svm.evaluate(train_x, train_y, stage='train')
    svm.test(test_x, test_y)
    svm.save()
示例#12
0
def train_surrogate(model_file, is_global=True, sampling_rate=5., surrogate='rule',
                    rule_maxlen=2, min_support=0.01, eta=1, iters=50000, _lambda=30, alpha=1):
    is_rule = surrogate == 'rule'
    model = load_model(model_file)
    dataset = model.name.split('-')[0]
    data = get_dataset(dataset, split=True, discrete=is_rule, one_hot=is_rule)
    train_x, train_y, test_x, test_y, feature_names, is_categorical = \
        data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names'], data['is_categorical']
    # print(feature_names)
    print("Original model:")
    model.test(test_x, test_y)
    print("Surrogate model:")

    model_name = surrogate + '-surrogate-' + model.name
    if surrogate == 'rule':
        surrogate_model = RuleSurrogate(name=model_name, discretizer=data['discretizer'],
                                        rule_minlen=1, rule_maxlen=rule_maxlen, min_support=min_support,
                                        _lambda=_lambda, nchain=30, eta=eta, iters=iters, alpha=alpha)
    elif surrogate == 'tree':
        surrogate_model = TreeSurrogate(name=model_name, max_depth=None, min_samples_leaf=0.01)
    else:
        raise ValueError("Unknown surrogate type {}".format(surrogate))
    constraints = get_constraints(train_x, is_categorical)
    # sigmas = [0] * train_x.shape[1]
    # print(sigmas)
    if is_global:
        instances = train_x
    else:
        instances = train_x[19:20, :]
    # print('train_y:')
    # print(train_y)
    # print('target_y')
    # print(model.predict(instances))
    if isinstance(surrogate_model, RuleSurrogate):
        surrogate_model.surrogate(model, instances, constraints, sampling_rate, rediscretize=True)
    else:
        surrogate_model.surrogate(model, instances, constraints, sampling_rate)
    # surrogate_model.evaluate(train_x, train_y)
    surrogate_model.describe(feature_names=feature_names)
    surrogate_model.save()
    # surrogate_model.self_test()
    if is_global:
        surrogate_model.test(test_x, test_y)
    else:
        surrogate_model.test(train_x[19:20, :], train_y[19:20])
示例#13
0
def train_tree(name='tree', dataset='wine', max_depth=None, min_samples_leaf=0.005, **kwargs):
    data = get_dataset(dataset, split=True, discrete=False, one_hot=True)
    train_x, train_y, test_x, test_y, feature_names, one_hot_encoder = \
        data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names'], data['one_hot_encoder']

    if rebalance:
        print("balancing training data")
        train_x, train_y = sample_balance(train_x, train_y)
        print("#data after balancing:", len(train_y))

    model_name = '-'.join([dataset, name])
    tree = Tree(name=model_name, max_depth=max_depth, min_samples_leaf=min_samples_leaf,
                one_hot_encoder=one_hot_encoder, **kwargs)
    tree.train(train_x, train_y)
    tree.evaluate(train_x, train_y, stage='train')
    tree.test(test_x, test_y)
    tree.describe()
    tree.export(get_path('models', '{}.json'.format(model_name)))
    tree.save()
示例#14
0
def train_nn(name='sample', dataset='pima', neurons=(20,), alpha=0.01, sample=True, **kwargs):
    from sklearn.neural_network import MLPClassifier, MLPRegressor
    data = get_dataset(dataset, split=True, discrete=False, one_hot=True)
    train_x, train_y, test_x, test_y, feature_names = \
        data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names']

    # filters[train_x.shape[1]] = [0]
    # filters
    if sample:
        train_x, train_y = do_re_sample(train_x, train_y, feature_names)

    one_hot_encoder, is_categorical = data['one_hot_encoder'], data['is_categorical']
    model_name = '-'.join([dataset, 'nn'] + [str(neuron) for neuron in neurons] + [name])
    model = MLPClassifier(hidden_layer_sizes=neurons, max_iter=5000, alpha=alpha, **kwargs)
    nn = SKClassifier(model, name=model_name, standardize=True, one_hot_encoder=one_hot_encoder)
    nn.train(train_x, train_y)
    nn.evaluate(train_x, train_y, stage='train')
    acc, loss, auc = nn.test(test_x, test_y)
    nn.save()
    return acc, loss, auc
示例#15
0
def train_rule(name='rule', dataset='breast_cancer', rule_max_len=2, **kwargs):
    data = get_dataset(dataset, split=True, discrete=True)
    train_x, train_y, test_x, test_y, feature_names = \
        data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names']
    from iml.models.rule_model import RuleList

    if rebalance:
        print("balancing training data")
        train_x, train_y = sample_balance(train_x, train_y)
        print("#data after balancing:", len(train_y))

    # print(train_x.shape, train_x.dtype)
    discretizer = data['discretizer']
    model_name = '-'.join([dataset, name])
    brl = RuleList(name=model_name, rule_maxlen=rule_max_len, discretizer=discretizer, **kwargs)
    brl.train(train_x, train_y)
    brl.evaluate(train_x, train_y, stage='train')
    # print(brl.infer(test_x))
    brl.test(test_x, test_y)
    brl.describe(feature_names=feature_names)
    brl.save()
示例#16
0
def model_meta(model_name):

    data_name = get_model_data(model_name)
    try:
        data = get_dataset(data_name, split=True, verbose=0, discrete=True)
    except LookupError:
        print("Cannot find data with name {}".format(data_name))
        return None
    discretizer = data['discretizer']
    ranges = None if 'ranges' not in data else data['ranges']
    categories = None if 'categories' not in data else data['categories']
    is_categorical = data['is_categorical']

    ret = {
        'featureNames': data['feature_names'],
        'labelNames': data['target_names'],
        'isCategorical': is_categorical,
        'categories': categories,
        # 'continuous': [True] * x.shape[1],
        'ranges': ranges,
        'discretizers': discretizer2json(discretizer),
    }
    return ret
示例#17
0
def train_surrogate(model_file, sampling_rate=5., sample=True,
                    rule_maxlen=2, min_support=0.01, eta=1, iters=50000, _lambda=30):
    model = load_model(model_file)
    dataset = model.name.split('-')[0]
    data = get_dataset(dataset, split=True, discrete=True, one_hot=False)
    train_x, train_y, test_x, test_y, feature_names, is_categorical = \
        data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names'], data['is_categorical']
    ranges = data['ranges']

    if sample:
        train_x, train_y = do_re_sample(train_x, train_y, feature_names)
    # print(feature_names)
    print("Original model:")
    model.test(test_x, test_y)
    print("Surrogate model:")

    model_name = 'rule-surrogate-' + model.name
    surrogate_model = RuleSurrogate(name=model_name, discretizer=data['discretizer'],
                                    rule_minlen=1, rule_maxlen=rule_maxlen, min_support=min_support,
                                    _lambda=_lambda, nchain=30, eta=eta, iters=iters)
    constraints = get_constraints(is_categorical, ranges)
    # sigmas = [0] * train_x.shape[1]
    # print(sigmas)
    instances = train_x
    # print('train_y:')
    # print(train_y)
    # print('target_y')
    # print(model.predict(instances))
    if isinstance(surrogate_model, RuleSurrogate):
        surrogate_model.surrogate(model, instances, constraints, sampling_rate, rediscretize=True)
    else:
        surrogate_model.surrogate(model, instances, constraints, sampling_rate)
    # surrogate_model.evaluate(train_x, train_y)
    surrogate_model.describe(feature_names=feature_names)
    surrogate_model.save()
    # surrogate_model.self_test()
    surrogate_model.test(test_x, test_y)