Exemplo n.º 1
0
def evaluate(model, x, y, names, pairs, tag):
    results = {'samples': pairs, 'labels': y.tolist(), 'evaluations': []}
    x, names = group_by_paintings(x, names=names)
    samples, patches, features = x.shape
    name_indices = {n: i for i, n in enumerate(names)}
    pair_indices = np.array([[name_indices[a], name_indices[b]]
                             for a, b in pairs]).T

    try:
        probabilities = model.predict_proba(x.reshape(-1, features)).reshape(
            samples, patches, -1)
        labels = None
        hyperplane_distance = None
        multi_class = True
    except AttributeError:
        probabilities = None
        labels = model.predict(x.reshape(-1,
                                         features)).reshape(samples, patches)
        hyperplane_distance = model.decision_function(x.reshape(
            -1, features)).reshape(samples, patches, -1)
        multi_class = len(model.classes_) > 2
        if not multi_class:
            hyperplane_distance = np.squeeze(hyperplane_distance, axis=-1)

    for strategy_tag in ('sum', 'mean', 'farthest', 'most_frequent'):
        strategy = getattr(strategies, strategy_tag)

        p = Fusion(strategy=strategy, multi_class=multi_class).predict(
            probabilities=probabilities,
            hyperplane_distance=hyperplane_distance,
            labels=labels)
        p = p[pair_indices]
        p = (p[0] == p[1]).astype(np.float)
        score = metrics.accuracy_score(y, p)

        hd = hyperplane_distance.mean(axis=(1, 2))[pair_indices]
        hd = hd[0] * hd[1]

        roc_auc = metrics.roc_auc_score(y, hd)

        print('roc auc: ', roc_auc, '\n', 'score using',
              strategy_tag, 'strategy:', score, '\n',
              metrics.classification_report(y, p), '\nConfusion matrix:\n',
              metrics.confusion_matrix(y, p))
        print('samples incorrectly classified:', names[p != y], '\n')

        results['evaluations'].append({
            'strategy': strategy_tag,
            'score': score,
            'p': p.tolist()
        })

    return results
def evaluate(probabilities, y, names, pairs, binary_strategy):
    from scipy import stats
    from sklearn import metrics
    from connoisseur.datasets import group_by_paintings

    print('aggregating patches')
    probabilities, names = group_by_paintings(probabilities, names=names)

    results = {
        'pairs': pairs,
        'labels': y.tolist(),
        'evaluations': [],
        'names': names.tolist()
    }

    print('all done, proceeding to fusion')
    probabilities = probabilities.mean(axis=-2)

    print('generating name map')
    name_indices = {n: i for i, n in enumerate(names)}

    if binary_strategy == 'dot':
        binary_strategy = np.dot
    elif binary_strategy == 'pearsonr':
        binary_strategy = lambda _x, _y: stats.pearsonr(_x, _y)[0]
    else:
        raise ValueError('unknown binary strategy %s' % binary_strategy)

    binary_probabilities = np.clip([
        binary_strategy(probabilities[name_indices[a]],
                        probabilities[name_indices[b]]) for a, b in pairs
    ], 0, 1)

    p = (binary_probabilities > .5).astype(np.float)

    score = metrics.roc_auc_score(y, binary_probabilities)
    print('roc auc score using mean strategy:', score, '\n',
          metrics.classification_report(y, p), '\nConfusion matrix:\n',
          metrics.confusion_matrix(y, p))
    print('samples incorrectly classified:', names[p != y], '\n')

    results['evaluations'].append({
        'strategy':
        'mean',
        'score':
        score,
        'probabilities':
        probabilities.tolist(),
        'binary_probabilities':
        binary_probabilities.tolist()
    })

    return results
def evaluate(probabilities, y, names, tag, group_patches, phase):
    from connoisseur.datasets import group_by_paintings
    from connoisseur.fusion import Fusion, strategies

    p = np.argmax(probabilities, axis=-1)
    score = metrics.accuracy_score(y, p)
    cm = metrics.confusion_matrix(y, p)
    print('score using raw strategy:', score, '\n',
          metrics.classification_report(y, p), '\nConfusion matrix:\n', cm)

    plot_confusion_matrix(cm, [str(i) for i in np.unique(y)],
                          name='-'.join((tag, phase, 'cm.jpg')),
                          cmap='BuPu')

    results = {
        'samples': names,
        'labels': y.tolist(),
        'evaluations': [{
            'strategy': 'raw',
            'score': score,
            'p': p.tolist(),
        }]
    }

    if group_patches:
        probabilities, y, names = group_by_paintings(probabilities,
                                                     y,
                                                     names=names)
        y = np.asarray([_y[0] for _y in y])

        for strategy_tag in ('mean', 'farthest', 'most_frequent'):
            strategy = getattr(strategies, strategy_tag)

            p = Fusion(strategy=strategy).predict(probabilities)
            score = metrics.accuracy_score(y, p)
            print('score using', strategy_tag, 'strategy:', score, '\n',
                  metrics.classification_report(y, p), '\nConfusion matrix:\n',
                  metrics.confusion_matrix(y, p))
            print('samples incorrectly classified:', names[p != y], '\n')

            results['evaluations'].append({
                'strategy': strategy_tag,
                'score': score,
                'p': p.tolist()
            })

    return results
Exemplo n.º 4
0
def run(_run, data_dir, patches, estimator_type, submission_info, solution,
        batch_size, dense_layers, device, ckpt, results_file, submission_file,
        use_multiprocessing, workers, joint_weights, outputs_meta, chunks):
    report_dir = _run.observers[0].dir

    with tf.device(device):
        print('building...')
        model = build_siamese_top_meta(outputs_meta,
                                       joint_weights=joint_weights,
                                       dense_layers=dense_layers)
        model.summary()
        print('loading weights from', ckpt)
        model.load_weights(ckpt)

        print('loading submission and solution...')
        pairs = pd.read_csv(submission_info, quotechar='"',
                            delimiter=',').values[:, 1:]
        labels = pd.read_csv(solution, quotechar='"',
                             delimiter=',').values[:, 1:].flatten()

        print('loading sequential predictions...')
        d = load_pickle_data(data_dir,
                             phases=['test'],
                             keys=['data', 'names'],
                             chunks=chunks)
        d, names = d['test']

        print('signal names:', d.keys())

        inputs = [d[e['n']] for e in outputs_meta]
        del d
        *inputs, names = group_by_paintings(*inputs, names=names)
        inputs = {o['n']: inputs[ix] for ix, o in enumerate(outputs_meta)}

        names = np.asarray([n.split('/')[1] + '.jpg' for n in names])

        # All outputs should have the same amount of patches.
        assert [i.shape[1] for i in inputs.values()]
        print('test data inputs shape:', [s.shape for s in inputs.values()])

        print('\n# test evaluation')
        test_data = ArrayPairsSequence(inputs, names, pairs, labels,
                                       batch_size)
        probabilities = model.predict_generator(
            test_data,
            use_multiprocessing=use_multiprocessing,
            workers=workers,
            verbose=1).reshape(-1, patches)
        del model
        K.clear_session()

    layer_results = evaluate(labels, probabilities, estimator_type)
    layer_results['phase'] = 'test'
    evaluation_results = [layer_results]

    # generate results file.
    with open(os.path.join(report_dir, results_file), 'w') as file:
        json.dump(evaluation_results, file)

    # generate submission file to Kaggle.
    for v in layer_results['evaluations']:
        predictions_field = 'binary_probabilities' if 'binary_probabilities' in v else 'p'
        p = v[predictions_field]

        with open(submission_file.format(strategy=v['strategy']), 'w') as f:
            f.write('index,sameArtist\n')
            f.writelines(['%i,%f\n' % (i, _p) for i, _p in enumerate(p)])
def run(_run, image_shape, data_dir, patches, estimator_type, submission_info,
        solution, architecture, weights, batch_size, last_base_layer,
        use_gram_matrix, pooling, dense_layers, device, num_classes,
        limb_weights, predictions_activation, joint, embedding_units,
        dropout_rate, ckpt, results_file, submission_file, use_multiprocessing,
        workers):
    report_dir = _run.observers[0].dir

    with tf.device(device):
        print('building...')
        model = build_siamese_model(
            image_shape,
            architecture,
            dropout_rate,
            weights,
            num_classes,
            last_base_layer,
            use_gram_matrix,
            dense_layers,
            pooling,
            include_base_top=False,
            include_top=True,
            predictions_activation=predictions_activation,
            limb_weights=limb_weights,
            trainable_limbs=False,
            embedding_units=embedding_units,
            joints=joint)
        model.summary()
        print('loading weights from', ckpt)
        model.load_weights(ckpt)

        limb, rest = model.get_layer('model_2'), model.layers[3:]
        x = i = Input(shape=(num_classes, ))
        for l in limb.layers[-5:]:
            x = l(x)
        limb = Model(inputs=i, outputs=x)

        ia, ib = Input(shape=(num_classes, )), Input(shape=(num_classes, ))
        ya = limb(ia)
        yb = limb(ib)

        x = [ya, yb]
        for l in rest:
            x = l(x)

        model = Model(inputs=[ia, ib], outputs=x)

        print('loading submission and solution...')
        pairs = pd.read_csv(submission_info, quotechar='"',
                            delimiter=',').values[:, 1:]
        labels = pd.read_csv(solution, quotechar='"',
                             delimiter=',').values[:, 1:].flatten()

        print('loading sequential predictions...')
        d = load_pickle_data(data_dir, phases=['test'], keys=['data', 'names'])
        d, names = d['test']
        samples = d['predictions']
        del d
        samples, names = group_by_paintings(samples, names=names)
        names = np.asarray([n.split('/')[1] + '.jpg' for n in names])

        print('test data shape:', samples.shape)

        print('\n# test evaluation')
        test_data = ArrayPairsSequence(samples, names, pairs, labels,
                                       batch_size)
        probabilities = model.predict_generator(
            test_data,
            use_multiprocessing=use_multiprocessing,
            workers=workers,
            verbose=1).reshape(-1, patches)
        del model
        K.clear_session()

    layer_results = evaluate(labels, probabilities, estimator_type)
    layer_results['phase'] = 'test'
    evaluation_results = [layer_results]

    # generate results file.
    with open(os.path.join(report_dir, results_file), 'w') as file:
        json.dump(evaluation_results, file)

    # generate submission file to Kaggle.
    for v in layer_results['evaluations']:
        predictions_field = 'binary_probabilities' if 'binary_probabilities' in v else 'p'
        p = v[predictions_field]

        with open(submission_file.format(strategy=v['strategy']), 'w') as f:
            f.write('index,sameArtist\n')
            f.writelines(['%i,%f\n' % (i, _p) for i, _p in enumerate(p)])
Exemplo n.º 6
0
def evaluate(model,
             x,
             y,
             names,
             group_patches=False,
             group_recaptures=False,
             max_patches=None):
    try:
        probabilities = model.predict_proba(x)
        labels = np.argmax(probabilities, axis=-1)
        hyperplane_distance = None
        multi_class = True
    except AttributeError:
        probabilities = None
        labels = model.predict(x)
        hyperplane_distance = model.decision_function(x)
        multi_class = len(model.classes_) > 2

    p = probabilities if probabilities is not None else hyperplane_distance

    score = metrics.accuracy_score(y, labels)
    cm = metrics.confusion_matrix(y, labels)
    print('score using raw strategy:', score, '\n',
          metrics.classification_report(y, labels), '\nConfusion matrix:\n',
          cm)

    results = {
        'samples':
        names.tolist(),
        'labels':
        y.tolist(),
        'evaluations': [{
            'strategy': 'raw',
            'score': score,
            'p': labels.tolist(),
        }]
    }

    if group_patches:
        labels, p, y, names = group_by_paintings(labels,
                                                 p,
                                                 y,
                                                 names=names,
                                                 max_patches=max_patches)
        y = np.asarray([_y[0] for _y in y])

        patches = p.shape[1] if len(p.shape) > 1 else 'all'

        for strategy_tag in ('sum', 'mean', 'farthest', 'most_frequent'):
            strategy = getattr(strategies, strategy_tag)

            p_ = (Fusion(strategy=strategy, multi_class=multi_class).predict(
                probabilities=p if probabilities is not None else None,
                hyperplane_distance=p
                if hyperplane_distance is not None else None,
                labels=labels))
            score = metrics.accuracy_score(y, p_)
            print('score using', strategy_tag, 'strategy:', score, '\n',
                  metrics.classification_report(y,
                                                p_), '\nConfusion matrix:\n',
                  metrics.confusion_matrix(y, p_), '\n',
                  'samples incorrectly classified:', names[p_ != y])

            if group_recaptures:
                print('combined recaptures score:')
                recaptures = np.asarray(
                    ['-'.join(n.split('-')[:-1]) for n in names])

                rp = (pd.Series(p_, name='p').groupby(recaptures).apply(
                    lambda _x: _x.value_counts().index[0]))
                ry = pd.Series(y, name='y').groupby(recaptures).first()
                ryp = pd.concat([rp, ry], axis=1)
                misses = ryp[ryp['y'] != ryp['p']].index.values

                score = metrics.accuracy_score(ry, rp)
                print('score using', strategy_tag, 'strategy:', score, '\n',
                      metrics.classification_report(ry, rp),
                      '\nConfusion matrix:\n',
                      metrics.confusion_matrix(ry, rp), '\n',
                      'samples incorrectly classified:', misses)

            results['evaluations'].append({
                'strategy': strategy_tag,
                'score': score,
                'p': p_.tolist(),
                'patches': patches
            })

    return results
Exemplo n.º 7
0
def run(_run, data_dir, phases, nb_samples_used, grid_searching, param_grid, cv, n_jobs,
        ckpt_file_name, chunks_loaded, classes, layer, max_patches, using_pca):
    report_dir = _run.observers[0].dir

    print('loading data...')
    data = load_pickle_data(data_dir=data_dir, phases=phases,
                            chunks=chunks_loaded,
                            layers=[layer], classes=classes)
    x, y, names = data['train']
    x = x[layer]

    if 'valid' in data:
        # Merge train and valid data, as K-fold
        # cross-validation will be performed afterwards.
        x_valid, y_valid, names_valid = data['valid']
        x_valid = x_valid[layer]

        x = np.concatenate((x, x_valid))
        y = np.concatenate((y, y_valid))
        names = np.concatenate((names, names_valid))
        del x_valid, y_valid, names_valid
    del data

    x, y, names = group_by_paintings(x, y, names=names, max_patches=max_patches)
    x, y = map(np.concatenate, (x, y))

    if nb_samples_used:
        # Training set is too bing. Sub-sample it.
        samples = np.arange(x.shape[0])
        np.random.shuffle(samples)
        samples = samples[:nb_samples_used]
        x = x[samples]
        y = y[samples]

    print('%s output shape: %s' % (layer, x.shape))
    print('occurrences:', dict(zip(*np.unique(y, return_counts=True))))

    # Flat the features, which are 3-rank tensors
    # at the end of InceptionV3's convolutions.
    x = x.reshape(x.shape[0], -1)

    steps = []

    if using_pca:
        steps.append(('pca', PCA(n_components=.99, random_state=7)))

    steps.append(('svc', LinearSVC(dual=False)))
    model = Pipeline(steps)

    if grid_searching:
        print('grid searching...', end=' ')
        grid = GridSearchCV(model, param_grid=param_grid, cv=cv, n_jobs=n_jobs, verbose=10, refit=True)
        grid.fit(x, y)
        model = grid.best_estimator_
        print('best parameters found:', grid.best_params_)
    else:
        print('training...', end=' ')
        model.fit(x, y)

    if using_pca:
        pca = model.steps[0][1]
        print('done -- training score:', model.score(x, y),
              'pca components:', pca.n_components_,
              '(%f energy conserved)' % sum(pca.explained_variance_ratio_))

    print('saving model...', end=' ')
    joblib.dump(model, os.path.join(report_dir, ckpt_file_name))
    print('done.')
def run(_run, image_shape, data_dir, patches, estimator_type, submission_info,
        solution, architecture, weights, batch_size, last_base_layer,
        use_gram_matrix, pooling, dense_layers, device, chunks, limb_weights,
        dropout_rate, ckpt, results_file, submission_file, use_multiprocessing,
        workers, outputs_meta, limb_dense_layers, joint_weights):
    report_dir = _run.observers[0].dir

    with tf.device(device):
        print('building...')
        model = build_siamese_mo_model(image_shape,
                                       architecture,
                                       outputs_meta,
                                       dropout_rate,
                                       weights,
                                       last_base_layer=last_base_layer,
                                       use_gram_matrix=use_gram_matrix,
                                       limb_dense_layers=limb_dense_layers,
                                       pooling=pooling,
                                       trainable_limbs=False,
                                       limb_weights=limb_weights,
                                       trainable_joints=False,
                                       joint_weights=joint_weights,
                                       dense_layers=dense_layers)

        print('loading weights from', ckpt)
        model.load_weights(ckpt)

        x = []
        for m in outputs_meta:
            name = m['n']
            shape = [m['e']]
            x += [
                Input(shape, name='%s_ia' % name),
                Input(shape, name='%s_ib' % name)
            ]

        o = []
        for i, m in enumerate(outputs_meta):
            name = m['n']
            y = [x[2 * i], x[2 * i + 1]]
            y = model.get_layer('multiply_%i' % (i + 1))(y)
            y = model.get_layer('%s_binary_predictions' % name)(y)
            o += [y]

        rest = model.layers.index(model.get_layer('concatenate_asg'))
        for l in model.layers[rest:]:
            o = l(o)

        meta_model = Model(inputs=x, outputs=o)
        del model

        print('loading submission and solution...')
        pairs = pd.read_csv(submission_info, quotechar='"',
                            delimiter=',').values[:, 1:]
        labels = pd.read_csv(solution, quotechar='"',
                             delimiter=',').values[:, 1:].flatten()

        print('loading sequential predictions...')
        d = load_pickle_data(data_dir,
                             phases=['test'],
                             keys=['data', 'names'],
                             chunks=chunks)
        samples, names = d['test']
        samples = np.asarray(
            list(zip(*(samples['%s_em3' % o['n']] for o in outputs_meta))))
        samples, names = group_by_paintings(samples, names=names)
        names = np.asarray([n.split('/')[1] + '.jpg' for n in names])

        print('test data shape:', samples.shape)

        print('\n# test evaluation')
        test_data = ArrayPairsSequence(samples, names, pairs, labels,
                                       batch_size)
        probabilities = meta_model.predict_generator(
            test_data,
            use_multiprocessing=use_multiprocessing,
            workers=workers,
            verbose=1).reshape(-1, patches)
        del meta_model
        K.clear_session()

    layer_results = evaluate(labels, probabilities, estimator_type)
    layer_results['phase'] = 'test'
    evaluation_results = [layer_results]

    # generate results file.
    with open(os.path.join(report_dir, results_file), 'w') as file:
        json.dump(evaluation_results, file)

    # generate submission file to Kaggle.
    for v in layer_results['evaluations']:
        predictions_field = 'binary_probabilities' if 'binary_probabilities' in v else 'p'
        p = v[predictions_field]

        with open(
                os.path.join(report_dir,
                             submission_file.format(strategy=v['strategy'])),
                'w') as f:
            f.write('index,sameArtist\n')
            f.writelines(['%i,%f\n' % (i, _p) for i, _p in enumerate(p)])
def run(_run, device, data_dir, input_shape, patches, estimator_type,
        submission_info, solution, chunks, batch_size, embedding_units, joints,
        include_sigmoid_unit, ckpt, results_file, submission_file,
        use_multiprocessing, workers):
    report_dir = _run.observers[0].dir

    with tf.device(device):
        print('building...')

        x = Input(shape=input_shape)
        identity_model = Model(inputs=x, outputs=x)
        model = build_siamese_gram_model(
            input_shape,
            architecture=None,
            dropout_rate=0,
            embedding_units=embedding_units,
            joints=joints,
            include_sigmoid_unit=include_sigmoid_unit,
            limb=identity_model)
        model.load_weights(ckpt, by_name=True)
        model.summary()

        print('loading submission and solution...')
        pairs = pd.read_csv(submission_info, quotechar='"',
                            delimiter=',').values[:, 1:]
        labels = pd.read_csv(solution, quotechar='"',
                             delimiter=',').values[:, 1:].flatten()

        print('loading sequential predictions...')
        d = load_pickle_data(data_dir,
                             phases=['test'],
                             keys=['data', 'names'],
                             chunks=chunks)
        d, names = d['test']
        samples = d['predictions']
        del d
        samples, names = group_by_paintings(samples, names=names)
        names = np.asarray([n.split('/')[1] + '.jpg' for n in names])

        print('test data shape:', samples.shape)

        print('\n# test evaluation')
        test_data = ArrayPairsSequence(samples, names, pairs, labels,
                                       batch_size)
        probabilities = model.predict_generator(
            test_data,
            use_multiprocessing=use_multiprocessing,
            workers=workers,
            verbose=1).reshape(-1, patches)
        del model
        K.clear_session()

    layer_results = evaluate(labels, probabilities, estimator_type)
    layer_results['phase'] = 'test'
    evaluation_results = [layer_results]

    # generate results file.
    with open(os.path.join(report_dir, results_file), 'w') as file:
        json.dump(evaluation_results, file)

    # generate submission file to Kaggle.
    for v in layer_results['evaluations']:
        predictions_field = 'binary_probabilities' if 'binary_probabilities' in v else 'p'
        p = v[predictions_field]

        with open(submission_file.format(strategy=v['strategy']), 'w') as f:
            f.write('index,sameArtist\n')
            f.writelines(['%i,%f\n' % (i, _p) for i, _p in enumerate(p)])