예제 #1
0
def run(data_dir, submission_info_path, solution_path, binary_strategy,
        results_file_name):
    from connoisseur.datasets import load_pickle_data

    pairs = pd.read_csv(submission_info_path, quotechar='"',
                        delimiter=',').values[:, 1:]
    y = pd.read_csv(solution_path, quotechar='"', delimiter=',').values[:, 1:]

    pairs = [[
        'unknown/' + os.path.splitext(a)[0],
        'unknown/' + os.path.splitext(b)[0]
    ] for a, b in pairs]

    results = []
    for phase in ['test']:
        print('\n# %s evaluation' % phase)

        d = load_pickle_data(data_dir, phases=['test'], keys=['data', 'names'])
        d, names = d['test']
        probabilities = d['predictions']
        del d

        layer_results = evaluate(probabilities, y, names, pairs,
                                 binary_strategy)
        layer_results['phase'] = phase
        results.append(layer_results)

    with open(results_file_name, 'w') as file:
        json.dump(results, file)
예제 #2
0
def run(_run, ex_tag, data_dir, phases, classes, layer, ckpt_file_name,
        submission_info_path, solution_path):
    report_dir = _run.observers[0].dir

    from connoisseur.datasets import load_pickle_data

    print('loading model...', end=' ')
    model = joblib.load(ckpt_file_name)
    print('done.')

    print('loading data...', end=' ')
    data = load_pickle_data(data_dir=data_dir,
                            phases=phases,
                            chunks=(0, 1),
                            classes=classes,
                            layers=[layer])
    print('done.')

    results = []

    pairs = pd.read_csv(submission_info_path, quotechar='"',
                        delimiter=',').values[:, 1:]
    y = pd.read_csv(solution_path, quotechar='"', delimiter=',').values[:, 1:]

    pairs = [[
        'unknown/' + os.path.splitext(a)[0],
        'unknown/' + os.path.splitext(b)[0]
    ] for a, b in pairs]

    for p in phases:
        print('\n# %s evaluation' % p)
        x, _, names = data[p]
        x = x[layer]
        x = x.reshape(x.shape[0], -1)

        layer_results = evaluate(model, x, y, names, pairs=pairs, tag=ex_tag)
        layer_results['phase'] = p
        layer_results['layer'] = layer
        results.append(layer_results)

    with open(os.path.join(report_dir, 'report.json'), 'w') as file:
        json.dump(results, file, indent=0)
예제 #3
0
def run(_run, data_dir, phases, classes, layer, ckpt, results_file_name,
        group_patches, group_recaptures, max_patches):
    report_dir = _run.observers[0].dir

    print('loading model...', end=' ')
    model = joblib.load(ckpt)
    print('done.')

    print('loading data...', end=' ')
    data = load_pickle_data(data_dir=data_dir,
                            phases=phases,
                            chunks=(0, 1),
                            classes=classes,
                            layers=[layer])
    print('done.')

    results = []

    for p in phases:
        print('\n# %s evaluation' % p)
        x, y, names = data[p]
        x = x[layer]
        x = x.reshape(x.shape[0], -1)

        layer_results = evaluate(model,
                                 x,
                                 y,
                                 names,
                                 group_patches=group_patches,
                                 group_recaptures=group_recaptures,
                                 max_patches=max_patches)
        layer_results['phase'] = p
        layer_results['layer'] = layer
        results.append(layer_results)

    with open(os.path.join(report_dir, results_file_name), 'w') as file:
        json.dump(results, file)
def run(_run, data_dir, train_info, chunks, train_pairs, valid_pairs,
        train_shuffle, valid_shuffle, joint_weights, trainable_joints,
        dense_layers, batch_size, device, opt_params, dropout_rate, ckpt,
        steps_per_epoch, epochs, validation_steps, initial_epoch,
        early_stop_patience, resuming_ckpt, outputs_meta):
    report_dir = _run.observers[0].dir

    print('loading limb-embedded inputs...')
    d = load_pickle_data(data_dir,
                         keys=['data', 'names'],
                         phases=['train', 'valid'],
                         chunks=chunks)
    x_train, x_valid = d['train'][0], d['valid'][0]
    print('x-train, x-valid shape:', x_train['artist'].shape,
          x_valid['artist'].shape)

    print('loading labels...')
    outputs, name_map = load_multiple_outputs(train_info,
                                              outputs_meta,
                                              encode='sparse')

    ys = []
    for phase in ('train', 'valid'):
        names = d[phase][1]
        names = ['-'.join(os.path.basename(n).split('-')[:-1]) for n in names]
        indices = [name_map[n] for n in names]
        ys += [{o: v[indices] for o, v in outputs.items()}]

    y_train, y_valid = ys

    artists = np.unique(y_train['artist'])
    x_train, y_train = create_pairs(x_train,
                                    y_train,
                                    pairs=train_pairs,
                                    classes=artists,
                                    shuffle=train_shuffle)

    x_valid, y_valid = create_pairs(x_valid,
                                    y_valid,
                                    pairs=valid_pairs,
                                    classes=artists,
                                    shuffle=valid_shuffle)

    for y in (y_train, y_valid):
        y['binary_predictions'] = y['artist_binary_predictions']

    with tf.device(device):
        print('building...')
        model = build_siamese_top_meta(outputs_meta,
                                       dropout_rate=dropout_rate,
                                       joint_weights=joint_weights,
                                       trainable_joints=trainable_joints,
                                       dense_layers=dense_layers)

        if resuming_ckpt:
            print('loading weights from', resuming_ckpt)
            model.load_weights(resuming_ckpt)

        model.compile(optimizer=optimizers.Adam(**opt_params),
                      loss='binary_crossentropy',
                      metrics=['acc'])

        print('training from epoch %i...' % initial_epoch)
        try:
            model.fit(
                x_train,
                y_train,
                steps_per_epoch=steps_per_epoch,
                epochs=epochs,
                validation_data=(x_valid, y_valid),
                validation_steps=validation_steps,
                initial_epoch=initial_epoch,
                batch_size=batch_size,
                verbose=2,
                callbacks=[
                    callbacks.TerminateOnNaN(),
                    callbacks.EarlyStopping(patience=early_stop_patience),
                    callbacks.ReduceLROnPlateau(min_lr=1e-10,
                                                patience=early_stop_patience //
                                                3),
                    callbacks.TensorBoard(report_dir,
                                          batch_size=batch_size,
                                          histogram_freq=1,
                                          write_grads=True,
                                          write_images=True),
                    callbacks.ModelCheckpoint(os.path.join(report_dir, ckpt),
                                              save_best_only=True,
                                              verbose=1),
                ])
        except KeyboardInterrupt:
            print('interrupted by user')
        else:
            print('done')
예제 #5
0
def run(_run, data_dir, patches, estimator_type, submission_info, solution,
        batch_size, dense_layers, device, ckpt, results_file, submission_file,
        use_multiprocessing, workers, joint_weights, outputs_meta, chunks):
    report_dir = _run.observers[0].dir

    with tf.device(device):
        print('building...')
        model = build_siamese_top_meta(outputs_meta,
                                       joint_weights=joint_weights,
                                       dense_layers=dense_layers)
        model.summary()
        print('loading weights from', ckpt)
        model.load_weights(ckpt)

        print('loading submission and solution...')
        pairs = pd.read_csv(submission_info, quotechar='"',
                            delimiter=',').values[:, 1:]
        labels = pd.read_csv(solution, quotechar='"',
                             delimiter=',').values[:, 1:].flatten()

        print('loading sequential predictions...')
        d = load_pickle_data(data_dir,
                             phases=['test'],
                             keys=['data', 'names'],
                             chunks=chunks)
        d, names = d['test']

        print('signal names:', d.keys())

        inputs = [d[e['n']] for e in outputs_meta]
        del d
        *inputs, names = group_by_paintings(*inputs, names=names)
        inputs = {o['n']: inputs[ix] for ix, o in enumerate(outputs_meta)}

        names = np.asarray([n.split('/')[1] + '.jpg' for n in names])

        # All outputs should have the same amount of patches.
        assert [i.shape[1] for i in inputs.values()]
        print('test data inputs shape:', [s.shape for s in inputs.values()])

        print('\n# test evaluation')
        test_data = ArrayPairsSequence(inputs, names, pairs, labels,
                                       batch_size)
        probabilities = model.predict_generator(
            test_data,
            use_multiprocessing=use_multiprocessing,
            workers=workers,
            verbose=1).reshape(-1, patches)
        del model
        K.clear_session()

    layer_results = evaluate(labels, probabilities, estimator_type)
    layer_results['phase'] = 'test'
    evaluation_results = [layer_results]

    # generate results file.
    with open(os.path.join(report_dir, results_file), 'w') as file:
        json.dump(evaluation_results, file)

    # generate submission file to Kaggle.
    for v in layer_results['evaluations']:
        predictions_field = 'binary_probabilities' if 'binary_probabilities' in v else 'p'
        p = v[predictions_field]

        with open(submission_file.format(strategy=v['strategy']), 'w') as f:
            f.write('index,sameArtist\n')
            f.writelines(['%i,%f\n' % (i, _p) for i, _p in enumerate(p)])
예제 #6
0
def run(_run, data_dir, shape, batch_size, device, train_info, use_gram_matrix,
        ckpt_file, dense_layers, opt_params, dropout_p, resuming_from, epochs,
        steps_per_epoch, validation_steps, initial_epoch, early_stop_patience,
        first_trainable_layer, class_weight, outputs_meta, layer_name, chunks):
    try:
        report_dir = _run.observers[0].dir
    except IndexError:
        report_dir = './logs/_unlabeled'

    print('loading limb-embedded inputs...')
    d = load_pickle_data(data_dir,
                         keys=['data', 'names'],
                         phases=['train', 'valid'],
                         chunks=chunks)
    (x_train, names_train), (x_valid, names_valid) = d['train'], d['valid']
    x_train, x_valid = (x[layer_name] for x in (x_train, x_valid))
    print('x-train, x-valid shape:', x_train.shape, x_valid.shape)

    p = np.arange(len(x_train))
    np.random.shuffle(p)
    x_train = x_train[p]
    names_train = names_train[p]

    p = np.arange(len(x_valid))
    np.random.shuffle(p)
    x_valid = x_valid[p]
    names_valid = names_valid[p]

    print('loading labels...')
    outputs, name_map = load_multiple_outputs(train_info,
                                              outputs_meta,
                                              encode='sparse')

    ys = []
    for phase, names in zip(('train', 'valid'), (names_train, names_valid)):
        names = ['-'.join(os.path.basename(n).split('-')[:-1]) for n in names]
        indices = [name_map[n] for n in names]
        ys += [{o: v[indices] for o, v in outputs.items()}]

    y_train, y_valid = ys

    # class_weight = sk_utils.compute_class_weight(class_weight, np.unique(y_train), y_train)
    print('data sample:')
    print(x_train[:10])

    print(x_train.shape)
    print(y_train['artist'].shape)

    with tf.device(device):
        print('building...')
        model = build_meta_limb(
            shape,
            dropout_p=dropout_p,
            use_gram_matrix=use_gram_matrix,
            include_top=True,
            dense_layers=dense_layers,
            classes=[o['u'] for o in outputs_meta],
            predictions_name=[o['n'] for o in outputs_meta],
            predictions_activation=[o['a'] for o in outputs_meta])
        layer_names = [l.name for l in model.layers]
        if first_trainable_layer:
            if first_trainable_layer not in layer_names:
                raise ValueError('%s is not a layer in the model: %s' %
                                 (first_trainable_layer, layer_names))
            _trainable = False
            for layer in model.layers:
                if layer.name == first_trainable_layer:
                    _trainable = True
                layer.trainable = _trainable
            del _trainable
        model.compile(optimizer=optimizers.Adam(**opt_params),
                      loss=dict((o['n'], o['l']) for o in outputs_meta),
                      metrics=dict((o['n'], o['m']) for o in outputs_meta),
                      loss_weights=dict(
                          (o['n'], o['w']) for o in outputs_meta))
        model.summary()

        if resuming_from:
            print('re-loading weights...')
            model.load_weights(resuming_from)

        try:
            print('training from epoch %i...' % initial_epoch)
            model.fit(
                x_train,
                y_train,
                epochs=epochs,
                batch_size=batch_size,
                steps_per_epoch=steps_per_epoch,
                validation_data=(x_valid, y_valid),
                validation_steps=validation_steps,
                initial_epoch=initial_epoch,
                verbose=2,
                class_weight=class_weight,
                callbacks=[
                    TerminateOnNaN(),
                    EarlyStopping(patience=early_stop_patience),
                    ReduceLROnPlateau(min_lr=1e-10,
                                      patience=int(early_stop_patience // 3)),
                    # TensorBoard(report_dir,
                    #             batch_size=batch_size, write_grads=True, write_images=True,
                    #             histogram_freq=10),
                    ModelCheckpoint(os.path.join(report_dir, ckpt_file),
                                    save_best_only=True,
                                    verbose=1)
                ])
        except KeyboardInterrupt:
            print('interrupted by user')
        else:
            print('done')
        finally:
            print('train history:', model.history.history)
def run(_run, image_shape, data_dir, patches, estimator_type, submission_info,
        solution, architecture, weights, batch_size, last_base_layer,
        use_gram_matrix, pooling, dense_layers, device, num_classes,
        limb_weights, predictions_activation, joint, embedding_units,
        dropout_rate, ckpt, results_file, submission_file, use_multiprocessing,
        workers):
    report_dir = _run.observers[0].dir

    with tf.device(device):
        print('building...')
        model = build_siamese_model(
            image_shape,
            architecture,
            dropout_rate,
            weights,
            num_classes,
            last_base_layer,
            use_gram_matrix,
            dense_layers,
            pooling,
            include_base_top=False,
            include_top=True,
            predictions_activation=predictions_activation,
            limb_weights=limb_weights,
            trainable_limbs=False,
            embedding_units=embedding_units,
            joints=joint)
        model.summary()
        print('loading weights from', ckpt)
        model.load_weights(ckpt)

        limb, rest = model.get_layer('model_2'), model.layers[3:]
        x = i = Input(shape=(num_classes, ))
        for l in limb.layers[-5:]:
            x = l(x)
        limb = Model(inputs=i, outputs=x)

        ia, ib = Input(shape=(num_classes, )), Input(shape=(num_classes, ))
        ya = limb(ia)
        yb = limb(ib)

        x = [ya, yb]
        for l in rest:
            x = l(x)

        model = Model(inputs=[ia, ib], outputs=x)

        print('loading submission and solution...')
        pairs = pd.read_csv(submission_info, quotechar='"',
                            delimiter=',').values[:, 1:]
        labels = pd.read_csv(solution, quotechar='"',
                             delimiter=',').values[:, 1:].flatten()

        print('loading sequential predictions...')
        d = load_pickle_data(data_dir, phases=['test'], keys=['data', 'names'])
        d, names = d['test']
        samples = d['predictions']
        del d
        samples, names = group_by_paintings(samples, names=names)
        names = np.asarray([n.split('/')[1] + '.jpg' for n in names])

        print('test data shape:', samples.shape)

        print('\n# test evaluation')
        test_data = ArrayPairsSequence(samples, names, pairs, labels,
                                       batch_size)
        probabilities = model.predict_generator(
            test_data,
            use_multiprocessing=use_multiprocessing,
            workers=workers,
            verbose=1).reshape(-1, patches)
        del model
        K.clear_session()

    layer_results = evaluate(labels, probabilities, estimator_type)
    layer_results['phase'] = 'test'
    evaluation_results = [layer_results]

    # generate results file.
    with open(os.path.join(report_dir, results_file), 'w') as file:
        json.dump(evaluation_results, file)

    # generate submission file to Kaggle.
    for v in layer_results['evaluations']:
        predictions_field = 'binary_probabilities' if 'binary_probabilities' in v else 'p'
        p = v[predictions_field]

        with open(submission_file.format(strategy=v['strategy']), 'w') as f:
            f.write('index,sameArtist\n')
            f.writelines(['%i,%f\n' % (i, _p) for i, _p in enumerate(p)])
예제 #8
0
def run(_run, data_dir, phases, nb_samples_used, grid_searching, param_grid, cv, n_jobs,
        ckpt_file_name, chunks_loaded, classes, layer, max_patches, using_pca):
    report_dir = _run.observers[0].dir

    print('loading data...')
    data = load_pickle_data(data_dir=data_dir, phases=phases,
                            chunks=chunks_loaded,
                            layers=[layer], classes=classes)
    x, y, names = data['train']
    x = x[layer]

    if 'valid' in data:
        # Merge train and valid data, as K-fold
        # cross-validation will be performed afterwards.
        x_valid, y_valid, names_valid = data['valid']
        x_valid = x_valid[layer]

        x = np.concatenate((x, x_valid))
        y = np.concatenate((y, y_valid))
        names = np.concatenate((names, names_valid))
        del x_valid, y_valid, names_valid
    del data

    x, y, names = group_by_paintings(x, y, names=names, max_patches=max_patches)
    x, y = map(np.concatenate, (x, y))

    if nb_samples_used:
        # Training set is too bing. Sub-sample it.
        samples = np.arange(x.shape[0])
        np.random.shuffle(samples)
        samples = samples[:nb_samples_used]
        x = x[samples]
        y = y[samples]

    print('%s output shape: %s' % (layer, x.shape))
    print('occurrences:', dict(zip(*np.unique(y, return_counts=True))))

    # Flat the features, which are 3-rank tensors
    # at the end of InceptionV3's convolutions.
    x = x.reshape(x.shape[0], -1)

    steps = []

    if using_pca:
        steps.append(('pca', PCA(n_components=.99, random_state=7)))

    steps.append(('svc', LinearSVC(dual=False)))
    model = Pipeline(steps)

    if grid_searching:
        print('grid searching...', end=' ')
        grid = GridSearchCV(model, param_grid=param_grid, cv=cv, n_jobs=n_jobs, verbose=10, refit=True)
        grid.fit(x, y)
        model = grid.best_estimator_
        print('best parameters found:', grid.best_params_)
    else:
        print('training...', end=' ')
        model.fit(x, y)

    if using_pca:
        pca = model.steps[0][1]
        print('done -- training score:', model.score(x, y),
              'pca components:', pca.n_components_,
              '(%f energy conserved)' % sum(pca.explained_variance_ratio_))

    print('saving model...', end=' ')
    joblib.dump(model, os.path.join(report_dir, ckpt_file_name))
    print('done.')
def run(_run, image_shape, data_dir, patches, estimator_type, submission_info,
        solution, architecture, weights, batch_size, last_base_layer,
        use_gram_matrix, pooling, dense_layers, device, chunks, limb_weights,
        dropout_rate, ckpt, results_file, submission_file, use_multiprocessing,
        workers, outputs_meta, limb_dense_layers, joint_weights):
    report_dir = _run.observers[0].dir

    with tf.device(device):
        print('building...')
        model = build_siamese_mo_model(image_shape,
                                       architecture,
                                       outputs_meta,
                                       dropout_rate,
                                       weights,
                                       last_base_layer=last_base_layer,
                                       use_gram_matrix=use_gram_matrix,
                                       limb_dense_layers=limb_dense_layers,
                                       pooling=pooling,
                                       trainable_limbs=False,
                                       limb_weights=limb_weights,
                                       trainable_joints=False,
                                       joint_weights=joint_weights,
                                       dense_layers=dense_layers)

        print('loading weights from', ckpt)
        model.load_weights(ckpt)

        x = []
        for m in outputs_meta:
            name = m['n']
            shape = [m['e']]
            x += [
                Input(shape, name='%s_ia' % name),
                Input(shape, name='%s_ib' % name)
            ]

        o = []
        for i, m in enumerate(outputs_meta):
            name = m['n']
            y = [x[2 * i], x[2 * i + 1]]
            y = model.get_layer('multiply_%i' % (i + 1))(y)
            y = model.get_layer('%s_binary_predictions' % name)(y)
            o += [y]

        rest = model.layers.index(model.get_layer('concatenate_asg'))
        for l in model.layers[rest:]:
            o = l(o)

        meta_model = Model(inputs=x, outputs=o)
        del model

        print('loading submission and solution...')
        pairs = pd.read_csv(submission_info, quotechar='"',
                            delimiter=',').values[:, 1:]
        labels = pd.read_csv(solution, quotechar='"',
                             delimiter=',').values[:, 1:].flatten()

        print('loading sequential predictions...')
        d = load_pickle_data(data_dir,
                             phases=['test'],
                             keys=['data', 'names'],
                             chunks=chunks)
        samples, names = d['test']
        samples = np.asarray(
            list(zip(*(samples['%s_em3' % o['n']] for o in outputs_meta))))
        samples, names = group_by_paintings(samples, names=names)
        names = np.asarray([n.split('/')[1] + '.jpg' for n in names])

        print('test data shape:', samples.shape)

        print('\n# test evaluation')
        test_data = ArrayPairsSequence(samples, names, pairs, labels,
                                       batch_size)
        probabilities = meta_model.predict_generator(
            test_data,
            use_multiprocessing=use_multiprocessing,
            workers=workers,
            verbose=1).reshape(-1, patches)
        del meta_model
        K.clear_session()

    layer_results = evaluate(labels, probabilities, estimator_type)
    layer_results['phase'] = 'test'
    evaluation_results = [layer_results]

    # generate results file.
    with open(os.path.join(report_dir, results_file), 'w') as file:
        json.dump(evaluation_results, file)

    # generate submission file to Kaggle.
    for v in layer_results['evaluations']:
        predictions_field = 'binary_probabilities' if 'binary_probabilities' in v else 'p'
        p = v[predictions_field]

        with open(
                os.path.join(report_dir,
                             submission_file.format(strategy=v['strategy'])),
                'w') as f:
            f.write('index,sameArtist\n')
            f.writelines(['%i,%f\n' % (i, _p) for i, _p in enumerate(p)])
def run(_run, device, data_dir, input_shape, patches, estimator_type,
        submission_info, solution, chunks, batch_size, embedding_units, joints,
        include_sigmoid_unit, ckpt, results_file, submission_file,
        use_multiprocessing, workers):
    report_dir = _run.observers[0].dir

    with tf.device(device):
        print('building...')

        x = Input(shape=input_shape)
        identity_model = Model(inputs=x, outputs=x)
        model = build_siamese_gram_model(
            input_shape,
            architecture=None,
            dropout_rate=0,
            embedding_units=embedding_units,
            joints=joints,
            include_sigmoid_unit=include_sigmoid_unit,
            limb=identity_model)
        model.load_weights(ckpt, by_name=True)
        model.summary()

        print('loading submission and solution...')
        pairs = pd.read_csv(submission_info, quotechar='"',
                            delimiter=',').values[:, 1:]
        labels = pd.read_csv(solution, quotechar='"',
                             delimiter=',').values[:, 1:].flatten()

        print('loading sequential predictions...')
        d = load_pickle_data(data_dir,
                             phases=['test'],
                             keys=['data', 'names'],
                             chunks=chunks)
        d, names = d['test']
        samples = d['predictions']
        del d
        samples, names = group_by_paintings(samples, names=names)
        names = np.asarray([n.split('/')[1] + '.jpg' for n in names])

        print('test data shape:', samples.shape)

        print('\n# test evaluation')
        test_data = ArrayPairsSequence(samples, names, pairs, labels,
                                       batch_size)
        probabilities = model.predict_generator(
            test_data,
            use_multiprocessing=use_multiprocessing,
            workers=workers,
            verbose=1).reshape(-1, patches)
        del model
        K.clear_session()

    layer_results = evaluate(labels, probabilities, estimator_type)
    layer_results['phase'] = 'test'
    evaluation_results = [layer_results]

    # generate results file.
    with open(os.path.join(report_dir, results_file), 'w') as file:
        json.dump(evaluation_results, file)

    # generate submission file to Kaggle.
    for v in layer_results['evaluations']:
        predictions_field = 'binary_probabilities' if 'binary_probabilities' in v else 'p'
        p = v[predictions_field]

        with open(submission_file.format(strategy=v['strategy']), 'w') as f:
            f.write('index,sameArtist\n')
            f.writelines(['%i,%f\n' % (i, _p) for i, _p in enumerate(p)])