Exemplo n.º 1
0
def task2txt(task_file, out_txt_file):
    cols = ['file', 'onset', 'offset']
    dfs = []
    with h5py.File(task_file) as t:
        bys = t['bys'][...]
    for n_by, by in enumerate(bys):
        by_db = pandas.read_hdf(task_file, 'feat_dbs/' + by)
        with h5py.File(task_file) as t:
            trip_attrs = t['triplets']['by_index'][n_by]
        with h52np.H52NP(task_file) as t:
            inp = t.add_subdataset('triplets', 'data', indexes=trip_attrs)
            for triplets in inp:
                df_A = by_db.loc[triplets[:, 0]]
                df_B = by_db.loc[triplets[:, 1]]
                df_X = by_db.loc[triplets[:, 2]]
                df_A = df_A.reset_index(drop=True)
                df_B = df_B.reset_index(drop=True)
                df_X = df_X.reset_index(drop=True)
                df = pandas.DataFrame()
                df[['file_TGT', 'onset_TGT', 'offset_TGT']] = df_A[cols]
                df[['file_OTH', 'onset_OTH', 'offset_OTH']] = df_B[cols]
                df[['file_X', 'onset_X', 'offset_X']] = df_X[cols]
                dfs.append(df)
    df = pandas.concat(dfs)
    df.to_csv(out_txt_file, index=False, float_format='%.6f')
Exemplo n.º 2
0
def run(train_words, val_words, alignement_features, input_features,
        name='abnet', dim=40, nframes=7):
    data_train = sample_data.read_word_clusters_file(train_words)
    data_test = sample_data.read_word_clusters_file(val_words)
    data = [data_train, data_test]

    layer_size = 500
    n_layers_init = 2
    n_layers_end = 6
    n_layers = n_layers_init
    architecture = [[dim*nframes] + [layer_size]*n_layers_init + [39],
                    [nl.rectify]*n_layers_init + [nl.linear],
                    [0] + [0.2]*n_layers_init]
    nnet = abnet2.ABnet(*architecture, loss_type='cosine_margin', margin=0.5)
    features_getter = sample_data.FeaturesAPI_cached(alignement_features)
    input_features_getter = sample_data.FeaturesAPI_cached(input_features)


    # Utility functions
    def train_fn(batch_data):
        return nnet.train_fn(*batch_data)
    def val_fn(batch_data):
        return nnet.val_fn(*batch_data)
    def train(data, train_fn, val_fn, network, max_epochs=4000, patience=100):
        (train_words, train_clusters), (test_words, test_clusters) = data
        run = []
        best_model = None
        if patience <= 0:
            patience = max_epochs
        patience_val = 0
        best_val = None

        for epoch in range(max_epochs):
            data_train = sample_data.generate_abnet_batch(
                train_words, train_clusters, epoch, features_getter,
                input_features_getter, return_indexes=False)
            data_val = sample_data.generate_abnet_batch(
                test_words, test_clusters, epoch, features_getter,
                input_features_getter, return_indexes=False)
            start_time = time.time()
            train_err, val_err = abnet2.train_iteration(
                data_train, data_val, train_fn, val_fn)
            if epoch % 20 == 0:
                run.append(layers.get_all_param_values(network))
            if np.isnan(val_err) or np.isnan(train_err):
                print("Train error or validation error is NaN, "
                      "stopping now.")
                break
            # Calculating patience
            if best_val == None or val_err < best_val:
                best_val = val_err
                patience_val = 0
                best_model = layers.get_all_param_values(network)
            else:
                patience_val += 1
                if patience_val > patience:
                    print("No improvements after {} iterations, "
                          "stopping now".format(patience))
                    break

            # Then we print the results for this epoch:
            print("Epoch {} of {} took {:.3f}s".format(
                epoch + 1, max_epochs, time.time() - start_time))
            print("  training loss:\t\t{:.6f}".format(train_err))
            print("  validation loss:\t\t{:.6f}".format(val_err))
            acc = nnet.eer(*data_val)
            print("  score eer:\t\t{:.2f} %".format(acc))
            auc = nnet.auc(*data_val)
            print("  score auc:\t\t{:.2f} %".format(auc))
        return best_model, run


    # Training...
    for i in range(n_layers_end - n_layers_init):
        best_model, run = train(data, train_fn, val_fn, nnet.network)
        # saving model
        weights = best_model
        weights_file = '{}_{}_best_model.npz'.format(name, i)
        run_file = '{}_{}_run.h5'.format(name, i)

        tryremove(weights_file)
        tryremove(run_file)
        np.savez(weights_file, weights)

        for epoch, weights in enumerate(run):
            h5py.File(run_file).create_group(str(epoch * 2))
            for i, w in enumerate(weights[::2]):
                h5py.File(run_file)[str(epoch * 2)].create_dataset(str(i), data=w)

        # adding layer
        if i < n_layers_end - n_layers_init - 1:
            n_layers += 1
            W = lasagne.init.GlorotUniform().sample((layer_size, layer_size))
            u = np.empty((layer_size,), dtype=np.float32)
            architecture = [[dim*nframes] + [layer_size]*n_layers + [39],
                            [nl.rectify]*n_layers + [nl.linear],
                            [0] + [0.2, 0.2, 0.2, 0.2]*n_layers]
            init_weights = best_model[:-2] + [W, u] + best_model[-2:]
            nnet = abnet2.ABnet(*architecture, loss_type='cosine_margin', margin=0.5)
            layers.set_all_param_values(nnet.network, init_weights)


    layers.set_all_param_values(nnet.network, best_model)
    h5features_file = name + '_embeddings.h5f'
    try:
        tryremove(h5features_file)
        shutil.copy(input_features, h5features_file)
        del h5py.File(h5features_file)['features']['features']
        transform = nnet.evaluate
        embedding_size = architecture[0][-1]

        with h52np.H52NP(input_features) as f_in, \
             np2h5.NP2H5(h5features_file) as f_out:
            inp = f_in.add_dataset('features', 'features', buf_size=10000)
            out = f_out.add_dataset(
                'features', 'features', buf_size=10000,
                n_rows=inp.n_rows, n_columns=embedding_size,
                item_type=np.float32)
            for X in inp:
                X = X.astype(np.float32)
                emb_wrd = transform(X)
                out.write(emb_wrd)
    except:
        tryremove(h5features_file)
        raise
Exemplo n.º 3
0
def score(task_file, distance_file, score_file=None, score_group='scores'):
    """Calculate the score of a task and put the results in a hdf5 file.

    Parameters
    ----------
    task_file : string
        The hdf5 file containing the task (with the triplets and pairs
        generated)
    distance_file : string
        The hdf5 file containing the distances between the pairs
    score_file : string, optional
        The hdf5 file that will contain the results
    """
    if score_file is None:
        (basename_task, _) = os.path.splitext(task_file)
        (basename_dist, _) = os.path.splitext(distance_file)
        score_file = basename_task + '_' + basename_dist + '.score'
    # file verification:
    assert os.path.exists(task_file), 'Cannot find task file ' + task_file
    assert os.path.exists(distance_file), ('Cannot find distance file ' +
                                           distance_file)
    assert not os.path.exists(score_file), ('score file already exist ' +
                                            score_file)
    # with h5py.File(task_file) as t:
    #     bys = [by for by in t['triplets']]
    # FIXME skip empty by datasets, this should not be necessary anymore when
    # empty datasets are filtered at the task file generation level
    with h5py.File(task_file, 'r') as t:
        bys = t['bys'][...]
        # bys = t['feat_dbs'].keys()
        n_triplets = t['triplets']['data'].shape[0]
    with h5py.File(score_file, 'w') as s:
        s.create_dataset('scores', (n_triplets, 1), dtype=np.int8)
        for n_by, by in enumerate(bys):
            with h5py.File(task_file, 'r') as t, h5py.File(distance_file,
                                                           'r') as d:
                trip_attrs = t['triplets']['by_index'][n_by]
                pair_attrs = t['unique_pairs'].attrs[by]
                # FIXME here we make the assumption
                # that this fits into memory ...
                dis = d['distances']['data'][pair_attrs[1]:pair_attrs[2]][...]
                dis = np.reshape(dis, dis.shape[0])
                # FIXME idem + only unique_pairs used ?
                pairs = t['unique_pairs']['data'][pair_attrs[1]:pair_attrs[2]][
                    ...]
                pairs = np.reshape(pairs, pairs.shape[0])
                base = pair_attrs[0]
                pair_key_type = type_fitting.fit_integer_type((base)**2 - 1,
                                                              is_signed=False)
            with h52np.H52NP(task_file) as t:
                inp = t.add_subdataset('triplets', 'data', indexes=trip_attrs)
                idx_start = trip_attrs[0]
                for triplets in inp:
                    triplets = pair_key_type(triplets)
                    idx_end = idx_start + triplets.shape[0]

                    pairs_AX = triplets[:, 0] + base * triplets[:, 2]
                    # FIXME change the encoding (and type_fitting) so that
                    # A,B and B,A have the same code ... (take a=min(a,b),
                    # b=max(a,b))
                    pairs_BX = triplets[:, 1] + base * triplets[:, 2]
                    dis_AX = dis[np.searchsorted(pairs, pairs_AX)]

                    dis_BX = dis[np.searchsorted(pairs, pairs_BX)]
                    scores = (np.int8(dis_AX < dis_BX) -
                              np.int8(dis_AX > dis_BX))
                    # 1 if X closer to A, -1 if X closer to B, 0 if equal
                    # distance (this doesn't use 0, 1/2, 1 to use the
                    # compact np.int8 data format)
                    s['scores'][idx_start:idx_end] = np.reshape(
                        scores, (-1, 1))
                    idx_start = idx_end