Пример #1
0
def load_swbd_same_diff_nopadding(rng, data_dir):

    logger.info("Loading same and different pairs: " + data_dir)

    datasets = []

    for set in ["train", "dev", "test"]:

        npz_fn = path.join(data_dir, "swbd." + set + ".npz")
        logger.info("Reading: " + npz_fn)

        # Load data and shuffle
        npz = np.load(npz_fn)
        utt_ids = sorted(npz.keys())
        rng.shuffle(utt_ids)
        xs = [npz[i] for i in utt_ids]
        ls = np.asarray([len(x) for x in xs], dtype=int)
        base_inds = np.cumsum(ls)
        ends = theano.shared(base_inds, borrow=True)
        base_begins = base_inds.copy()
        base_begins[1:] = base_inds[:-1]
        base_begins[0] = 0
        begins = theano.shared(base_begins, borrow=True)
        # Get labels for each utterance
        labels = swbd_utts_to_labels(utt_ids)

        matches_vec = samediff.generate_matches_array(labels)

        try:
            shared_x = theano.shared(np.asarray(np.vstack(xs), dtype=THEANOTYPE), borrow=True)
        except: import pdb; pdb.set_trace()
        # Create a tuple for this set and add to `data_sets`
        datasets.append((shared_x, begins, ends, matches_vec, labels))

    return datasets
Пример #2
0
def load_swbd_same_diff(rng, data_dir):

    logger.info("Loading same and different pairs: " + data_dir)

    datasets = []

    for set in ["train", "dev", "test"]:

        npz_fn = path.join(data_dir, "swbd." + set + ".npz")
        logger.info("Reading: " + npz_fn)

        # Load data and shuffle
        npz = np.load(npz_fn)
        utt_ids = sorted(npz.keys())
        while "width" in utt_ids:
            utt_ids.remove("width")
        while "padding" in utt_ids:
            utt_ids.remove("padding")
        rng.shuffle(utt_ids)
        x = [npz[i] for i in utt_ids]

        # Get labels for each utterance
        labels = swbd_utts_to_labels(utt_ids)

        matches_vec = samediff.generate_matches_array(labels)
        shared_x = theano.shared(np.asarray(x, dtype=THEANOTYPE), borrow=True)

        # Create a tuple for this set and add to `data_sets`
        datasets.append((shared_x, matches_vec, labels))

    return datasets
def eval_samediff(segments_dict):
    """Returns average precision and recision-recall breakeven."""

    # Generate list of pairs
    segment_keys = sorted(segments_dict.keys())
    pairs = []
    m = len(segment_keys)
    for i in range(0, m - 1):
        for j in range(i + 1, m):
            pairs.append((segment_keys[i], segment_keys[j]))
    # print("No. pairs: {}".format(len(pairs)))

    print("Calculating distances:")
    costs = np.zeros(len(pairs))
    for i_pair, pair in enumerate(tqdm(pairs)):
        utt_id_1, utt_id_2 = pair
        costs[i_pair] = dtw_cost_func(
            np.array(segments_dict[utt_id_1], dtype=np.double, order="c"),
            np.array(segments_dict[utt_id_2], dtype=np.double, order="c"),
            True)

    # Same-different
    distances_vec = np.asarray(costs)
    labels = [key.split("_")[0] for key in segment_keys]
    matches = samediff.generate_matches_array(labels)
    ap, prb = samediff.average_precision(distances_vec[matches == True],
                                         distances_vec[matches == False],
                                         False)
    return (ap, prb)
Пример #4
0
def load_swbd_same_diff(rng, data_dir):

    logger.info("Loading same and different pairs: " + data_dir)

    datasets = []

    for set in ["train", "dev", "test"]:

        npz_fn = path.join(data_dir, "swbd." + set + ".npz")
        logger.info("Reading: " + npz_fn)

        # Load data and shuffle
        npz = np.load(npz_fn)
        utt_ids = sorted(npz.keys())
        rng.shuffle(utt_ids)
        x = [npz[i] for i in utt_ids]

        # Get labels for each utterance
        labels = swbd_utts_to_labels(utt_ids)

        matches_vec = samediff.generate_matches_array(labels)
        shared_x = theano.shared(np.asarray(x, dtype=THEANOTYPE), borrow=True)

        # Create a tuple for this set and add to `data_sets`
        datasets.append((shared_x, matches_vec, labels))

    return datasets
Пример #5
0
def GetTestData(dict):
    labels = []
    data = []
    for i in dict.files:
        labels.append(i[: i.find('_')])
        data.append(dict[i])
    data, lengths = Padding(data)
    matches = samediff.generate_matches_array(labels)
    return data, lengths, matches
Пример #6
0
def main():
    args = check_argv()

    print datetime.datetime.now()

    print "Reading:", args.npz_fn
    npz = np.load(args.npz_fn)

    print datetime.datetime.now()

    # if args.normalize:
    #     print "Normalizing embeddings"
    # else:
    print "Ordering embeddings"
    n_embeds = 0
    X = []
    ids = []
    for label in sorted(npz):
        ids.append(label)
        X.append(npz[label])
        n_embeds += 1
    X = np.array(X)
    print "No. embeddings:", n_embeds
    print "Embedding dimensionality:", X.shape[1]

    print datetime.datetime.now()

    print "Calculating distances"
    distances = pdist(X, metric=args.metric)

    print datetime.datetime.now()

    print "Getting labels"
    labels = []
    for utt_id in ids:
        word = "_".join(utt_id.split("_")[:-2])
        labels.append(word)

    if args.mean_ap:
        print datetime.datetime.now()
        print "Calculating mean average precision"
        mean_ap, mean_prb, ap_dict = samediff.mean_average_precision(distances, labels)
        print "Mean average precision:", mean_ap
        print "Mean precision-recall breakeven:", mean_prb

    print datetime.datetime.now()

    print "Calculating average precision"
    matches = samediff.generate_matches_array(labels)

    ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False])
    print "Average precision:", ap
    print "Precision-recall breakeven:", prb

    print datetime.datetime.now()
Пример #7
0
    def samediff_val(normalise=True):
        # Embed validation
        np.random.seed(options_dict["rnd_seed"])
        val_batch_iterator = batching.SimpleIterator(val_x, len(val_x), False)
        labels = [val_labels[i] for i in val_batch_iterator.indices]
        speakers = [val_speakers[i] for i in val_batch_iterator.indices]
        saver = tf.train.Saver()
        with tf.Session() as session:
            saver.restore(session, val_model_fn)
            for batch_x_padded, batch_x_lengths in val_batch_iterator:
                np_x = batch_x_padded
                np_x_lengths = batch_x_lengths
                np_z = session.run([encoding],
                                   feed_dict={
                                       x: np_x,
                                       x_lengths: np_x_lengths
                                   })[0]
                break  # single batch

        embed_dict = {}
        for i, utt_key in enumerate(
            [val_keys[i] for i in val_batch_iterator.indices]):
            embed_dict[utt_key] = np_z[i]

        # Same-different
        if normalise:
            np_z_normalised = (np_z - np_z.mean(axis=0)) / np_z.std(axis=0)
            distances = pdist(np_z_normalised, metric="cosine")
        else:
            distances = pdist(np_z, metric="cosine")
        # matches = samediff.generate_matches_array(labels)
        # ap, prb = samediff.average_precision(
        #     distances[matches == True], distances[matches == False]
        #     )
        word_matches = samediff.generate_matches_array(labels)
        speaker_matches = samediff.generate_matches_array(speakers)
        sw_ap, sw_prb, swdp_ap, swdp_prb = samediff.average_precision_swdp(
            distances[np.logical_and(word_matches, speaker_matches)],
            distances[np.logical_and(word_matches, speaker_matches == False)],
            distances[word_matches == False])
        # return [sw_prb, -sw_ap, swdp_prb, -swdp_ap]
        return [swdp_prb, -swdp_ap]
Пример #8
0
    def samediff_val(normalise=True):
        # Embed validation
        np.random.seed(options_dict["rnd_seed"])
        val_batch_iterator = batching.SimpleIterator(val_x, len(val_x), False)
        labels = [val_labels[i] for i in val_batch_iterator.indices]
        saver = tf.train.Saver()
        with tf.Session() as session:
            saver.restore(session, val_model_fn)
            for batch_x_padded, batch_x_lengths in val_batch_iterator:
                np_x = batch_x_padded
                np_x_lengths = batch_x_lengths
                # np_z = session.run(
                    # [z], feed_dict={a: np_x, a_lengths: np_x_lengths}
                    # )[0]
                np_z = session.run(
                    [z_mean], feed_dict={a: np_x, a_lengths: np_x_lengths}
                    )[0]
                # print(np_z)
                break  # single batch

        embed_dict = {}
        for i, utt_key in enumerate(
                [val_keys[i] for i in val_batch_iterator.indices]):
            embed_dict[utt_key] = np_z[i]

        # Same-different
        if normalise:
            # print(np_z.shape)
            np_z_normalised = (np_z - np_z.mean(axis=0))/np_z.std(axis=0)
            distances = pdist(np_z_normalised, metric="cosine")
            matches = samediff.generate_matches_array(labels)
            ap, prb = samediff.average_precision(
                distances[matches == True], distances[matches == False]
                )
        else:
            distances = pdist(np_z, metric="cosine")
            matches = samediff.generate_matches_array(labels)
            ap, prb = samediff.average_precision(
                distances[matches == True], distances[matches == False]
                )    
        return [prb, -ap]
Пример #9
0
def main():
    args = check_argv()

    print(datetime.now())

    print("Reading:", args.npz_fn)
    npz = np.load(args.npz_fn)

    print(datetime.now())

    print("Ordering embeddings")
    n_embeds = 0
    X = []
    ids = []
    for label in sorted(npz):
        ids.append(label)
        X.append(npz[label])
        n_embeds += 1
    X = np.array(X)
    print("No. embeddings:", n_embeds)
    print("Embedding dimensionality:", X.shape[1])

    if args.mvn:
        normed = (X - X.mean(axis=0)) / X.std(axis=0)
        X = normed

    print(datetime.now())

    print("Calculating distances")
    metric = args.metric
    if metric == "kl":
        import scipy.stats
        metric = scipy.stats.entropy
    distances = pdist(X, metric=metric)

    print("Getting labels")
    labels = []
    for utt_id in ids:
        word = utt_id.split("_")[0]  # "_".join(utt_id.split("_")[:-2])
        labels.append(word)

    print("Calculating average precision")
    matches = samediff.generate_matches_array(labels)

    ap, prb = samediff.average_precision(distances[matches == True],
                                         distances[matches == False])
    print("Average precision: {:.4f}".format(ap))
    print("Precision-recall breakeven: {:.4f}".format(prb))

    print(datetime.now())
def main():
    args = check_argv()

    print datetime.datetime.now()

    print "Reading:", args.npz_fn
    npz = np.load(args.npz_fn)

    print datetime.datetime.now()

    # if args.normalize:
    #     print "Normalizing embeddings"
    # else:
    print "Ordering embeddings"
    n_embeds = 0
    X = []
    ids = []
    for label in sorted(npz):
        ids.append(label)
        X.append(npz[label])
        n_embeds += 1
    X = np.array(X)
    print "No. embeddings:", n_embeds
    print "Embedding dimensionality:", X.shape[1]

    print datetime.datetime.now()

    print "Calculating distances"
    metric = args.metric
    if metric == "kl":
        import scipy.stats

        metric = scipy.stats.entropy
    distances = pdist(X, metric=metric)

    print "Getting labels"
    labels = []
    for utt_id in ids:
        word = "_".join(utt_id.split("_")[:-2])
        labels.append(word)

    print "Calculating average precision"
    matches = samediff.generate_matches_array(labels)

    ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False])
    print "Average precision:", ap
    print "Precision-recall breakeven:", prb

    print datetime.datetime.now()
Пример #11
0
    def samediff_val(normalise=False):
        # Embed validation
        np.random.seed(options_dict["rnd_seed"])
        val_batch_iterator = batching.LabelledIterator(
            val_x, None, val_x.shape[0], False
            )
        labels = [val_labels[i] for i in val_batch_iterator.indices]
        saver = tf.train.Saver()
        with tf.Session() as session:
            saver.restore(session, val_model_fn)
            for batch_x in val_batch_iterator:
                np_z = session.run(
                    [output], feed_dict={x: batch_x}
                    )[0]
                break  # single batch

        embed_dict = {}
        for i, utt_key in enumerate(
                [val_keys[i] for i in val_batch_iterator.indices]):
            embed_dict[utt_key] = np_z[i]

        # Same-different
        if normalise:
            np_z_normalised = (np_z - np_z.mean(axis=0))/np_z.std(axis=0)
            distances = pdist(np_z_normalised, metric="cosine")
            matches = samediff.generate_matches_array(labels)
            ap, prb = samediff.average_precision(
                distances[matches == True], distances[matches == False]
                )
        else:
            distances = pdist(np_z, metric="cosine")
            matches = samediff.generate_matches_array(labels)
            ap, prb = samediff.average_precision(
                distances[matches == True], distances[matches == False]
                )    
        return [prb, -ap]
Пример #12
0
def load_swbd_same_diff_mask_old(rng, data_dir, filter_length=None, seq_length=200):

    logger.info("Loading same and different pairs: " + data_dir)

    datasets = []

    for set in ["train", "dev", "test"]:

        npz_fn = path.join(data_dir, "swbd." + set + ".npz")
        logger.info("Reading: " + npz_fn)

        # Load data and shuffle
        npz = np.load(npz_fn)
        utt_ids = sorted(npz.keys())
        rng.shuffle(utt_ids)
        
        ls = np.asarray([len(npz[i]) for i in utt_ids], dtype=np.int32)
        max_length = ls.max() if seq_length is None else seq_length
        xs = np.zeros((len(ls), max_length, npz[utt_ids[0]].shape[1]),
                      dtype=THEANOTYPE)
        mask = np.zeros((len(ls), max_length), dtype=THEANOTYPE)
        for j, i in enumerate(utt_ids):
            xs[j][:ls[j]] = npz[i]
            mask[j][:ls[j]] = 1.0

        # perform adjustment for convlstms, since we perform a convolution
        # first over the time series
        if filter_length is not None:
            ls -= filter_length - 1


        # Get labels for each utterance
        labels = swbd_utts_to_labels(utt_ids)

        matches_vec = samediff.generate_matches_array(labels)
        shared_x = theano.shared(xs, borrow=True)
        shared_mask = theano.shared(mask, borrow=True)
        shared_ls = theano.shared(ls, borrow=True)
        
        # Create a tuple for this set and add to `data_sets`
        datasets.append((shared_x, shared_mask, shared_ls, matches_vec, labels))

    return datasets
Пример #13
0
def load_swbd_same_diff_mask(rng, data_dir, filter_length=None):

    logger.info("Loading same and different pairs: " + data_dir)

    datasets = []

    for set in ["train", "dev", "test"]:

        npz_fn = path.join(data_dir, "swbd." + set + ".npz")
        width_fn = path.join(data_dir, "width." + set + ".npz")
        padding_fn = path.join(data_dir, "padding." + set + ".npz")
        logger.info("Reading: " + npz_fn)

        # Load data and shuffle
        npz = np.load(npz_fn)
        widths = np.load(width_fn)
        paddings = np.load(padding_fn)
        utt_ids = sorted(npz.keys())
        rng.shuffle(utt_ids)
        x = [npz[i].T for i in utt_ids]
        
        # Get labels for each utterance
        labels = swbd_utts_to_labels(utt_ids)

        matches_vec = samediff.generate_matches_array(labels)
        mask = np.zeros((len(x), len(x[0])), dtype=THEANOTYPE)
        for i, utt_id in enumerate(utt_ids):
            padding = int(paddings[utt_id])
            width = int(widths[utt_id])
            mask[i][padding: padding + width] = 1.0
                
        shared_x = theano.shared(np.asarray(x, dtype=THEANOTYPE), borrow=True)
        shared_m = theano.shared(mask, borrow=True)
        # Create a tuple for this set and add to `data_sets`
        datasets.append((shared_x, shared_m, matches_vec, labels))

    return datasets
def train_mlp(options_dict):
    """Train and save a word classifier MLP."""

    # Preliminary

    logger.info(datetime.now())

    if not path.isdir(options_dict["model_dir"]):
        os.makedirs(options_dict["model_dir"])

    if "log_to_file" in options_dict and options_dict["log_to_file"] is True:
        log_fn = path.join(options_dict["model_dir"], "log")
        print "Writing:", log_fn
        root_logger = logging.getLogger()
        if len(root_logger.handlers) > 0:
            root_logger.removeHandler(
                root_logger.handlers[0])  # close open file handler
        logging.basicConfig(filename=log_fn, level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.DEBUG)

    rng = np.random.RandomState(options_dict["rnd_seed"])
    if options_dict["dropout_rates"] is not None:
        srng = RandomStreams(seed=options_dict["rnd_seed"])
    else:
        srng = None

    # Load and format data

    # Load into shared variables
    datasets, word_to_i_map = data_io.load_swbd_labelled(
        rng, options_dict["data_dir"], options_dict["min_count"])
    train_x, train_y = datasets[0]
    dev_x, dev_y = datasets[1]
    test_x, test_y = datasets[2]

    # Get batch sizes and iterators
    class BatchIterator(object):
        def __init__(self, n_batches):
            self.n_batches = n_batches

        def __iter__(self):
            for i_batch in xrange(self.n_batches):
                yield [i_batch]

    n_train_batches = train_x.get_value(
        borrow=True).shape[0] / options_dict["batch_size"]
    n_dev_batches = dev_x.get_value(
        borrow=True).shape[0] / options_dict["batch_size"]
    n_test_batches = test_x.get_value(
        borrow=True).shape[0] / options_dict["batch_size"]
    train_batch_iterator = BatchIterator(n_train_batches)
    validate_batch_iterator = BatchIterator(n_dev_batches)
    test_batch_iterator = BatchIterator(n_test_batches)

    # Flatten data
    d_in = 39 * 200
    train_x = train_x.reshape((-1, d_in))
    dev_x = dev_x.reshape((-1, d_in))
    test_x = test_x.reshape((-1, d_in))
    d_out = len(word_to_i_map)
    options_dict["d_out"] = d_out

    # Save `options_dict`
    options_dict_fn = path.join(options_dict["model_dir"],
                                "options_dict.pkl.gz")
    logger.info("Saving options: " + options_dict_fn)
    f = data_io.smart_open(options_dict_fn, "wb")
    pickle.dump(options_dict, f, -1)
    f.close()

    logger.info("Options: " + str(options_dict))

    # Setup model

    logger.info("Building MLP")

    # Symbolic variables
    i_batch = T.lscalar()  # batch index
    x = T.matrix("x")  # flattened data of shape (n_data, d_in)
    y = T.ivector("y")  # labels

    # Build model
    logger.info("No. of word type targets: " + str(options_dict["d_out"]))
    model = mlp.MLP(rng, x, d_in, options_dict["d_out"],
                    options_dict["hidden_layer_specs"], srng,
                    options_dict["dropout_rates"])
    if options_dict["dropout_rates"] is not None:
        loss = model.dropout_negative_log_likelihood(y)
    else:
        loss = model.negative_log_likelihood(y)
    error = model.errors(y)

    # Add regularization
    if options_dict["l1_weight"] > 0. or options_dict["l2_weight"] > 0.:
        loss = loss + options_dict["l1_weight"] * model.l1 + options_dict[
            "l2_weight"] * model.l2

    # Compile test functions
    outputs = [error, loss]
    validate_model = theano.function(
        inputs=[i_batch],
        outputs=outputs,
        givens={
            x:
            dev_x[i_batch * options_dict["batch_size"]:(i_batch + 1) *
                  options_dict["batch_size"]],
            y:
            dev_y[i_batch * options_dict["batch_size"]:(i_batch + 1) *
                  options_dict["batch_size"]]
        })
    test_model = theano.function(
        inputs=[i_batch],
        outputs=outputs,
        givens={
            x:
            test_x[i_batch * options_dict["batch_size"]:(i_batch + 1) *
                   options_dict["batch_size"]],
            y:
            test_y[i_batch * options_dict["batch_size"]:(i_batch + 1) *
                   options_dict["batch_size"]]
        })

    # Gradients and training updates
    parameters = model.parameters
    gradients = T.grad(loss, parameters)
    learning_rule = options_dict["learning_rule"]
    if learning_rule["type"] == "adadelta":
        updates = training.learning_rule_adadelta(parameters, gradients,
                                                  learning_rule["rho"],
                                                  learning_rule["epsilon"])
    elif learning_rule["type"] == "momentum":
        updates = training.learning_rule_momentum(
            parameters, gradients, learning_rule["learning_rate"],
            learning_rule["momentum"])
    else:
        assert False, "Invalid learning rule: " + learning_rule["type"]

    # Compile training function
    train_model = theano.function(
        inputs=[i_batch],
        outputs=outputs,
        updates=updates,
        givens={
            x:
            train_x[i_batch * options_dict["batch_size"]:(i_batch + 1) *
                    options_dict["batch_size"]],
            y:
            train_y[i_batch * options_dict["batch_size"]:(i_batch + 1) *
                    options_dict["batch_size"]]
        },
    )

    # Train model

    logger.info("Training MLP")
    record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz")
    record_dict = training.train_fixed_epochs_with_validation(
        options_dict["n_max_epochs"],
        train_model=train_model,
        train_batch_iterator=train_batch_iterator,
        validate_model=validate_model,
        validate_batch_iterator=validate_batch_iterator,
        test_model=test_model,
        test_batch_iterator=test_batch_iterator,
        save_model_func=model.save,
        save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"),
        record_dict_fn=record_dict_fn,
    )

    # Extrinsic evaluation

    # Pass data trough model
    logger.info("Performing same-different evaluation")
    layers_output_dict = apply_layers.apply_layers(
        options_dict["model_dir"],
        "dev",
        batch_size=645,
        i_layer=options_dict["i_layer_eval"])
    utt_ids = sorted(layers_output_dict.keys())
    embeddings = np.array([layers_output_dict[i] for i in utt_ids])
    labels = data_io.swbd_utts_to_labels(utt_ids)

    # Perform same-different
    distances = pdist(embeddings, metric="cosine")
    matches = samediff.generate_matches_array(labels)
    ap, prb = samediff.average_precision(distances[matches == True],
                                         distances[matches == False])
    logger.info("Validation average precision: " + str(ap))
    ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt")
    with open(ap_fn, "w") as f:
        f.write(str(ap) + "\n")
Пример #15
0
        for i in xrange(0, N_train, batchsize):
            x_batch = x_train[i : i + batchsize]
            model.forward(x_batch,test=False)
        logger.info("Extracting final layer")
        save_to = args.save_to
        X=[]
        for i in xrange(0, N_test):
            utt_id = utt_ids_tst[i]
            x_batch = x_test[i : i + 1]
            X.append(cuda.to_cpu(F.softmax(model.forward(x_batch,test=True)).data))
        X=numpy.asarray(X)[:,0,:]
        logger.info("Calcurating average precision")
        start_time = timeit.default_timer()
        labels = swbd_utts_to_labels(utt_ids_tst)
        distances = pdist(X, metric="cosine")
        matches = samediff.generate_matches_array(labels)
        ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False])
        end_time = timeit.default_timer()
        logger.info("Average precision: %s (processing time: %f [sec])"  % (str(ap), end_time-start_time))

        logger.info('Saving output layer to %s' % save_to+".npz")
        numpy.savez_compressed(save_to, X)

#        dataset = load_swbd_dataset(args.dataset, ratio=1)  
#       x_train, y_train = dataset[0] 
#        x_test, y_test   = dataset[2]
#        N_test=x_test.shape[0]
#        N_train=x_train.shape[0]
#        print "Applying batch normalization"
#        for i in xrange(0, N_train, batchsize):
#            x_batch = x_train[i : i + batchsize]
def train_siamese_cnn(options_dict):

    # Preliminary

    logger.info(datetime.now())

    if not path.isdir(options_dict["model_dir"]):
        os.makedirs(options_dict["model_dir"])

    if "log_to_file" in options_dict and options_dict["log_to_file"] is True:
        log_fn = path.join(options_dict["model_dir"], "log")
        print "Writing:", log_fn
        root_logger = logging.getLogger()
        if len(root_logger.handlers) > 0:
            root_logger.removeHandler(root_logger.handlers[0])  # close open file handler
        logging.basicConfig(filename=log_fn, level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.DEBUG)

    rng = np.random.RandomState(options_dict["rnd_seed"])
    if options_dict["dropout_rates"] is not None:
        srng = RandomStreams(seed=options_dict["rnd_seed"])
    else:
        srng = None

    options_dict_fn = path.join(options_dict["model_dir"], "options_dict.pkl.gz")
    logger.info("Saving options: " + options_dict_fn)
    f = data_io.smart_open(options_dict_fn, "wb")
    pickle.dump(options_dict, f, -1)
    f.close()

    logger.info("Options: " + str(options_dict))


    # Load and format data

    # Load into shared variables
    datasets = data_io.load_swbd_same_diff(rng, options_dict["data_dir"])
    train_x, train_matches_vec, train_labels = datasets[0]
    dev_x, dev_matches_vec, dev_labels = datasets[1]
    test_x, test_matches_vec, test_labels = datasets[2]

    # Flatten data
    d_in = 39*200
    train_x = train_x.reshape((-1, d_in))
    dev_x = dev_x.reshape((-1, d_in))
    test_x = test_x.reshape((-1, d_in))

    # Make batch iterators
    train_batch_iterator = BatchIteratorSameDifferent(
        rng, train_matches_vec, options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"], sample_diff_every_epoch=True
        )
    validate_batch_iterator = BatchIteratorSameDifferent(
        rng, dev_matches_vec, options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=False
        )
    test_batch_iterator = BatchIteratorSameDifferent(
        rng, test_matches_vec, options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=False
        )


    # Setup model

    logger.info("Building Siamese CNN")

    # Symbolic variables
    y = T.ivector("y")      # indicates whether x1 and x2 is same (1) or different (0)
    x1 = T.matrix("x1")
    x2 = T.matrix("x2")
    x1_indices = T.ivector("x1_indices")
    x2_indices = T.ivector("x2_indices")

    # Build model
    input_shape = (options_dict["batch_size"], 1, 39, 200)
    model = siamese.SiameseCNN(
        rng, x1, x2, input_shape,
        conv_layer_specs=options_dict["conv_layer_specs"],
        hidden_layer_specs=options_dict["hidden_layer_specs"],
        srng=srng,
        dropout_rates=options_dict["dropout_rates"],
        )
    if options_dict["loss"] == "cos_cos2":
        if options_dict["dropout_rates"] is not None:
            loss = model.dropout_loss_cos_cos2(y)
        else:
            loss = model.loss_cos_cos2(y)
        error = model.loss_cos_cos2(y)  # doesn't include regularization or dropout
    elif options_dict["loss"] == "cos_cos":
        if options_dict["dropout_rates"] is not None:
            loss = model.dropout_loss_cos_cos(y)
        else:
            loss = model.loss_cos_cos(y)
        error = model.loss_cos_cos(y)
    elif options_dict["loss"] == "cos_cos_margin":
        if options_dict["dropout_rates"] is not None:
            loss = model.dropout_loss_cos_cos_margin(y)
        else:
            loss = model.loss_cos_cos_margin(y)
        error = model.loss_cos_cos_margin(y)
    elif options_dict["loss"] == "euclidean_margin":
        if options_dict["dropout_rates"] is not None:
            loss = model.dropout_loss_euclidean_margin(y)
        else:
            loss = model.loss_euclidean_margin(y)
        error = model.loss_euclidean_margin(y)
    else:
        assert False, "Invalid loss: " + options_dict["loss"]

    # Add regularization
    if options_dict["l1_weight"] > 0. or options_dict["l2_weight"] > 0.:
        loss = loss + options_dict["l1_weight"]*model.l1 + options_dict["l2_weight"]* model.l2

    # Compile test functions
    same_distance = model.cos_same(y)  # track the distances of same and different pairs separately
    diff_distance = model.cos_diff(y)
    outputs = [error, loss, same_distance, diff_distance]
    theano_mode = theano.Mode(linker="cvm")
    test_model = theano.function(
        inputs=[x1_indices, x2_indices, y],
        outputs=outputs,
        givens={
            x1: test_x[x1_indices],
            x2: test_x[x2_indices],
            },
        mode=theano_mode,
        )
    validate_model = theano.function(
        inputs=[x1_indices, x2_indices, y],
        outputs=outputs,
        givens={
            x1: dev_x[x1_indices],
            x2: dev_x[x2_indices],
            },
        mode=theano_mode,
        )

    # Gradients and training updates
    parameters = model.parameters
    gradients = T.grad(loss, parameters)
    learning_rule = options_dict["learning_rule"]
    if learning_rule["type"] == "adadelta":
        updates = training.learning_rule_adadelta(
            parameters, gradients, learning_rule["rho"], learning_rule["epsilon"]
            )
    elif learning_rule["type"] == "momentum":
        updates = training.learning_rule_momentum(
            parameters, gradients, learning_rule["learning_rate"], learning_rule["momentum"]
            )
    else:
        assert False, "Invalid learning rule: " + learning_rule["type"]

    # Compile training function
    train_model = theano.function(
        inputs=[x1_indices, x2_indices, y],
        outputs=outputs,
        updates=updates,
        givens={
            x1: train_x[x1_indices],
            x2: train_x[x2_indices],
            },
        mode=theano_mode,
        )


    # Train model

    logger.info("Training Siamese CNN")
    record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz")
    record_dict = training.train_fixed_epochs_with_validation(
        options_dict["n_max_epochs"],
        train_model=train_model,
        train_batch_iterator=train_batch_iterator,
        validate_model=validate_model,
        validate_batch_iterator=validate_batch_iterator,
        test_model=test_model,
        test_batch_iterator=test_batch_iterator,
        save_model_func=model.save,
        save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"),
        record_dict_fn=record_dict_fn,
        )


    # Extrinsic evaluation

    # Pass data trough model
    logger.info("Performing same-different evaluation")
    layers_output_dict = apply_layers.apply_layers(options_dict["model_dir"], "dev", batch_size=645)  # batch size covers 10965 out of 10966 tokens
    utt_ids = sorted(layers_output_dict.keys())
    embeddings = np.array([layers_output_dict[i] for i in utt_ids])
    labels = data_io.swbd_utts_to_labels(utt_ids)

    # Perform same-different
    distances = pdist(embeddings, metric="cosine")
    matches = samediff.generate_matches_array(labels)
    ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False])
    logger.info("Validation average precision: " + str(ap))
    ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt")
    with open(ap_fn, "w") as f:
        f.write(str(ap) + "\n")
def train_siamese_triplets_lstm_nn(options_dict):
    """Train and save a Siamese CNN using the specified options."""

    # Preliminary

    logger.info(datetime.now())

    if not path.isdir(options_dict["model_dir"]):
        os.makedirs(options_dict["model_dir"])

    if "log_to_file" in options_dict and options_dict["log_to_file"] is True:
        log_fn = path.join(options_dict["model_dir"], "log")
        print "Writing:", log_fn
        root_logger = logging.getLogger()
        if len(root_logger.handlers) > 0:
            root_logger.removeHandler(root_logger.handlers[0])  # close open file handler
        logging.basicConfig(filename=log_fn, level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.DEBUG)

    rng = np.random.RandomState(options_dict["rnd_seed"])
    if options_dict["dropout_rates"] is not None:
        srng = RandomStreams(seed=options_dict["rnd_seed"])
    else:
        srng = None

    options_dict_fn = path.join(options_dict["model_dir"], "options_dict.pkl.gz")
    logger.info("Saving options: " + options_dict_fn)
    f = data_io.smart_open(options_dict_fn, "wb")
    pickle.dump(options_dict, f, -1)
    f.close()

    logger.info("Options: " + str(options_dict))

    # Load and format data

    # Load into shared variables
    datasets = data_io.load_swbd_same_diff_mask(rng, options_dict["data_dir"])
    train_x, train_mask, train_lengths, train_matches_vec, train_labels = datasets[0]
    dev_x, dev_mask, dev_lengths, dev_matches_vec, dev_labels = datasets[1]
    test_x, test_mask, test_lengths, test_matches_vec, test_labels = datasets[2]

    # Make batch iterators
    train_triplet_iterator = BatchIteratorTriplets(
        rng,
        train_matches_vec,
        options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=True,
    )
    validate_triplet_iterator = BatchIteratorTriplets(
        rng,
        dev_matches_vec,
        options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=False,
    )
    test_triplet_iterator = BatchIteratorTriplets(
        rng,
        test_matches_vec,
        options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=False,
    )

    # Setup model

    logger.info("Building Siamese triplets LSTM")

    # Symbolic variables
    x1 = tensor.tensor3("x1", dtype=THEANOTYPE)
    x2 = tensor.tensor3("x2", dtype=THEANOTYPE)
    x3 = tensor.tensor3("x3", dtype=THEANOTYPE)
    m1 = tensor.matrix("m1", dtype=THEANOTYPE)
    m2 = tensor.matrix("m2", dtype=THEANOTYPE)
    m3 = tensor.matrix("m3", dtype=THEANOTYPE)
    x1_indices = tensor.ivector("x1_indices")
    x2_indices = tensor.ivector("x2_indices")
    x3_indices = tensor.ivector("x3_indices")
    l1 = tensor.iscalar("l1")
    l2 = tensor.iscalar("l2")
    l3 = tensor.iscalar("l3")

    # Build model
    input_shape = (options_dict["batch_size"], 1, 39, 200)
    model = siamese.SiameseTripletBatchLSTMNN(
        rng,
        x1,
        x2,
        x3,
        m1,
        m2,
        m3,
        n_in=39,
        n_lstm_hiddens=options_dict["n_hiddens"],
        mlp_hidden_specs=options_dict["hidden_layer_specs"],
    )
    if options_dict["loss"] == "hinge_cos":
        if options_dict["dropout_rates"] is not None:
            loss = model.dropout_loss_hinge_cos(options_dict["margin"])
        else:
            loss = model.loss_hinge_cos(options_dict["margin"])
        error = model.loss_hinge_cos(options_dict["margin"])  # doesn't include regularization or dropout
    else:
        assert False, "Invalid loss: " + options_dict["loss"]

    # Add regularization
    if options_dict["l2_weight"] > 0.0:
        loss = loss + options_dict["l2_weight"] * model.l2

    # Compile test functions
    same_distance = model.cos_same()  # track the distances of same and different pairs separately
    diff_distance = model.cos_diff()
    outputs = [error, loss, same_distance, diff_distance]
    theano_mode = theano.Mode(linker="cvm")

    validate_model = theano.function(
        inputs=[x1_indices, x2_indices, x3_indices],
        outputs=outputs,
        givens={
            x1: dev_x[x1_indices].swapaxes(0, 1)[: dev_lengths[x1_indices].max()],
            m1: dev_mask[x1_indices].T[: dev_lengths[x1_indices].max()],
            x2: dev_x[x2_indices].swapaxes(0, 1)[: dev_lengths[x2_indices].max()],
            m2: dev_mask[x2_indices].T[: dev_lengths[x2_indices].max()],
            x3: dev_x[x3_indices].swapaxes(0, 1)[: dev_lengths[x3_indices].max()],
            m3: dev_mask[x3_indices].T[: dev_lengths[x3_indices].max()],
        },
        mode=theano_mode,
    )
    test_model = theano.function(
        inputs=[x1_indices, x2_indices, x3_indices],
        outputs=outputs,
        givens={
            x1: test_x[x1_indices].swapaxes(0, 1)[: test_lengths[x1_indices].max()],
            m1: test_mask[x1_indices].T[: test_lengths[x1_indices].max()],
            x2: test_x[x2_indices].swapaxes(0, 1)[: test_lengths[x2_indices].max()],
            m2: test_mask[x2_indices].T[: test_lengths[x2_indices].max()],
            x3: test_x[x3_indices].swapaxes(0, 1)[: test_lengths[x3_indices].max()],
            m3: test_mask[x3_indices].T[: test_lengths[x3_indices].max()],
        },
        mode=theano_mode,
    )
    # test_model = theano.function(
    #     inputs=[x1_indices, x2_indices, x3_indices],
    #     outputs=outputs,
    #     givens={
    #         l1: test_lengths[x1_indices].max(),
    #         x1: test_x[x1_indices].swapaxes(0, 1)[:l1],
    #         m1: test_mask[x1_indices][:l1],
    #         l2: test_lengths[x2_indices].max(),
    #         x2: test_x[x2_indices].swapaxes(0, 1)[:l2],
    #         m2: test_mask[x2_indices][:l2],
    #         l3: test_lengths[x3_indices].max(),
    #         x3: test_x[x3_indices].swapaxes(0, 1)[:l3],
    #         m3: test_mask[x3_indices][:l3],
    #         },
    #     mode=theano_mode,
    #     )

    # Gradients and training updates
    parameters = model.parameters
    gradients = tensor.grad(loss, parameters)
    learning_rule = options_dict["learning_rule"]
    if learning_rule["type"] == "adadelta":
        updates = training.learning_rule_adadelta(parameters, gradients, learning_rule["rho"], learning_rule["epsilon"])
    elif learning_rule["type"] == "momentum":
        updates = training.learning_rule_momentum(
            parameters, gradients, learning_rule["learning_rate"], learning_rule["momentum"]
        )
    else:
        assert False, "Invalid learning rule: " + learning_rule["type"]

    # Compile training function
    train_model = theano.function(
        inputs=[x1_indices, x2_indices, x3_indices],
        outputs=outputs,
        updates=updates,
        givens={
            x1: train_x[x1_indices].swapaxes(0, 1)[: train_lengths[x1_indices].max()],
            m1: train_mask[x1_indices].T[: train_lengths[x1_indices].max()],
            x2: train_x[x2_indices].swapaxes(0, 1)[: train_lengths[x2_indices].max()],
            m2: train_mask[x2_indices].T[: train_lengths[x2_indices].max()],
            x3: train_x[x3_indices].swapaxes(0, 1)[: train_lengths[x3_indices].max()],
            m3: train_mask[x3_indices].T[: train_lengths[x3_indices].max()],
        },
        mode=theano_mode,
    )
    # train_model = theano.function(
    #     inputs=[x1_indices, x2_indices, x3_indices],
    #     outputs=outputs,
    #     updates=updates,
    #     givens={
    #         l1: train_lengths[x1_indices].max(),
    #         x1: train_x[x1_indices].swapaxes(0, 1)[:l1],
    #         m1: train_mask[x1_indices][:l1],
    #         l2: train_lengths[x2_indices].max(),
    #         x2: train_x[x2_indices].swapaxes(0, 1)[:l2],
    #         m2: train_mask[x2_indices][:l2],
    #         l3: train_lengths[x3_indices].max(),
    #         x3: train_x[x3_indices].swapaxes(0, 1)[:l3],
    #         m3: train_mask[x3_indices][:l3],
    #         },
    #     mode=theano_mode,
    #     )

    # Train model

    logger.info("Training Siamese triplets CNN")
    record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz")
    record_dict = training.train_fixed_epochs_with_validation(
        options_dict["n_max_epochs"],
        train_model=train_model,
        train_triplet_iterator=train_triplet_iterator,
        validate_model=validate_model,
        validate_triplet_iterator=validate_triplet_iterator,
        test_model=test_model,
        test_triplet_iterator=test_triplet_iterator,
        save_model_func=model.save,
        save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"),
        record_dict_fn=record_dict_fn,
    )

    # Extrinsic evaluation

    # Pass data trough model
    logger.info("Performing same-different evaluation")
    layers_output_dict = apply_layers.apply_layers(
        options_dict["model_dir"], "dev", batch_size=645
    )  # batch size covers 10965 out of 10966 tokens
    utt_ids = sorted(layers_output_dict.keys())
    embeddings = np.array([layers_output_dict[i] for i in utt_ids])
    labels = data_io.swbd_utts_to_labels(utt_ids)

    # Perform same-different
    distances = pdist(embeddings, metric="cosine")
    matches = samediff.generate_matches_array(labels)
    ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False])
    logger.info("Validation average precision: " + str(ap))
    ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt")
    with open(ap_fn, "w") as f:
        f.write(str(ap) + "\n")
Пример #18
0
            x_batch = x_train[i:i + batchsize]
            model.forward(x_batch, test=False)
        logger.info("Extracting final layer")
        save_to = args.save_to
        X = []
        for i in xrange(0, N_test):
            utt_id = utt_ids_tst[i]
            x_batch = x_test[i:i + 1]
            X.append(
                cuda.to_cpu(F.softmax(model.forward(x_batch, test=True)).data))
        X = numpy.asarray(X)[:, 0, :]
        logger.info("Calcurating average precision")
        start_time = timeit.default_timer()
        labels = swbd_utts_to_labels(utt_ids_tst)
        distances = pdist(X, metric="cosine")
        matches = samediff.generate_matches_array(labels)
        ap, prb = samediff.average_precision(distances[matches == True],
                                             distances[matches == False])
        end_time = timeit.default_timer()
        logger.info("Average precision: %s (processing time: %f [sec])" %
                    (str(ap), end_time - start_time))

        logger.info('Saving output layer to %s' % save_to + ".npz")
        numpy.savez_compressed(save_to, X)

#        dataset = load_swbd_dataset(args.dataset, ratio=1)
#       x_train, y_train = dataset[0]
#        x_test, y_test   = dataset[2]
#        N_test=x_test.shape[0]
#        N_train=x_train.shape[0]
#        print "Applying batch normalization"
def main():
    args = check_argv()

    print(datetime.now())

    print("Reading:", args.npz_fn)
    npz = np.load(args.npz_fn)

    print(datetime.now())

    # if args.normalize:
    #     print("Normalizing embeddings")
    # else:
    print("Ordering embeddings")
    n_embeds = 0
    X = []
    ids = []
    for label in sorted(npz):
        ids.append(label)
        X.append(npz[label])
        n_embeds += 1
    X = np.array(X)
    print("No. embeddings:", n_embeds)
    print("Embedding dimensionality:", X.shape[1])

    if args.mvn:
        normed = (X - X.mean(axis=0)) / X.std(axis=0)
        X = normed

    print(datetime.now())

    print("Calculating distances")
    distances = pdist(X, metric=args.metric)

    print(datetime.now())

    print("Getting labels")
    labels = []
    for utt_id in ids:
        word = utt_id.split("_")[0]
        labels.append(word)

    if args.mean_ap:
        print(datetime.now())
        print("Calculating mean average precision")
        mean_ap, mean_prb, ap_dict = samediff.mean_average_precision(
            distances, labels)
        print("Mean average precision:", mean_ap)
        print("Mean precision-recall breakeven:", mean_prb)

    print(datetime.now())

    print("Calculating average precision")
    matches = samediff.generate_matches_array(labels)

    ap, prb = samediff.average_precision(distances[matches == True],
                                         distances[matches == False])
    print("Average precision:", ap)
    print("Precision-recall breakeven:", prb)

    print(datetime.now())
Пример #20
0
def main():
    args = check_argv()

    print(datetime.now())

    print("Reading:", args.npz_fn)
    npz = np.load(args.npz_fn)

    print(datetime.now())

    print("Ordering embeddings")
    n_embeds = 0
    X = []
    ids = []
    for label in sorted(npz):
        ids.append(label)
        X.append(npz[label])
        n_embeds += 1
    X = np.array(X)
    print("No. embeddings:", n_embeds)
    print("Embedding dimensionality:", X.shape[1])

    if args.mvn:
        normed = (X - X.mean(axis=0)) / X.std(axis=0)
        X = normed

    print(datetime.now())

    print("Calculating distances")
    metric = args.metric
    if metric == "kl":
        import scipy.stats
        metric = scipy.stats.entropy
    distances = pdist(X, metric=metric)

    print(datetime.now())

    print("Getting labels and speakers")
    labels = []
    speakers = []
    for utt_id in ids:
        utt_id = utt_id.split("_")
        word = utt_id[0]
        speaker = utt_id[1]
        labels.append(word)
        speakers.append(speaker)

    if args.mean_ap:
        print(datetime.now())
        print("Calculating mean average precision")
        mean_ap, mean_prb, ap_dict = samediff.mean_average_precision(
            distances, labels)
        print("Mean average precision:", mean_ap)
        print("Mean precision-recall breakeven:", mean_prb)

    print(datetime.now())

    print("Calculating average precision")
    # matches = samediff.generate_matches_array(labels)  # Temp
    word_matches = samediff.generate_matches_array(labels)
    speaker_matches = samediff.generate_matches_array(speakers)
    print("No. same-word pairs:", sum(word_matches))
    print("No. same-speaker pairs:", sum(speaker_matches))

    sw_ap, sw_prb, swdp_ap, swdp_prb = samediff.average_precision_swdp(
        distances[np.logical_and(word_matches, speaker_matches)],
        distances[np.logical_and(word_matches, speaker_matches == False)],
        distances[word_matches == False])
    print("-" * 79)
    print("Average precision: {:.8f}".format(sw_ap))
    print("Precision-recall breakeven: {:.8f}".format(sw_prb))
    print("SWDP average precision: {:.8f}".format(swdp_ap))
    print("SWDP precision-recall breakeven: {:.8f}".format(swdp_prb))
    print("-" * 79)

    print(datetime.now())
Пример #21
0
def train_cnn(options_dict):
    """Train and save a word classifier CNN."""

    # Preliminary

    logger.info(datetime.now())

    if not path.isdir(options_dict["model_dir"]):
        os.makedirs(options_dict["model_dir"])

    if "log_to_file" in options_dict and options_dict["log_to_file"] is True:
        log_fn = path.join(options_dict["model_dir"], "log")
        print "Writing:", log_fn
        root_logger = logging.getLogger()
        if len(root_logger.handlers) > 0:
            root_logger.removeHandler(root_logger.handlers[0])  # close open file handler
        logging.basicConfig(filename=log_fn, level=logging.DEBUG)
        # root_logger = logging.getLogger()
        # formatter = root_logger.handlers[0].formatter
        # root_logger.removeHandler(root_logger.handlers[0])
        # file_handler = logging.FileHandler(log_fn, "a")
        # file_handler.setFormatter(formatter)
        # root_logger.addHandler(file_handler)
    else:
        logging.basicConfig(level=logging.DEBUG)

    rng = np.random.RandomState(options_dict["rnd_seed"])
    if options_dict["dropout_rates"] is not None:
        srng = RandomStreams(seed=options_dict["rnd_seed"])
    else:
        srng = None


    # Load and format data

    # Load into shared variables
    datasets, word_to_i_map = data_io.load_swbd_labelled(rng, options_dict["data_dir"], options_dict["min_count"])
    train_x, train_y = datasets[0]
    dev_x, dev_y = datasets[1]
    test_x, test_y = datasets[2]

    # Get batch sizes and iterators
    class BatchIterator(object):
        def __init__(self, n_batches):
            self.n_batches = n_batches
        def __iter__(self):
            for i_batch in xrange(self.n_batches):
                yield [i_batch]
    n_train_batches = train_x.get_value(borrow=True).shape[0] / options_dict["batch_size"]
    n_dev_batches = dev_x.get_value(borrow=True).shape[0] / options_dict["batch_size"]
    n_test_batches = test_x.get_value(borrow=True).shape[0] / options_dict["batch_size"]
    train_batch_iterator = BatchIterator(n_train_batches)
    validate_batch_iterator = BatchIterator(n_dev_batches)
    test_batch_iterator = BatchIterator(n_test_batches)

    # Flatten data
    d_in = 39*200
    train_x = train_x.reshape((-1, d_in))
    dev_x = dev_x.reshape((-1, d_in))
    test_x = test_x.reshape((-1, d_in))
    d_out = len(word_to_i_map)
    options_dict["d_out"] = d_out

    # Save `options_dict`
    options_dict_fn = path.join(options_dict["model_dir"], "options_dict.pkl.gz")
    logger.info("Saving options: " + options_dict_fn)
    f = data_io.smart_open(options_dict_fn, "wb")
    pickle.dump(options_dict, f, -1)
    f.close()

    logger.info("Options: " + str(options_dict))


    # Setup model

    logger.info("Building CNN")

    # Symbolic variables
    i_batch = T.lscalar()   # batch index
    x = T.matrix("x")       # flattened data of shape (n_data, d_in)
    y = T.ivector("y")      # labels

    # Build model
    logger.info("No. of word type targets: " + str(options_dict["d_out"]))
    input_shape = (options_dict["batch_size"], 1, 39, 200)
    model = cnn.CNN(
        rng, x, input_shape, options_dict["conv_layer_specs"],
        options_dict["hidden_layer_specs"], options_dict["d_out"], srng,
        options_dict["dropout_rates"] 
        )
    if options_dict["dropout_rates"] is not None:
        loss = model.dropout_negative_log_likelihood(y)
    else:
        loss = model.negative_log_likelihood(y)
    error = model.errors(y)

    # Add regularization
    if options_dict["l1_weight"] > 0. or options_dict["l2_weight"] > 0.:
        loss = loss + options_dict["l1_weight"]*model.l1 + options_dict["l2_weight"]* model.l2

    # Compile test functions
    outputs = [error, loss]
    validate_model = theano.function(
        inputs=[i_batch],
        outputs=outputs,
        givens={
            x: dev_x[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]],
            y: dev_y[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]]
            }
        )
    test_model = theano.function(
        inputs=[i_batch],
        outputs=outputs,
        givens={
            x: test_x[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]],
            y: test_y[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]]
            }
        )

    # Gradients and training updates
    parameters = model.parameters
    gradients = T.grad(loss, parameters)
    learning_rule = options_dict["learning_rule"]
    if learning_rule["type"] == "adadelta":
        updates = training.learning_rule_adadelta(
            parameters, gradients, learning_rule["rho"], learning_rule["epsilon"]
            )
    elif learning_rule["type"] == "momentum":
        updates = training.learning_rule_momentum(
            parameters, gradients, learning_rule["learning_rate"], learning_rule["momentum"]
            )
    else:
        assert False, "Invalid learning rule: " + learning_rule["type"]

    # Compile training function
    train_model = theano.function(
        inputs=[i_batch],
        outputs=outputs,
        updates=updates,
        givens={
            x: train_x[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]],
            y: train_y[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]]
            },
        )


    # Train model

    logger.info("Training CNN")
    record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz")
    record_dict = training.train_fixed_epochs_with_validation(
        options_dict["n_max_epochs"],
        train_model=train_model,
        train_batch_iterator=train_batch_iterator,
        validate_model=validate_model,
        validate_batch_iterator=validate_batch_iterator,
        test_model=test_model,
        test_batch_iterator=test_batch_iterator,
        save_model_func=model.save,
        save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"),
        record_dict_fn=record_dict_fn,
        )


    # Extrinsic evaluation

    # Pass data trough model
    logger.info("Performing same-different evaluation")
    layers_output_dict = apply_layers.apply_layers(
        options_dict["model_dir"], "dev", batch_size=645, i_layer=options_dict["i_layer_eval"]
        )  # batch size covers 10965 out of 10966 tokens
    utt_ids = sorted(layers_output_dict.keys())
    embeddings = np.array([layers_output_dict[i] for i in utt_ids])
    labels = data_io.swbd_utts_to_labels(utt_ids)

    # Perform same-different
    distances = pdist(embeddings, metric="cosine")
    matches = samediff.generate_matches_array(labels)
    ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False])
    logger.info("Validation average precision: " + str(ap))
    ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt")
    with open(ap_fn, "w") as f:
        f.write(str(ap) + "\n")
def train_siamese_triplets_cnn(options_dict):
    """Train and save a Siamese CNN using the specified options."""

    # Preliminary

    logger.info(datetime.now())

    if not path.isdir(options_dict["model_dir"]):
        os.makedirs(options_dict["model_dir"])

    if "log_to_file" in options_dict and options_dict["log_to_file"] is True:
        log_fn = path.join(options_dict["model_dir"], "log")
        print "Writing:", log_fn
        root_logger = logging.getLogger()
        if len(root_logger.handlers) > 0:
            root_logger.removeHandler(
                root_logger.handlers[0])  # close open file handler
        logging.basicConfig(filename=log_fn, level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.DEBUG)

    rng = np.random.RandomState(options_dict["rnd_seed"])
    if options_dict["dropout_rates"] is not None:
        srng = RandomStreams(seed=options_dict["rnd_seed"])
    else:
        srng = None

    options_dict_fn = path.join(options_dict["model_dir"],
                                "options_dict.pkl.gz")
    logger.info("Saving options: " + options_dict_fn)
    f = data_io.smart_open(options_dict_fn, "wb")
    pickle.dump(options_dict, f, -1)
    f.close()

    logger.info("Options: " + str(options_dict))

    # Load and format data

    # Load into shared variables
    datasets = data_io.load_swbd_same_diff(rng, options_dict["data_dir"])
    train_x, train_matches_vec, train_labels = datasets[0]
    dev_x, dev_matches_vec, dev_labels = datasets[1]
    test_x, test_matches_vec, test_labels = datasets[2]

    # Flatten data
    d_in = 39 * 200
    train_x = train_x.reshape((-1, d_in))
    dev_x = dev_x.reshape((-1, d_in))
    test_x = test_x.reshape((-1, d_in))

    # Make batch iterators
    train_batch_iterator = BatchIteratorTriplets(
        rng,
        train_matches_vec,
        options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=True)
    validate_batch_iterator = BatchIteratorTriplets(
        rng,
        dev_matches_vec,
        options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=False)
    test_batch_iterator = BatchIteratorTriplets(
        rng,
        test_matches_vec,
        options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=False)

    # Setup model

    logger.info("Building Siamese triplets CNN")

    # Symbolic variables
    x1 = T.matrix("x1")
    x2 = T.matrix("x2")
    x3 = T.matrix("x3")
    x1_indices = T.ivector("x1_indices")
    x2_indices = T.ivector("x2_indices")
    x3_indices = T.ivector("x3_indices")

    # Build model
    input_shape = (options_dict["batch_size"], 1, 39, 200)
    model = siamese.SiameseTripletCNN(
        rng,
        x1,
        x2,
        x3,
        input_shape,
        conv_layer_specs=options_dict["conv_layer_specs"],
        hidden_layer_specs=options_dict["hidden_layer_specs"],
        srng=srng,
        dropout_rates=options_dict["dropout_rates"],
    )
    if options_dict["loss"] == "hinge_cos":
        if options_dict["dropout_rates"] is not None:
            loss = model.dropout_loss_hinge_cos(options_dict["margin"])
        else:
            loss = model.loss_hinge_cos(options_dict["margin"])
        error = model.loss_hinge_cos(
            options_dict["margin"]
        )  # doesn't include regularization or dropout
    else:
        assert False, "Invalid loss: " + options_dict["loss"]

    # Add regularization
    if options_dict["l1_weight"] > 0. or options_dict["l2_weight"] > 0.:
        loss = loss + options_dict["l1_weight"] * model.l1 + options_dict[
            "l2_weight"] * model.l2

    # Compile test functions
    same_distance = model.cos_same(
    )  # track the distances of same and different pairs separately
    diff_distance = model.cos_diff()
    outputs = [error, loss, same_distance, diff_distance]
    theano_mode = theano.Mode(linker="cvm")
    validate_model = theano.function(
        inputs=[x1_indices, x2_indices, x3_indices],
        outputs=outputs,
        givens={
            x1: dev_x[x1_indices],
            x2: dev_x[x2_indices],
            x3: dev_x[x3_indices],
        },
        mode=theano_mode,
    )
    test_model = theano.function(
        inputs=[x1_indices, x2_indices, x3_indices],
        outputs=outputs,
        givens={
            x1: test_x[x1_indices],
            x2: test_x[x2_indices],
            x3: test_x[x3_indices],
        },
        mode=theano_mode,
    )

    # Gradients and training updates
    parameters = model.parameters
    gradients = T.grad(loss, parameters)
    learning_rule = options_dict["learning_rule"]
    if learning_rule["type"] == "adadelta":
        updates = training.learning_rule_adadelta(parameters, gradients,
                                                  learning_rule["rho"],
                                                  learning_rule["epsilon"])
    elif learning_rule["type"] == "momentum":
        updates = training.learning_rule_momentum(
            parameters, gradients, learning_rule["learning_rate"],
            learning_rule["momentum"])
    else:
        assert False, "Invalid learning rule: " + learning_rule["type"]

    # Compile training function
    train_model = theano.function(
        inputs=[x1_indices, x2_indices, x3_indices],
        outputs=outputs,
        updates=updates,
        givens={
            x1: train_x[x1_indices],
            x2: train_x[x2_indices],
            x3: train_x[x3_indices],
        },
        mode=theano_mode,
    )

    # Train model

    logger.info("Training Siamese triplets CNN")
    record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz")
    record_dict = training.train_fixed_epochs_with_validation(
        options_dict["n_max_epochs"],
        train_model=train_model,
        train_batch_iterator=train_batch_iterator,
        validate_model=validate_model,
        validate_batch_iterator=validate_batch_iterator,
        test_model=test_model,
        test_batch_iterator=test_batch_iterator,
        save_model_func=model.save,
        save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"),
        record_dict_fn=record_dict_fn,
    )

    # Extrinsic evaluation

    # Pass data trough model
    logger.info("Performing same-different evaluation")
    layers_output_dict = apply_layers.apply_layers(
        options_dict["model_dir"], "dev",
        batch_size=645)  # batch size covers 10965 out of 10966 tokens
    utt_ids = sorted(layers_output_dict.keys())
    embeddings = np.array([layers_output_dict[i] for i in utt_ids])
    labels = data_io.swbd_utts_to_labels(utt_ids)

    # Perform same-different
    distances = pdist(embeddings, metric="cosine")
    matches = samediff.generate_matches_array(labels)
    ap, prb = samediff.average_precision(distances[matches == True],
                                         distances[matches == False])
    logger.info("Validation average precision: " + str(ap))
    ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt")
    with open(ap_fn, "w") as f:
        f.write(str(ap) + "\n")
Пример #23
0
def main():
    args = check_argv()

    print("Reading:", args.npz_fn)
    embeddings = np.load(args.npz_fn)

    # # Temp
    # data = {}
    # a = list(embeddings)
    # random.shuffle(a)
    # for key in a[:100]:
    #     data[key] = embeddings[key]
    # embeddings = data

    print("Ordering embeddings:")
    n_embeds = 0
    X = []
    utt_keys = []
    labels = []
    speakers = []
    for utt_key in tqdm(sorted(embeddings)):
        utt_keys.append(utt_key)
        X.append(embeddings[utt_key])
        utt_key = utt_key.split("_")
        label = utt_key[0]
        speaker = utt_key[1]
        labels.append(label)
        speakers.append(speaker)
    X = np.array(X)
    print("No. embeddings:", X.shape[0])
    print("Embedding dimensionality:", X.shape[1])

    # Normalise
    normed = (X - X.mean(axis=0)) / X.std(axis=0)
    X = normed

    print("Calculating distances")
    distances = pdist(X, metric="cosine")

    # Plot: Matching words
    print("Getting word matches")
    word_matches = samediff.generate_matches_array(labels)
    print("Total no. pairs:", word_matches.shape[0])
    print("No. same-word pairs:", sum(word_matches))
    distances_pos_avg = np.mean(distances[word_matches == True])
    distances_neg_avg = np.mean(distances[word_matches == False])
    distances_pos_std = np.std(distances[word_matches == True])
    distances_neg_std = np.std(distances[word_matches == False])
    plt.figure()
    plt.bar([0, 1], [distances_neg_avg, distances_pos_avg],
            yerr=[distances_neg_std, distances_pos_std])
    plt.xticks([0, 1], ("No", "Yes"))
    plt.xlabel("Matching words")
    plt.ylabel("Cosine distance")
    plt.ylim([0, 1.2])

    # Plot: Same speakers
    print("Getting speaker matches")
    speaker_matches = samediff.generate_matches_array(speakers)
    print("No. same-speaker pairs:", sum(speaker_matches))
    distances_pos_avg = np.mean(distances[np.logical_and(
        word_matches, speaker_matches)])
    distances_neg_avg = np.mean(distances[np.logical_and(
        word_matches, speaker_matches == False)])
    distances_pos_std = np.std(distances[np.logical_and(
        word_matches, speaker_matches)])
    distances_neg_std = np.std(distances[np.logical_and(
        word_matches, speaker_matches == False)])
    # distances_pos_avg = np.mean(distances[speaker_matches == True])
    # distances_neg_avg = np.mean(distances[speaker_matches == False])
    # distances_pos_std = np.std(distances[speaker_matches == True])
    # distances_neg_std = np.std(distances[speaker_matches == False])
    plt.figure()
    plt.bar([0, 1], [distances_neg_avg, distances_pos_avg],
            yerr=[distances_neg_std, distances_pos_std])
    plt.xticks([0, 1], ("No", "Yes"))
    plt.xlabel("Matching speakers")
    plt.ylabel("Cosine distance")
    plt.ylim([0, 1.2])
    plt.title("Distances between same-word pairs")

    # Plot: Edit distances
    if args.pronunciation is not None:

        # Pronunciations
        pron_fn = path.join("lists", args.pronunciation, "dev.prons")
        print("Reading:", pron_fn)
        pronunciations = read_pronunciations(pron_fn)
        pron_labels = []
        for utt_key in utt_keys:
            pron_labels.append(pronunciations[utt_key])

        # Get distances
        print("Getting edit distances:")
        # edit_distances = editdistance_array(labels)
        edit_distances = editdistance_array(pron_labels)

        # Plot distances
        edits = sorted(set(edit_distances))
        averages = []
        stds = []
        for edit in edits:
            averages.append(np.mean(distances[edit_distances == edit]))
            stds.append(np.std(distances[edit_distances == edit]))
        plt.figure()
        plt.bar(edits, averages, yerr=stds)
        plt.ylim([0, 1.2])
        plt.xlabel("Phone edit distance")
        plt.ylabel("Cosine distance")

    plt.show()