示例#1
0
    def get_feed_dict(self,
                      word_idx_seqs,
                      task,
                      label_seqs=None,
                      lr=None,
                      dropout=None):
        word_idx_seqs = [list(word_idxs) for word_idxs in word_idx_seqs]
        word_ids, sequence_lengths = pad_sequences(word_idx_seqs, 0)

        # build feed dictionary
        feed = {
            self.word_idxs: word_ids,
            self.sequence_lengths: sequence_lengths
        }

        if label_seqs is not None:
            label_seqs = [list(labels) for labels in label_seqs]
            labels, _ = pad_sequences(label_seqs, 0)
            if task == 'src1':
                feed[self.labels_src1] = labels
            elif task == 'src2':
                feed[self.labels_src2] = labels
            else:
                feed[self.labels_tar] = labels

        feed[self.lr] = lr
        feed[self.dropout] = dropout

        return feed, sequence_lengths
示例#2
0
    def get_feed_dict(self,
                      word_idx_seqs,
                      label_seqs=None,
                      lr=None,
                      dropout=None,
                      manual_feat=None):
        word_idx_seqs = [list(word_idxs) for word_idxs in word_idx_seqs]
        word_ids, sequence_lengths = utils.pad_sequences(word_idx_seqs, 0)

        # print(len(word_ids))
        # build feed dictionary
        feed = {
            self.word_idxs: word_ids,
            self.sequence_lengths: sequence_lengths
        }

        if label_seqs is not None:
            label_seqs = [list(labels) for labels in label_seqs]
            labels, _ = utils.pad_sequences(label_seqs, 0)
            feed[self.labels] = labels

        if self.manual_feat is not None:
            manual_feat, lens = utils.pad_feat_sequence(
                manual_feat, manual_feat[0].shape[1])
            feed[self.manual_feat] = manual_feat

        feed[self.lr] = lr
        feed[self.dropout] = dropout

        return feed, sequence_lengths
示例#3
0
    def get_feed_dict(self,
                      word_embeddings,
                      label_seqs=None,
                      lr=None,
                      dropout=None):
        word_embed_seqs, sequence_lengths = utils.pad_embed_sequences(
            word_embeddings, self.word_embed_pad)
        word_embed_seqs = np.array(word_embed_seqs, np.float32)
        # print(word_embed_seqs.shape)

        # print(len(word_ids))
        # build feed dictionary
        feed = {
            self.word_embeddings_input: word_embed_seqs,
            self.sequence_lengths: sequence_lengths
        }

        if label_seqs is not None:
            label_seqs = [list(labels) for labels in label_seqs]
            labels, _ = utils.pad_sequences(label_seqs, 0)
            feed[self.labels] = labels

        feed[self.lr] = lr
        feed[self.dropout] = dropout

        return feed, sequence_lengths
示例#4
0
    def get_feed_dict_ol(self,
                         embed_arr,
                         seq_lens,
                         lr,
                         dropout,
                         task,
                         label_seqs=None):
        feed = {
            self.word_embeddings_input: embed_arr,
            self.sequence_lengths: seq_lens
        }

        # if label_seqs is not None:
        #     feed[self.labels] = label_seqs

        if label_seqs is not None:
            label_seqs = [list(labels) for labels in label_seqs]
            labels, _ = utils.pad_sequences(label_seqs, 0)
            if task == 'src1':
                feed[self.labels_src1] = labels
            elif task == 'src2':
                feed[self.labels_src2] = labels
            else:
                feed[self.labels_tar] = labels

        feed[self.lr] = lr
        feed[self.dropout] = dropout
        return feed
示例#5
0
def ubuntu_data_load(params):
    '''
    :param dataset_dir:
    :param max_sentence_len:
    :param max_num_utterance:
    :return: dict{str: numpy.ndarray}
    '''
    default_params = {
        "dataset_dir": None,
        "phase": "training",
        "training_files": ["responses.pkl", "utterances.pkl"],
        "evaluate_files": ["Evaluate.pkl"],
        "max_num_utterance": 10,
        "max_sentence_len": 50
    }
    default_params.update(params)
    assert default_params["dataset_dir"] and os.path.exists(default_params["dataset_dir"])

    np_dtype = np.int64
    if default_params["phase"] in ["training", "validation"]:
        training_files = default_params["training_files"]
        with open(os.path.join(default_params["dataset_dir"], training_files[0]), 'rb') as f:
            actions = pickle.load(f)
        with open(os.path.join(default_params["dataset_dir"], training_files[1]), 'rb') as f:
            history, true_utt = pickle.load(f)

        # prepare tf dataset
        history, history_len = utils.multi_sequences_padding(history,
                                                             max_num_utterance=default_params["max_num_utterance"],
                                                             max_sentence_len=default_params["max_sentence_len"])
        true_utt_len = np.array(utils.get_sequences_length(true_utt, maxlen=default_params["max_sentence_len"]),
                                dtype=np_dtype)
        true_utt = np.array(pad_sequences(true_utt, default_params["max_sentence_len"], padding='post'),
                            dtype=np_dtype)
        actions_len = np.array(utils.get_sequences_length(actions, maxlen=default_params["max_sentence_len"]), dtype=np_dtype)
        actions = np.array(pad_sequences(actions, default_params["max_sentence_len"], padding='post'),
                           dtype=np_dtype)
        history, history_len = np.array(history, dtype=np_dtype), np.array(history_len, dtype=np_dtype)

        return {
            "history": {"data": history, "type": "normal"},
            "history_len": {"data": history_len, "type": "normal"},
            "true_utt": {"data": true_utt, "type": "normal"},
            "true_utt_len": {"data": true_utt_len, "type": "normal"},
            "actions": {"data": actions, "type": "share"},
            "actions_len": {"data": actions_len, "type": "share"}
        }
    else:
        evaluate_files = default_params["evaluate_files"]
        with open(os.path.join(default_params["dataset_dir"], evaluate_files[0]), 'rb') as f:
            history, true_utt, labels = pickle.load(f)
        history, history_len = utils.multi_sequences_padding(history,
                                                             max_num_utterance=default_params["max_num_utterance"],
                                                             max_sentence_len=default_params["max_sentence_len"])
        true_utt_len = np.array(utils.get_sequences_length(true_utt, maxlen=default_params["max_sentence_len"]),
                                dtype=np_dtype)
        true_utt = np.array(pad_sequences(true_utt, default_params["max_sentence_len"], padding='post'),
                            dtype=np_dtype)
        history, history_len = np.array(history, dtype=np_dtype), np.array(history_len, dtype=np_dtype)
        labels = np.array(labels, dtype=np_dtype)

        return {
            "history": history,
            "history_len": history_len,
            "true_utt": true_utt,
            "true_utt_len": true_utt_len,
            "labels": labels
        }
示例#6
0
    def numpy_process():
        max_batch_size = 100000
        np_cache_file = None
        results = {}
        keys = ["history_np_list", "history_len", "response_features_np_list", "true_utt", "true_utt_len", "labels",
                "history_bert_id_np_list", "history_bert_len", "history_bert_mask_np_list",
                "history_bert_segment_np_list", "history_alignment_np_list",
                "true_utt_bert_id", "true_utt_bert_len", "true_utt_bert_mask",
                "true_utt_bert_segment", "true_utt_alignment"]

        if default_params["phase"] in ["training", "validation"]:
            np_cache_file = os.path.join(default_params["dataset_dir"], "numpy_training_" + training_files[0])
        else:
            np_cache_file = os.path.join(default_params["dataset_dir"], "numpy_evaluate_" + evaluate_files[0][5:])

        if os.path.isfile(np_cache_file):
            results = pickle.load(open(np_cache_file, 'rb'))
        if (use_bert_embeddings and all(key in results for key in keys)) or (not use_bert_embeddings and all(key in results for key in keys[:6])):
            pass
        else:
            inputs = process()
            if "history_np_list" not in results:
                history, history_len = utils.multi_sequences_padding(tqdm(inputs['c'], desc="Sequence Padding"),
                                                                     max_num_utterance=default_params[
                                                                         "max_num_utterance"],
                                                                     max_sentence_len=default_params[
                                                                         "max_sentence_len"])
                results["history_np_list"] = [np.array(history[i: i + max_batch_size], dtype=np_dtype) for i in
                                              range(0, len(history), max_batch_size)]
                results["history_len"] = np.array(history_len, dtype=np_dtype)
            if "response_features_np_list" not in results:
                feature_len = len(inputs["r_feature"][0][0][0])
                response_features, _ = utils.multi_sequences_padding(tqdm(inputs["r_feature"], desc="Feature Sequence Padding"),
                                                                     max_num_utterance=default_params["max_num_utterance"],
                                                                     max_sentence_len=default_params["max_sentence_len"],
                                                                     padding_element=[0] * feature_len)
                results["response_features_np_list"] = [
                    np.array(response_features[i:i + max_batch_size], dtype=np_float_dtype) for i in
                    range(0, len(response_features), max_batch_size)]

            if "true_utt_len" not in results: results["true_utt_len"] = np.array(utils.get_sequences_length(inputs['r'], maxlen=default_params["max_sentence_len"]), dtype=np_dtype)
            if "true_utt" not in results: results["true_utt"]= np.array(pad_sequences(inputs['r'], default_params["max_sentence_len"], padding='post'), dtype=np_dtype)
            if "labels" not in results: results["labels"]= np.array(inputs['y'], dtype=np_dtype)

            if use_bert_embeddings:
                if "history_bert_id_np_list" not in results:
                    history_bert_id, history_bert_len = utils.multi_sequences_padding(
                        tqdm(inputs["c_bert"]["id"], desc="Bert Sequence Padding"),
                        max_num_utterance=default_params["max_num_utterance"],
                        max_sentence_len=default_params["bert_max_sentence_len"])
                    results["history_bert_id_np_list"] = [
                        np.array(history_bert_id[i: i + max_batch_size], dtype=np_dtype)
                        for i in range(0, len(history_bert_id), max_batch_size)]
                    results["history_bert_len"] = np.array(history_bert_len, dtype=np_dtype)
                if "history_bert_mask_np_list" not in results:
                    history_bert_mask, _ = utils.multi_sequences_padding(
                        tqdm(inputs["c_bert"]["mask"], desc="Bert Mask Padding"),
                        max_num_utterance=default_params["max_num_utterance"],
                        max_sentence_len=default_params["bert_max_sentence_len"])
                    results["history_bert_mask_np_list"] = [
                        np.array(history_bert_mask[i: i + max_batch_size], dtype=np_dtype)
                        for i in range(0, len(history_bert_mask), max_batch_size)]
                if "history_bert_segment_np_list" not in results:
                    history_bert_segment, _ = utils.multi_sequences_padding(
                        tqdm(inputs["c_bert"]["segment"], desc="Bert Segment Padding"),
                        max_num_utterance=default_params["max_num_utterance"],
                        max_sentence_len=default_params["bert_max_sentence_len"])
                    results["history_bert_segment_np_list"] = [
                        np.array(history_bert_segment[i: i + max_batch_size], dtype=np_dtype)
                        for i in range(0, len(history_bert_segment), max_batch_size)]
                if "history_alignment_np_list" not in results:
                    history_alignment, _ = utils.multi_sequences_padding(
                        tqdm(inputs["c_bert"]["alignment"], desc="Alignment Padding"),
                        max_num_utterance=default_params["max_num_utterance"],
                        max_sentence_len=default_params["max_sentence_len"])
                    results["history_alignment_np_list"] = [
                        np.array(history_alignment[i: i + max_batch_size], dtype=np_dtype)
                        for i in range(0, len(history_alignment), max_batch_size)]

                if "true_utt_bert_id" not in results: results["true_utt_bert_id"] = np.array(pad_sequences(inputs["r_bert"]["id"], default_params["bert_max_sentence_len"], padding='post'), dtype=np_dtype)
                if "true_utt_bert_len" not in results: results["true_utt_bert_len"] = np.array(utils.get_sequences_length(inputs["r_bert"]["id"], default_params["bert_max_sentence_len"]), dtype=np_dtype)
                if "true_utt_bert_mask" not in results: results["true_utt_bert_mask"] = np.array(pad_sequences(inputs["r_bert"]["mask"], default_params["bert_max_sentence_len"], padding='post'), dtype=np_dtype)
                if "true_utt_bert_segment" not in results: results["true_utt_bert_segment"] = np.array(pad_sequences(inputs["r_bert"]["segment"], default_params["bert_max_sentence_len"], padding='post'), dtype=np_dtype)
                if "true_utt_alignment" not in results: results["true_utt_alignment"] = np.array(pad_sequences(inputs["r_bert"]["alignment"], default_params["max_sentence_len"], padding='post'), dtype=np_dtype)

            with open(np_cache_file, 'wb') as f:
                pickle.dump(results, f)

        results["history"] = np.concatenate(results["history_np_list"], axis=0)
        results["response_features"] = np.concatenate(results["response_features_np_list"], axis=0)
        if use_bert_embeddings:
            results["history_bert_id"] = np.concatenate(results["history_bert_id_np_list"], axis=0)
            results["history_bert_mask"] = np.concatenate(results["history_bert_mask_np_list"], axis=0)
            results["history_bert_segment"] = np.concatenate(results["history_bert_segment_np_list"], axis=0)
            results["history_alignment"] = np.concatenate(results["history_alignment_np_list"], axis=0)

        return results
示例#7
0
def main():

    # word vector
    embedding_vec = None
    ## model testing doesn't need data
    if TEST_MODE != 1:
        if TEST_MODE == 2:
            print 'INTO TEST_MODE 2'
        ## load the dataset
        print 'Load data...'
        X_train, y_train_sent, y_train_chunk, train_av, train_lex, train_en,\
          X_val, y_val_sent, y_val_chunk, val_av, val_lex, val_en,\
            X_test, y_test_sent, y_test_chunk, test_av, test_lex, test_en,\
              word_dict, tag_dict, label_dict = load_data()

        # read wordvec file
        if wordvec_init == True:
            embedding_vec = numpy.empty((0, WORD_DIM), float)
            wordvec_dict = cPickle.load(open(WORD_VEC_FILE, 'rb'))
            for i in range(VOCA_SIZE):
                if wordvec_dict.has_key(i):
                    embedding_vec = numpy.append(embedding_vec,
                                                 wordvec_dict[i].reshape(
                                                     1, WORD_DIM),
                                                 axis=0)
                else:
                    embedding_vec = numpy.append(
                        embedding_vec,
                        lasagne.random.get_rng().normal(0.0,
                                                        0.01,
                                                        size=(1, WORD_DIM)),
                        axis=0)
    #print embedding_vec.shape, type(embedding_vec)
    #print embedding_vec[0][:10], len(embedding_vec[0])
        print 'Ok.'
    else:
        print 'INTO TEST_MODE 1'

    ### model inputs
    sents = theano.tensor.itensor3('sents')
    bigrams = theano.tensor.itensor3('bigrams')
    masks = theano.tensor.imatrix('masks')
    av_features = theano.tensor.itensor3('av')
    lex_features = theano.tensor.imatrix('lex')
    en_features = theano.tensor.itensor3('en')
    #masks_seg = theano.tensor.imatrix('masks_seg')
    lr = theano.tensor.fscalar('lr')

    ### model target outputs
    # chunk label target
    chunk_labels = theano.tensor.imatrix()

    ### model target outputs
    def expandMatrix(X, dim=None, exflag=True):
        ## exflag: has padding or not.
        # (batch size, max length) -> (batch size, max length, dim)
        _eye = None
        if exflag:
            _x = theano.tensor.eye(dim)
            _y = theano.tensor.zeros((1, dim))
            _eye = theano.tensor.concatenate([_y, _x], axis=0)
        else:
            _eye = theano.tensor.eye(dim)
        return theano.tensor.cast(_eye[X], dtype='int32')

    chunk_targets = expandMatrix(chunk_labels, N_CHUNK_LABEL)

    ## build model
    print 'Build model...'

    # the model has two outputs
    chunk_out = build_model(sents, bigrams, av_features, lex_features,
                            en_features, masks, chunk_labels, embedding_vec)

    # whether or not use trained model
    if reuse_mode == True:
        with numpy.load('model_best_c_test.npz') as f:
            param_values = [f['arr_%d' % i] for i in range(len(f.files))]
        lasagne.layers.set_all_param_values(chunk_out, param_values)

    # (batch size, )
    chunk_label_mass = lasagne.layers.get_output(chunk_out)
    chunk_loss = theano.tensor.mean(chunk_label_mass[0] /
                                    theano.tensor.sum(masks, axis=1))
    chunk_label_chain = chunk_label_mass[1] * masks

    # l2 penalty
    loss_penalty = lasagne.regularization.regularize_layer_params(
        chunk_out, lasagne.regularization.l2) * LAMBDA

    loss = chunk_loss + loss_penalty

    all_params = lasagne.layers.get_all_params(chunk_out, trainable=True)
    #print all_params

    ## set constraints for Tag Inference Layer
    tag_inference_layer_params = chunk_out.get_params()
    init_tran = tag_inference_layer_params[0]
    tran = tag_inference_layer_params[1]
    halt_tran = tag_inference_layer_params[2]

    def l1_unit_norm(p):
        epsilon = 10e-8
        p = p * theano.tensor.cast(p >= 0., 'float64')
        return p / (epsilon + theano.tensor.sum(p, axis=-1, keepdims=True))

    constraints = {
        init_tran: l1_unit_norm,
        tran: l1_unit_norm,
        halt_tran: l1_unit_norm
    }

    updates = adagrad_norm(loss,
                           all_params,
                           learning_rate=lr,
                           constraints=constraints)

    ## for validation and test
    chunk_pred_label_mass = lasagne.layers.get_output(chunk_out,
                                                      deterministic=True)
    chunk_pred_loss = theano.tensor.mean(chunk_pred_label_mass[0] /
                                         theano.tensor.sum(masks, axis=1))
    chunk_pred_label_chain = chunk_pred_label_mass[1] * masks

    val_loss = chunk_pred_loss

    # for train
    train_fn = theano.function([
        sents, bigrams, av_features, lex_features, en_features, masks,
        chunk_labels, lr
    ],
                               loss,
                               updates=updates)
    # loss, updates=updates)
    # validation or test
    val_fn = theano.function([
        sents, bigrams, av_features, lex_features, en_features, masks,
        chunk_labels
    ], [val_loss, chunk_pred_label_chain])

    if TEST_MODE == 1:
        print 'MODEL BUILDING PASS.'
        sys.exit()

    print 'Ok.'

    best_val_c_f1 = 0
    best_val = 0
    best_test_c_f1 = 0

    data = {'best_val_c_f1': [], 'best_val': [], 'best_test_c_f1': []}

    lr_decayed = numpy.float32(LR)

    train_losses = []

    # Finally, launch the training loop.
    print "Starting training..."
    for epoch in range(NUM_EPOCHS):
        print 'Epoch', epoch

        progbar = generic_utils.Progbar(X_train.shape[0])

        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batchs = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train,
                                         y_train_sent,
                                         y_train_chunk,
                                         train_av,
                                         train_lex,
                                         train_en,
                                         N_BATCH,
                                         shuffle=True):
            inputs, bigrams, av_features, lex_features, en_features, masks, sentiment_targets, chunk_targets = batch
            err = train_fn(inputs, bigrams, av_features, lex_features,
                           en_features, masks, chunk_targets, lr_decayed)
            train_err += err
            train_batchs += 1

            progbar.add(inputs.shape[0], values=[('train loss', err)])
        train_loss = train_err / train_batchs
        # Decrease learning rate if no improvement was seen over last 3 times.
        if len(train_losses) > 3 and train_loss > max(train_losses[-3:]):
            lr_decayed = numpy.float32(lr_decayed * 0.5)

        train_losses.append(train_loss)

        val_cf1 = 0
        # And a full pass over the val data:
        inputs = utils.pad_sequences(X_val, MAX_LENGTH)
        masks = numpy.int32(
            numpy.ones_like(inputs) * (1 - numpy.equal(inputs, 0)))
        inputscw, bigrams = contextwin_bigram(inputs, WINDOW_SIZE)
        chunk_targets = utils.pad_sequences(y_val_chunk, MAX_LENGTH)
        val_av_batch = [
            utils.pad_matrix(val_av[j], sent_maxlen=MAX_LENGTH, feature_dim=5)
            for j in range(len(val_av))
        ]
        val_lex_batch = utils.pad_sequences(val_lex, MAX_LENGTH)
        val_en_batch = [
            utils.pad_matrix(val_en[j], sent_maxlen=MAX_LENGTH, feature_dim=2)
            for j in range(len(val_en))
        ]
        val_err, val_chunk_label = val_fn(inputscw, bigrams, val_av_batch,
                                          val_lex_batch, val_en_batch, masks,
                                          chunk_targets)

        if BIO_C_FLAG:
            c_res_val = utils.cwsEalve(inputs, chunk_targets, val_chunk_label,
                                       word_dict, tag_dict, VAL_GS_FILE,
                                       DICTIONARY, False)
            val_cf1 = c_res_val

        if best_val <= (val_cf1):
            best_val = val_cf1
            best_val_c_f1 = val_cf1

            test_cf1 = 0
            # And a full pass over the test data:
            inputs = utils.pad_sequences(X_test, MAX_LENGTH)
            masks = numpy.int32(
                numpy.ones_like(inputs) * (1 - numpy.equal(inputs, 0)))
            inputscw, bigrams = contextwin_bigram(inputs, WINDOW_SIZE)
            chunk_targets = utils.pad_sequences(y_test_chunk, MAX_LENGTH)
            test_av_batch = [
                utils.pad_matrix(test_av[j],
                                 sent_maxlen=MAX_LENGTH,
                                 feature_dim=5) for j in range(len(test_av))
            ]
            test_lex_batch = utils.pad_sequences(test_lex, MAX_LENGTH)
            test_en_batch = [
                utils.pad_matrix(test_en[j],
                                 sent_maxlen=MAX_LENGTH,
                                 feature_dim=2) for j in range(len(test_en))
            ]
            test_err, test_chunk_label = val_fn(inputscw, bigrams,
                                                test_av_batch, test_lex_batch,
                                                test_en_batch, masks,
                                                chunk_targets)

            if BIO_C_FLAG:
                # c_res_val = utils.cwsEalve(inputs, chunk_targets, test_chunk_label,
                #     word_dict, tag_dict, TEST_GS_FILE, DICTIONARY, False)
                utils.save_test(inputs, chunk_targets, test_chunk_label,
                                word_dict, tag_dict, TEST_GS_FILE, DICTIONARY,
                                False)
                #best_test_c_f1 = c_res_val

            #numpy.savez('model_best_c_test.npz', *lasagne.layers.get_all_param_values(chunk_out))
            # utils.id2original(inputs, chunk_targets,
            #     utils.pad_sequences(y_test_sent, MAX_LENGTH, value=1), test_chunk_label,
            #     utils.pad_sequences(y_test_sent, MAX_LENGTH, value=1),
            #         word_dict=word_dict, tag_dict=tag_dict, label_dict=label_dict,
            #             output_file='test_c_result.txt')

        # Then we print the results for this epoch:
        print "val: %.4f c_f1: %.4f best: %.4f test: %.4f c_f1: %.4f" \
        %(val_err, val_cf1, best_val, test_err, best_test_c_f1)

        data['best_val_c_f1'].append(best_val_c_f1)
        data['best_val'].append(best_val)
        data['best_test_c_f1'].append(best_test_c_f1)
        data['args'] = args_hash
        cPickle.dump(data, open('result_data.pkl', 'wb'))
示例#8
0
def iterate_minibatches(inputs,
                        sent_targets,
                        chunk_targets,
                        train_av,
                        train_lex,
                        train_en,
                        batchsize,
                        shuffle=False):

    assert len(inputs) == len(sent_targets)
    assert len(inputs) == len(chunk_targets)

    def gene_mask(X):
        return numpy.int32(numpy.ones_like(X) * (1 - numpy.equal(X, 0)))

    index_set = [[] for _ in BUCKTES]
    for i in range(len(inputs)):
        x = inputs[i]
        for bucket_id, (min_size, max_size) in enumerate(BUCKTES):
            if len(x) >= min_size and len(x) < max_size:
                index_set[bucket_id].append(i)
                break

    if index_set and len(index_set) > 1:  # user set multi buckets
        for i in range(len(index_set)):
            bucket_max_length = BUCKTES[i][1] - 1
            index_bucket = index_set[i]
            if len(index_bucket) == 0:  # empty buckets
                #print 'empty'
                continue
            #print index_bucket
            index_bucket_shuffled = index_bucket[:]
            if shuffle:
                lasagne.random.get_rng().shuffle(index_bucket_shuffled)
            for start_idx in range(0, len(index_bucket), batchsize):
                if shuffle:
                    excerpt = index_bucket_shuffled[start_idx:len(index_bucket) \
                    if (start_idx + batchsize > len(index_bucket)) else start_idx + batchsize]
                else:
                    excerpt = index_bucket[start_idx:len(index_bucket) \
                    if (start_idx + batchsize > len(index_bucket)) else start_idx + batchsize]

                sents_one_batch = utils.pad_sequences(
                    [inputs[j] for j in excerpt], bucket_max_length)
                masks_one_batch = gene_mask(sents_one_batch)
                sentscw_one_batch = contextwin(sents_one_batch, WINDOW_SIZE)
                sent_targets_one_batch = utils.pad_sequences(
                    [sent_targets[j] for j in excerpt], bucket_max_length)
                chunk_targets_one_batch = utils.pad_sequences(
                    [chunk_targets[j] for j in excerpt], bucket_max_length)

                yield sentscw_one_batch, masks_one_batch, sent_targets_one_batch, chunk_targets_one_batch

    else:  # only one default bucket: (0, max_length) or None buckets at all
        if shuffle:
            indices = numpy.arange(len(inputs))
            lasagne.random.get_rng().shuffle(indices)
            #print indices
        for start_idx in range(0, len(inputs), batchsize):
            if shuffle:
                excerpt = indices[start_idx:len(inputs) \
                if (start_idx + batchsize > len(inputs)) else start_idx + batchsize]
            else:
                excerpt = range(start_idx, len(inputs) \
                    if (start_idx + batchsize > len(inputs)) else start_idx + batchsize)

            sents_one_batch = utils.pad_sequences([inputs[j] for j in excerpt],
                                                  MAX_LENGTH)
            masks_one_batch = gene_mask(sents_one_batch)
            sentscw_one_batch, bigram_one_batch = contextwin_bigram(
                sents_one_batch, WINDOW_SIZE)
            train_av_one_batch = [
                utils.pad_matrix(train_av[j],
                                 sent_maxlen=MAX_LENGTH,
                                 feature_dim=5) for j in excerpt
            ]
            train_lex_one_batch = utils.pad_sequences(
                [train_lex[j] for j in excerpt], MAX_LENGTH)
            train_en_one_batch = [
                utils.pad_matrix(train_en[j],
                                 sent_maxlen=MAX_LENGTH,
                                 feature_dim=2) for j in excerpt
            ]
            sent_targets_one_batch = utils.pad_sequences(
                [sent_targets[j] for j in excerpt], MAX_LENGTH)
            chunk_targets_one_batch = utils.pad_sequences(
                [chunk_targets[j] for j in excerpt], MAX_LENGTH)

            #yield sentscw_one_batch, masks_one_batch, masks_seg_one_batch, sent_targets_one_batch, chunk_targets_one_batch
            yield sentscw_one_batch, bigram_one_batch, train_av_one_batch, train_lex_one_batch, train_en_one_batch, masks_one_batch, \
            sent_targets_one_batch, chunk_targets_one_batch
示例#9
0
    def numpy_process():
        max_batch_size = 100000
        np_cache_file = None
        if default_params["phase"] in ["training", "validation"]:
            np_cache_file = os.path.join(
                default_params["dataset_dir"],
                "numpy_" + training_files[0].rsplit(".", 1)[0] + ".pkl")
        else:
            np_cache_file = os.path.join(
                default_params["dataset_dir"],
                "numpy_" + evaluate_files[0].rsplit(".", 1)[0] + ".pkl")

        if os.path.isfile(np_cache_file):
            context_id_np_list, context_len, context_mask_np_list, context_segment_np_list, \
            response_features_np_list, response_id, response_len, response_mask, response_segment, labels = pickle.load(
                open(np_cache_file, 'rb'))
        else:
            inputs = bert_process()
            context_id, context_len = utils.multi_sequences_padding(
                tqdm(inputs["context_id"], desc="Sequence Padding"),
                max_num_utterance=max_num_utterance,
                max_sentence_len=max_sentence_len)
            context_mask, _ = utils.multi_sequences_padding(
                tqdm(inputs["context_mask"], desc="Sequence Mask Padding"),
                max_num_utterance=max_num_utterance,
                max_sentence_len=max_sentence_len)
            context_segment, _ = utils.multi_sequences_padding(
                tqdm(inputs["context_segment"],
                     desc="Sequence Segment Padding"),
                max_num_utterance=max_num_utterance,
                max_sentence_len=max_sentence_len)
            feature_len = len(inputs["r_feature"][0][0][0])
            response_features, _ = utils.multi_sequences_padding(
                tqdm(inputs["r_feature"], desc="Feature Sequence Padding"),
                max_num_utterance=default_params["max_num_utterance"],
                max_sentence_len=default_params["max_sentence_len"],
                padding_element=[0] * feature_len)

            context_id_np_list = [
                np.array(context_id[i:i + max_batch_size], dtype=np_dtype)
                for i in range(0, len(context_id), max_batch_size)
            ]
            context_len = np.array(context_len, dtype=np_dtype)
            context_mask_np_list = [
                np.array(context_mask[i:i + max_batch_size], dtype=np_dtype)
                for i in range(0, len(context_mask), max_batch_size)
            ]
            context_segment_np_list = [
                np.array(context_segment[i:i + max_batch_size], dtype=np_dtype)
                for i in range(0, len(context_segment), max_batch_size)
            ]
            response_features_np_list = [
                np.array(response_features[i:i + max_batch_size],
                         dtype=np_float_dtype)
                for i in range(0, len(response_features), max_batch_size)
            ]

            response_id = np.array(pad_sequences(inputs["response_id"],
                                                 max_sentence_len,
                                                 padding='post'),
                                   dtype=np_dtype)
            response_len = np.array(inputs["response_len"], dtype=np_dtype)
            response_mask = np.array(pad_sequences(inputs["response_mask"],
                                                   max_sentence_len,
                                                   padding='post'),
                                     dtype=np_dtype)
            response_segment = np.array(pad_sequences(
                inputs["response_segment"], max_sentence_len, padding='post'),
                                        dtype=np_dtype)
            labels = np.array(inputs["labels"], dtype=np_dtype)

            with open(np_cache_file, 'wb') as f:
                pickle.dump([
                    context_id_np_list, context_len, context_mask_np_list,
                    context_segment_np_list, response_features_np_list,
                    response_id, response_len, response_mask, response_segment,
                    labels
                ], f)

        context_id = np.concatenate(context_id_np_list, axis=0)
        context_mask = np.concatenate(context_mask_np_list, axis=0)
        context_segment = np.concatenate(context_segment_np_list, axis=0)
        response_features = np.concatenate(response_features_np_list, axis=0)

        return [
            context_id, context_len, context_mask, context_segment,
            response_features, response_id, response_len, response_mask,
            response_segment, labels
        ]
示例#10
0
    for curr_epoch in range(num_epochs):
        train_cost = train_ler = train_cost1 = train_ler1 = 0
        start = time.time()

        for batch in range(num_batches_per_epoch):

            # Getting the index
            indexes = [
                i % num_examples
                for i in range(batch * batch_size, (batch + 1) * batch_size)
            ]

            batch_train_inputs = train_inputs[indexes]
            # Padding input to max_time_step of this batch
            batch_train_inputs, batch_train_seq_len = pad_sequences(
                batch_train_inputs)

            # Converting to sparse representation so as to to feed SparseTensor input
            batch_train_targets = sparse_tuple_from(train_targets[indexes])

            feed = {
                inputs: batch_train_inputs,
                targets: batch_train_targets,
                seq_len: batch_train_seq_len
            }

            train_ler1_part = session.run(ler, feed_dict=feed) * batch_size
            train_ler1_part = session.run(ler, feed_dict=feed) * batch_size
            train_ler1 += train_ler1_part
            train_cost1_part = session.run(cost, feed) * batch_size
            train_cost1_part = session.run(cost, feed) * batch_size