def run_cnn(exp_name,
            dataset,
            embedding,
            log_fn,
            perf_fn,
            emb_dm=100,
            batch_size=100,
            filter_hs=[1, 2, 3],
            hidden_units=[200, 100, 11],
            type_hidden_units=[200, 100, 6],
            dropout_rate=0.5,
            shuffle_batch=True,
            n_epochs=300,
            lr_decay=0.95,
            activation=ReLU,
            sqr_norm_lim=9,
            non_static=True,
            print_freq=5,
            sen_reg=False,
            L2=False):
    """
    Train and Evaluate CNN event encoder model
    :dataset: list containing three elements[(train_x, train_y), 
            (valid_x, valid_y), (test_x, test_y)]
    :embedding: word embedding with shape (|V| * emb_dm)
    :filter_hs: filter height for each paralle cnn layer
    :dropout_rate: dropout rate for full connected layers
    :n_epochs: the max number of iterations
    
    """
    start_time = timeit.default_timer()
    rng = np.random.RandomState(1234)

    input_height = len(dataset[0][0][0][0])
    num_sens = len(dataset[0][0][0])
    print "--input height ", input_height
    input_width = emb_dm
    num_maps = hidden_units[0]

    ###################
    # start snippet 1 #
    ###################
    print "start to construct the model ...."
    x = T.tensor3("x")
    type_y = T.ivector("y_type")
    pop_y = T.ivector("y_pop")

    words = shared(value=np.asarray(embedding, dtype=theano.config.floatX),
                   name="embedding",
                   borrow=True)

    # define function to keep padding vector as zero
    zero_vector_tensor = T.vector()
    zero_vec = np.zeros(input_width, dtype=theano.config.floatX)
    set_zero = function([zero_vector_tensor],
                        updates=[(words,
                                  T.set_subtensor(words[0, :],
                                                  zero_vector_tensor))])

    layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape(
        (x.shape[0] * x.shape[1], 1, x.shape[2], emb_dm))

    #########################
    # Construct Sen Vec #####
    #########################
    conv_layers = []
    filter_shape = (num_maps, 1, filter_hs[0], emb_dm)
    pool_size = (input_height - filter_hs[0] + 1, 1)
    conv_layer = nn.ConvPoolLayer(rng,
                                  input=layer0_input,
                                  input_shape=None,
                                  filter_shape=filter_shape,
                                  pool_size=pool_size,
                                  activation=activation)
    sen_vecs = conv_layer.output.reshape((x.shape[0], x.shape[1], num_maps))
    conv_layers.append(conv_layer)

    ########################
    ## Task 1: populaiton###
    ########################
    pop_layer_sizes = zip(hidden_units, hidden_units[1:])
    pop_layer_input = sen_vecs
    pop_drop_input = sen_vecs
    pop_hidden_outs = []
    pop_drop_outs = []
    pop_hidden_layers = []
    pop_drop_layers = []
    droprate = 0.5
    for layer_size in pop_layer_sizes[:-1]:
        U_value = np.random.random(layer_size).astype(theano.config.floatX)
        b_value = np.zeros((layer_size[-1], ), dtype=theano.config.floatX)

        U = theano.shared(U_value, borrow=True, name="U")
        b = theano.shared(b_value, borrow=True, name="b")

        pop_hidden_layer = nn.HiddenLayer(rng, pop_layer_input, layer_size[0],
                                          layer_size[1], ReLU,
                                          U * (1 - droprate), b)
        pop_drop_hidden_layer = nn.DropoutHiddenLayer(rng, pop_drop_input,
                                                      layer_size[0],
                                                      layer_size[1], ReLU,
                                                      droprate, U, b)

        pop_hidden_layers.append(pop_hidden_layer)
        pop_drop_layers.append(pop_drop_hidden_layer)

        pop_hidden_out = pop_hidden_layer.output
        pop_drop_out = pop_drop_hidden_layer.output

        pop_layer_input = pop_hidden_out
        pop_drop_input = pop_drop_out

        pop_hidden_outs.append(pop_hidden_out)
        pop_drop_outs.append(pop_drop_out)

    # construct pop classifier
    n_in, n_out = pop_layer_sizes[-1]
    W_value = np.random.random((n_in, n_out)).astype(theano.config.floatX)
    b_value = np.zeros((n_out, ), dtype=theano.config.floatX)

    pop_W = theano.shared(W_value, borrow=True, name="pop_W")
    pop_b = theano.shared(b_value, borrow=True, name="pop_b")

    pop_act = T.dot(pop_hidden_outs[-1], pop_W * (1 - droprate)) + pop_b
    pop_drop_act = T.dot(pop_drop_outs[-1], pop_W) + pop_b

    #pop_max_act = T.max(pop_act, axis=1).flatten(2)
    #pop_drop_max_act = T.max(pop_drop_act, axis=1).flatten(2)
    pop_sum_act = T.sum(pop_act, axis=1).flatten(2)
    pop_drop_sum_act = T.sum(pop_drop_act, axis=1).flatten(2)

    pop_sen_max = T.argmax(T.max(pop_act, axis=2).flatten(2), axis=1)
    pop_drop_sen_max = T.argmax(T.max(pop_drop_act, axis=2).flatten(2), axis=1)

    #pop_probs = T.nnet.softmax(pop_max_act)
    #pop_drop_probs = T.nnet.softmax(pop_drop_max_act)

    pop_probs = T.nnet.softmax(pop_sum_act)
    pop_drop_probs = T.nnet.softmax(pop_drop_sum_act)

    pop_y_pred = T.argmax(pop_probs, axis=1)
    pop_drop_y_pred = T.argmax(pop_drop_probs, axis=1)

    pop_neg_loglikelihood = -T.mean(
        T.log(pop_probs)[T.arange(pop_y.shape[0]), pop_y])
    pop_drop_neg_loglikelihood = -T.mean(
        T.log(pop_drop_probs)[T.arange(pop_y.shape[0]), pop_y])

    pop_errors = T.mean(T.neq(pop_y_pred, pop_y))
    pop_errors_detail = T.neq(pop_y_pred, pop_y)

    pop_cost = pop_neg_loglikelihood
    pop_drop_cost = pop_drop_neg_loglikelihood

    ########################
    ## Task 1: event type###
    ########################
    type_layer_sizes = zip(type_hidden_units, type_hidden_units[1:])
    type_layer_input = sen_vecs
    type_drop_input = sen_vecs
    type_hidden_outs = []
    type_drop_outs = []
    type_hidden_layers = []
    type_drop_layers = []
    droprate = 0.5
    for layer_size in type_layer_sizes[:-1]:
        U_value = np.random.random(layer_size).astype(theano.config.floatX)
        b_value = np.zeros((layer_size[-1], ), dtype=theano.config.floatX)

        U = theano.shared(U_value, borrow=True, name="U")
        b = theano.shared(b_value, borrow=True, name="b")

        type_hidden_layer = nn.HiddenLayer(rng, type_layer_input,
                                           layer_size[0], layer_size[1], ReLU,
                                           U * (1 - droprate), b)
        type_drop_hidden_layer = nn.DropoutHiddenLayer(rng, type_drop_input,
                                                       layer_size[0],
                                                       layer_size[1], ReLU,
                                                       droprate, U, b)

        type_hidden_layers.append(type_hidden_layer)
        type_drop_layers.append(type_drop_hidden_layer)

        type_hidden_out = type_hidden_layer.output
        type_drop_out = type_drop_hidden_layer.output

        type_layer_input = type_hidden_out
        type_drop_input = type_drop_out

        type_hidden_outs.append(type_hidden_out)
        type_drop_outs.append(type_drop_out)

    # construct pop classifier
    n_in, n_out = type_layer_sizes[-1]
    W_value = np.random.random((n_in, n_out)).astype(theano.config.floatX)
    b_value = np.zeros((n_out, ), dtype=theano.config.floatX)

    type_W = theano.shared(W_value, borrow=True, name="pop_W")
    type_b = theano.shared(b_value, borrow=True, name="pop_b")

    type_act = T.dot(type_hidden_outs[-1], type_W * (1 - droprate)) + type_b
    type_drop_act = T.dot(type_drop_outs[-1], type_W) + type_b

    #type_max_act = T.max(type_act, axis=1).flat2en(2)
    #type_drop_max_act = T.max(type_drop_act, axis=1).flatten(2)

    type_sum_act = T.sum(type_act, axis=1).flatten(2)
    type_drop_sum_act = T.sum(type_drop_act, axis=1).flatten(2)

    type_sen_max = T.argmax(T.max(type_act, axis=2).flatten(2), axis=1)
    type_drop_sen_max = T.argmax(T.max(type_drop_act, axis=2).flatten(2),
                                 axis=1)

    #type_probs = T.nnet.softmax(type_max_act)
    #type_drop_probs = T.nnet.softmax(type_drop_max_act)

    type_probs = T.nnet.softmax(type_sum_act)
    type_drop_probs = T.nnet.softmax(type_drop_sum_act)

    type_y_pred = T.argmax(type_probs, axis=1)
    type_drop_y_pred = T.argmax(type_drop_probs, axis=1)

    type_neg_loglikelihood = -T.mean(
        T.log(type_probs)[T.arange(type_y.shape[0]), type_y])
    type_drop_neg_loglikelihood = -T.mean(
        T.log(type_drop_probs)[T.arange(type_y.shape[0]), type_y])

    type_errors = T.mean(T.neq(type_y_pred, type_y))
    type_errors_detail = T.neq(type_y_pred, type_y)

    type_cost = type_neg_loglikelihood
    type_drop_cost = type_drop_neg_loglikelihood

    ###################################
    ## Choose the max sens in two task#
    ###################################
    pop_drop_choosed_sens = sen_vecs[T.arange(sen_vecs.shape[0]),
                                     pop_drop_sen_max]
    type_drop_choosed_sens = sen_vecs[T.arange(sen_vecs.shape[0]),
                                      type_drop_sen_max]
    simi_drop_cost = T.mean(
        T.exp(
            T.sum((pop_drop_choosed_sens - type_drop_choosed_sens)**2,
                  axis=1)))

    pop_choosed_sens = sen_vecs[T.arange(sen_vecs.shape[0]), pop_sen_max]
    type_choosed_sens = sen_vecs[T.arange(sen_vecs.shape[0]), type_sen_max]
    simi_cost = T.mean(
        T.exp(T.sum((pop_choosed_sens - type_choosed_sens)**2, axis=1)))

    ##################################
    # Collect all the parameters #####
    ##################################
    params = []
    # convolution layer params
    for conv_layer in conv_layers:
        params += conv_layer.params

    # params for population task
    for layer in pop_drop_layers:
        params += layer.params

    params.append(pop_W)
    params.append(pop_b)

    # params for event type task
    for layer in type_drop_layers:
        params += layer.params

    params.append(type_W)
    params.append(type_b)

    if non_static:
        params.append(words)

    total_cost = pop_cost + type_cost
    total_drop_cost = pop_drop_cost + type_drop_cost

    if sen_reg:
        simi_weight = 0.05
        total_cost += simi_weight * simi_cost
        total_drop_cost += simi_drop_cost
    if L2:
        l2_norm = 0.1 * T.sum(pop_W**2) + 0.1 * T.sum(type_W**2)
        for drop_layer in type_drop_layers:
            l2_norm += 0.1 * T.sum(drop_layer.W**2)

        for drop_layer in pop_drop_layers:
            l2_norm += 0.1 * T.sum(drop_layer.W**2)
        total_cost += l2_norm
        total_drop_cost += l2_norm

    total_grad_updates = sgd_updates_adadelta(params, total_drop_cost,
                                              lr_decay, 1e-6, sqr_norm_lim)

    total_preds = [pop_y_pred, type_y_pred]
    total_errors_details = [pop_errors_detail, type_errors_detail]
    total_choosed_sens = [pop_sen_max, type_sen_max]
    total_out = total_preds + total_errors_details + total_choosed_sens

    #####################
    # Construct Dataset #
    #####################
    print "Copy data to GPU and constrct train/valid/test func"
    np.random.seed(1234)

    train_x, train_pop_y, train_type_y = shared_dataset(dataset[0])
    valid_x, valid_pop_y, valid_type_y = shared_dataset(dataset[1])
    test_x, test_pop_y, test_type_y = shared_dataset(dataset[2])

    n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size))
    n_valid_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size))
    n_test_batches = int(np.ceil(1.0 * len(dataset[2][0]) / batch_size))

    #####################
    # Train model func #
    #####################
    index = T.iscalar()
    train_func = function(
        [index],
        total_drop_cost,
        updates=total_grad_updates,
        givens={
            x: train_x[index * batch_size:(index + 1) * batch_size],
            pop_y: train_pop_y[index * batch_size:(index + 1) * batch_size],
            type_y: train_type_y[index * batch_size:(index + 1) * batch_size]
        })

    valid_train_func = function(
        [index],
        total_drop_cost,
        updates=total_grad_updates,
        givens={
            x: valid_x[index * batch_size:(index + 1) * batch_size],
            pop_y: valid_pop_y[index * batch_size:(index + 1) * batch_size],
            type_y: valid_type_y[index * batch_size:(index + 1) * batch_size]
        })

    test_pred_detail = function(
        [index],
        total_out,
        givens={
            x: test_x[index * batch_size:(index + 1) * batch_size],
            pop_y: test_pop_y[index * batch_size:(index + 1) * batch_size],
            type_y: test_type_y[index * batch_size:(index + 1) * batch_size]
        })

    # apply early stop strategy
    patience = 100
    patience_increase = 2
    improvement_threshold = 1.005

    n_valid = len(dataset[1][0])
    n_test = len(dataset[2][0])

    epoch = 0
    best_params = None
    best_validation_score = 0.
    test_perf = 0

    done_loop = False

    log_file = open(log_fn, 'w')

    print "Start to train the model....."

    total_score = 0.0
    while (epoch < n_epochs) and not done_loop:
        start_time = timeit.default_timer()
        epoch += 1
        costs = []
        for minibatch_index in np.random.permutation(range(n_train_batches)):
            cost_epoch = train_func(minibatch_index)
            costs.append(cost_epoch)
            set_zero(zero_vec)

        # do validatiovalidn
        valid_cost = [
            valid_train_func(i)
            for i in np.random.permutation(xrange(n_valid_batches))
        ]

        if epoch % print_freq == 0:
            # do test
            pop_preds = []
            type_preds = []
            pop_errors = []
            type_errors = []
            pop_sens = []
            type_sens = []

            for i in xrange(n_test_batches):
                test_pop_pred, test_type_pred, test_pop_error, test_type_error, test_pop_sen, test_type_sen = test_pred_detail(
                    i)

                pop_preds.append(test_pop_pred)
                type_preds.append(test_type_pred)
                pop_errors.append(test_pop_error)
                type_errors.append(test_type_error)
                pop_sens.append(test_pop_sen)
                type_sens.append(test_type_sen)

            pop_preds = np.concatenate(pop_preds)
            type_preds = np.concatenate(type_preds)
            pop_errors = np.concatenate(pop_errors)
            type_errors = np.concatenate(type_errors)
            pop_sens = np.concatenate(pop_sens)
            type_sens = np.concatenate(type_sens)

            pop_perf = 1 - np.mean(pop_errors)
            type_perf = 1 - np.mean(type_errors)

            # dumps the predictions and the choosed sentences
            with open(
                    os.path.join(perf_fn,
                                 "%s_%d.pop_pred" % (exp_name, epoch)),
                    'w') as epf:
                for p in pop_preds:
                    epf.write("%d\n" % int(p))

            with open(
                    os.path.join(perf_fn,
                                 "%s_%d.type_pred" % (exp_name, epoch)),
                    'w') as epf:
                for p in type_preds:
                    epf.write("%d\n" % int(p))
            print pop_sens
            with open(
                    os.path.join(perf_fn,
                                 "%s_%d.pop_sens" % (exp_name, epoch)),
                    'w') as epf:
                for s in pop_sens:
                    epf.write("%d\n" % int(s))

            with open(
                    os.path.join(perf_fn,
                                 "%s_%d.type_sens" % (exp_name, epoch)),
                    'w') as epf:
                for s in type_sens:
                    epf.write("%d\n" % int(s))

            message = "Epoch %d test pop perf %f, type perf %f, training_cost %f" % (
                epoch, pop_perf, type_perf, np.mean(costs))
            print message
            log_file.write(message + "\n")
            log_file.flush()

            if (pop_perf + type_perf) > total_score:
                total_score = pop_perf + type_perf
                # save the model
                model_name = os.path.join(
                    perf_fn, "%s_%d.best_model" % (exp_name, epoch))
                with open(model_name, 'wb') as mn:
                    for param in params:
                        cPickle.dump(param.get_value(), mn)

        end_time = timeit.default_timer()
        print "Finish one iteration using %f m" % (
            (end_time - start_time) / 60.)

    # output the final model params
    print "Output the final model"
    model_name = os.path.join(perf_fn, "%s_%d.final_model" % (exp_name, epoch))
    with open(model_name, 'wb') as mn:
        for param in params:
            cPickle.dump(param.get_value(), mn)

    log_file.flush()
    log_file.close()
示例#2
0
    def run_experiment(self, dataset, word_embedding, exp_name):

        # load parameters
        num_maps_word = self.options["num_maps_word"]
        drop_rate_word = self.options["drop_rate_word"]
        drop_rate_sentence = self.options["drop_rate_sentence"]
        word_window = self.options["word_window"]
        word_dim = self.options["word_dim"]
        k_max_word = self.options["k_max_word"]
        batch_size = self.options["batch_size"]
        rho = self.options["rho"]
        epsilon = self.options["epsilon"]
        norm_lim = self.options["norm_lim"]
        max_iteration = self.options["max_iteration"]

        sentence_len = len(dataset[0][0][0][0])

        # compute the sentence flags
        train_flags, test_flags = construct_sentence_flag(dataset)
        train_flags = theano.shared(value=np.asarray(train_flags, dtype=theano.config.floatX), borrow=True)
        test_flags = theano.shared(value=np.asarray(test_flags, dtype=theano.config.floatX), borrow=True)


        # define the parameters
        x = T.tensor3("x")
        y = T.ivector("y")
        sen_flags = T.matrix("flag")
        rng = np.random.RandomState(1234)

        words = theano.shared(value=np.asarray(word_embedding,
            dtype=theano.config.floatX),
            name="embedding", borrow=True)
        zero_vector_tensor = T.vector()
        zero_vec = np.zeros(word_dim, dtype=theano.config.floatX)
        set_zero = theano.function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0,:], zero_vector_tensor))])

        x_emb = words[T.cast(x.flatten(), dtype="int32")].reshape((x.shape[0]*x.shape[1], 1, x.shape[2], words.shape[1]))

        dropout_x_emb = nn.dropout_from_layer(rng, x_emb, drop_rate_word)

        # compute convolution on words layer
        word_filter_shape = (num_maps_word, 1, word_window, word_dim)
        word_pool_size = (sentence_len - word_window + 1, 1)
        dropout_word_conv = nn.ConvPoolLayer(rng,
                input=dropout_x_emb,
                input_shape=None,
                filter_shape=word_filter_shape,
                pool_size=word_pool_size,
                activation=Tanh,
                k=k_max_word)
        sent_vec_dim = num_maps_word*k_max_word
        dropout_sent_vec = dropout_word_conv.output.reshape((x.shape[0] * x.shape[1], sent_vec_dim))

        word_conv = nn.ConvPoolLayer(rng,
                input=dropout_x_emb*(1 - drop_rate_word),
                input_shape=None,
                filter_shape=word_filter_shape,
                pool_size=word_pool_size,
                activation=Tanh,
                k=k_max_word,
                W=dropout_word_conv.W,
                b=dropout_word_conv.b)
        sent_vec = word_conv.output.reshape((x.shape[0] * x.shape[1], sent_vec_dim))

        # construct sentence level classifier
        n_in = sent_vec_dim
        n_out = 1
        sen_W_values = np.zeros((n_in, n_out), dtype=theano.config.floatX)
        sen_W = theano.shared(value=sen_W_values, borrow=True, name="logis_W")
        sen_b_value = nn.as_floatX(0.0)
        sen_b = theano.shared(value=sen_b_value, borrow=True, name="logis_b")

        drop_sent_prob = T.nnet.sigmoid(T.dot(dropout_sent_vec, sen_W) + sen_b)
        sent_prob = T.nnet.sigmoid(T.dot(sent_vec, sen_W*(1-drop_rate_sentence)) + sen_b)

        # reform the sent vec to doc level
        drop_sent_prob = drop_sent_prob.reshape((x.shape[0], x.shape[1]))
        sent_prob = sent_prob.reshape((x.shape[0], x.shape[1]))
        # the pos probability bag label is the avg of the probs
        drop_doc_prob = T.sum(drop_sent_prob * sen_flags, axis=1) / T.sum(sen_flags, axis=1)
        doc_prob = T.sum(sent_prob * sen_flags, axis=1) / T.sum(sen_flags, axis=1)

        drop_doc_prob = T.clip(drop_doc_prob, nn.as_floatX(1e-7), nn.as_floatX(1 - 1e-7 ))
        doc_prob = T.clip(doc_prob, nn.as_floatX(1e-7), nn.as_floatX(1 - 1e-7 ))
        """
        # the pos probability bag label equals to 1 - all negative
        drop_doc_prob = T.prod(drop_sent_prob, axis=1)
        drop_doc_prob = T.set_subtensor(drop_doc_prob[:,1], 1 - drop_doc_prob[:,0])

        doc_prob = T.prod(sent_prob, axis=1)
        doc_prob = T.set_subtensor(doc_prob[:,1], 1 - doc_prob[:,0])

        # the pos probability bag label is the most positive probability
        drop_doc_prob = T.max(drop_sent_prob, axis=1)
        drop_doc_prob = T.clip(drop_doc_prob, nn.as_floatX(1e-7), nn.as_floatX(1 - 1e-7 ))
        doc_prob = T.max(sent_prob, axis=1)
        doc_prob = T.clip(doc_prob, nn.as_floatX(1e-7), nn.as_floatX(1 - 1e-7 ))
        """

        doc_preds = doc_prob > 0.5

        # instance level cost
        drop_sent_cost = T.sum(T.maximum(0.0, nn.as_floatX(.5) - T.sgn(drop_sent_prob.reshape((x.shape[0]*x.shape[1], n_out)) - nn.as_floatX(0.6)) * T.dot(dropout_sent_vec, sen_W)) * sen_flags.reshape((x.shape[0]*x.shape[1], n_out))) / T.sum(sen_flags)

        # we need that the most positive instance at least 0.7 in pos bags
        # and at most 0.1 in neg bags
        # we want the number of positive instance should at least ...
        # and non of the positive instances in the negative bags
        
        # compute the number of positive instance
        positive_count = T.sum((drop_sent_prob * sen_flags) > 0.5, axis=1)
        pos_cost = T.maximum(nn.as_floatX(0.0), nn.as_floatX(2) - positive_count)
        neg_cost = T.maximum(nn.as_floatX(0.0), positive_count)
        
        """
        most_positive_prob = T.max(drop_sent_prob, axis=1)
        pos_cost = T.maximum(0.0, nn.as_floatX(0.6) - most_positive_prob)
        neg_cost = T.maximum(0.0, most_positive_prob - nn.as_floatX(0.05))
        """
        penal_cost = T.mean(pos_cost * y + neg_cost * (nn.as_floatX(1.0) - y))

        # add the sentence similarity constrains
        sen_sen = T.dot(dropout_sent_vec, dropout_sent_vec.T)
        sen_sqr = T.sum(dropout_sent_vec ** 2, axis=1)
        sen_sqr_left = sen_sqr.dimshuffle(0, 'x')
        sen_sqr_right = sen_sqr.dimshuffle('x', 0)
        sen_sim_matrix = sen_sqr_left - 2 * sen_sqr + sen_sqr_right
        sen_sim_matrix = T.exp(-1 * sen_sim_matrix)

        sen_sim_prob = drop_sent_prob.reshape((x.shape[0]*x.shape[1], 1)) - drop_sent_prob.flatten()
        sen_sim_prob = sen_sim_prob ** 2

        sen_sim_flag = T.dot(sen_flags.reshape((x.shape[0]*x.shape[1],1)), sen_flags.reshape((1,x.shape[0]*x.shape[1])))

        sen_sim_cost = T.sum(sen_sim_matrix * sen_sim_prob * sen_sim_flag) / T.sum(sen_sim_flag)


        # bag level cost
        drop_bag_cost = T.mean(-y * T.log(drop_doc_prob) * nn.as_floatX(0.6) - (1 - y) * T.log(1 - drop_doc_prob) * nn.as_floatX(0.4))
        #drop_cost = drop_bag_cost * nn.as_floatX(3.0) + drop_sent_cost + nn.as_floatX(2.0) * penal_cost
        drop_cost = drop_bag_cost * nn.as_floatX(0.6) + drop_sent_cost * nn.as_floatX(0.1) + penal_cost * nn.as_floatX(0.5)  + sen_sim_cost * nn.as_floatX(0.0001)


        # collect parameters
        self.params.append(words)
        self.params += dropout_word_conv.params
        self.params.append(sen_W)
        self.params.append(sen_b)

        grad_updates = nn.sgd_updates_adadelta(self.params,
                drop_cost,
                rho,
                epsilon,
                norm_lim)

        # construct the dataset
        # random the
        train_x, train_y = nn.shared_dataset(dataset[0])
        test_x, test_y = nn.shared_dataset(dataset[1])
        test_cpu_y = dataset[1][1]

        n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size))
        n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size))

        # construt the model
        index = T.iscalar()
        train_func = theano.function([index], [drop_cost, drop_bag_cost, drop_sent_cost, penal_cost, sen_sim_cost], updates=grad_updates,
                givens={
                    x: train_x[index*batch_size:(index+1)*batch_size],
                    y: train_y[index*batch_size:(index+1)*batch_size],
                    sen_flags: train_flags[index*batch_size:(index+1)*batch_size]
                    })

        test_func = theano.function([index], doc_preds,
                givens={
                    x:test_x[index*batch_size:(index+1)*batch_size],
                    sen_flags: test_flags[index*batch_size:(index+1)*batch_size]
                    })

        get_train_sent_prob = theano.function([index], sent_prob,
                givens={
                    x:train_x[index*batch_size:(index+1)*batch_size]
                    })

        get_test_sent_prob = theano.function([index], sent_prob,
                givens={
                    x:test_x[index*batch_size:(index+1)*batch_size]
                    })

        epoch = 0
        best_score = 0


        log_file = open("./log/%s.log" % exp_name, 'w')

        while epoch <= max_iteration:
            start_time = timeit.default_timer()
            epoch += 1
            costs = []

            for minibatch_index in np.random.permutation(range(n_train_batches)):
                cost_epoch = train_func(minibatch_index)
                costs.append(cost_epoch)
                set_zero(zero_vec)

            total_train_cost, train_bag_cost, train_sent_cost, train_penal_cost, train_sim_cost = zip(*costs)
            print "Iteration %d, total_cost %f bag_cost %f sent_cost %f penal_cost %f sim cost %f\n" %  (epoch, np.mean(total_train_cost), np.mean(train_bag_cost), np.mean(train_sent_cost), np.mean(train_penal_cost), np.mean(train_sim_cost))

            if epoch % 1 == 0:
                test_preds = []
                for i in xrange(n_test_batches):
                    test_y_pred = test_func(i)
                    test_preds.append(test_y_pred)
                test_preds = np.concatenate(test_preds)
                test_score = 1 - np.mean(np.not_equal(test_cpu_y, test_preds))

                precision, recall, beta, support = precision_recall_fscore_support(test_cpu_y, test_preds, pos_label=1)

                if beta[1] > best_score or epoch % 5 == 0:
                    best_score = beta[1]
                    # save the sentence vectors
                    train_sens = [get_train_sent_prob(i) for i in range(n_train_batches)]
                    test_sens = [get_test_sent_prob(i) for i in range(n_test_batches)]

                    train_sens = np.concatenate(train_sens, axis=0)
                    test_sens = np.concatenate(test_sens, axis=0)

                    out_train_sent_file = "./results/%s_train_sent_%d.vec" % (exp_name, epoch)
                    out_test_sent_file = "./results/%s_test_sent_%d.vec" % (exp_name, epoch)

                    with open(out_test_sent_file, 'w') as test_f, open(out_train_sent_file, 'w') as train_f:
                        cPickle.dump(train_sens, train_f)
                        cPickle.dump(test_sens, test_f)
                    print "Get best performace at %d iteration %f" % (epoch, test_score)
                    log_file.write("Get best performance at %d iteration %f \n" % (epoch, test_score))

                end_time = timeit.default_timer()
                print "Iteration %d , precision, recall, f1" % epoch, precision, recall, beta
                log_file.write("Iteration %d, neg precision %f, pos precision %f, neg recall %f pos recall %f , neg f1 %f, pos f1 %f, total_cost %f bag_cost %f sent_cost %f penal_cost %f\n" % (epoch, precision[0], precision[1], recall[0], recall[1], beta[0], beta[1], np.mean(total_train_cost), np.mean(train_bag_cost), np.mean(train_sent_cost), np.mean(train_penal_cost)))
                print "Using time %f m" % ((end_time -start_time)/60.)
                log_file.write("Uing time %f m\n" % ((end_time - start_time)/60.))
            end_time = timeit.default_timer()
            print "Iteration %d Using time %f m" % ( epoch, (end_time -start_time)/60.)
            log_file.write("Uing time %f m\n" % ((end_time - start_time)/60.))
            log_file.flush()

        log_file.close()
示例#3
0
def construct_model(params, datasets, filter_hs=[3, 4, 5], batch_size=200):
    rng = np.random.RandomState(1234)
    input_height = len(datasets[0][0]) - 2
    input_width = params["embedding"].shape[1]
    filter_shapes = [p[0].shape for p in params["convs"]]
    pool_sizes = [(input_height - s[2] + 1, input_width - s[3] + 1)
                  for s in filter_shapes]

    param_sizes = {
        "input_height": input_height,
        "input_width": input_width,
        "filter_shapes": filter_shapes,
        "pool_sizes": pool_sizes
    }

    print "Param sizes: ", param_sizes
    index = T.iscalar()
    x = T.matrix('x')
    y = T.ivector('y')

    print '....Construct model'
    word_embedding = params["embedding"]
    words = shared(word_embedding, name='embedding')
    layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape(\
            (x.shape[0], 1, x.shape[1], words.shape[1]))
    # construct layers
    conv_layers = []
    conv_params = params["convs"]
    layer1_inputs = []
    for i, filter_h in enumerate(filter_hs):
        filter_shape = filter_shapes[i]
        pool_size = pool_sizes[i]
        conv_W = shared(value=np.asarray(conv_params[i][0],
                                         dtype=theano.config.floatX),
                        borrow=True,
                        name='conv_W')
        conv_b = shared(value=np.asarray(conv_params[i][1],
                                         dtype=theano.config.floatX),
                        borrow=True,
                        name='conv_b')
        conv_layer = nn.ConvPoolLayer(rng,
                                      input=layer0_input,
                                      input_shape=(batch_size, 1, input_height,
                                                   input_width),
                                      filter_shape=filter_shape,
                                      pool_size=pool_size,
                                      activation=ReLU,
                                      W=conv_W,
                                      b=conv_b)
        conv_layers.append(conv_layer)
        layer1_input = conv_layer.output.flatten(2)
        layer1_inputs.append(layer1_input)

    layer1_input = T.concatenate(layer1_inputs, 1)

    # population classifier
    pop_hidden_units = [300, 13]
    clf_w, clf_b = params["clf"]
    Ws = [
        shared(value=np.asarray(clf_w, dtype=theano.config.floatX),
               borrow=True,
               name='logis_w')
    ]
    bs = [
        shared(value=np.asarray(clf_b, dtype=theano.config.floatX),
               borrow=True,
               name='logis_b')
    ]

    pop_classifier = nn.MLPDropout(rng,
                                   input=layer1_input,
                                   layer_sizes=pop_hidden_units,
                                   dropout_rates=[0.5],
                                   activations=[ReLU],
                                   Ws=Ws,
                                   bs=bs)

    pop_loss = pop_classifier.errors(y)
    pop_pred = pop_classifier.preds

    # construct data set
    if datasets[0].shape[0] % batch_size > 0:
        extra_data_num = batch_size - datasets[0].shape[0] % batch_size
        train_set = np.random.permutation(datasets[0])
        extra_data = train_set[:extra_data_num]
        new_data = np.append(datasets[0], extra_data, axis=0)
    else:
        new_data = dataset[0]

    new_data = np.random.permutation(new_data)
    n_batches = new_data.shape[0] / batch_size
    n_train_batches = int(np.round(n_batches * 0.9))
    train_set = new_data[:n_train_batches * batch_size, :]
    train_set_x = theano.shared(np.asarray(train_set[:, :input_height],
                                           dtype=theano.config.floatX),
                                borrow=True)
    train_set_pop_y = T.cast(
        theano.shared(np.asarray(train_set[:, -2], dtype=theano.config.floatX),
                      borrow=True), 'int32')

    print '...construct test function'
    test_fn = function(
        inputs=[index],
        outputs=[pop_loss, pop_pred],
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_pop_y[index * batch_size:(index + 1) * batch_size]
        })

    results = [test_fn(i) for i in xrange(n_train_batches)]
    pop_losses = [r[0] for r in results]
    pop_train_perf = 1 - np.mean(pop_losses)
    pop_predictions = np.concatenate([r[1] for r in results])
    rs = {}
    rs["pop_preds"] = list(pop_predictions)
    rs["pop_truth"] = list(map(int, train_set[:, -2]))
    print "Population Train Performance %f" % pop_train_perf

    return rs
def run_cnn(exp_name,
            dataset,
            embedding,
            log_fn,
            perf_fn,
            k=0,
            emb_dm=100,
            batch_size=100,
            filter_hs=[1, 2, 3],
            hidden_units=[200, 100, 11],
            dropout_rate=0.5,
            shuffle_batch=True,
            n_epochs=300,
            lr_decay=0.95,
            activation=ReLU,
            sqr_norm_lim=9,
            non_static=True,
            print_freq=5):
    """
    Train and Evaluate CNN event encoder model
    :dataset: list containing three elements[(train_x, train_y),
            (valid_x, valid_y), (test_x, test_y)]
    :embedding: word embedding with shape (|V| * emb_dm)
    :filter_hs: filter height for each paralle cnn layer
    :dropout_rate: dropout rate for full connected layers
    :n_epochs: the max number of iterations

    """
    start_time = timeit.default_timer()
    rng = np.random.RandomState(1234)

    input_height = len(dataset[0][0][0][0])
    num_sens = len(dataset[0][0][0])
    print "--input height ", input_height
    num_maps = hidden_units[0]

    ###################
    # start snippet 1 #
    ###################
    print "start to construct the model ...."
    word_x = T.tensor3("word_x")
    freq_x = T.tensor3("freq_x")
    pos_x = T.tensor3("pos_x")
    sent_x = T.matrix("sent_x")
    y_event = T.ivector("y_event")

    words = shared(value=np.asarray(embedding, dtype=theano.config.floatX),
                   name="embedding",
                   borrow=True)

    sym_dim = 20
    # the frequency embedding is 21 * sym_dim matrix
    freq_val = np.random.random((21, sym_dim)).astype(theano.config.floatX)
    freqs = shared(value=freq_val, borrow=True, name="freqs")

    pos_val = np.random.random((21, sym_dim)).astype(theano.config.floatX)
    poss = shared(value=pos_val, borrow=True, name="poss")

    # define function to keep padding vector as zero
    zero_vector_tensor = T.vector()
    zero_vec = np.zeros(emb_dm, dtype=theano.config.floatX)
    set_zero = function([zero_vector_tensor],
                        updates=[(words,
                                  T.set_subtensor(words[0, :],
                                                  zero_vector_tensor))])

    freq_zero_tensor = T.vector()
    freq_zero_vec = np.zeros(sym_dim, dtype=theano.config.floatX)
    freq_set_zero = function([freq_zero_tensor],
                             updates=[(freqs,
                                       T.set_subtensor(freqs[0, :],
                                                       freq_zero_tensor))])

    pos_zero_tensor = T.vector()
    pos_zero_vec = np.zeros(sym_dim, dtype=theano.config.floatX)
    pos_set_zero = function([pos_zero_tensor],
                            updates=[(poss,
                                      T.set_subtensor(poss[0, :],
                                                      pos_zero_tensor))])

    word_x_emb = words[T.cast(word_x.flatten(), dtype="int32")].reshape(
        (word_x.shape[0] * word_x.shape[1], 1, word_x.shape[2], emb_dm))
    freq_x_emb = freqs[T.cast(freq_x.flatten(), dtype="int32")].reshape(
        (freq_x.shape[0] * freq_x.shape[1], 1, freq_x.shape[2], sym_dim))
    pos_x_emb = poss[T.cast(pos_x.flatten(), dtype="int32")].reshape(
        (pos_x.shape[0] * pos_x.shape[1], 1, pos_x.shape[2], sym_dim))

    layer0_input = T.concatenate([word_x_emb, freq_x_emb, pos_x_emb], axis=3)
    conv_layers = []
    layer1_inputs = []

    for i in xrange(len(filter_hs)):
        filter_shape = (num_maps, 1, filter_hs[i], emb_dm + sym_dim + sym_dim)
        pool_size = (input_height - filter_hs[i] + 1, 1)
        conv_layer = nn.ConvPoolLayer(rng,
                                      input=layer0_input,
                                      input_shape=None,
                                      filter_shape=filter_shape,
                                      pool_size=pool_size,
                                      activation=activation)
        sen_vecs = conv_layer.output.reshape(
            (word_x.shape[0], 1, word_x.shape[1], num_maps))
        # construct multi-layer sentence vectors

        conv_layers.append(conv_layer)
        layer1_inputs.append(sen_vecs)

    sen_vec = T.concatenate(layer1_inputs, 3)
    # score the sentences
    theta_value = np.random.random((len(filter_hs) * num_maps, 1))
    theta = shared(value=np.asarray(theta_value, dtype=theano.config.floatX),
                   name="theta",
                   borrow=True)
    weighted_sen_vecs, sen_score = keep_max(sen_vec, theta, k, sent_x)
    sen_score_cost = T.mean(T.sum(sen_score, axis=2).flatten(1))
    doc_vec = T.sum(weighted_sen_vecs, axis=2)
    layer1_input = doc_vec.flatten(2)
    final_sen_score = sen_score.flatten(2)

    ##############
    # classifier pop#
    ##############
    params = []
    for conv_layer in conv_layers:
        params += conv_layer.params

    params.append(theta)
    params.append(words)
    params.append(freqs)
    params.append(poss)

    gamma = as_floatX(0.001)
    beta1 = as_floatX(0.000)
    beta2 = as_floatX(0.000)
    total_cost = gamma * sen_score_cost
    total_dropout_cost = gamma * sen_score_cost

    print "Construct classifier ...."
    hidden_units[0] = num_maps * len(filter_hs)
    model = nn.MLPDropout(rng,
                          input=layer1_input,
                          layer_sizes=hidden_units,
                          dropout_rates=[dropout_rate],
                          activations=[activation])

    params += model.params

    cost = model.negative_log_likelihood(y_event)
    dropout_cost = model.dropout_negative_log_likelihood(y_event)

    total_cost += cost + beta1 * model.L1
    total_dropout_cost += dropout_cost + beta1 * model.L1

    # using adagrad
    total_grad_updates = sgd_updates_adadelta(params, total_dropout_cost,
                                              lr_decay, 1e-6, sqr_norm_lim)

    total_preds = model.preds

    #####################
    # Construct Dataset #
    #####################
    print "Copy data to GPU and constrct train/valid/test func"

    train_word_x, train_freq_x, train_pos_x, train_sent_x, train_event_y = shared_dataset(
        dataset[0])
    test_word_x, test_freq_x, test_pos_x, test_sent_x, test_event_y = shared_dataset(
        dataset[1])

    n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size))
    n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size))

    #####################
    # Train model func #
    #####################
    index = T.iscalar()
    train_func = function(
        [index],
        total_cost,
        updates=total_grad_updates,
        givens={
            word_x: train_word_x[index * batch_size:(index + 1) * batch_size],
            freq_x: train_freq_x[index * batch_size:(index + 1) * batch_size],
            pos_x: train_pos_x[index * batch_size:(index + 1) * batch_size],
            sent_x: train_sent_x[index * batch_size:(index + 1) * batch_size],
            y_event:
            train_event_y[index * batch_size:(index + 1) * batch_size],
        })

    test_pred = function(
        [index],
        total_preds,
        givens={
            word_x: test_word_x[index * batch_size:(index + 1) * batch_size],
            freq_x: test_freq_x[index * batch_size:(index + 1) * batch_size],
            pos_x: test_pos_x[index * batch_size:(index + 1) * batch_size],
            sent_x: test_sent_x[index * batch_size:(index + 1) * batch_size]
        })

    test_sentence_est = function(
        [index],
        final_sen_score,
        givens={
            word_x: test_word_x[index * batch_size:(index + 1) * batch_size],
            freq_x: test_freq_x[index * batch_size:(index + 1) * batch_size],
            pos_x: test_pos_x[index * batch_size:(index + 1) * batch_size],
            sent_x: test_sent_x[index * batch_size:(index + 1) * batch_size]
        })

    train_sentence_est = function(
        [index],
        final_sen_score,
        givens={
            word_x: train_word_x[index * batch_size:(index + 1) * batch_size],
            freq_x: train_freq_x[index * batch_size:(index + 1) * batch_size],
            pos_x: train_pos_x[index * batch_size:(index + 1) * batch_size],
            sent_x: train_sent_x[index * batch_size:(index + 1) * batch_size]
        })

    # apply early stop strategy
    patience = 100
    patience_increase = 2
    improvement_threshold = 1.005

    n_test = len(dataset[1][0])

    epoch = 0
    best_params = None
    best_validation_score = 0.
    test_perf = 0

    done_loop = False

    log_file = open(log_fn, 'w')

    print "Start to train the model....."
    cpu_tst_event_y = np.asarray(dataset[1][4])

    def compute_score(true_list, pred_list):
        mat = np.equal(true_list, pred_list)
        score = np.mean(mat)
        return score

    best_score = 0.0
    while (epoch < n_epochs) and not done_loop:
        start_time = timeit.default_timer()
        epoch += 1
        costs = []
        for minibatch_index in np.random.permutation(range(n_train_batches)):
            cost_epoch = train_func(minibatch_index)
            costs.append(cost_epoch)
            set_zero(zero_vec)
            freq_set_zero(freq_zero_vec)
            pos_set_zero(pos_zero_vec)

        if epoch % 1 == 0:
            # do test
            test_event_preds = np.concatenate(
                [test_pred(i) for i in xrange(n_test_batches)])
            test_event_score = compute_score(cpu_tst_event_y, test_event_preds)

            precision, recall, beta, support = precision_recall_fscore_support(
                cpu_tst_event_y, test_event_preds, pos_label=1)

            with open(
                    os.path.join(perf_fn,
                                 "%s_%d.event_pred" % (exp_name, epoch)),
                    'w') as epf:
                for p in test_event_preds:
                    epf.write("%d\n" % int(p))

            message = "Epoch %d test event perf %f, precision [%f, %f], recall[%f %f] , f1[%f, %f], train cost %f" % (
                epoch, test_event_score, precision[0], precision[1], recall[0],
                recall[1], beta[0], beta[1], np.mean(costs))
            evl_score = beta[1]

            print message
            log_file.write(message + "\n")
            log_file.flush()

            if (evl_score > best_score):
                best_score = evl_score
                # save the sentence score
                test_sen_score = [
                    test_sentence_est(i) for i in xrange(n_test_batches)
                ]
                score_file = "./results/%s_%d_test.score" % (exp_name, epoch)
                with open(score_file, "wb") as sm:
                    cPickle.dump(test_sen_score, sm)

        end_time = timeit.default_timer()
        print "Finish one iteration using %f m" % (
            (end_time - start_time) / 60.)

    log_file.flush()
    log_file.close()
def run_cnn(exp_name,
            dataset,
            embedding,
            log_fn,
            perf_fn,
            emb_dm=100,
            batch_size=100,
            filter_hs=[1, 2, 3],
            hidden_units=[200, 100, 11],
            dropout_rate=0.5,
            shuffle_batch=True,
            n_epochs=300,
            lr_decay=0.95,
            activation=ReLU,
            sqr_norm_lim=9,
            non_static=True,
            sen_weight=False):
    """
    Train and Evaluate CNN event encoder model
    :dataset: list containing three elements[(train_x, train_y), 
            (valid_x, valid_y), (test_x, test_y)]
    :embedding: word embedding with shape (|V| * emb_dm)
    :filter_hs: filter height for each paralle cnn layer
    :dropout_rate: dropout rate for full connected layers
    :n_epochs: the max number of iterations
    
    """
    start_time = timeit.default_timer()
    rng = np.random.RandomState(1234)

    input_height = len(dataset[0][0][0][0])  # number of words in the sentences
    num_sens = len(dataset[0][0][0])  # number of sentences
    print "--input height ", input_height
    input_width = emb_dm
    num_maps = hidden_units[0]

    ###################
    # start snippet 1 #
    ###################
    print "start to construct the model ...."
    x = T.tensor3("x")
    y = T.ivector("y")

    words = shared(value=np.asarray(embedding, dtype=theano.config.floatX),
                   name="embedding",
                   borrow=True)

    # define function to keep padding vector as zero
    zero_vector_tensor = T.vector()
    zero_vec = np.zeros(input_width, dtype=theano.config.floatX)
    set_zero = function([zero_vector_tensor],
                        updates=[(words,
                                  T.set_subtensor(words[0, :],
                                                  zero_vector_tensor))])

    # the input for the sentence level conv layers
    layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape(
        (x.shape[0] * x.shape[1], 1, x.shape[2], emb_dm))

    conv_layers = []
    layer1_inputs = []

    for i in xrange(len(filter_hs)):
        filter_shape = (num_maps, 1, filter_hs[i], emb_dm)
        pool_size = (input_height - filter_hs[i] + 1, 1)
        conv_layer = nn.ConvPoolLayer(rng,
                                      input=layer0_input,
                                      input_shape=None,
                                      filter_shape=filter_shape,
                                      pool_size=pool_size,
                                      activation=activation)

        sen_vecs = conv_layer.output.reshape(
            (x.shape[0], x.shape[1], num_maps))
        sen_vecs = sen_vecs.dimshuffle(0, 2, 1)
        # construct the weighted sentences
        if sen_weight:  # using sentence weight
            #s_w = 1. / T.arange(1, x.shape[1] + 1)
            s_w = T.arange(1, x.shape[1] + 1)
            s_w = (1.0 * x.shape[0] - s_w) / T.sum(s_w)
            sen_vecs = sen_vecs * s_w

        # using max in each dimension to represent the document vec
        doc_vec = T.sum(sen_vecs, axis=2).flatten(2)
        layer1_inputs.append(doc_vec)
        conv_layers.append(conv_layer)
        """
        doc_filter_shape = (num_maps, 1, 2, num_maps)
        doc_pool_size = (num_sens - 2 + 1, 1)
        doc_conv_layer = nn.ConvPoolLayer(rng, input=sen_vecs, 
                input_shape=None,
                filter_shape=doc_filter_shape,
                pool_size=doc_pool_size,
                activation=activation)

        layer1_input = doc_conv_layer.output.flatten(2)
        conv_layers.append(conv_layer)
        conv_layers.append(doc_conv_layer)

        layer1_inputs.append(layer1_input)
        """

    layer1_input = T.concatenate(layer1_inputs, 1)

    ##############
    # classifier #
    ##############
    print "Construct classifier ...."
    hidden_units[0] = num_maps * len(filter_hs)
    model = nn.MLPDropout(rng,
                          input=layer1_input,
                          layer_sizes=hidden_units,
                          dropout_rates=[dropout_rate],
                          activations=[activation])

    params = model.params
    for conv_layer in conv_layers:
        params += conv_layer.params

    if non_static:
        params.append(words)

    cost = model.negative_log_likelihood(y)
    dropout_cost = model.dropout_negative_log_likelihood(y)
    grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6,
                                        sqr_norm_lim)

    #####################
    # Construct Dataset #
    #####################
    print "Copy data to GPU and constrct train/valid/test func"
    np.random.seed(1234)

    train_x, train_y = shared_dataset(dataset[0])
    valid_x, valid_y = shared_dataset(dataset[1])
    test_x, test_y = shared_dataset(dataset[2])

    n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size))
    n_valid_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size))
    n_test_batches = int(np.ceil(1.0 * len(dataset[2][0]) / batch_size))

    #####################
    # Train model func #
    #####################
    index = T.iscalar()
    train_func = function(
        [index],
        cost,
        updates=grad_updates,
        givens={
            x: train_x[index * batch_size:(index + 1) * batch_size],
            y: train_y[index * batch_size:(index + 1) * batch_size]
        })

    valid_train_func = function(
        [index],
        cost,
        updates=grad_updates,
        givens={
            x: valid_x[index * batch_size:(index + 1) * batch_size],
            y: valid_y[index * batch_size:(index + 1) * batch_size]
        })

    train_pred = function(
        [index],
        model.preds,
        givens={x: train_x[index * batch_size:(index + 1) * batch_size]})

    valid_pred = function([index],
                          model.preds,
                          givens={
                              x:
                              valid_x[index * batch_size:(index + 1) *
                                      batch_size],
                          })

    test_pred = function([index],
                         model.preds,
                         givens={
                             x:
                             test_x[index * batch_size:(index + 1) *
                                    batch_size],
                         })

    # apply early stop strategy
    patience = 100
    patience_increase = 2
    improvement_threshold = 1.005

    n_valid = len(dataset[1][0])
    n_test = len(dataset[2][0])

    epoch = 0
    best_params = None
    best_validation_score = 0.
    test_perf = 0

    done_loop = False

    log_file = open(log_fn, 'a')

    print "Start to train the model....."
    cpu_trn_y = np.asarray(dataset[0][1])
    cpu_val_y = np.asarray(dataset[1][1])
    cpu_tst_y = np.asarray(dataset[2][1])

    def compute_score(true_list, pred_list):
        mat = np.equal(true_list, pred_list)
        score = np.mean(mat)
        return score

    best_test_score = 0.
    while (epoch < n_epochs) and not done_loop:
        start_time = timeit.default_timer()
        epoch += 1
        costs = []
        for minibatch_index in np.random.permutation(range(n_train_batches)):
            cost_epoch = train_func(minibatch_index)
            costs.append(cost_epoch)
            set_zero(zero_vec)

        # do validatiovalidn
        valid_cost = [
            valid_train_func(i)
            for i in np.random.permutation(xrange(n_valid_batches))
        ]

        if epoch % 5 == 0:
            # do test
            test_preds = np.concatenate(
                [test_pred(i) for i in xrange(n_test_batches)])
            test_score = compute_score(cpu_tst_y, test_preds)

            with open(os.path.join(perf_fn, "%s_%d.pred" % (exp_name, epoch)),
                      'w') as epf:
                for p in test_preds:
                    epf.write("%d\n" % int(p))
                message = "Epoch %d test perf %f" % (epoch, test_score)
            print message
            log_file.write(message + "\n")
            log_file.flush()

            # store the best model
            if test_score > best_test_score:
                best_test_score = test_score
                # save the model
                model_name = "%s_%d.model" % (exp_name, epoch)
                with open(model_name, 'wb') as bm:
                    for p in params:
                        cPickle.dump(p.get_value(), bm)

        end_time = timeit.default_timer()
        print "Finish one iteration using %f m" % (
            (end_time - start_time) / 60.)

    log_file.flush()
    log_file.close()
示例#6
0
def train_cnn_encoder(datasets,
                      word_embedding,
                      input_width=64,
                      filter_hs=[3, 4, 5],
                      hidden_units=[100, 2],
                      dropout_rate=[0.5],
                      shuffle_batch=True,
                      n_epochs=100,
                      batch_size=50,
                      lr_decay=0.95,
                      activations=[ReLU],
                      sqr_norm_lim=9,
                      non_static=True):

    start_time = timeit.default_timer()

    rng = np.random.RandomState(1234)
    input_height = len(datasets[0][0]) - 2
    filter_width = input_width
    feature_maps = hidden_units[0]
    filter_shapes = []
    pool_sizes = []
    for filter_h in filter_hs:
        filter_shapes.append((feature_maps, 1, filter_h, filter_width))
        pool_sizes.append(
            (input_height - filter_h + 1, input_width - filter_width + 1))

    parameters = [("Input Shape", input_height, input_width),
                  ("Filter Shape", filter_shapes), ("Pool Sizes", pool_sizes),
                  ("dropout rate", dropout_rate),
                  ("hidden units", hidden_units),
                  ("shuffle_batch", shuffle_batch), ("n_epochs", n_epochs),
                  ("batch size", batch_size)]
    print parameters

    # construct the model
    index = T.iscalar()
    x = T.matrix("x")
    y = T.ivector("y")
    words = shared(value=word_embedding, name="embedding")

    zero_vector_tensor = T.vector()
    zero_vec = np.zeros(input_width, dtype=theano.config.floatX)
    set_zero = function([zero_vector_tensor],
                        updates=[(words,
                                  T.set_subtensor(words[0, :],
                                                  zero_vector_tensor))])

    layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape(
        (x.shape[0], 1, x.shape[1], words.shape[1]))

    conv_layers = []
    layer1_inputs = []
    for i in xrange(len(filter_hs)):
        filter_shape = filter_shapes[i]
        pool_size = pool_sizes[i]
        conv_layer = nn.ConvPoolLayer(rng,
                                      input=layer0_input,
                                      input_shape=(batch_size, 1, input_height,
                                                   input_width),
                                      filter_shape=filter_shape,
                                      pool_size=pool_size,
                                      activation=ReLU)
        layer1_input = conv_layer.output.flatten(2)
        conv_layers.append(conv_layer)
        layer1_inputs.append(layer1_input)

    layer1_input = T.concatenate(layer1_inputs, 1)

    ###################
    # Population Task #
    ###################
    hidden_units[0] = feature_maps * len(filter_hs)

    pop_classifier = nn.MLPDropout(rng,
                                   input=layer1_input,
                                   layer_sizes=hidden_units,
                                   dropout_rates=dropout_rate,
                                   activations=activations)

    pop_params = pop_classifier.params
    for conv_layer in conv_layers:
        pop_params += conv_layer.params

    if non_static:
        pop_params.append(words)

    pop_cost = pop_classifier.negative_log_likelihood(y)
    pop_dropout_cost = pop_classifier.dropout_negative_log_likelihood(y)

    pop_grad_updates = sgd_updates_adadelta(pop_params, pop_dropout_cost,
                                            lr_decay, 1e-6, sqr_norm_lim)

    ###################
    # EventType Task #
    ###################
    event_type_hidden_units = [feature_maps * len(filter_hs), 12]
    type_classifier = nn.MLPDropout(rng,
                                    input=layer1_input,
                                    layer_sizes=event_type_hidden_units,
                                    dropout_rates=dropout_rate,
                                    activations=activations)
    type_params = type_classifier.params
    for conv_layer in conv_layers:
        type_params += conv_layer.params

    if non_static:
        type_params.append(words)

    type_cost = type_classifier.negative_log_likelihood(y)
    type_dropout_cost = type_classifier.dropout_negative_log_likelihood(y)
    type_grad_updates = sgd_updates_adadelta(type_params, type_dropout_cost,
                                             lr_decay, 1e-6, sqr_norm_lim)

    ######################
    # Construct Data Set #
    ######################

    np.random.seed(1234)
    if datasets[0].shape[0] % batch_size > 0:
        extra_data_num = batch_size - datasets[0].shape[0] % batch_size
        train_set = np.random.permutation(datasets[0])
        extra_data = train_set[:extra_data_num]
        new_data = np.append(datasets[0], extra_data, axis=0)
    else:
        new_data = datasets[0]

    new_data = np.random.permutation(new_data)
    n_batches = new_data.shape[0] / batch_size
    n_train_batches = int(np.round(n_batches * 0.9))

    # divide the train set intp train/val sets
    if datasets[1].shape[0] % batch_size > 0:
        extra_data_num = batch_size - datasets[1].shape[0] % batch_size
        test_set = np.random.permutation(datasets[1])
        extra_data = test_set[:extra_data_num]
        new_test_data = np.append(datasets[1], extra_data, axis=0)
    else:
        new_test_data = datasets[1]
    test_set_x = new_test_data[:, :input_height]
    test_set_pop_y = np.asarray(new_test_data[:, -2], "int32")
    test_set_type_y = np.asarray(new_test_data[:, -1], "int32")

    train_set = new_data[:n_train_batches * batch_size, :]
    val_set = new_data[n_train_batches * batch_size:, :]

    print train_set[:, -1]
    borrow = True
    train_set_x = theano.shared(np.asarray(train_set[:, :input_height],
                                           dtype=theano.config.floatX),
                                borrow=borrow)
    train_set_pop_y = T.cast(
        theano.shared(np.asarray(train_set[:, -2], dtype=theano.config.floatX),
                      borrow=borrow), 'int32')
    train_set_type_y = T.cast(
        theano.shared(np.asarray(train_set[:, -1], dtype=theano.config.floatX),
                      borrow=borrow), 'int32')

    val_set_x = theano.shared(np.asarray(val_set[:, :input_height],
                                         dtype=theano.config.floatX),
                              borrow=borrow)
    val_set_pop_y = T.cast(
        theano.shared(np.asarray(val_set[:, -2], dtype=theano.config.floatX),
                      borrow=borrow), 'int32')
    val_set_type_y = T.cast(
        theano.shared(np.asarray(val_set[:, -1], dtype=theano.config.floatX),
                      borrow=borrow), 'int32')

    n_val_batches = n_batches - n_train_batches
    n_test_batches = test_set_x.shape[0] / batch_size
    print 'n_test_batches: %d' % n_test_batches
    # transform the data into shared varibale for GPU computing
    test_set_x = theano.shared(np.asarray(test_set_x,
                                          dtype=theano.config.floatX),
                               borrow=borrow)
    test_set_pop_y = theano.shared(test_set_pop_y, borrow=True)
    test_set_type_y = theano.shared(test_set_type_y, borrow=True)

    ####################
    # Train Model Func #
    ####################
    # population model
    val_pop_model = function(
        [index],
        pop_classifier.errors(y),
        givens={
            x: val_set_x[index * batch_size:(index + 1) * batch_size],
            y: val_set_pop_y[index * batch_size:(index + 1) * batch_size]
        })

    test_pop_model = function(
        [index],
        pop_classifier.errors(y),
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_pop_y[index * batch_size:(index + 1) * batch_size]
        })

    real_test_pop_model = function(
        [index],
        pop_classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_pop_y[index * batch_size:(index + 1) * batch_size]
        })

    train_pop_model = function(
        [index],
        pop_cost,
        updates=pop_grad_updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_pop_y[index * batch_size:(index + 1) * batch_size]
        })

    # event type model
    val_type_model = function(
        [index],
        type_classifier.errors(y),
        givens={
            x: val_set_x[index * batch_size:(index + 1) * batch_size],
            y: val_set_type_y[index * batch_size:(index + 1) * batch_size]
        })

    test_type_model = function(
        [index],
        type_classifier.errors(y),
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_type_y[index * batch_size:(index + 1) * batch_size]
        })

    real_test_type_model = function(
        [index],
        type_classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_type_y[index * batch_size:(index + 1) * batch_size]
        })

    train_type_model = function(
        [index],
        type_cost,
        updates=type_grad_updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_type_y[index * batch_size:(index + 1) * batch_size]
        })
    """
    test_pred_layers = []
    test_size = test_set_x.shape[0]
    test_layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape((test_size, 1, input_height, input_width))
    for conv_layer in conv_layers:
        test_layer0_output = conv_layer.predict(test_layer0_input, test_size)
        test_pred_layers.append(test_layer0_output.flatten(2))

    test_layer1_input = T.concatenate(test_pred_layers, 1)

    test_pop_y_pred = pop_classifier.predict(test_layer1_input)
    test_pop_error = T.mean(T.neq(test_pop_y_pred, y))
    test_pop_model_all = function([x, y], test_pop_error)

    test_type_y_pred = type_classifier.predict(test_layer1_input)
    test_type_error = T.mean(T.neq(test_type_y_pred, y))
    test_type_model_all = function([x, y], test_type_error)
    """
    # start to training the model
    print "Start training the model...."
    epoch = 0
    best_pop_val_perf = 0
    best_type_val_perf = 0

    while (epoch < n_epochs):
        epoch += 1
        if shuffle_batch:
            for minibatch_index in np.random.permutation(
                    range(n_train_batches)):
                if minibatch_index % 10 == 0:
                    print minibatch_index
                cost_pop_epoch = train_pop_model(minibatch_index)
                set_zero(zero_vec)
                cost_type_epoch = train_type_model(minibatch_index)
                set_zero(zero_vec)
        else:
            for minibatch_index in xrange(n_train_batches):
                cost_pop_epoch = train_pop_model(minibatch_index)
                set_zero(zero_vec)
                cost_type_epoch = train_type_model(minibatch_index)
                set_zero(zero_vec)

        train_pop_losses = [test_pop_model(i) for i in xrange(n_train_batches)]
        train_pop_perf = 1 - np.mean(train_pop_losses)

        train_type_losses = [
            test_type_model(i) for i in xrange(n_train_batches)
        ]
        train_type_perf = 1 - np.mean(train_type_losses)

        val_pop_losses = [val_pop_model(i) for i in xrange(n_val_batches)]
        val_pop_perf = 1 - np.mean(val_pop_losses)

        val_type_losses = [val_type_model(i) for i in xrange(n_val_batches)]
        val_type_perf = 1 - np.mean(val_type_losses)

        print('epoch %i, train pop perf %f %%, val pop perf %f' %
              (epoch, train_pop_perf * 100., val_pop_perf * 100.))
        print('epoch %i, train type perf %f %%, val type perf %f' %
              (epoch, train_type_perf * 100., val_type_perf * 100.))

        if val_pop_perf >= best_pop_val_perf:
            best_pop_val_perf = val_pop_perf
            #test_pop_losses = test_pop_model_all(test_set_x, test_set_pop_y)
            test_pop_losses = [
                real_test_pop_model(i) for i in xrange(n_test_batches)
            ]
            test_pop_perf = 1 - np.mean(test_pop_losses)
            print "Test POP Performance %f under Current Best Valid perf %f" % (
                test_pop_perf, val_pop_perf)

        if val_type_perf >= best_type_val_perf:
            best_type_val_perf = val_type_perf
            #test_type_losses = test_type_model_all(test_set_x, test_set_type_y)
            test_type_losses = [
                real_test_type_model(i) for i in xrange(n_test_batches)
            ]
            test_type_perf = 1 - np.mean(test_type_losses)
            print "Test Type Performance %f under Current Best Valid perf %f" % (
                test_type_perf, val_type_perf)

        end_time = timeit.default_timer()
        print "Epoch %d finish take time %fm " % (epoch,
                                                  (end_time - start_time) /
                                                  60.)
        start_time = timeit.default_timer()

    return test_pop_perf, test_type_perf
示例#7
0
def run_cnn(exp_name,
        dataset, embedding,
        log_fn, perf_fn,
        k=0,
        emb_dm=100,
        batch_size=100,
        filter_hs=[1, 2, 3],
        hidden_units=[200, 100, 11],
        dropout_rate=0.5,
        shuffle_batch=True,
        n_epochs=300,
        lr_decay=0.95,
        activation=ReLU,
        sqr_norm_lim=9,
        non_static=True,
        print_freq=5):
    """
    Train and Evaluate CNN event encoder model
    :dataset: list containing three elements[(train_x, train_y), 
            (valid_x, valid_y), (test_x, test_y)]
    :embedding: word embedding with shape (|V| * emb_dm)
    :filter_hs: filter height for each paralle cnn layer
    :dropout_rate: dropout rate for full connected layers
    :n_epochs: the max number of iterations
    
    """
    start_time = timeit.default_timer()
    rng = np.random.RandomState(1234)
   
    input_height = len(dataset[0][0][0][0])
    num_sens = len(dataset[0][0][0])
    print "--input height ", input_height 
    input_width = emb_dm
    num_maps = hidden_units[0]

    ###################
    # start snippet 1 #
    ###################
    print "start to construct the model ...."
    x = T.tensor3("x")
    y_type = T.ivector("y_type")
    y_pop = T.ivector("y_pop")

    words = shared(value=np.asarray(embedding,
        dtype=theano.config.floatX), 
        name="embedding", borrow=True)

    # define function to keep padding vector as zero
    zero_vector_tensor = T.vector()
    zero_vec = np.zeros(input_width, dtype=theano.config.floatX)
    set_zero = function([zero_vector_tensor],
            updates=[(words, T.set_subtensor(words[0,:], zero_vector_tensor))])

    layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape((
        x.shape[0] * x.shape[1], 1, x.shape[2], emb_dm
        ))

    conv_layers = []
    layer1_inputs = []

    for i in xrange(len(filter_hs)):
        filter_shape = (num_maps, 1, filter_hs[i], emb_dm)
        pool_size = (input_height - filter_hs[i] + 1, 1)
        conv_layer = nn.ConvPoolLayer(rng, input=layer0_input, 
                input_shape=None,
                filter_shape=filter_shape,
                pool_size=pool_size, activation=activation)
        sen_vecs = conv_layer.output.reshape((x.shape[0], 1, x.shape[1], num_maps))
        # construct multi-layer sentence vectors

        conv_layers.append(conv_layer)
        layer1_inputs.append(sen_vecs)
    
    sen_vec = T.concatenate(layer1_inputs, 3)
    # score the sentences
    theta_value = np.random.random((len(filter_hs) * num_maps, 1))
    theta = shared(value=np.asarray(theta_value, dtype=theano.config.floatX),
            name="theta", borrow=True)
    weighted_sen_vecs, sen_score = keep_max(sen_vec, theta, k)
    doc_vec = T.max(weighted_sen_vecs, axis=2)
    layer1_input = doc_vec.flatten(2) 
    final_sen_score = sen_score.flatten(2)

    ##############
    # classifier pop#
    ##############
    print "Construct classifier ...."
    hidden_units[0] = num_maps * len(filter_hs)
    model = nn.MLPDropout(rng,
            input=layer1_input,
            layer_sizes=hidden_units,
            dropout_rates=[dropout_rate],
            activations=[activation])

    params = model.params
    for conv_layer in conv_layers:
        params += conv_layer.params

    params.append(theta)
    if non_static:
        params.append(words)

    cost = model.negative_log_likelihood(y_pop)
    dropout_cost = model.dropout_negative_log_likelihood(y_pop)

    #######################
    # classifier Type #####
    #######################
    type_hidden_units = [num for num in hidden_units]
    type_hidden_units[-1] = 5
    type_model = nn.MLPDropout(rng,
            input=layer1_input,
            layer_sizes=type_hidden_units,
            dropout_rates=[dropout_rate],
            activations=[activation])
    params += type_model.params

    type_cost = type_model.negative_log_likelihood(y_type)
    type_dropout_cost = type_model.dropout_negative_log_likelihood(y_type)

    total_cost = cost + type_cost
    total_dropout_cost = dropout_cost  + type_dropout_cost
    # using adagrad
    lr = 0.01
    """
    total_grad_updates = nn.optimizer(total_dropout_cost,
            params,
            lr,
            method="adadelta"
            )
    """
    total_grad_updates = sgd_updates_adadelta(params, 
            total_dropout_cost,
            lr_decay,
            1e-6,
            sqr_norm_lim)
    
    total_preds = [model.preds, type_model.preds]

    #####################
    # Construct Dataset #
    #####################
    print "Copy data to GPU and constrct train/valid/test func"
    np.random.seed(1234)
    
    train_x, train_pop_y, train_type_y = shared_dataset(dataset[0])
    test_x, test_pop_y, test_type_y = shared_dataset(dataset[1])

    n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size))
    n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size))

    #####################
    # Train model func #
    #####################
    index = T.iscalar()
    train_func = function([index], total_cost, updates=total_grad_updates,
            givens={
                x: train_x[index*batch_size:(index+1)*batch_size],
                y_pop: train_pop_y[index*batch_size:(index+1)*batch_size],
                y_type:train_type_y[index*batch_size:(index+1)*batch_size]
                })
    
    test_pred = function([index], total_preds,
            givens={
                x:test_x[index*batch_size:(index+1)*batch_size],
                })
    
    test_sentence_est = function([index], final_sen_score,
            givens={
                x: test_x[index*batch_size:(index+1)*batch_size]
                })
    
    train_sentence_est = function([index], final_sen_score,
            givens={
                x: train_x[index*batch_size:(index+1)*batch_size]
                })


    # apply early stop strategy
    patience = 100
    patience_increase = 2
    improvement_threshold = 1.005
    
    n_test = len(dataset[1][0])

    epoch = 0
    best_params = None
    best_validation_score = 0.
    test_perf = 0

    done_loop = False
    
    log_file = open(log_fn, 'w')

    print "Start to train the model....."
    cpu_tst_pop_y = np.asarray(dataset[1][1])
    cpu_tst_type_y = np.asarray(dataset[1][2])

    def compute_score(true_list, pred_list):
        mat = np.equal(true_list, pred_list)
        score = np.mean(mat)
        return score
    
    total_score = 0.0
    while (epoch < n_epochs) and not done_loop:
        start_time = timeit.default_timer()
        epoch += 1
        costs = []
        for minibatch_index in np.random.permutation(range(n_train_batches)):
            cost_epoch = train_func(minibatch_index)
            costs.append(cost_epoch)
            set_zero(zero_vec)
        

        if epoch % print_freq == 0:
            # do test
            test_pop_preds, test_type_preds = map(np.concatenate, zip(*[test_pred(i) for i in xrange(n_test_batches)]))
            test_pop_score = compute_score(cpu_tst_pop_y, test_pop_preds)
            test_type_score = compute_score(cpu_tst_type_y, test_type_preds)
            
            with open(os.path.join(perf_fn, "%s_%d.pop_pred" % (exp_name, epoch)), 'w') as epf:
                for p in test_pop_preds:
                    epf.write("%d\n" % int(p))

            with open(os.path.join(perf_fn, "%s_%d.type_pred" % (exp_name, epoch)), 'w') as epf:
                for p in test_type_preds:
                    epf.write("%d\n" % int(p))
            
            message = "Epoch %d test pop perf %f, type perf %f" % (epoch, test_pop_score, test_type_score)
            print message
            log_file.write(message + "\n")
            log_file.flush()

            if ((test_pop_score + test_type_score) > total_score) or (epoch % 15 == 0):
                total_score = test_pop_score + test_type_score
                # save the sentence score
                test_sen_score = [test_sentence_est(i) for i in xrange(n_test_batches)]
                score_file = "./results/%s_%d_test.score" % (exp_name, epoch)
                with open(score_file, "wb") as sm:
                    cPickle.dump(test_sen_score, sm)
                
                train_sen_score = [train_sentence_est(i) for i in xrange(n_train_batches)]
                score_file = "./results/%s_%d_train.score" % (exp_name, epoch)
                with open(score_file, "wb") as sm:
                    cPickle.dump(train_sen_score, sm)

        end_time = timeit.default_timer()
        print "Finish one iteration using %f m" % ((end_time - start_time)/60.)

    log_file.flush()
    log_file.close()
    def run_experiment(self, dataset, word_embedding, exp_name):

        # load parameters
        num_maps_word = self.options["num_maps_word"]
        drop_rate_word = self.options["drop_rate_word"]
        word_window = self.options["word_window"]
        word_dim = self.options["word_dim"]
        k_max_word = self.options["k_max_word"]
        num_maps_sentence = self.options["num_maps_sentence"]
        drop_rate_sentence = self.options["drop_rate_sentence"]
        sentence_window = self.options["sentence_window"]
        k_max_sentence = self.options["k_max_sentence"]
        batch_size = self.options["batch_size"]
        rho = self.options["rho"]
        epsilon = self.options["epsilon"]
        norm_lim = self.options["norm_lim"]
        max_iteration = self.options["max_iteration"]

        sentence_len = len(dataset[0][0][0][0])
        sentence_num = len(dataset[0][0][0])

        # define the parameters
        x = T.tensor3("x")
        y = T.ivector("y")
        rng = np.random.RandomState(1234)

        words = theano.shared(value=np.asarray(word_embedding,
                                               dtype=theano.config.floatX),
                              name="embedding",
                              borrow=True)
        zero_vector_tensor = T.vector()
        zero_vec = np.zeros(word_dim, dtype=theano.config.floatX)
        set_zero = theano.function(
            [zero_vector_tensor],
            updates=[(words, T.set_subtensor(words[0, :],
                                             zero_vector_tensor))])

        x_emb = words[T.cast(x.flatten(), dtype="int32")].reshape(
            (x.shape[0] * x.shape[1], 1, x.shape[2], words.shape[1]))

        dropout_x_emb = nn.dropout_from_layer(rng, x_emb, drop_rate_word)

        # compute convolution on words layer
        word_filter_shape = (num_maps_word, 1, word_window, word_dim)
        word_pool_size = (sentence_len - word_window + 1, 1)
        dropout_word_conv = nn.ConvPoolLayer(rng,
                                             input=dropout_x_emb,
                                             input_shape=None,
                                             filter_shape=word_filter_shape,
                                             pool_size=word_pool_size,
                                             activation=Tanh,
                                             k=k_max_word)
        sent_vec_dim = num_maps_word * k_max_word
        dropout_sent_vec = dropout_word_conv.output.reshape(
            (x.shape[0], 1, x.shape[1], sent_vec_dim))
        dropout_sent_vec = nn.dropout_from_layer(rng, dropout_sent_vec,
                                                 drop_rate_sentence)

        word_conv = nn.ConvPoolLayer(rng,
                                     input=dropout_x_emb *
                                     (1 - drop_rate_word),
                                     input_shape=None,
                                     filter_shape=word_filter_shape,
                                     pool_size=word_pool_size,
                                     activation=Tanh,
                                     k=k_max_word,
                                     W=dropout_word_conv.W,
                                     b=dropout_word_conv.b)
        sent_vec = word_conv.output.reshape(
            (x.shape[0], 1, x.shape[1], sent_vec_dim))

        # construct the convolution layer on sentences
        sent_filter_shape = (num_maps_sentence, 1, sentence_window,
                             sent_vec_dim)
        sent_pool_size = (sentence_num - sentence_window + 1, 1)
        dropout_sent_conv = nn.ConvPoolLayer(rng,
                                             input=dropout_sent_vec,
                                             input_shape=None,
                                             filter_shape=sent_filter_shape,
                                             pool_size=sent_pool_size,
                                             activation=Tanh,
                                             k=k_max_sentence)

        sent_conv = nn.ConvPoolLayer(rng,
                                     input=sent_vec * (1 - drop_rate_sentence),
                                     input_shape=None,
                                     filter_shape=sent_filter_shape,
                                     pool_size=sent_pool_size,
                                     activation=Tanh,
                                     k=k_max_sentence,
                                     W=dropout_sent_conv.W,
                                     b=dropout_sent_conv.b)

        dropout_doc_vec = dropout_sent_conv.output.flatten(2)
        doc_vec = sent_conv.output.flatten(2)
        doc_vec_dim = num_maps_sentence * k_max_sentence

        # construct classifier
        dropout_logistic_layer = nn.LogisticRegressionLayer(
            input=dropout_doc_vec, n_in=doc_vec_dim, n_out=2)

        logistic_layer = nn.LogisticRegressionLayer(input=doc_vec,
                                                    n_in=doc_vec_dim,
                                                    n_out=2,
                                                    W=dropout_logistic_layer.W,
                                                    b=dropout_logistic_layer.b)

        dropout_cost = dropout_logistic_layer.negative_log_likelihood(y)
        cost = logistic_layer.negative_log_likelihood(y)

        preds = logistic_layer.y_pred
        errors = logistic_layer.errors(y)

        # collect parameters
        self.params.append(words)
        self.params += dropout_word_conv.params
        self.params += dropout_sent_conv.params
        self.params += dropout_logistic_layer.params

        grad_updates = nn.sgd_updates_adadelta(self.params, dropout_cost, rho,
                                               epsilon, norm_lim)

        # construct the dataset
        train_x, train_y = nn.shared_dataset(dataset[0])
        test_x, test_y = nn.shared_dataset(dataset[1])
        test_cpu_y = dataset[1][1]

        n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size))
        n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size))

        # construt the model
        index = T.iscalar()
        train_func = theano.function(
            [index],
            dropout_cost,
            updates=grad_updates,
            givens={
                x: train_x[index * batch_size:(index + 1) * batch_size],
                y: train_y[index * batch_size:(index + 1) * batch_size]
            })

        test_func = theano.function(
            [index],
            preds,
            givens={x: test_x[index * batch_size:(index + 1) * batch_size]})

        get_train_sentvec = theano.function(
            [index],
            sent_vec,
            givens={x: train_x[index * batch_size:(index + 1) * batch_size]})

        get_test_sentvec = theano.function(
            [index],
            sent_vec,
            givens={x: test_x[index * batch_size:(index + 1) * batch_size]})

        epoch = 0
        best_score = 0
        raw_train_x = dataset[0][0]
        raw_test_x = dataset[1][0]
        # get the sentence number for each document
        number_train_sens = []
        number_test_sens = []

        for doc in raw_train_x:
            sen_num = 0
            for sen in doc:
                if np.any(sen):
                    sen_num += 1
            number_train_sens.append(sen_num)

        for doc in raw_test_x:
            sen_num = 0
            for sen in doc:
                if np.any(sen):
                    sen_num += 1
            number_test_sens.append(sen_num)

        log_file = open("./log/%s.log" % exp_name, 'w')

        while epoch <= max_iteration:
            start_time = timeit.default_timer()
            epoch += 1
            costs = []

            for minibatch_index in np.random.permutation(
                    range(n_train_batches)):
                cost_epoch = train_func(minibatch_index)
                costs.append(cost_epoch)
                set_zero(zero_vec)

            if epoch % 5 == 0:
                test_preds = []
                for i in xrange(n_test_batches):
                    test_y_pred = test_func(i)
                    test_preds.append(test_y_pred)
                test_preds = np.concatenate(test_preds)
                test_score = 1 - np.mean(np.not_equal(test_cpu_y, test_preds))

                precision, recall, beta, support = precision_recall_fscore_support(
                    test_cpu_y, test_preds, pos_label=1)

                if test_score > best_score:
                    best_score = test_score
                    # save the sentence vectors
                    train_sens = [
                        get_train_sentvec(i) for i in range(n_train_batches)
                    ]
                    test_sens = [
                        get_test_sentvec(i) for i in range(n_test_batches)
                    ]

                    train_sens = np.concatenate(train_sens, axis=0)
                    test_sens = np.concatenate(test_sens, axis=0)

                    out_train_sent_file = "./results/%s_train_sent.vec" % exp_name
                    out_test_sent_file = "./results/%s_test_sent.vec" % exp_name

                    with open(out_train_sent_file,
                              'w') as train_f, open(out_test_sent_file,
                                                    'w') as test_f:
                        for i in range(len(train_sens)):
                            tr_doc_vect = train_sens[i][
                                0][:number_train_sens[i]]
                            train_f.write(
                                json.dumps(tr_doc_vect.tolist()) + "\n")

                        for i in range(len(test_sens)):
                            te_doc_vect = test_sens[i][0][:number_test_sens[i]]
                            test_f.write(
                                json.dumps(te_doc_vect.tolist()) + "\n")
                    print "Get best performace at %d iteration" % epoch
                    log_file.write("Get best performance at %d iteration\n" %
                                   epoch)

                end_time = timeit.default_timer()
                print "Iteration %d , precision, recall, support" % epoch, precision, recall, support
                log_file.write(
                    "Iteration %d, neg precision %f, pos precision %f, neg recall %f pos recall %f \n"
                    %
                    (epoch, precision[0], precision[1], recall[0], recall[1]))
                print "Using time %f m" % ((end_time - start_time) / 60.)
                log_file.write("Uing time %f m\n" %
                               ((end_time - start_time) / 60.))
            end_time = timeit.default_timer()
            print "Iteration %d Using time %f m" % (epoch,
                                                    (end_time - start_time) /
                                                    60.)
            log_file.write("Uing time %f m\n" %
                           ((end_time - start_time) / 60.))
            log_file.flush()

        log_file.close()
def train_cnn_encoder(datasets, word_embedding, input_width=64,
                      filter_hs=[3, 4, 5],
                      hidden_units=[100, 2],
                      dropout_rate=[0.5],
                      shuffle_batch=True,
                      n_epochs=100,
                      batch_size=50,
                      lr_decay=0.95,
                      activations=[ReLU],
                      sqr_norm_lim=9,
                      non_static=True):
    rng = np.random.RandomState(1234)
    input_height = len(datasets[0][0]) - 1
    filter_width = input_width
    feature_maps = hidden_units[0]
    filter_shapes = []
    pool_sizes = []
    for filter_h in filter_hs:
        filter_shapes.append((feature_maps, 1, filter_h, filter_width))
        pool_sizes.append((input_height-filter_h+1, input_width-filter_width+1))

    parameters = [("Input Shape", input_height, input_width),
                  ("Filter Shape", filter_shapes),
                  ("Pool Sizes", pool_sizes),
                  ("dropout rate", dropout_rate),
                  ("hidden units", hidden_units),
                  ("shuffle_batch", shuffle_batch),
                  ("n_epochs", n_epochs),
                  ("batch size", batch_size)]
    print parameters

    # construct the model
    index = T.iscalar()
    x = T.matrix("x")
    y = T.ivector("y")
    words = shared(value=word_embedding, name="embedding")

    zero_vector_tensor = T.vector()
    zero_vec = np.zeros(input_width)
    set_zero = function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0,:], zero_vector_tensor))])

    layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape((x.shape[0],1,x.shape[1],words.shape[1]))

    conv_layers = []
    layer1_inputs = []
    for i in xrange(len(filter_hs)):
        filter_shape = filter_shapes[i]
        pool_size = pool_sizes[i]
        conv_layer = nn.ConvPoolLayer(rng, input=layer0_input,
            input_shape=(batch_size, 1, input_height, input_width),
            filter_shape=filter_shape,
            pool_size=pool_size, activation=ReLU)
        layer1_input = conv_layer.output.flatten(2)
        conv_layers.append(conv_layer)
        layer1_inputs.append(layer1_input)

    layer1_input = T.concatenate(layer1_inputs, 1)

    hidden_units[0] = feature_maps * len(filter_hs)

    classifier = nn.MLPDropout(rng,
        input=layer1_input,
        layer_sizes=hidden_units,
        dropout_rates=dropout_rate,
        activations=activations)

    params = classifier.params
    for conv_layer in conv_layers:
        params += conv_layer.params

    if non_static:
        params.append(words)


    cost = classifier.negative_log_likelihood(y)
    dropout_cost = classifier.dropout_negative_log_likelihood(y)

    grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6, sqr_norm_lim)

    np.random.seed(1234)
    if datasets[0].shape[0] % batch_size > 0:
        extra_data_num = batch_size - datasets[0].shape[0] % batch_size
        train_set = np.random.permutation(datasets[0])
        extra_data = train_set[:extra_data_num]
        new_data = np.append(datasets[0], extra_data, axis=0)
    else:
        new_data = datasets[0]

    new_data = np.random.permutation(new_data)
    n_batches = new_data.shape[0]/batch_size
    n_train_batches = int(np.round(n_batches*0.9))

    # divide the train set intp train/val sets
    test_set_x = datasets[1][:,:input_height]
    test_set_y = np.asarray(datasets[1][:,-1], "int32")

    train_set = new_data[:n_train_batches*batch_size,:]
    val_set = new_data[n_train_batches*batch_size:,:]
    print train_set[:,-1]
    train_set_x, train_set_y = shared_dataset((train_set[:,:input_height],train_set[:,-1]))
    val_set_x, val_set_y = shared_dataset((val_set[:,:input_height],val_set[:,-1]))

    n_val_batches = n_batches - n_train_batches
    val_model = function([index], classifier.errors(y),
        givens={
            x: val_set_x[index * batch_size: (index + 1) * batch_size],
            y: val_set_y[index * batch_size: (index + 1) * batch_size]
        })

    test_model = function([index], classifier.errors(y),
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        })

    train_model = function([index], cost, updates=grad_updates,
        givens={
            x: train_set_x[index*batch_size:(index+1)*batch_size],
            y: train_set_y[index*batch_size:(index+1)*batch_size]
        })

    test_pred_layers = []
    test_size = test_set_x.shape[0]
    test_layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape((test_size, 1, input_height, input_width))
    for conv_layer in conv_layers:
        test_layer0_output = conv_layer.predict(test_layer0_input, test_size)
        test_pred_layers.append(test_layer0_output.flatten(2))

    test_layer1_input = T.concatenate(test_pred_layers, 1)
    test_y_pred = classifier.predict(test_layer1_input)
    test_error = T.mean(T.neq(test_y_pred, y))
    test_model_all = function([x, y], test_error)

    # start to training the model
    print "Start training the model...."
    epoch = 0
    best_val_perf = 0
    val_perf = 0
    cost_epoch = 0
    while(epoch < n_epochs):
        epoch += 1
        if shuffle_batch:
            for minibatch_index in np.random.permutation(range(n_train_batches)):
                print minibatch_index
                cost_epoch = train_model(minibatch_index)
                set_zero(zero_vec)
        else:
            for minibatch_index in xrange(n_train_batches):
                cost_epoch = train_model(minibatch_index)
                set_zero(zero_vec)
        train_losses = [test_model(i) for i in xrange(n_train_batches)]
        train_perf = 1 - np.mean(train_losses)
        
        val_losses = [val_model(i) for i in xrange(n_val_batches)]
        val_perf = 1 - np.mean(val_losses)
        print('epoch %i, train perf %f %%, val perf %f' % (epoch, train_perf * 100., val_perf*100.))

        if val_perf >= best_val_perf:
            best_val_perf = val_perf
            test_losses = test_model_all(test_set_x, test_set_y)
            test_perf = 1 - test_losses
            print "Test Performance %f under Current Best Valid perf %f" % (test_perf, val_perf)

    return test_perf
示例#10
0
def run_cnn(exp_name,
        dataset, embedding,
        log_fn, perf_fn,
        emb_dm=100,
        batch_size=100,
        filter_hs=[1, 2, 3],
        hidden_units=[200, 100, 11],
        dropout_rate=0.5,
        shuffle_batch=True,
        n_epochs=300,
        lr_decay=0.95,
        activation=ReLU,
        sqr_norm_lim=9,
        non_static=True,
        print_freq=5):
    """
    Train and Evaluate CNN event encoder model
    :dataset: list containing three elements[(train_x, train_y), 
            (valid_x, valid_y), (test_x, test_y)]
    :embedding: word embedding with shape (|V| * emb_dm)
    :filter_hs: filter height for each paralle cnn layer
    :dropout_rate: dropout rate for full connected layers
    :n_epochs: the max number of iterations
    
    """
    start_time = timeit.default_timer()
    rng = np.random.RandomState(1234)
   
    input_height = len(dataset[0][0][0][0]) # number of words in the sentences
    num_sens = len(dataset[0][0][0]) # number of sentences
    print "--input height ", input_height 
    input_width = emb_dm
    num_maps = hidden_units[0]

    ###################
    # start snippet 1 #
    ###################
    print "start to construct the model ...."
    x = T.tensor3("x")
    y = T.ivector("y")

    words = shared(value=np.asarray(embedding,
        dtype=theano.config.floatX), 
        name="embedding", borrow=True)

    # define function to keep padding vector as zero
    zero_vector_tensor = T.vector()
    zero_vec = np.zeros(input_width, dtype=theano.config.floatX)
    set_zero = function([zero_vector_tensor],
            updates=[(words, T.set_subtensor(words[0,:], zero_vector_tensor))])

    # the input for the sentence level conv layers
    layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape((
        x.shape[0] * x.shape[1], 1, x.shape[2], emb_dm
        ))

    conv_layers = []
    
    filter_shape = (num_maps, 1, filter_hs[0], emb_dm)
    pool_size = (input_height - filter_hs[0] + 1, 1)
    conv_layer = nn.ConvPoolLayer(rng, input=layer0_input, 
            input_shape=None, 
            filter_shape=filter_shape, 
            pool_size=pool_size, activation=activation)
        
    sen_vecs = conv_layer.output.reshape((x.shape[0] * x.shape[1], num_maps)) 
    conv_layers.append(conv_layer)

    # compute preactivation for each sentences
    layer_sizes = zip(hidden_units, hidden_units[1:])
    full_layer_input = sen_vecs
    dropout_input = sen_vecs
    hidden_outs = []
    drophidden_outs = []
    hidden_layers = []
    dropout_layers = []
    droprate = 0.5
    for lay_size in layer_sizes[:-1]:
        U_value = np.random.random(lay_size).astype(theano.config.floatX)
        b_value = np.zeros((lay_size[1],), dtype=theano.config.floatX)
        U = theano.shared(U_value, borrow=True, name="U")
        b = theano.shared(b_value, borrow=True, name="b")
        hiddenLayer = nn.HiddenLayer(rng, full_layer_input, lay_size[0], lay_size[1], ReLU, U * (1 - droprate), b)
        dropHiddenLayer = nn.DropoutHiddenLayer(rng, dropout_input, lay_size[0], lay_size[1], ReLU, droprate, U, b)

        hidden_layers.append(hiddenLayer)
        dropout_layers.append(dropHiddenLayer)

        hidden_out = hiddenLayer.output
        drophidden_out = dropHiddenLayer.output
        
        hidden_outs.append(hidden_out)
        drophidden_outs.append(drophidden_out)

        full_layer_input = hidden_out
        dropout_input = drophidden_out

    
    # get the max value for each class
    n_in, n_out = layer_sizes[-1]
    W_value = np.random.random((n_in, n_out)).astype(theano.config.floatX)
    b_value = np.zeros((n_out,), dtype=theano.config.floatX)
    W = theano.shared(W_value, borrow=True, name="logis_W")
    b = theano.shared(b_value, borrow=True, name="logis_b")

    full_act = T.dot(hidden_outs[-1], W*(1 - droprate)) + b
    dropout_act = nn.dropout_from_layer(rng, T.dot(drophidden_outs[-1], W) + b, droprate)
    
    # compute the probability
    sen_full_probs = T.nnet.softmax(full_act)
    sen_dropout_probs = T.nnet.softmax(dropout_act)
    # compute the sentence similarity
    sen_sen = T.dot(sen_vecs, sen_vecs.T)
    sen_sqr = T.sum(sen_vecs ** 2, axis=1)
    sen_sqr_left = sen_sqr.dimshuffle(0, 'x')
    sen_sqr_right = sen_sqr.dimshuffle('x', 0)
    sen_smi_matrix = sen_sqr_left - 2 * sen_sen + sen_sqr_right
    sen_smi_matrix = T.exp(-1 * sen_smi_matrix)

    # compute the delta between sentence probabilities
    prob_prob_full = T.dot(sen_full_probs, sen_full_probs.T)
    prob_sqr_full = T.sum(sen_full_probs ** 2, axis=1)
    prob_sqr_left_full = prob_sqr_full.dimshuffle(0, 'x')
    prob_sqr_right_full = prob_sqr_full.dimshuffle('x', 0)
    prob_delta_full = prob_sqr_left_full - 2 * prob_prob_full + prob_sqr_right_full
    sen_cost_full = T.sum(sen_smi_matrix * prob_delta_full)
    
    prob_prob_drop = T.dot(sen_dropout_probs, sen_dropout_probs.T)
    prob_sqr_drop = T.sum(sen_dropout_probs ** 2, axis=1)
    prob_sqr_left_drop = prob_sqr_drop.dimshuffle(0, 'x')
    prob_sqr_right_drop = prob_sqr_drop.dimshuffle('x', 0)
    prob_delta_drop = prob_sqr_left_drop - 2 * prob_prob_drop + prob_sqr_right_drop
    sen_cost_drop = T.sum(sen_smi_matrix * prob_delta_drop)

    # transform the sen probs to doc probs
    # by using average probs
    doc_full_probs = sen_full_probs.reshape((x.shape[0], x.shape[1], n_out))
    doc_full_probs = T.mean(doc_full_probs, axis=1)
    doc_dropout_probs = sen_dropout_probs.reshape((x.shape[0], x.shape[1], n_out))
    doc_dropout_probs = T.mean(doc_dropout_probs, axis=1)

    doc_full_y_pred = T.argmax(doc_full_probs, axis=1)
    doc_dropout_y_pred = T.argmax(doc_dropout_probs, axis=1)
    
    full_negative_likelihood = T.sum(-T.log(doc_full_probs)[T.arange(y.shape[0]), y])
    dropout_negative_likelihood = T.sum(-T.log(doc_dropout_probs)[T.arange(y.shape[0]), y])

    full_errors = T.mean(T.neq(doc_full_y_pred, y))

    gamma = 2
    full_cost = full_negative_likelihood + gamma * sen_cost_full
    dropout_cost = dropout_negative_likelihood + gamma * sen_cost_drop
    
    params = []
    for conv_layer in conv_layers:
        params += conv_layer.params

    for dropout_layer in dropout_layers:
        params += dropout_layer.params

    params.append(W)
    params.append(b)
    
    if non_static:
        params.append(words)

    grad_updates = sgd_updates_adadelta(params,
            dropout_cost,
            lr_decay,
            1e-6,
            sqr_norm_lim)


    #####################
    # Construct Dataset #
    #####################
    print "Copy data to GPU and constrct train/valid/test func"
    np.random.seed(1234)
    
    train_x, train_y = shared_dataset(dataset[0])
    valid_x, valid_y = shared_dataset(dataset[1])
    test_x, test_y = shared_dataset(dataset[2])

    n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size))
    n_valid_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size))
    n_test_batches = int(np.ceil(1.0 * len(dataset[2][0]) / batch_size))

    #####################
    # Train model func #
    #####################
    index = T.iscalar()
    train_func = function([index], full_cost, updates=grad_updates,
            givens={
                x: train_x[index*batch_size:(index+1)*batch_size],
                y: train_y[index*batch_size:(index+1)*batch_size]
                })

    train_error = function([index], full_errors,
            givens={
                x: train_x[index*batch_size:(index+1)*batch_size],
                y: train_y[index*batch_size:(index+1)*batch_size]
                })
    
    valid_train_func = function([index], [full_negative_likelihood, sen_cost_full], updates=grad_updates,
            givens={
                x: valid_x[index*batch_size:(index+1)*batch_size],
                y: valid_y[index*batch_size:(index+1)*batch_size]
                })

    test_pred = function([index], doc_full_y_pred,
            givens={
                x:test_x[index*batch_size:(index+1)*batch_size],
                })
    
    
    # apply early stop strategy
    patience = 100
    patience_increase = 2
    improvement_threshold = 1.005
    
    n_valid = len(dataset[1][0])
    n_test = len(dataset[2][0])

    epoch = 0
    best_params = None
    best_validation_score = 0.
    test_perf = 0

    done_loop = False
    
    log_file = open(log_fn, 'w')

    print "Start to train the model....."
    cpu_trn_y = np.asarray(dataset[0][1])
    cpu_val_y = np.asarray(dataset[1][1])
    cpu_tst_y = np.asarray(dataset[2][1])

    def compute_score(true_list, pred_list):
        mat = np.equal(true_list, pred_list)
        score = np.mean(mat)
        return score
    
    best_test_score = 0.
    while (epoch < n_epochs) and not done_loop:
        start_time = timeit.default_timer()
        epoch += 1
        costs = []
        
        for minibatch_index in np.random.permutation(range(n_train_batches)):
            cost_epoch = train_func(minibatch_index)
            costs.append(cost_epoch)
            set_zero(zero_vec)

        # do validatiovalidn
        valid_cost, valid_sen_cost = zip(*[valid_train_func(i) for i in np.random.permutation(xrange(n_valid_batches))])
        if epoch % print_freq == 0:
            # do test
            test_preds = np.concatenate([test_pred(i) for i in xrange(n_test_batches)])
            test_score = compute_score(cpu_tst_y, test_preds)
            
            with open(os.path.join(perf_fn, "%s_%d.pred" % (exp_name, epoch)), 'w') as epf:
                for p in test_preds:
                    epf.write("%d\n" % int(p))
                message = "Epoch %d test perf %f train cost %f, valid_sen_cost %f, valid_doc_cost %f" % (epoch, test_score, np.mean(costs), np.mean(valid_sen_cost), np.mean(valid_cost))


            print message
            log_file.write(message + "\n")
            log_file.flush()
            """
            # store the best model
            if (test_score > best_test_score) or (epoch % 25 == 0):
                best_test_score = test_score
                # save the model
                model_name = "%s_%d.model" % (exp_name, epoch)
                with open(model_name, 'wb') as bm:
                    for p in params:
                        cPickle.dump(p.get_value(), bm)
            """

        end_time = timeit.default_timer()
        print "Finish one iteration using %f m" % ((end_time - start_time)/60.)

    log_file.flush()
    log_file.close()
示例#11
0
    def run_experiment(self, dataset, word_embedding, exp_name):
        
        # load parameters
        num_maps_word = self.options["num_maps_word"]
        drop_rate_word = self.options["drop_rate_word"]
        drop_rate_sentence = self.options["drop_rate_sentence"]
        word_window = self.options["word_window"]
        word_dim = self.options["word_dim"]
        k_max_word = self.options["k_max_word"]
        batch_size = self.options["batch_size"]
        rho = self.options["rho"]
        epsilon = self.options["epsilon"]
        norm_lim = self.options["norm_lim"]
        max_iteration = self.options["max_iteration"]
        k = self.options["k_max"]

        sentence_len = len(dataset[0][0][0][0])
        sentence_num = len(dataset[0][0][0])
        
        # define the parameters
        x = T.tensor3("x")
        y = T.ivector("y")
        rng = np.random.RandomState(1234)
        
        words = theano.shared(value=np.asarray(word_embedding,
            dtype=theano.config.floatX),
            name="embedding", borrow=True)
        zero_vector_tensor = T.vector() 
        zero_vec = np.zeros(word_dim, dtype=theano.config.floatX)
        set_zero = theano.function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0,:], zero_vector_tensor))])

        x_emb = words[T.cast(x.flatten(), dtype="int32")].reshape((x.shape[0]*x.shape[1], 1, x.shape[2], words.shape[1]))

        dropout_x_emb = nn.dropout_from_layer(rng, x_emb, drop_rate_word)

        # compute convolution on words layer
        word_filter_shape = (num_maps_word, 1, word_window, word_dim)
        word_pool_size = (sentence_len - word_window + 1, 1)
        dropout_word_conv = nn.ConvPoolLayer(rng, 
                input=dropout_x_emb,
                input_shape=None,
                filter_shape=word_filter_shape,
                pool_size=word_pool_size,
                activation=Tanh,
                k=k_max_word)
        sent_vec_dim = num_maps_word*k_max_word
        dropout_sent_vec = dropout_word_conv.output.reshape((x.shape[0] * x.shape[1], sent_vec_dim))

        word_conv = nn.ConvPoolLayer(rng, 
                input=dropout_x_emb*(1 - drop_rate_word),
                input_shape=None,
                filter_shape=word_filter_shape,
                pool_size=word_pool_size,
                activation=Tanh,
                k=k_max_word,
                W=dropout_word_conv.W,
                b=dropout_word_conv.b)
        sent_vec = word_conv.output.reshape((x.shape[0] * x.shape[1], sent_vec_dim))
        
        theta_value = np.random.random((sent_vec_dim,1))
        theta = shared(value=np.asarray(theta_value, dtype=theano.config.floatX), name="theta", borrow=True)
        weighted_drop_sent_vec, weighted_sen_score = keep_max(dropout_sent_vec.reshape((x.shape[0], 1, x.shape[1], sent_vec_dim)), theta, k)
        drop_doc_vec = T.sum(weighted_drop_sent_vec, axis=2).flatten(2)
        
        weighted_sent_vec, sen_score = keep_max(sent_vec.reshape((x.shape[0], 1, x.shape[1], sent_vec_dim)), theta, k)
        doc_vec = T.sum(weighted_sent_vec, axis=2).flatten(2)
        # we need to constrain the number of positive sentences in positive
        

        
        # collect parameters
        self.params.append(words)
        self.params += dropout_word_conv.params
        self.params.append(sen_W)
        self.params.append(sen_b)
        
        grad_updates = nn.sgd_updates_adadelta(self.params,
                drop_cost,
                rho,
                epsilon,
                norm_lim)

        # construct the dataset
        train_x, train_y = nn.shared_dataset(dataset[0])
        test_x, test_y = nn.shared_dataset(dataset[1])
        test_cpu_y = dataset[1][1]

        n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size))
        n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size))

        # construt the model
        index = T.iscalar()
        train_func = theano.function([index], [drop_cost, drop_bag_cost, drop_sent_cost, penal_cost], updates=grad_updates,
                givens={
                    x: train_x[index*batch_size:(index+1)*batch_size],
                    y: train_y[index*batch_size:(index+1)*batch_size]
                    })

        test_func = theano.function([index], doc_preds,
                givens={
                    x:test_x[index*batch_size:(index+1)*batch_size]
                    })

        get_train_sent_prob = theano.function([index], sent_prob,
                givens={
                    x:train_x[index*batch_size:(index+1)*batch_size]
                    })

        get_test_sent_prob = theano.function([index], sent_prob,
                givens={
                    x:test_x[index*batch_size:(index+1)*batch_size]
                    })

        epoch = 0
        best_score = 0
        raw_train_x = dataset[0][0]
        raw_test_x = dataset[1][0]
        # get the sentence number for each document
        number_train_sens = []
        number_test_sens = []


        log_file = open("./log/%s.log" % exp_name, 'w')

        while epoch <= max_iteration:
            start_time = timeit.default_timer()
            epoch += 1
            costs = []

            for minibatch_index in np.random.permutation(range(n_train_batches)):
                cost_epoch = train_func(minibatch_index)
                costs.append(cost_epoch)
                set_zero(zero_vec)

            total_train_cost, train_bag_cost, train_sent_cost, train_penal_cost = zip(*costs)
            print "Iteration %d, total_cost %f bag_cost %f sent_cost %f penal_cost %f\n" %  (epoch, np.mean(total_train_cost), np.mean(train_bag_cost), np.mean(train_sent_cost), np.mean(train_penal_cost))

            if epoch % 5 == 0:
                test_preds = []
                for i in xrange(n_test_batches):
                    test_y_pred = test_func(i)
                    test_preds.append(test_y_pred)
                test_preds = np.concatenate(test_preds)
                test_score = 1 - np.mean(np.not_equal(test_cpu_y, test_preds))

                precision, recall, beta, support = precision_recall_fscore_support(test_cpu_y, test_preds, pos_label=1)

                if test_score > best_score:
                    best_score = test_score
                    # save the sentence vectors
                    train_sens = [get_train_sent_prob(i) for i in range(n_train_batches)]
                    test_sens = [get_test_sent_prob(i) for i in range(n_test_batches)]

                    train_sens = np.concatenate(train_sens, axis=0)
                    test_sens = np.concatenate(test_sens, axis=0)

                    out_train_sent_file = "./results/%s_train_sent.vec" % exp_name
                    out_test_sent_file = "./results/%s_test_sent.vec" % exp_name

                    with open(out_train_sent_file, 'w') as train_f, open(out_test_sent_file, 'w') as test_f:
                        cPickle.dump(train_sens, train_f)
                        cPickle.dump(test_sens, test_f)
                    print "Get best performace at %d iteration %f" % (epoch, test_score)
                    log_file.write("Get best performance at %d iteration %f \n" % (epoch, test_score))

                end_time = timeit.default_timer()
                print "Iteration %d , precision, recall, support" % epoch, precision, recall, support
                log_file.write("Iteration %d, neg precision %f, pos precision %f, neg recall %f pos recall %f , total_cost %f bag_cost %f sent_cost %f penal_cost %f\n" % (epoch, precision[0], precision[1], recall[0], recall[1], np.mean(total_train_cost), np.mean(train_bag_cost), np.mean(train_sent_cost), np.mean(train_penal_cost)))
                print "Using time %f m" % ((end_time -start_time)/60.)
                log_file.write("Uing time %f m\n" % ((end_time - start_time)/60.))
            end_time = timeit.default_timer()
            print "Iteration %d Using time %f m" % ( epoch, (end_time -start_time)/60.)
            log_file.write("Uing time %f m\n" % ((end_time - start_time)/60.))
            log_file.flush()

        log_file.close()
示例#12
0
def run_cnn(exp_name,
            dataset,
            embedding,
            log_fn,
            perf_fn,
            emb_dm=100,
            batch_size=100,
            filter_hs=[1, 2, 3],
            hidden_units=[200, 100, 11],
            dropout_rate=0.5,
            shuffle_batch=True,
            n_epochs=300,
            lr_decay=0.95,
            activation=ReLU,
            sqr_norm_lim=9,
            non_static=True):
    """
    Train and Evaluate CNN event encoder model
    :dataset: list containing three elements[(train_x, train_y), 
            (valid_x, valid_y), (test_x, test_y)]
    :embedding: word embedding with shape (|V| * emb_dm)
    :filter_hs: filter height for each paralle cnn layer
    :dropout_rate: dropout rate for full connected layers
    :n_epochs: the max number of iterations
    
    """
    start_time = timeit.default_timer()
    rng = np.random.RandomState(1234)

    input_height = len(dataset[0][0][0][0])
    num_sens = len(dataset[0][0][0])
    print "--input height ", input_height
    input_width = emb_dm
    num_maps = hidden_units[0]

    ###################
    # start snippet 1 #
    ###################
    print "start to construct the model ...."
    x = T.tensor3("x")
    y = T.matrix("y")

    words = shared(value=np.asarray(embedding, dtype=theano.config.floatX),
                   name="embedding",
                   borrow=True)

    # define function to keep padding vector as zero
    zero_vector_tensor = T.vector()
    zero_vec = np.zeros(input_width, dtype=theano.config.floatX)
    set_zero = function([zero_vector_tensor],
                        updates=[(words,
                                  T.set_subtensor(words[0, :],
                                                  zero_vector_tensor))])

    layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape(
        (x.shape[0] * x.shape[1], 1, x.shape[2], emb_dm))

    conv_layers = []
    layer1_inputs = []

    for i in xrange(len(filter_hs)):
        filter_shape = (num_maps, 1, filter_hs[i], emb_dm)
        pool_size = (input_height - filter_hs[i] + 1, 1)
        conv_layer = nn.ConvPoolLayer(rng,
                                      input=layer0_input,
                                      input_shape=None,
                                      filter_shape=filter_shape,
                                      pool_size=pool_size,
                                      activation=activation)
        sen_vecs = conv_layer.output.reshape(
            (x.shape[0], x.shape[1], num_maps))
        sen_vecs = sen_vecs.dimshuffle(0, 2, 1)
        doc_vec = T.sum(sen_vecs, axis=2).flatten(2)
        layer1_inputs.append(doc_vec)
        conv_layers.append(conv_layer)

    layer1_input = T.concatenate(layer1_inputs, 1)

    ##############
    # Task pop#
    ##############
    print "Construct classifier ...."
    hidden_units[0] = num_maps * len(filter_hs)
    pop_factor = nn.MLDropout(
        rng,
        input=layer1_input,
        layer_sizes=hidden_units,
        dropout_rates=[dropout_rate for i in range(len(hidden_units) - 1)],
        activations=[activation for i in range(len(hidden_units) - 1)])
    pop_factor_output = pop_factor.output.dimshuffle(0, 1, 'x')
    pop_factor_dropout_output = pop_factor.dropout_output.dimshuffle(0, 1, 'x')

    #######################
    # Task Type #####
    #######################
    type_hidden_units = [num for num in hidden_units]
    type_hidden_units[-1] = 5
    type_factor = nn.MLDropout(
        rng,
        input=layer1_input,
        layer_sizes=type_hidden_units,
        dropout_rates=[
            dropout_rate for i in range(len(type_hidden_units) - 1)
        ],
        activations=[activation for i in range(len(type_hidden_units) - 1)])
    type_factor_output = type_factor.output.dimshuffle(0, 'x', 1)
    type_factor_dropout_output = type_factor.dropout_output.dimshuffle(
        0, 'x', 1)

    ######################
    ## Joint Y matrix ###
    #####################
    # construct V matrix to model pop type dependency
    V_value = np.random.random((hidden_units[-1], type_hidden_units[-1]))
    V = theano.shared(value=np.asarray(V_value, dtype=theano.config.floatX),
                      name="V",
                      borrow=True)

    # compute the Joint propability
    joint_act = T.batched_dot(pop_factor_output, type_factor_output) + V
    joint_act_dropout = T.batched_dot(pop_factor_dropout_output,
                                      type_factor_dropout_output) + V

    joint_probs = T.nnet.softmax(joint_act.flatten(2))
    joint_probs_dropout = T.nnet.softmax(joint_act_dropout.flatten(2))

    neg_likelihood = -T.mean(T.log(T.sum(joint_probs * y, axis=1)))
    neg_likelihood_dropout = -T.mean(
        T.log(T.sum(joint_probs_dropout * y, axis=1)))

    joint_preds = T.argmax(joint_probs, axis=1)
    pop_preds = joint_preds // type_hidden_units[-1]
    type_preds = joint_preds % type_hidden_units[-1]

    y_index = T.argmax(y, axis=1)
    pop_y = y_index // type_hidden_units[-1]
    type_y = y_index % type_hidden_units[-1]

    pop_error = T.mean(T.neq(pop_preds, pop_y))
    type_error = T.mean(T.neq(type_preds, type_y))

    params = pop_factor.params
    params += type_factor.params
    params.append(V)

    for conv_layer in conv_layers:
        params += conv_layer.params

    if non_static:
        params.append(words)

    grad_updates = sgd_updates_adadelta(params, neg_likelihood_dropout,
                                        lr_decay, 1e-6, sqr_norm_lim)

    #####################
    # Construct Dataset #
    #####################
    print "Copy data to GPU and constrct train/valid/test func"
    np.random.seed(1234)

    train_x, train_y = shared_dataset(dataset[0])
    test_x, test_y = shared_dataset(dataset[1])

    n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size))
    n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size))

    #####################
    # Train model func #
    #####################
    index = T.iscalar()
    train_func = function(
        [index],
        neg_likelihood_dropout,
        updates=grad_updates,
        givens={
            x: train_x[index * batch_size:(index + 1) * batch_size],
            y: train_y[index * batch_size:(index + 1) * batch_size]
        })

    test_pred = function(
        [index], [pop_error, type_error],
        givens={
            x: test_x[index * batch_size:(index + 1) * batch_size],
            y: test_y[index * batch_size:(index + 1) * batch_size]
        })

    # apply early stop strategy
    patience = 100
    patience_increase = 2
    improvement_threshold = 1.005

    n_test = len(dataset[1][0])

    epoch = 0
    best_params = None
    best_validation_score = 0.
    test_perf = 0

    done_loop = False

    log_file = open(log_fn, 'a')

    while (epoch < n_epochs) and not done_loop:
        start_time = timeit.default_timer()
        epoch += 1
        costs = []
        for minibatch_index in np.random.permutation(range(n_train_batches)):
            cost_epoch = train_func(minibatch_index)
            costs.append(cost_epoch)
            set_zero(zero_vec)

        if epoch % 5 == 0:
            # do test
            test_pop_errors = []
            test_type_errors = []
            for i in xrange(n_test_batches):
                test_pop_error, test_type_error = test_pred(i)
                test_pop_errors.append(test_pop_error)
                test_type_errors.append(test_type_error)

            test_pop_score = 1 - np.mean(test_pop_errors)
            test_type_score = 1 - np.mean(test_type_errors)

            message = "Epoch %d test pop perf %f, type perf %f" % (
                epoch, test_pop_score, test_type_score)
            print message
            log_file.write(message + "\n")
            log_file.flush()

        end_time = timeit.default_timer()
        print "Finish one iteration using %f m" % (
            (end_time - start_time) / 60.)

    log_file.flush()
    log_file.close()
def run_cnn(exp_name,
            dataset,
            embedding,
            log_fn,
            perf_fn,
            emb_dm=100,
            batch_size=100,
            filter_hs=[1, 2, 3],
            hidden_units=[200, 100, 11],
            dropout_rate=0.5,
            shuffle_batch=True,
            n_epochs=300,
            lr_decay=0.95,
            activation=ReLU,
            sqr_norm_lim=9,
            non_static=True,
            alpha=0.0001):
    """
    Train and Evaluate CNN event encoder model
    :dataset: list containing three elements[(train_x, train_y), 
            (valid_x, valid_y), (test_x, test_y)]
    :embedding: word embedding with shape (|V| * emb_dm)
    :filter_hs: filter height for each paralle cnn layer
    :dropout_rate: dropout rate for full connected layers
    :n_epochs: the max number of iterations
    
    """
    start_time = timeit.default_timer()

    input_height = len(dataset[0][0][0])
    print "--input height ", input_height
    input_width = emb_dm
    num_maps = hidden_units[0]

    ###################
    # start snippet 1 #
    ###################
    print "start to construct the model ...."
    word_x = T.matrix("word_x")
    freq_x = T.matrix("freq_x")
    pos_x = T.matrix("pos_x")

    y = T.ivector("y")

    words = shared(value=np.asarray(embedding, dtype=theano.config.floatX),
                   name="embedding",
                   borrow=True)

    sym_dim = 20
    # the frequency embedding is 21 * 50 matrix
    freq_val = np.random.random((21, sym_dim))
    freqs = shared(value=np.asarray(freq_val, dtype=theano.config.floatX),
                   borrow=True,
                   name="freqs")

    # the position embedding is 31 * 50 matrix
    poss_val = np.random.random((31, sym_dim))
    poss = shared(value=np.asarray(poss_val, dtype=theano.config.floatX),
                  borrow=True,
                  name="poss")

    # define function to keep padding vector as zero
    zero_vector_tensor = T.vector()
    zero_vec = np.zeros(input_width, dtype=theano.config.floatX)
    set_zero = function([zero_vector_tensor],
                        updates=[(words,
                                  T.set_subtensor(words[0, :],
                                                  zero_vector_tensor))])

    freq_zero_tensor = T.vector()
    freq_zero_vec = np.zeros(sym_dim, dtype=theano.config.floatX)
    freq_set_zero = function([freq_zero_tensor],
                             updates=[(freqs,
                                       T.set_subtensor(freqs[0, :],
                                                       freq_zero_tensor))])

    pos_zero_tensor = T.vector()
    pos_zero_vec = np.zeros(sym_dim, dtype=theano.config.floatX)
    pos_set_zero = function([pos_zero_tensor],
                            updates=[(poss,
                                      T.set_subtensor(poss[0, :],
                                                      pos_zero_tensor))])

    word_x_emb = words[T.cast(word_x.flatten(), dtype="int32")].reshape(
        (word_x.shape[0], 1, word_x.shape[1], emb_dm))
    freq_x_emb = freqs[T.cast(freq_x.flatten(), dtype="int32")].reshape(
        (freq_x.shape[0], 1, freq_x.shape[1], sym_dim))
    pos_x_emb = poss[T.cast(pos_x.flatten(), dtype="int32")].reshape(
        (pos_x.shape[0], 1, pos_x.shape[1], sym_dim))

    layer0_input = T.concatenate([word_x_emb, freq_x_emb, pos_x_emb], axis=3)

    conv_layers = []
    layer1_inputs = []
    rng = np.random.RandomState()
    for i in xrange(len(filter_hs)):
        filter_shape = (num_maps, 1, filter_hs[i], emb_dm + sym_dim + sym_dim)
        pool_size = (input_height - filter_hs[i] + 1, 1)
        conv_layer = nn.ConvPoolLayer(rng,
                                      input=layer0_input,
                                      input_shape=None,
                                      filter_shape=filter_shape,
                                      pool_size=pool_size,
                                      activation=activation)
        layer1_input = conv_layer.output.flatten(2)
        conv_layers.append(conv_layer)
        layer1_inputs.append(layer1_input)

    layer1_input = T.concatenate(layer1_inputs, 1)

    ##############
    # classifier #
    ##############
    print "Construct classifier ...."
    hidden_units[0] = num_maps * len(filter_hs)
    model = nn.MLPDropout(rng,
                          input=layer1_input,
                          layer_sizes=hidden_units,
                          dropout_rates=[dropout_rate],
                          activations=[activation])

    params = model.params
    for conv_layer in conv_layers:
        params += conv_layer.params

    params.append(words)
    params.append(freqs)
    params.append(poss)

    cost = model.negative_log_likelihood(y) + alpha * model.L2
    dropout_cost = model.dropout_negative_log_likelihood(y) + alpha * model.L2

    grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6,
                                        sqr_norm_lim)

    #####################
    # Construct Dataset #
    #####################
    print "Copy data to GPU and constrct train/valid/test func"

    train_word_x, train_freq_x, train_pos_x, train_y = shared_dataset(
        dataset[0])
    test_word_x, test_freq_x, test_pos_x, test_y = shared_dataset(dataset[1])

    n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size))
    n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size))

    #####################
    # Train model func #
    #####################
    index = T.iscalar()
    train_func = function(
        [index],
        cost,
        updates=grad_updates,
        givens={
            word_x: train_word_x[index * batch_size:(index + 1) * batch_size],
            freq_x: train_freq_x[index * batch_size:(index + 1) * batch_size],
            pos_x: train_pos_x[index * batch_size:(index + 1) * batch_size],
            y: train_y[index * batch_size:(index + 1) * batch_size]
        })

    test_pred = function(
        [index],
        model.preds,
        givens={
            word_x: test_word_x[index * batch_size:(index + 1) * batch_size],
            freq_x: test_freq_x[index * batch_size:(index + 1) * batch_size],
            pos_x: test_pos_x[index * batch_size:(index + 1) * batch_size]
        })

    # apply early stop strategy
    patience = 100
    patience_increase = 2
    improvement_threshold = 1.005

    n_test = len(dataset[1][0])

    epoch = 0
    best_params = None
    best_validation_score = 0.
    test_perf = 0

    done_loop = False

    log_file = open(log_fn, 'a')

    print "Start to train the model....."
    cpu_trn_y = np.asarray(dataset[0][3])
    cpu_tst_y = np.asarray(dataset[1][3])

    def compute_score(true_list, pred_list):
        mat = np.equal(true_list, pred_list)
        score = np.mean(mat)
        return score

    while (epoch < n_epochs) and not done_loop:
        start_time = timeit.default_timer()
        epoch += 1
        costs = []
        for minibatch_index in np.random.permutation(range(n_train_batches)):
            cost_epoch = train_func(minibatch_index)
            costs.append(cost_epoch)
            set_zero(zero_vec)
            freq_set_zero(freq_zero_vec)
            pos_set_zero(pos_zero_vec)

        if epoch % 5 == 0:
            # do test
            test_preds = np.concatenate(
                [test_pred(i) for i in xrange(n_test_batches)])
            test_score = compute_score(cpu_tst_y, test_preds)

            with open(os.path.join(perf_fn, "%s_%d.pred" % (exp_name, epoch)),
                      'w') as epf:
                for p in test_preds:
                    epf.write("%d\n" % int(p))
                message = "Epoch %d test perf %f with train cost %f" % (
                    epoch, test_score, np.mean(costs))
            print message
            log_file.write(message + "\n")
            log_file.flush()

        end_time = timeit.default_timer()
        print "Finish one iteration using %f m" % (
            (end_time - start_time) / 60.)

    log_file.flush()
    log_file.close()