def feature_sel(data_):
    train_, valid_, test_ = data_helper.split_data(data_)
    x_sel = [i for i in range(0, 10)]
    # Initial cross entropy.
    model_ = MLPClassifier()
    model_.fit(train_[:, x_sel], train_[:, LABEL_COL])
    prev_min_cros_entrpy = 100
    min_cros_entrpy = cross_entropy(model_, valid_, x_sel)
    while (prev_min_cros_entrpy +
           (0.01 * prev_min_cros_entrpy)) > min_cros_entrpy:
        prev_min_cros_entrpy = min_cros_entrpy
        crs_entr_li = list()
        for feat in x_sel:
            train_features = [x for x in x_sel if x != feat]
            model_ = MLPClassifier()
            print('Training the model with feature(s): {}'.format(
                train_features))
            model_.fit(train_[:, train_features], train_[:, LABEL_COL])
            # print(model_.classes_)
            print('Calculating entropy')
            crs_entr_li.append(cross_entropy(model_, valid_, train_features))
        print('Entropies are: {}'.format(crs_entr_li))
        print('Minimum entropy is {} and is obtained for feature: {}'.format(
            min(crs_entr_li), x_sel[crs_entr_li.index(min(crs_entr_li))]))
        min_cros_entrpy = min(crs_entr_li)
        x_sel.pop(crs_entr_li.index(min(crs_entr_li)))
        print('Remaining features in this iteration: {}'.format(x_sel))
        print('Previous Entropy: {}, Current Entropy: {}'.format(
            prev_min_cros_entrpy, min_cros_entrpy))

    # Get output for the problem;
    entropy_auc(train_, valid_, test_, x_sel)
예제 #2
0
def train(config):
    print('parameters:')
    print(config)

    # load data
    print('load data')
    X, y = data_helper.process_data(config)  # X=[[seq1],[seq2]]   y=[,,,,]

    # make vocab
    print('make vocab...')
    word2index, label2index = data_helper.generate_vocab(X, y, config)

    # padding data
    print('padding data')
    input_x, input_y = data_helper.padding(X, y, config, word2index,
                                           label2index)

    # split data
    print('split data...')
    x_train, y_train, x_test, y_test, x_dev, y_dev = data_helper.split_data(
        input_x, input_y, config)

    print('length train: {}'.format(len(x_train)))
    print('length test: {}'.format(len(x_test)))
    print('length dev: {}'.format(len(x_dev)))

    print('training...')

    with tf.Graph().as_default():
        sess_config = tf.ConfigProto(
            allow_soft_placement=config['allow_soft_placement'],
            log_device_placement=config['log_device_placement'])
        with tf.Session(config=sess_config) as sess:
            rcnn = TextRCNN(config)

        # training procedure
        global_step = tf.Variable(0, name='globel_step', trainable=False)
        train_op = tf.train.AdamOptimizer(config['learning_rate']).minimize(
            rcnn.loss, global_step=global_step)

        # output dir for models
        timestamp = str(int(time.time()))
        outdir = os.path.abspath(
            os.path.join(os.path.curdir, 'runs', timestamp))
        if not os.path.exists(os.path.join(os.path.curdir, 'runs')):
            os.mkdir(os.path.join(os.path.curdir, 'runs'))
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        print('writing to {}'.format(outdir))

        # checkpoint dictory
        checkpoint_dir = os.path.abspath(os.path.join(outdir, 'checkpoints'))
        checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

        if not os.path.exists(checkpoint_dir):
            os.mkdir(checkpoint_dir)

        saver = tf.train.Saver(tf.global_variables(),
                               max_to_keep=config['num_checkpoints'])

        sess.run(tf.global_variables_initializer())

        def train_step(x_batch, y_batch):
            feed_dict = {
                rcnn.input_x: x_batch,
                rcnn.input_y: y_batch,
                rcnn.dropout_keep_prob: config['dropout_keep_prob']
            }

            _, step, loss, accuracy = sess.run(
                [train_op, global_step, rcnn.loss, rcnn.accuracy],
                feed_dict=feed_dict)

            time_str = datetime.datetime.now().isoformat()
            print('{}: step {}, loss {}, acc {}'.format(
                time_str, step, loss, accuracy))

        def dev_step(x_batch, y_batch):
            feed_dict = {
                rcnn.input_x: x_batch,
                rcnn.input_y: y_batch,
                rcnn.dropout_keep_prob: 1.0
            }

            step, loss, accuracy = sess.run(
                [global_step, rcnn.loss, rcnn.accuracy], feed_dict=feed_dict)

            time_str = datetime.datetime.now().isoformat()
            print('{}: step {}, loss {}, acc {}'.format(
                time_str, step, loss, accuracy))

        # generate batches
        batches = data_helper.generate_batchs(x_train, y_train, config)
        for batch in batches:
            x_batch, y_batch = zip(*batch)
            print(y_batch)
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % config['evaluate_every'] == 0:
                print('Evaluation:')
                dev_step(x_dev, y_dev)

            if current_step % config['checkpoint_every'] == 0:
                path = saver.save(sess,
                                  checkpoint_prefix,
                                  global_step=current_step)
                print('save model checkpoint to {}'.format(path))

        # test accuracy
        test_accuracy = sess.run(
            [rcnn.accuracy],
            feed_dict={
                rcnn.input_x: x_test,
                rcnn.input_y: y_test,
                rcnn.dropout_keep_prob: 1.0
            })
        print('Test dataset accuracy: {}'.format(test_accuracy))
예제 #3
0
파일: train.py 프로젝트: Ivy99999/Fasttext
def train(config):
    print('parameters: ')
    print(json.dumps(config, indent=4, ensure_ascii=False))

    # load data
    print('load data .....')
    X, y = data_helper.process_data(config)

    # make vocab
    print('make vocab .....')
    word_to_index, label_to_index = data_helper.generate_vocab(X, y, config)

    # padding data
    print('padding data .....')
    input_x, input_y = data_helper.padding(X, y, config, word_to_index, label_to_index)

    # split data
    print('split data .....')
    x_train, y_train, x_test, y_test, x_dev, y_dev = data_helper.split_data(input_x, input_y, config)

    print('length train: {}'.format(len(x_train)))
    print('length test: {}'.format(len(x_test)))
    print('length dev: {}'.format(len(x_dev)))
    print('training .....')
    with tf.Graph().as_default():
        sess_config = tf.ConfigProto(
            allow_soft_placement=config['allow_soft_placement'],
            log_device_placement=config['log_device_placement']
        )
        with tf.Session(config=sess_config) as sess:
            fast_text = FastText(config)

            # training procedure
            global_step = tf.Variable(0, name='global_step', trainable=False)
            optimizer = tf.train.AdamOptimizer(config['learning_rate'])
            grads_and_vars = optimizer.compute_gradients(fast_text.loss)
            train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

            # keep track of gradient values and sparsity
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram('{}/grad/hist'.format(v.name), g)
                    sparsity_summary = tf.summary.scalar('{}/grad/sparsity'.format(v.name), tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # output dir for models and summaries
            timestamp = str(int(time.time()))
            outdir = os.path.abspath(os.path.join(os.path.curdir, 'runs', timestamp))
            print('writing to {}'.format(outdir))

            # summary for loss and accuracy
            loss_summary = tf.summary.scalar('loss', fast_text.loss)
            acc_summary = tf.summary.scalar('accuracy', fast_text.accuracy)

            # train summary
            train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(outdir, 'summaries', 'train')
            train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

            # dev summary
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(outdir, 'summaries', 'dev')
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

            # checkpoint dirctory
            checkpoint_dir = os.path.abspath(os.path.join(outdir, 'checkpoints'))
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model.bin')

            if not os.path.exists(checkpoint_dir):
                os.mkdir(checkpoint_dir)

            saver = tf.train.Saver(tf.global_variables(), max_to_keep=config['num_checkpoints'])

            sess.run(tf.global_variables_initializer())

            def train_step(x_batch, y_batch):
                feed_dict = {
                    fast_text.input_x: x_batch,
                    fast_text.input_y: y_batch,
                }

                _, step, summaries, loss, accuracy = sess.run(
                    [train_op, global_step, train_summary_op, fast_text.loss, fast_text.accuracy],
                    feed_dict=feed_dict
                )

                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
                train_summary_writer.add_summary(summaries, step)

            def dev_step(x_batch, y_batch, writer=None):
                feed_dic = {
                    fast_text.input_x: x_batch,
                    fast_text.input_y: y_batch,
                    fast_text.dropout_keep_prob: 1.0
                }

                step, summaries, loss, accuracy = sess.run(
                    [global_step, dev_summary_op, fast_text.loss, fast_text.accuracy],
                    feed_dict=feed_dic
                )

                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
                if writer:
                    writer.add_summary(summaries, step)

            # generate batches
            batches = data_helper.generate_batchs(x_train, y_train, config)
            for batch in batches:
                x_batch, y_batch = zip(*batch)
                train_step(x_batch, y_batch)
                current_step = tf.train.global_step(sess, global_step)
                if current_step % config['evaluate_every'] == 0:
                    print('Evaluation:')
                    dev_step(x_dev, y_dev, writer=dev_summary_writer)

                if current_step % config['checkpoint_every'] == 0:
                    path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                    print('save model checkpoint to {}'.format(path))

            # test accuracy
            test_accuracy = sess.run([fast_text.accuracy], feed_dict={
                fast_text.input_x: x_test, fast_text.input_y: y_test, fast_text.dropout_keep_prob: 1.0})
            print('Test dataset accuracy: {}'.format(test_accuracy))
예제 #4
0
def train(config):
    learning_rate = config['learning_rate']
    clip_grad = config['clip_grad']
    max_model_keep = config['max_model_keep']

    print('parameters: ')
    print(json.dumps(config, indent=4, ensure_ascii=False))

    # load data
    print('load data .....')
    X, y = data_helper.process_data(config)

    # make vocab
    print('make vocab .....')
    word_to_index, label_to_index = data_helper.generate_vocab(X, y, config)
    config['num_tags'] = len(label_to_index)

    # padding data
    print('padding data .....')
    input_x, input_y, sequence_lengths = data_helper.padding(X, y, word_to_index, label_to_index)

    # split data
    print('split data .....')
    x_train, y_train, sequences_length_train, x_test, y_test, sequence_length_test, x_dev, y_dev, sequence_length_dev = \
        data_helper.split_data(input_x, input_y, sequence_lengths, config)

    print('length train: {}'.format(len(x_train)))
    print('length test: {}'.format(len(x_test)))
    print('length dev: {}'.format(len(x_dev)))

    with tf.Graph().as_default():
        with tf.Session() as sess:
            bilstm_crf = BilstmCrf(config)

            # training_procedure
            global_step = tf.Variable(0, name='global_step', trainable=False)
            optimizer = tf.train.AdamOptimizer(learning_rate)

            # apply grad clip to avoid gradiend explosion
            grads_and_vars = optimizer.compute_gradients(bilstm_crf.loss)
            grads_and_vars_clip = [[tf.clip_by_value(g, -clip_grad, clip_grad), v] for g, v in grads_and_vars]
            train_op = optimizer.apply_gradients(grads_and_vars_clip, global_step=global_step)

            # output dir for models and summaries
            timestamp = str(int(time.time()))
            outdir = os.path.abspath(os.path.join(os.path.curdir, 'runs', timestamp))
            print('writing to {} !!!'.format(outdir))

            # summary of loss
            tf.summary.scalar('loss', bilstm_crf.loss)

            # train summary
            train_sumary_op = tf.summary.merge_all()
            train_summary_dir = os.path.join(outdir, 'summaries', 'train')
            train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

            # dev summary
            dev_summary_op = tf.summary.merge_all()
            dev_summary_dir = os.path.join(outdir, 'summaries', 'dev')
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

            # checkpoint dir
            checkpoint_dir = os.path.abspath(os.path.join(outdir, 'checkpoints'))
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

            if not os.path.exists(checkpoint_dir):
                os.mkdir(checkpoint_dir)

            saver = tf.train.Saver(tf.global_variables(), max_to_keep=max_model_keep)

            sess.run(tf.global_variables_initializer())

            def viterbi_decoder(logits, seq_len_list, transition_params):
                label_list = []
                for logit, seq_len in zip(logits, seq_len_list):
                    viterbi_seq, _ = tf.contrib.crf.viterbi_decode(logit[:seq_len], transition_params)
                    label_list.append(viterbi_seq)
                return label_list

            def train_step(x_batch, y_batch, sequence_lengths):
                feed_dict = {
                    bilstm_crf.input_x: x_batch,
                    bilstm_crf.input_y: y_batch,
                    bilstm_crf.sequence_length: sequence_lengths,
                    bilstm_crf.dropout_keep_prob: config['dropout_keep_prob']
                }

                _, step, summaries, loss, transition_params, logits = sess.run(
                    [train_op, global_step, train_sumary_op, bilstm_crf.loss,
                     bilstm_crf.transition_params, bilstm_crf.logits],
                    feed_dict=feed_dict
                )

                label_list = viterbi_decoder(logits, sequence_lengths, transition_params)

                acc, recall, f1 = data_helper.measure(y_batch, label_list, sequence_lengths)

                time_str = datetime.datetime.now().isoformat()
                print("training: {}: step {}, loss {:g}, acc {:.2f} recall {:.2f} f1 {:.2f}".format
                      (time_str, step, loss, acc, recall, f1))
                train_summary_writer.add_summary(summaries, step)

            def dev_step(x_batch, y_batch, sequence_lengths, writer=None):
                feed_dic = {
                    bilstm_crf.input_x: x_batch,
                    bilstm_crf.input_y: y_batch,
                    bilstm_crf.sequence_length: sequence_lengths,
                    bilstm_crf.dropout_keep_prob: 1.0
                }

                step, summaries, loss, transition_params, logits = sess.run(
                    [global_step, dev_summary_op, bilstm_crf.loss, bilstm_crf.transition_params, bilstm_crf.logits],
                    feed_dict=feed_dic
                )

                label_list = viterbi_decoder(logits, sequence_lengths, transition_params)

                acc, recall, f1 = data_helper.measure(y_batch, label_list, sequence_lengths)

                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, f1 {:.2f}".format(time_str, step, loss, f1))
                if writer:
                    writer.add_summary(summaries, step)

            # generate batches
            batches = data_helper.generate_batchs(x_train, y_train, sequences_length_train, config)
            for batch in batches:
                x_batch, y_batch, sequence_length_batch = zip(*batch)
                train_step(x_batch, y_batch, sequence_length_batch)
                current_step = tf.train.global_step(sess, global_step)
                if current_step % config['evaluate_every'] == 0:
                    print('Evaluation:')
                    dev_step(x_dev, y_dev, sequence_length_dev, writer=dev_summary_writer)

                if current_step % config['checkpoint_every'] == 0:
                    path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                    print('save model checkpoint to {}'.format(path))