Пример #1
0
def create_csv(paths, args):
    """
  desc: This function creates a csv file from a downloaded dataset.
        Currently this process works on the imdb dataset but other datasets
        can be easily added.
  args:
    args: dictionary of cli arguments
    paths: project paths
  """

    if args.dataset == "imdb":
        print("creating {} csv".format(args.dataset))
        imdb = IMDB(action="create")
        imdb.createManager(args.binary)
        print("{} csv created".format(args.dataset))
def serialize_data(paths, args):
    """
  desc: write dataset partition to binary file
  args:
    nparray: dataset partition as numpy array to write to binary file 
    filename: name of file to write dataset partition to
  """

    if args.dataset == "imdb":

        # fetch imdb dataset
        imdb = IMDB(action="fetch")
        tic = time.time()  # start time of data fetch
        x_train, y_train, x_test, y_test = imdb.partitionManager(args.dataset)

        toc = time.time()  # end time of data fetch
        print("time taken to fetch {} dataset: {}(sec)".format(
            args.dataset, toc - tic))

        # kill if shapes don't make sense
        assert (len(x_train) == len(y_train)
                ), "x_train length does not match y_train length"
        assert (len(x_test) == len(y_test)
                ), "x_test length does not match y_test length"

        # combine datasets
        x_all = x_train + x_test
        y_all = np.concatenate((y_train, y_test), axis=0)

        # create slices
        train_slice_lim = int(round(len(x_all) * args.train_data_percentage))
        validation_slice_lim = int(
            round((train_slice_lim) +
                  len(x_all) * args.validation_data_percentage))

        # partition dataset into train, validation, and test sets
        x_all, docsize, sent_size = imdb.hanformater(inputs=x_all)

        x_train = x_all[:train_slice_lim]
        y_train = y_all[:train_slice_lim]
        docsize_train = docsize[:train_slice_lim]
        sent_size_train = sent_size[:train_slice_lim]

        x_val = x_all[train_slice_lim + 1:validation_slice_lim]
        y_val = y_all[train_slice_lim + 1:validation_slice_lim]
        docsize_val = docsize[train_slice_lim + 1:validation_slice_lim]
        sent_size_val = sent_size[train_slice_lim + 1:validation_slice_lim]

        x_test = x_all[validation_slice_lim + 1:]
        y_test = y_all[validation_slice_lim + 1:]
        docsize_test = docsize[validation_slice_lim + 1:]
        sent_size_test = sent_size[validation_slice_lim + 1:]

        train_bin_filename_x = os.path.join(paths.ROOT_DATA_DIR, args.dataset,
                                            "train_x.npy")
        train_bin_filename_y = os.path.join(paths.ROOT_DATA_DIR, args.dataset,
                                            "train_y.npy")
        train_bin_filename_docsize = os.path.join(paths.ROOT_DATA_DIR,
                                                  args.dataset,
                                                  "train_docsize.npy")
        train_bin_filename_sent_size = os.path.join(paths.ROOT_DATA_DIR,
                                                    args.dataset,
                                                    "train_sent_size.npy")

        val_bin_filename_x = os.path.join(paths.ROOT_DATA_DIR, args.dataset,
                                          "val_x.npy")
        val_bin_filename_y = os.path.join(paths.ROOT_DATA_DIR, args.dataset,
                                          "val_y.npy")
        val_bin_filename_docsize = os.path.join(paths.ROOT_DATA_DIR,
                                                args.dataset,
                                                "val_docsize.npy")
        val_bin_filename_sent_size = os.path.join(paths.ROOT_DATA_DIR,
                                                  args.dataset,
                                                  "val_sent_size.npy")

        test_bin_filename_x = os.path.join(paths.ROOT_DATA_DIR, args.dataset,
                                           "test_x.npy")
        test_bin_filename_y = os.path.join(paths.ROOT_DATA_DIR, args.dataset,
                                           "test_y.npy")
        test_bin_filename_docsize = os.path.join(paths.ROOT_DATA_DIR,
                                                 args.dataset,
                                                 "test_docsize.npy")
        test_bin_filename_sent_size = os.path.join(paths.ROOT_DATA_DIR,
                                                   args.dataset,
                                                   "test_sent_size.npy")

        _write_binaryfile(nparray=x_train, filename=train_bin_filename_x)
        _write_binaryfile(nparray=y_train, filename=train_bin_filename_y)
        _write_binaryfile(nparray=docsize_train,
                          filename=train_bin_filename_docsize)
        _write_binaryfile(nparray=sent_size_train,
                          filename=train_bin_filename_sent_size)

        _write_binaryfile(nparray=x_val, filename=val_bin_filename_x)
        _write_binaryfile(nparray=y_val, filename=val_bin_filename_y)
        _write_binaryfile(nparray=docsize_val,
                          filename=val_bin_filename_docsize)
        _write_binaryfile(nparray=sent_size_val,
                          filename=val_bin_filename_sent_size)

        _write_binaryfile(nparray=x_test, filename=test_bin_filename_x)
        _write_binaryfile(nparray=y_test, filename=test_bin_filename_y)
        _write_binaryfile(nparray=docsize_test,
                          filename=test_bin_filename_docsize)
        _write_binaryfile(nparray=sent_size_test,
                          filename=test_bin_filename_sent_size)
                    han.input_x: x_batch,
                    han.input_y: y_batch,
                    han.sentence_lengths: docsize,
                    han.word_lengths: sent_size,
                    han.is_training: is_training
                }

                loss, accuracy = sess.run([han.loss, han.accuracy],
                                          feed_dict=feed_dict)
                return loss, accuracy

            # end

            # generate batches on imdb dataset else quit
            if FLAGS.dataset == "imdb":
                dataset_controller = IMDB(action="fetch")
            else:
                exit("set dataset flag to appropiate dataset")

            x, y, docsize, sent_size = dataset_controller.get_data(
                type_=FLAGS.run_type)  # fetch dataset
            all_evaluated_chkpts = [
            ]  # list of all checkpoint files previously evaluated

            # testing loop
            while True:

                if FLAGS.wait_for_checkpoint_files:
                    time.sleep(
                        2 * MINUTE
                    )  # wait to allow for creation of new checkpoint file
logger.setLevel(logging.INFO)

formatter = logging.Formatter("%(asctime)s:%(name)s:%(message)s")
currentTime = str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"))
logFileName = os.path.join(paths.LOGS_DIR,
                           "HAN_TxtClassification_{}.log".format(currentTime))

fileHandler = logging.FileHandler(logFileName)
fileHandler.setLevel(logging.ERROR)
fileHandler.setFormatter(formatter)

logger.addHandler(fileHandler)

print("Loading data...\n")

if not IMDB.csvExist():
    imdb = IMDB(action="create")
    imdb.createManager()
    x_train, y_train, x_test, y_test = imdb.partitionManager(type="han")
else:
    imdb = IMDB()
    x_train, y_train, x_test, y_test = imdb.partitionManager(type="han")

if FLAGS.run_type == "train":
    print("Training...\n")
    # create new graph set as default
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        session_conf.gpu_options.allocator_type = "BFC"
        # create new session set it as default
Пример #5
0
def train():
    paths = prjPaths()

    with open(
            os.path.join(paths.LIB_DIR, CONFIG['dataset'], 'persisted_vars.p'),
            'rb') as handle:
        persisted_vars = pickle.load(handle)

    persisted_vars['embedding_dim'] = CONFIG['embedding_dim']
    persisted_vars['max_grad_norm'] = CONFIG['max_grad_norm']
    persisted_vars['dropout_keep_proba'] = CONFIG['dropout_keep_proba']
    persisted_vars['learning_rate'] = CONFIG['learning_rate']
    pickle._dump(
        persisted_vars,
        open(
            os.path.join(paths.LIB_DIR, CONFIG['dataset'], 'persisted_vars.p'),
            'wb'))

    with tf.Graph().as_default():
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=CONFIG[
            'per_process_gpu_memory_fraction'])
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False,
                                      gpu_options=gpu_options)

        session_conf.gpu_options.allocator = 'BFC'

        with tf.Session(config=session_conf) as sess:
            han = HAN(max_seq_len=persisted_vars['max_grad_norm'],
                      max_sent_len=persisted_vars['max_sent_len'],
                      num_classes=persisted_vars['num_classes'],
                      vocab_size=persisted_vars['vocab_size'],
                      embedding_size=persisted_vars['embedding_dim'],
                      max_grad_norm=persisted_vars['max_grad_norm'],
                      dropout_keep_proba=persisted_vars['dropout_keep_proba'],
                      learning_rate=persisted_vars['learning_rate'])

            global_step = tf.Variable(0, name='global_step', trainable=False)

            # 梯度裁剪需要获取训练参数
            tvars = tf.trainable_variables()
            grads, global_norm = tf.clip_by_global_norm(
                tf.gradients(han.loss, tvars), han.max_grad_norm)

            optimizer = tf.train.AdamOptimizer(
                han.learning_rate)  # todo 尝试其他参数

            train_op = optimizer.apply_gradients(zip(grads, tvars),
                                                 name='train_op',
                                                 global_step=global_step)

            merge_summary_op = tf.summary.merge_all()
            train_summary_writer = tf.summary.FileWriter(
                os.path.join(paths.SUMMARY_DIR, CONFIG['run_type']),
                sess.graph)

            # todo 这里的保存对象换成sess
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=CONFIG['num_checkpoint'])

            sess.run(tf.global_variables_initializer())

            # _________train__________
            def train_step(epoch, x_batch, y_batch, docsize, sent_size,
                           is_training):
                tic = time.time()

                feed_dict = {
                    han.input_x: x_batch,
                    han.input_y: y_batch,
                    han.sentence_lengths: docsize,
                    han.word_legths: sent_size,
                    han.sis_training: is_training
                }
                _, step, loss, accuracy, summaries = sess.run(
                    [
                        train_op, global_step, han.loss, han.accuracy,
                        merge_summary_op
                    ],
                    feed_dict=feed_dict)

                time_elapsed = time.time() - tic

                if is_training:
                    print(
                        'Training||CurrentEpoch: {} || GlobalStep: {} || ({} sec/sep) || Loss {:g}) || Accuracy {:g}'
                        .format(epoch + 1, step, time_elapsed, loss, accuracy))

                if step % CONFIG['log_summaries_every'] == 0:
                    train_summary_writer.add_summary(summaries, step)
                    print(
                        f'Saved model summaries to {os.path.join(paths.SUMMARY_DIR,CONFIG["run_type"])} \n'
                    )

                if step % CONFIG['checkpoint_every'] == 0:
                    chkpt_path = saver.save(sess,
                                            os.path.join(
                                                paths.CHECKPOINT_DIR, 'han'),
                                            global_step=step)
                    print('Saved model checkpoint to {} \n'.format(chkpt_path))

            imdb = IMDB(action='fetch')
            x_train, y_train, docsize_train, sent_size_train = imdb.get_data(
                type=CONFIG['run_type'])

            for epoch, batch in imdb.get_batch(data=list(
                    zip(x_train, y_train, docsize_train, sent_size_train)),
                                               batch_size=CONFIG['batch_size'],
                                               num_epoch=CONFIG['num_epochs']):
                x_batch, y_batch, docsize, sent_size = zip(*batch)

                train_step(epoch=epoch,
                           x_batch=x_batch,
                           y_batch=y_batch,
                           docsize=docsize,
                           sent_size=sent_size,
                           is_training=True)
Пример #6
0
def test():
    MINUTE = 60
    paths = prjPaths()
    print('loading persisted variables...')
    with open(
            os.path.join(paths.LIB_DIR, CONFIG['dataset'], 'persisted_vars.p'),
            'rb') as handle:
        persisted_vars = pickle.load(handle)

    graph = tf.Graph()
    with graph.as_default():
        # Set GPU options
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=CONFIG[
            'per_process_gpu_memory_fraction'])
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False,
                                      gpu_options=gpu_options)

        session_conf.gpu_options.allocator_type = 'BFC'

        with tf.Session(config=session_conf) as sess:
            # Insert model
            han = HAN(max_seq_len=persisted_vars['max_seq_len'],
                      max_sent_len=persisted_vars['max_sent_len'],
                      num_classes=persisted_vars['num_classes'],
                      vocab_size=persisted_vars['vocab_size'],
                      embedding_size=persisted_vars['embedding_dim'],
                      max_grad_norm=persisted_vars['max_grad_norm'],
                      dropout_keep_proba=persisted_vars['dropout_keep_proba'],
                      learning_rate=persisted_vars['learning_rate'])

            global_step = tf.Variable(0, name='global_step', trainable=False)
            tvars = tf.trainable_variables()

            # todo 这个方法返回的是什么
            grads, global_norm = tf.clip_by_global_norm(
                tf.gradients(han.loss, tvars), han.max_grad_norm)
            optimizer = tf.train.AdamOptimizer(han.learning_rate)
            test_op = optimizer.apply_gradients(
                zip(grads, tvars),
                name=f'{CONFIIG["run_type"]}_op',
                global_step=global_step)

            merge_summary_op = tf.summary.merge_all()
            test_summary_writer = tf.summary.FileWriter(
                os.path.join(paths.SUMMARY_DIR, CONFIG['run_type']),
                sess.graph)

            meta_file = get_most_recently_create_file([
                os.path.join(paths.CHECKPOINT_DIR, file)
                for file in os.listdir(paths.CHECKPOINT_DIR)
                if file.endswith('.meta')
            ])
            saver = tf.train.import_meta_graph(meta_file)

            sess.run(tf.global_variables_initializer())

            def test_step(sample_num, x_batch, y_batch, docsize, sent_size,
                          is_training):
                feed_dict = {
                    han.input_x: x_batch,
                    han.input_y: y_batch,
                    han.sentence_lengths: docsize,
                    han.word_lengths: sent_size,
                    han.is_training: is_training
                }
                loss, accuracy = sess.run([han.loss, han.accuracy],
                                          feed_dict=feed_dict)
                return loss, accuracy

            if CONFIG['dataset'] == 'imdb':
                dataset_controller = IMDB(action='fetch')
            else:
                exit('set dataset flag to appropiate dataset')

            x, y, docsize, sent_size = dataset_controller.get_data(
                type=CONFIG['run_key'])
            all_evaluated_chkpts = []

            while True:
                if CONFIG['wait_for_checkpoint_files']:
                    time.sleep(2 *
                               MINUTE)  # wait for create new checkpoint file
                else:
                    time.sleep(0 * MINUTE)

                if tf.train.latest_checkpoint(
                        paths.CHECKPOINT_DIR) in all_evaluated_chkpts:
                    continue

                saver.restore(sess,
                              tf.train.latest_checkpoint(paths.CHECKPOINT_DIR))
                all_evaluated_chkpts.append(
                    tf.train.latest_checkpoint(paths.CHECKPOINT_DIR))

                losses = []
                accuracies = []

                tic = time.time()

                for i, batch in enumerate(
                        tqdm(list(zip(x, y, docsize, sent_size)))):
                    x_batch, y_batch, docsize_batch, sent_size_batch = batch
                    x_batch = np.expand_dims(x_batch, axis=0)
                    y_batch = np.expand_dims(y_batch, axis=0)
                    sent_size_batch = np.expand_dims(sent_size_batch, axis=0)

                    loss, accuracy = test_step(sample_num=1,
                                               x_batch=x_batch,
                                               y_batch=y_batch,
                                               docsize=docsize,
                                               sent_size=sent_size,
                                               is_training=False)

                    losses.append(loss)
                    accuracies.append(accuracy)

                time_elapsed = time.time() - tic
                losses_accuracy_vars = {
                    'losses': losses,
                    'accuracy': accuracies
                }

                print(
                    'Time taken to complete {} evaluate of {} checkpoint : {}'.
                    format(CONFIG['run_type'], all_evaluated_chkpts[-1],
                           time_elapsed))

                for k in losses_accuracy_vars.keys():
                    print('stats for {}:{}'.format(
                        k, stats.describe(losses_accuracy_vars[k])))
                    print(Counter(losses_accuracy_vars[k]))

                filename, ext = os.path.splitext(all_evaluated_chkpts[-1])
                pickle._dump(
                    losses_accuracy_vars,
                    open(
                        os.path.join(
                            paths.LIB_DIR, CONFIG['dataset'],
                            'losses_accuracies_vars_{}.p'.format(
                                filename.split('/')[-1])), 'wb'))