示例#1
0
    def __init__(self, action):
        '''预处理imdb数据'''
        self.paths = prjPaths()
        self.ROOT_DATA_DIR = self.ROOT_DATA_DIR
        self.DATASET = 'imdb'

        self.CSVFILENAME = os.path.join(self.ROOT_DATA_DIR, self.DATASET,
                                        '{}.csv'.format(self.DATASET))
        assert action in ['create', 'fetch'], 'invalid action'

        if action == 'create':
            if os.path.exists(self.CSVFILENAME):
                print('removing existing csv file from {}'.format(
                    self.CSVFILENAME))
                os.remove(self.CSVFILENAME)

            train_dir = os.path.join(self.ROOT_DATA_DIR, self.DATASET,
                                     'acImdb', 'train')
            test_dir = os.path.join(self.ROOT_DATA_DIR, self.DATASET, 'acImdb',
                                    'test')

            trainPos_dir = os.path.join(train_dir, 'pos')
            trainNeg_dir = os.path.join(train_dir, 'neg')

            testPos_dir = os.path.join(test_dir, 'pos')
            testNeg_dir = os.path.join(test_dir, 'neg')

            self.data = {
                'trainPos': self._getDirCountents(trainPos_dir),
                'trainNeg': self._getDirCountents(trainNeg_dir),
                'testPos': self._getDirCountents(testPos_dir),
                'testNeg': self._getDirCountents(testNeg_dir)
            }
  def __init__(self, action):
    """
    desc: this class is used to process the imdb dataset
    args:
        action: specify whether to create or fetch the data using the IMDB class
    """
    self.paths = prjPaths()
    self.ROOT_DATA_DIR = self.paths.ROOT_DATA_DIR
    self.DATASET = "imdb"
    
    self.CSVFILENAME = os.path.join(self.ROOT_DATA_DIR, self.DATASET, "{}.csv".format(self.DATASET))
    assert(action in ["create", "fetch"]), "invalid action"

    if action == "create":

      # if creating new csv remove old if one exists
      if os.path.exists(self.CSVFILENAME):
        print("removing existing csv file from {}".format(self.CSVFILENAME))
        os.remove(self.CSVFILENAME)

      # directory structure
      train_dir = os.path.join(self.ROOT_DATA_DIR, self.DATASET, "aclImdb", "train")
      test_dir = os.path.join(self.ROOT_DATA_DIR, self.DATASET, "aclImdb", "test")

      trainPos_dir = os.path.join(train_dir, "pos")
      trainNeg_dir = os.path.join(train_dir, "neg")

      testPos_dir = os.path.join(test_dir, "pos")
      testNeg_dir = os.path.join(test_dir, "neg")

      self.data = {"trainPos": self._getDirContents(trainPos_dir),
                   "trainNeg": self._getDirContents(trainNeg_dir),
                   "testPos": self._getDirContents(testPos_dir),
                   "testNeg": self._getDirContents(testNeg_dir)}
示例#3
0
    def __init__(self, action=None):
        self.paths = prjPaths()
        self.ROOT_DATA_DIR = self.paths.ROOT_DATA_DIR
        assert (action in ["create", None]), "invalid action"

        if action == "create":
            # directory structure
            train_dir = "{}/{}".format(self.ROOT_DATA_DIR, "train")
            test_dir = "{}/{}".format(self.ROOT_DATA_DIR, "test")

            trainPos_dir = "{}/{}".format(train_dir, "pos")
            trainNeg_dir = "{}/{}".format(train_dir, "neg")

            testPos_dir = "{}/{}".format(test_dir, "pos")
            testNeg_dir = "{}/{}".format(test_dir, "neg")

            self.data = {
                "trainPos": self._getDirContents(trainPos_dir),
                "trainNeg": self._getDirContents(trainNeg_dir),
                "testPos": self._getDirContents(testPos_dir),
                "testNeg": self._getDirContents(testNeg_dir)
            }
示例#4
0
def main():
    args = get_args()
    prjPaths_ = prjPaths()

    # determine if gpu present
    if torch.cuda.device_count() > 0:
        gpu_available = True
    else:
        gpu_available = False

    if args.run_type == "train":
        train(gpu_available=gpu_available,
              prjPaths=prjPaths_,
              n=args.n,
              training_steps=args.training_steps,
              batch_size=args.batch_size,
              learning_rate=args.learning_rate,
              show_every_n_steps=args.show_every_n_steps,
              checkpoint_every_n_steps=args.checkpoint_every_n_steps,
              verbose=args.verbose,
              clip_value=args.clip_value)
    elif args.run_type == "inference":
        inference(gpu_available, prjPaths=prjPaths_)
                                                   "test_sent_size.npy")

        _write_binaryfile(nparray=x_train, filename=train_bin_filename_x)
        _write_binaryfile(nparray=y_train, filename=train_bin_filename_y)
        _write_binaryfile(nparray=docsize_train,
                          filename=train_bin_filename_docsize)
        _write_binaryfile(nparray=sent_size_train,
                          filename=train_bin_filename_sent_size)

        _write_binaryfile(nparray=x_val, filename=val_bin_filename_x)
        _write_binaryfile(nparray=y_val, filename=val_bin_filename_y)
        _write_binaryfile(nparray=docsize_val,
                          filename=val_bin_filename_docsize)
        _write_binaryfile(nparray=sent_size_val,
                          filename=val_bin_filename_sent_size)

        _write_binaryfile(nparray=x_test, filename=test_bin_filename_x)
        _write_binaryfile(nparray=y_test, filename=test_bin_filename_y)
        _write_binaryfile(nparray=docsize_test,
                          filename=test_bin_filename_docsize)
        _write_binaryfile(nparray=sent_size_test,
                          filename=test_bin_filename_sent_size)


# end

if __name__ == "__main__":
    paths = prjPaths()
    args = get_args()
    serialize_data(paths, args=args)
示例#6
0
 def csvExist():
     paths = prjPaths()
     csvExists = "imdb.csv" in os.listdir(paths.ROOT_DATA_DIR)
     return csvExists
示例#7
0
def train():
    paths = prjPaths()

    with open(
            os.path.join(paths.LIB_DIR, CONFIG['dataset'], 'persisted_vars.p'),
            'rb') as handle:
        persisted_vars = pickle.load(handle)

    persisted_vars['embedding_dim'] = CONFIG['embedding_dim']
    persisted_vars['max_grad_norm'] = CONFIG['max_grad_norm']
    persisted_vars['dropout_keep_proba'] = CONFIG['dropout_keep_proba']
    persisted_vars['learning_rate'] = CONFIG['learning_rate']
    pickle._dump(
        persisted_vars,
        open(
            os.path.join(paths.LIB_DIR, CONFIG['dataset'], 'persisted_vars.p'),
            'wb'))

    with tf.Graph().as_default():
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=CONFIG[
            'per_process_gpu_memory_fraction'])
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False,
                                      gpu_options=gpu_options)

        session_conf.gpu_options.allocator = 'BFC'

        with tf.Session(config=session_conf) as sess:
            han = HAN(max_seq_len=persisted_vars['max_grad_norm'],
                      max_sent_len=persisted_vars['max_sent_len'],
                      num_classes=persisted_vars['num_classes'],
                      vocab_size=persisted_vars['vocab_size'],
                      embedding_size=persisted_vars['embedding_dim'],
                      max_grad_norm=persisted_vars['max_grad_norm'],
                      dropout_keep_proba=persisted_vars['dropout_keep_proba'],
                      learning_rate=persisted_vars['learning_rate'])

            global_step = tf.Variable(0, name='global_step', trainable=False)

            # 梯度裁剪需要获取训练参数
            tvars = tf.trainable_variables()
            grads, global_norm = tf.clip_by_global_norm(
                tf.gradients(han.loss, tvars), han.max_grad_norm)

            optimizer = tf.train.AdamOptimizer(
                han.learning_rate)  # todo 尝试其他参数

            train_op = optimizer.apply_gradients(zip(grads, tvars),
                                                 name='train_op',
                                                 global_step=global_step)

            merge_summary_op = tf.summary.merge_all()
            train_summary_writer = tf.summary.FileWriter(
                os.path.join(paths.SUMMARY_DIR, CONFIG['run_type']),
                sess.graph)

            # todo 这里的保存对象换成sess
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=CONFIG['num_checkpoint'])

            sess.run(tf.global_variables_initializer())

            # _________train__________
            def train_step(epoch, x_batch, y_batch, docsize, sent_size,
                           is_training):
                tic = time.time()

                feed_dict = {
                    han.input_x: x_batch,
                    han.input_y: y_batch,
                    han.sentence_lengths: docsize,
                    han.word_legths: sent_size,
                    han.sis_training: is_training
                }
                _, step, loss, accuracy, summaries = sess.run(
                    [
                        train_op, global_step, han.loss, han.accuracy,
                        merge_summary_op
                    ],
                    feed_dict=feed_dict)

                time_elapsed = time.time() - tic

                if is_training:
                    print(
                        'Training||CurrentEpoch: {} || GlobalStep: {} || ({} sec/sep) || Loss {:g}) || Accuracy {:g}'
                        .format(epoch + 1, step, time_elapsed, loss, accuracy))

                if step % CONFIG['log_summaries_every'] == 0:
                    train_summary_writer.add_summary(summaries, step)
                    print(
                        f'Saved model summaries to {os.path.join(paths.SUMMARY_DIR,CONFIG["run_type"])} \n'
                    )

                if step % CONFIG['checkpoint_every'] == 0:
                    chkpt_path = saver.save(sess,
                                            os.path.join(
                                                paths.CHECKPOINT_DIR, 'han'),
                                            global_step=step)
                    print('Saved model checkpoint to {} \n'.format(chkpt_path))

            imdb = IMDB(action='fetch')
            x_train, y_train, docsize_train, sent_size_train = imdb.get_data(
                type=CONFIG['run_type'])

            for epoch, batch in imdb.get_batch(data=list(
                    zip(x_train, y_train, docsize_train, sent_size_train)),
                                               batch_size=CONFIG['batch_size'],
                                               num_epoch=CONFIG['num_epochs']):
                x_batch, y_batch, docsize, sent_size = zip(*batch)

                train_step(epoch=epoch,
                           x_batch=x_batch,
                           y_batch=y_batch,
                           docsize=docsize,
                           sent_size=sent_size,
                           is_training=True)
示例#8
0
def test():
    MINUTE = 60
    paths = prjPaths()
    print('loading persisted variables...')
    with open(
            os.path.join(paths.LIB_DIR, CONFIG['dataset'], 'persisted_vars.p'),
            'rb') as handle:
        persisted_vars = pickle.load(handle)

    graph = tf.Graph()
    with graph.as_default():
        # Set GPU options
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=CONFIG[
            'per_process_gpu_memory_fraction'])
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False,
                                      gpu_options=gpu_options)

        session_conf.gpu_options.allocator_type = 'BFC'

        with tf.Session(config=session_conf) as sess:
            # Insert model
            han = HAN(max_seq_len=persisted_vars['max_seq_len'],
                      max_sent_len=persisted_vars['max_sent_len'],
                      num_classes=persisted_vars['num_classes'],
                      vocab_size=persisted_vars['vocab_size'],
                      embedding_size=persisted_vars['embedding_dim'],
                      max_grad_norm=persisted_vars['max_grad_norm'],
                      dropout_keep_proba=persisted_vars['dropout_keep_proba'],
                      learning_rate=persisted_vars['learning_rate'])

            global_step = tf.Variable(0, name='global_step', trainable=False)
            tvars = tf.trainable_variables()

            # todo 这个方法返回的是什么
            grads, global_norm = tf.clip_by_global_norm(
                tf.gradients(han.loss, tvars), han.max_grad_norm)
            optimizer = tf.train.AdamOptimizer(han.learning_rate)
            test_op = optimizer.apply_gradients(
                zip(grads, tvars),
                name=f'{CONFIIG["run_type"]}_op',
                global_step=global_step)

            merge_summary_op = tf.summary.merge_all()
            test_summary_writer = tf.summary.FileWriter(
                os.path.join(paths.SUMMARY_DIR, CONFIG['run_type']),
                sess.graph)

            meta_file = get_most_recently_create_file([
                os.path.join(paths.CHECKPOINT_DIR, file)
                for file in os.listdir(paths.CHECKPOINT_DIR)
                if file.endswith('.meta')
            ])
            saver = tf.train.import_meta_graph(meta_file)

            sess.run(tf.global_variables_initializer())

            def test_step(sample_num, x_batch, y_batch, docsize, sent_size,
                          is_training):
                feed_dict = {
                    han.input_x: x_batch,
                    han.input_y: y_batch,
                    han.sentence_lengths: docsize,
                    han.word_lengths: sent_size,
                    han.is_training: is_training
                }
                loss, accuracy = sess.run([han.loss, han.accuracy],
                                          feed_dict=feed_dict)
                return loss, accuracy

            if CONFIG['dataset'] == 'imdb':
                dataset_controller = IMDB(action='fetch')
            else:
                exit('set dataset flag to appropiate dataset')

            x, y, docsize, sent_size = dataset_controller.get_data(
                type=CONFIG['run_key'])
            all_evaluated_chkpts = []

            while True:
                if CONFIG['wait_for_checkpoint_files']:
                    time.sleep(2 *
                               MINUTE)  # wait for create new checkpoint file
                else:
                    time.sleep(0 * MINUTE)

                if tf.train.latest_checkpoint(
                        paths.CHECKPOINT_DIR) in all_evaluated_chkpts:
                    continue

                saver.restore(sess,
                              tf.train.latest_checkpoint(paths.CHECKPOINT_DIR))
                all_evaluated_chkpts.append(
                    tf.train.latest_checkpoint(paths.CHECKPOINT_DIR))

                losses = []
                accuracies = []

                tic = time.time()

                for i, batch in enumerate(
                        tqdm(list(zip(x, y, docsize, sent_size)))):
                    x_batch, y_batch, docsize_batch, sent_size_batch = batch
                    x_batch = np.expand_dims(x_batch, axis=0)
                    y_batch = np.expand_dims(y_batch, axis=0)
                    sent_size_batch = np.expand_dims(sent_size_batch, axis=0)

                    loss, accuracy = test_step(sample_num=1,
                                               x_batch=x_batch,
                                               y_batch=y_batch,
                                               docsize=docsize,
                                               sent_size=sent_size,
                                               is_training=False)

                    losses.append(loss)
                    accuracies.append(accuracy)

                time_elapsed = time.time() - tic
                losses_accuracy_vars = {
                    'losses': losses,
                    'accuracy': accuracies
                }

                print(
                    'Time taken to complete {} evaluate of {} checkpoint : {}'.
                    format(CONFIG['run_type'], all_evaluated_chkpts[-1],
                           time_elapsed))

                for k in losses_accuracy_vars.keys():
                    print('stats for {}:{}'.format(
                        k, stats.describe(losses_accuracy_vars[k])))
                    print(Counter(losses_accuracy_vars[k]))

                filename, ext = os.path.splitext(all_evaluated_chkpts[-1])
                pickle._dump(
                    losses_accuracy_vars,
                    open(
                        os.path.join(
                            paths.LIB_DIR, CONFIG['dataset'],
                            'losses_accuracies_vars_{}.p'.format(
                                filename.split('/')[-1])), 'wb'))