Пример #1
0
    def _load_data_(self):
        self.text_len = []
        self.label_vocab = Vocabulary()
        # load labels
        self.labels = []
        with open(self.meta_data_path, mode='r') as f:
            for line in f:
                label_token = line.strip().split('\t')[-1]
                self.label_vocab.add_token(label_token)
                self.labels.append(self.label_vocab.token_to_id[label_token])
        # load index
        self.data_index = []
        with open(self.data_index_path, mode='r') as f:
            for line in f:
                if self.dataset_name == 'R8' or self.dataset_name == 'R52' or self.dataset_name == 'ohsumed' or self.dataset_name == 'MR':
                    if self.is_test and line.find('test') != -1:
                        self.data_index.append(len(self.data_index))
                    elif not self.is_test and line.find('train') != -1:
                        self.data_index.append(len(self.data_index))
                elif self.dataset_name == '20ng':
                    self.data_index.append(int(line.strip()))

        # load data
        self.data = []
        with open(self.data_path, mode='r') as f:
            for line in f:
                ids = self.token_vocab.index_sentence(line.strip())
                if len(ids) > self.max_sequence_length:
                    ids = ids[:self.max_sequence_length]
                    self.text_len.append(self.max_sequence_length)
                elif len(ids) < self.max_sequence_length:
                    self.text_len.append(len(ids))
                    ids.extend([0] * (self.max_sequence_length - len(ids)))
                self.data.append(ids)
Пример #2
0
class TextDataset(Dataset):
    def __init__(self, data_index_path, data_path, meta_data_path, vocab_path,
                 max_sequence_length, dataset_name, is_test):
        self.data_index_path = data_index_path
        self.data_path = data_path
        self.vocab_path = vocab_path
        self.meta_data_path = meta_data_path
        self.max_sequence_length = max_sequence_length
        self.dataset_name = dataset_name
        self.is_test = is_test

        self.token_vocab = Vocabulary(self.vocab_path, is_padded=True)

        self._load_data_()

    def _load_data_(self):
        self.text_len = []
        self.label_vocab = Vocabulary()
        # load labels
        self.labels = []
        with open(self.meta_data_path, mode='r') as f:
            for line in f:
                label_token = line.strip().split('\t')[-1]
                self.label_vocab.add_token(label_token)
                self.labels.append(self.label_vocab.token_to_id[label_token])
        # load index
        self.data_index = []
        with open(self.data_index_path, mode='r') as f:
            for line in f:
                if self.dataset_name == 'R8' or self.dataset_name == 'R52' or self.dataset_name == 'ohsumed' or self.dataset_name == 'MR':
                    if self.is_test and line.find('test') != -1:
                        self.data_index.append(len(self.data_index))
                    elif not self.is_test and line.find('train') != -1:
                        self.data_index.append(len(self.data_index))
                elif self.dataset_name == '20ng':
                    self.data_index.append(int(line.strip()))

        # load data
        self.data = []
        with open(self.data_path, mode='r') as f:
            for line in f:
                ids = self.token_vocab.index_sentence(line.strip())
                if len(ids) > self.max_sequence_length:
                    ids = ids[:self.max_sequence_length]
                    self.text_len.append(self.max_sequence_length)
                elif len(ids) < self.max_sequence_length:
                    self.text_len.append(len(ids))
                    ids.extend([0] * (self.max_sequence_length - len(ids)))
                self.data.append(ids)

    def __getitem__(self, item):
        input_ids = torch.LongTensor(self.data[self.data_index[item]])
        text_len = torch.LongTensor([self.text_len[item]])
        label = torch.LongTensor([self.labels[self.data_index[item]]])
        return input_ids, text_len, label

    def __len__(self):
        return len(self.data_index)
Пример #3
0
    def __init__(self, data_index_path, data_path, meta_data_path, vocab_path,
                 max_sequence_length, dataset_name, is_test):
        self.data_index_path = data_index_path
        self.data_path = data_path
        self.vocab_path = vocab_path
        self.meta_data_path = meta_data_path
        self.max_sequence_length = max_sequence_length
        self.dataset_name = dataset_name
        self.is_test = is_test

        self.token_vocab = Vocabulary(self.vocab_path, is_padded=True)

        self._load_data_()
Пример #4
0
def main(_):
    if not FLAGS.dataset_dir:
        raise ValueError(
            'You must supply the dataset directory with --dataset_dir')

    if not FLAGS.output_dir:
        raise ValueError(
            'You must supply the output directory with --output_dir')
    print('Dataset directory:', FLAGS.dataset_dir)
    print('Output directory:', FLAGS.output_dir)

    vocab = Vocabulary()

    writer = DataWriter(vocab, FLAGS.dataset_dir, FLAGS.output_dir,
                        FLAGS.str_size, FLAGS.name, FLAGS.split)

    writer.build_data()
Пример #5
0
    )
    remove_stopwords = True
    min_freq = 5
    lowercase = True

    if args.module == "train":
        train_iter = ReutersDatasetIterator(args.data_root, "training")
        vocab_path = "common_persist/vocab.pkl"
        if os.path.exists(vocab_path):
            log.info("Loading existing vocab")
            vocabulary = file_utils.load_obj(vocab_path)
        else:
            log.info("Vocab doesn't exist. Creating")
            if not os.path.exists("common_persist"):
                os.makedirs("common_persist")
            vocabulary = Vocabulary(
                remove_stopwords, min_freq, lowercase, "./data/reuters/stopwords")
            vocabulary.build(train_iter)
            file_utils.save_obj(vocabulary, vocab_path)

        train_set = ReutersDataset(args.data_root, "training", vocabulary)
        test_set = ReutersDataset(args.data_root, "test", vocabulary)

        train_loader = DataLoader(train_set, shuffle=True, batch_size=1)
        test_loader = DataLoader(test_set, shuffle=False, batch_size=1)

        if args.model == "doc2vec":

            doc2vec_model_path = "common_persist/doc2vec_model.pkl"
            train_tagged_path = "common_persist/train_tagged.pkl"
            test_tagged_path = "common_persist/test_tagged.pkl"
Пример #6
0
def main(_):

    assert FLAGS.file_pattern, "--file_pattern is required"
    assert FLAGS.train_checkpoints, "--train_checkpoints is required"
    assert FLAGS.summaries_dir, "--summaries_dir is required"

    vocab = Vocabulary()

    model_config = configuration.ModelConfig()

    training_config = configuration.TrainingConfig()
    print(FLAGS.learning_rate)
    training_config.initial_learning_rate = FLAGS.learning_rate

    sequence_length = model_config.sequence_length
    batch_size = FLAGS.batch_size

    summaries_dir = FLAGS.summaries_dir
    if not tf.gfile.IsDirectory(summaries_dir):
        tf.logging.info("Creating training directory: %s", summaries_dir)
        tf.gfile.MakeDirs(summaries_dir)

    train_checkpoints = FLAGS.train_checkpoints
    if not tf.gfile.IsDirectory(train_checkpoints):
        tf.logging.info("Creating training directory: %s", train_checkpoints)
        tf.gfile.MakeDirs(train_checkpoints)

    # 数据队列初始化
    input_queue = DataReader(FLAGS.dataset_dir,
                             FLAGS.file_pattern,
                             model_config,
                             batch_size=batch_size)

    g = tf.Graph()
    with g.as_default():
        # 数据队列
        with tf.name_scope(None, 'input_queue'):
            input_images, input_labels = input_queue.read()

        # 模型建立
        model = crnn.CRNN(256, model_config.num_classes, 'train')
        logits = model.build(input_images)

        with tf.name_scope(None, 'loss'):
            loss = tf.reduce_mean(
                tf.nn.ctc_loss(labels=input_labels,
                               inputs=logits,
                               sequence_length=sequence_length *
                               tf.ones(batch_size, dtype=tf.int32)),
                name='compute_loss',
            )
            tf.losses.add_loss(loss)
            total_loss = tf.losses.get_total_loss(False)

        with tf.name_scope(None, 'decoder'):
            decoded, _ = tf.nn.ctc_beam_search_decoder(
                logits,
                sequence_length * tf.ones(batch_size, dtype=tf.int32),
                merge_repeated=False,
            )
            with tf.name_scope(None, 'acurracy'):
                sequence_dist = tf.reduce_mean(
                    tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                     input_labels),
                    name='seq_dist',
                )
            preds = tf.sparse_tensor_to_dense(decoded[0], name='prediction')
            gt_labels = tf.sparse_tensor_to_dense(input_labels,
                                                  name='Ground_Truth')

        # print(len(slim.get_model_variables()))
        # print('>>>>>>>>>>>>>>>>>>>>>>>>>>>')
        # print(len(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)))
        # sys.exit()
        global_step = tf.Variable(initial_value=0,
                                  name="global_step",
                                  trainable=False,
                                  collections=[
                                      tf.GraphKeys.GLOBAL_STEP,
                                      tf.GraphKeys.GLOBAL_VARIABLES
                                  ])

        start_learning_rate = training_config.initial_learning_rate
        learning_rate = tf.train.exponential_decay(
            start_learning_rate,
            global_step,
            decay_steps=training_config.learning_decay_steps,
            decay_rate=training_config.learning_rate_decay_factor,
            staircase=True,
        )

        # summary
        # Add summaries for variables.
        for variable in slim.get_model_variables():
            tf.summary.histogram(variable.op.name, variable)
        tf.summary.scalar(name='Seq_Dist', tensor=sequence_dist)
        tf.summary.scalar(name='global_step', tensor=global_step)
        tf.summary.scalar(name='learning_rate', tensor=learning_rate)
        tf.summary.scalar(name='total_loss', tensor=total_loss)

        # global/secs hook
        globalhook = tf.train.StepCounterHook(
            every_n_steps=FLAGS.log_every_n_steps, )
        # 保存chekpoints的hook
        # saver = tf.train.Saver(max_to_keep=training_config.max_checkpoints_to_keep)
        # saverhook = tf.train.CheckpointSaverHook(
        #     checkpoint_dir=FLAGS.train_checkpoints,
        #     save_steps=2000,
        #     saver=saver,
        # )
        # #保存summaries的hook
        # merge_summary_op = tf.summary.merge_all()
        # summaryhook = tf.train.SummarySaverHook(
        #     save_steps=200,
        #     output_dir=FLAGS.summaries_dir,
        #     summary_op=merge_summary_op,
        # )
        # 训练时需要logging的hook
        tensors_print = {
            'global_step': global_step,
            'loss': loss,
            'Seq_Dist': sequence_dist,
            # 'accurays':accurays,
        }
        loghook = tf.train.LoggingTensorHook(
            tensors=tensors_print,
            every_n_iter=FLAGS.log_every_n_steps,
        )
        # 停止hook
        stophook = tf.train.StopAtStepHook(last_step=FLAGS.number_of_steps)

        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction)
        session_config = tf.ConfigProto(log_device_placement=False,
                                        gpu_options=gpu_options)

        # extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        # with tf.control_dependencies(extra_update_ops):
        #     optimizer = tf.train.AdadeltaOptimizer(
        #         learning_rate=learning_rate).minimize(loss=total_loss, global_step=global_step)

        optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate)
        train_op = tf.contrib.training.create_train_op(total_loss=total_loss,
                                                       optimizer=optimizer,
                                                       global_step=global_step)
        # train_op = tf.group([optimizer, total_loss, sequence_dist])
        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_checkpoints,
                hooks=[globalhook, loghook, stophook],
                save_checkpoint_secs=180,
                save_summaries_steps=100,
                config=session_config) as sess:
            while not sess.should_stop():
                oloss, opreds, ogt_labels = sess.run(
                    [train_op, preds, gt_labels])
                accuray = compute_acuracy(opreds, ogt_labels)
                print("accuracy: %9f" % (accuray))
Пример #7
0
def main(_):

    assert FLAGS.file_pattern, "--file_pattern is required"
    assert FLAGS.train_checkpoints, "--train_checkpoints is required"
    assert FLAGS.summaries_dir, "--summaries_dir is required"

    vocab = Vocabulary()

    model_config = configuration.ModelConfig()

    training_config = configuration.TrainingConfig()
    print(FLAGS.learning_rate)
    training_config.initial_learning_rate = FLAGS.learning_rate

    sequence_length = model_config.sequence_length
    batch_size = FLAGS.batch_size

    summaries_dir = FLAGS.summaries_dir
    if not tf.gfile.IsDirectory(summaries_dir):
        tf.logging.info("Creating training directory: %s", summaries_dir)
        tf.gfile.MakeDirs(summaries_dir)

    train_checkpoints = FLAGS.train_checkpoints
    if not tf.gfile.IsDirectory(train_checkpoints):
        tf.logging.info("Creating training directory: %s", train_checkpoints)
        tf.gfile.MakeDirs(train_checkpoints)

    # 数据队列初始化
    input_queue = DataReader(FLAGS.dataset_dir,
                             FLAGS.file_pattern,
                             model_config,
                             batch_size=batch_size)

    g = tf.Graph()
    with g.as_default():
        # 数据队列
        with tf.name_scope(None, 'input_queue'):
            input_images, input_labels = input_queue.read()

        # 模型建立
        model = crnn.CRNN(256, model_config.num_classes, 'train')
        logits = model.build(input_images)

        with tf.name_scope(None, 'loss'):

            loss = tf.reduce_mean(
                tf.nn.ctc_loss(labels=input_labels,
                               inputs=logits,
                               sequence_length=sequence_length *
                               tf.ones(batch_size, dtype=tf.int32)),
                name='compute_loss',
            )
            tf.losses.add_loss(loss)
            total_loss = tf.losses.get_total_loss(False)

        with tf.name_scope(None, 'decoder'):
            decoded, _ = tf.nn.ctc_beam_search_decoder(
                logits,
                sequence_length * tf.ones(batch_size, dtype=tf.int32),
                merge_repeated=False,
            )
            with tf.name_scope(None, 'acurracy'):
                sequence_dist = tf.reduce_mean(
                    tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                     input_labels),
                    name='seq_dist',
                )
            preds = tf.sparse_tensor_to_dense(decoded[0], name='prediction')
            gt_labels = tf.sparse_tensor_to_dense(input_labels,
                                                  name='Ground_Truth')

        global_step = tf.Variable(initial_value=0,
                                  name="global_step",
                                  trainable=False,
                                  collections=[
                                      tf.GraphKeys.GLOBAL_STEP,
                                      tf.GraphKeys.GLOBAL_VARIABLES
                                  ])

        # 训练时需要logging的hook
        tensors_print = {
            'global_step': global_step,
            #'loss': loss,
        }
        loghook = tf.train.LoggingTensorHook(
            tensors=tensors_print,
            every_n_iter=FLAGS.log_every_n_steps,
        )
        # 停止hook
        stophook = tf.train.StopAtStepHook(last_step=FLAGS.number_of_steps)

        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction)
        session_config = tf.ConfigProto(log_device_placement=False,
                                        gpu_options=gpu_options)

        train_op = tf.assign_add(global_step, tf.constant(1))
        session = tf.train.ChiefSessionCreator(
            config=session_config,
            checkpoint_dir=FLAGS.train_checkpoints,
        )

        labels_shape = input_labels.dense_shape
        with tf.train.MonitoredSession(session, hooks=[loghook,
                                                       stophook]) as sess:

            while not sess.should_stop():
                test_logits, test_images, test_shape, _ = \
                        sess.run([logits, input_images, labels_shape, input_labels])
                if test_logits.shape[
                        1] != FLAGS.batch_size or test_images.shape[
                            0] != FLAGS.batch_size or test_shape[
                                0] != FLAGS.batch_size:
                    print("get it!!!!!")
                test_loss = sess.run([loss])
                sess.run(train_op)
Пример #8
0
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

from data_utils.vocabulary import Vocabulary
from model import configuration, crnn
tf.logging.set_verbosity(tf.logging.INFO)


gpu_options = tf.GPUOptions(allow_growth=True)
config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options)
sess = tf.InteractiveSession(config=config)


#模型建立
vocab = Vocabulary()

with tf.name_scope(None, 'input_image'):
    img_input = tf.placeholder(tf.uint8, shape=(32, 300, 3))
    image = tf.to_float(img_input)
    image = tf.expand_dims(image, 0)

model = crnn.CRNN(256, 37, 'inference')
logit = model.build(image)

# print(logit.get_shape().as_list())
# print(tf.shape(logit)[0])
# sys.exit()

decodes, _ = tf.nn.ctc_beam_search_decoder(inputs=logit,
                                           sequence_length=tf.shape(