def main():
    global simi_log
    global server_port
    global thread_num
    global word_vec_dim
    # global top_num_simi
    global work_data_dir
    global work_src_file
    global work_src_matrix
    global work_test_matrix
    cp = ConfigParser.SafeConfigParser()
    cp.read('conf_aysimi_skipthought.conf')
    server_port = cp.get('server', 'port')

    thread_num = int(cp.get('simi_calc', 'thread_num'))
    # word_vec_dim = int(cp.get('simi_calc', 'word_vec_dim'))
    word_vec_dim = FLAGS.num_units
    # top_num_simi = int(cp.get('simi_calc', 'top_num_simi'))
    work_data_dir = cp.get('simi_calc', 'work_data_dir')
    work_src_file = cp.get('simi_calc', 'work_src_file')
    work_src_matrix = cp.get('simi_calc', 'work_src_matrix')
    work_test_matrix = cp.get('simi_calc', 'work_test_matrix')

    simi_log = FinalLogger('aysimi_skipthought.log')

    init_tf_model()

    simi_log.info('---start anyou simi skipthought server---')

    application.listen(server_port)
    tornado.ioloop.IOLoop.instance().start()
Exemplo n.º 2
0
 def __init__(self, vocab_size, start_vocab, max_target_len, unit_type,
              num_units, num_layers, dropout, embedding_size, learning_rate,
              num_keep_ckpts):
     self.vocab_size = vocab_size  # src & tgt share vocab_size
     self.start_vocab = start_vocab  # start_vocab = ['<pad>', '<go>', '<eos>', '<unk>']
     self.max_target_len = max_target_len
     # net-parameters
     self.unit_type = unit_type
     self.num_units = num_units
     self.num_layers = num_layers
     self.dropout = dropout
     self.embedding_size = embedding_size
     self.learning_rate = learning_rate
     self.embedding_share = None
     # net-output-data
     self.curr_encoder_output = None
     self.curr_encoder_state = None
     self.prev_train_logits = None
     self.prev_predict_logits = None
     self.next_train_logits = None
     self.next_predict_logits = None
     self.loss = None
     self.gradients = None
     self.train_op = None
     # net-transit-data
     self.encoder_output = None
     self.encoder_state = None
     self.prev_train_decoder_output = None
     self.prev_predict_decoder_output = None
     self.next_train_decoder_output = None
     self.next_predict_decoder_output = None
     # init-log
     self._logger = FinalLogger(self.LOG_FILE)
     # init-device
     self.num_gpus = 0
     self._init_device_gpus()
     # init placeholder
     self._init_placeholder()
     # embeded init
     self._init_embeddings()
     # build graph
     self._build_graph()
     # compute and apply gradients
     self._build_train()
     # predict
     self._build_predict()
     # save train
     self.saver = tf.train.Saver(tf.global_variables(),
                                 max_to_keep=num_keep_ckpts)
Exemplo n.º 3
0
def main():
    global simi_log
    global work_data_dir
    global thread_num
    global word_vec_dim
    global class_server_url
    global seg_a_word

    global minshi_firstlist
    global minshi_nodemap
    global minshi_label_map
    global xingshi_firstlist
    global xingshi_nodemap
    global xingshi_label_map

    simi_log = FinalLogger('aysimi_skipthought_close_beta.log')

    seg_a_word = SegCNAWord()

    cp = ConfigParser.SafeConfigParser()
    cp.read('conf_aysimi_skipthought_close.conf')

    server_port = cp.get('server', 'port')

    work_data_dir = cp.get('simi_calc', 'work_data_dir')
    thread_num = cp.get('simi_calc', 'thread_num')
    word_vec_dim = FLAGS.num_units

    class_server_url = cp.get('class_server', 'server_url')

    node.loadConfig(work_data_dir + 'AY_minshi.xml', minshi_firstlist,
                    minshi_nodemap, minshi_label_map)
    node.loadConfig(work_data_dir + 'AY_xingshi.xml', xingshi_firstlist,
                    xingshi_nodemap, xingshi_label_map)

    init_tf_model()

    simi_log.info('---anyou similar skipthought close beta start server---')

    application.listen(server_port)
    tornado.ioloop.IOLoop.instance().start()
Exemplo n.º 4
0
    def __init__(self,
                 fname,
                 line_min_words=5,
                 line_process_fn=lambda x: x.strip(),
                 max_vocab_size=100000,
                 max_len=100,
                 verbose=10000):
        """Class for reading text data and making batches.

        Args:
            fname (str): File with data.
            line_process_fn (callable): Line processing function (str -> str). Use it if you want
                to do lemmatization or remove stopwords or smth. Default lambda x: x.strip()
            max_vocab_size (int): Maximum vocabulary size. Most frequent words are used.
            verbose (int): Verbosity level on reading data.
        """
        self.verbose = verbose
        self._logger = FinalLogger(
            os.path.dirname(__file__) + '/text_data.log')
        self.fname = fname
        self.line_min_words = line_min_words
        self.max_len = max_len
        self.max_vocab_size = max_vocab_size
        self.line_process_fn = line_process_fn

        self._check_args()

        self.vocab = Vocab()
        self.total_lines = None
        self.prev_sent = None
        self.curr_sent = None
        self.next_sent = None

        # test
        # self.len_over_100 = 0
        # self.num_sents = 0
        # self.len_sents = 0

        self._build_vocabulary_and_stats()
Exemplo n.º 5
0
                            continue
                        prev_predict, next_predict = sess.run(
                            [skip_thought_model.prev_predict_logits, skip_thought_model.next_predict_logits],
                            feed_dict=eval_feed_dict(skip_thought_model, pred_batch)
                        )

                        train_log.info('%d, %s', l, '------')
                        for pred_i in prev_predict:
                            pred_str = ''
                            for pred_j in pred_i:
                                pred_str += text_data.vocab.index2word[pred_j] + ','
                            train_log.info(pred_str)

                        for next_i in next_predict:
                            next_str = ''
                            for next_j in next_i:
                                next_str += text_data.vocab.index2word[next_j] + ','
                            train_log.info(next_str)

                    # save session
                    skip_thought_model.saver.save(sess, FLAGS.checkpoint_dir + 'model.ckpt', global_step=i)

    train_log.info('Elapse time: ' + str((time.time() - start_time)))


if __name__ == '__main__':
    train_log = FinalLogger('skip_thought_train.log')
    train_log.info('start')
    tf.app.run()
    train_log.info('ok')
Exemplo n.º 6
0
class TextData:

    NUM_LINE_TRIPLES = 3
    ONE_LINE_TOKEN = '<one>'

    def __init__(self,
                 fname,
                 line_min_words=5,
                 line_process_fn=lambda x: x.strip(),
                 max_vocab_size=100000,
                 max_len=100,
                 verbose=10000):
        """Class for reading text data and making batches.

        Args:
            fname (str): File with data.
            line_process_fn (callable): Line processing function (str -> str). Use it if you want
                to do lemmatization or remove stopwords or smth. Default lambda x: x.strip()
            max_vocab_size (int): Maximum vocabulary size. Most frequent words are used.
            verbose (int): Verbosity level on reading data.
        """
        self.verbose = verbose
        self._logger = FinalLogger(
            os.path.dirname(__file__) + '/text_data.log')
        self.fname = fname
        self.line_min_words = line_min_words
        self.max_len = max_len
        self.max_vocab_size = max_vocab_size
        self.line_process_fn = line_process_fn

        self._check_args()

        self.vocab = Vocab()
        self.total_lines = None
        self.prev_sent = None
        self.curr_sent = None
        self.next_sent = None

        # test
        # self.len_over_100 = 0
        # self.num_sents = 0
        # self.len_sents = 0

        self._build_vocabulary_and_stats()

    def _check_args(self):
        import os
        assert self.max_vocab_size > 0
        assert os.path.isfile(self.fname)

    def _build_vocabulary_and_stats(self):
        """Builds vocabulary, calculates maximum length and total number of
        lines in file.
        """
        with open(self.fname) as f:
            # self.vocab = Vocab()
            self.total_lines = 0
            for line in f:
                tokens = self._tok_line(line)
                # tmp_max_len = max(map(len, map(self._tok_line, seg_sentence(line)))) + 2  # 2 = len([<go>, <eos>])
                # self.len_over_100 += len(filter(
                #     lambda x: x >= 100, map(len, map(self._tok_line, seg_sentence(line)))))
                # self.num_sents += len(map(self._tok_line, seg_sentence(line)))
                # self.len_sents += sum(map(len, map(self._tok_line, seg_sentence(line))))
                # if tmp_max_len > self.max_len:
                #     self.max_len = tmp_max_len
                if not tokens:
                    continue
                self.vocab.add_words(tokens)

                self.total_lines += 1
                if self.total_lines % self.verbose == 0:
                    self._logger.info('Read\t{0} lines.'.format(
                        self.total_lines))
        self.vocab.cut_by_freq(self.max_vocab_size)
        self._logger.info('Read\t{0} lines.'.format(self.total_lines))
        self._logger.info('Done building vocab: %d and stats.',
                          len(self.vocab))

    def pro_triples_data(self, batch_size):
        """Generate triples data, reads lines from file and encodes words.
        """
        self.prev_sent = list()
        self.curr_sent = list()
        self.next_sent = list()
        with open(self.fname) as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                seg_line = seg_sentence(line, self.line_min_words)
                if not seg_line:
                    continue

                triples = self.make_lines_triples(seg_line)
                self.prev_sent.extend(triples[0])
                self.curr_sent.extend(triples[1])
                self.next_sent.extend(triples[2])
                if len(self.curr_sent) < batch_size:
                    continue
                # TODO: Optimize batch-data initialization.
                for data_iter in self.triples_data_iterator(
                        self.prev_sent,
                        self.curr_sent,
                        self.next_sent,
                        max_len=self.max_len,
                        batch_size=batch_size):
                    yield data_iter

                self.prev_sent = list()
                self.curr_sent = list()
                self.next_sent = list()

    def pro_tuple_data(self, out_file_name, batch_size=1):
        """Generate one tuple data, reads lines from file and encodes words.
        """
        if not out_file_name or not os.path.exists(out_file_name):
            return
        curr_sent = []
        with open(out_file_name) as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                seg_line = seg_sentence(line, self.line_min_words)
                if not seg_line:
                    continue

                curr_sent.extend(filter(lambda x: x, seg_line))
                for data_iter in self.lines_curr_iterator(
                        curr_sent, batch_size=batch_size):
                    yield data_iter
                yield TextData.ONE_LINE_TOKEN
                curr_sent = []

    def _tok_line(self, line):
        """Tokenizes raw line.

        Args:
            line (str): Raw line.
        Returns:
            tokens (list of str): List of tokens.
        """
        if not line or not isinstance(line, str):
            return
        return self.line_process_fn(line).split()

    def encode_line(self, line, with_eos=False, with_go=False):
        """Encodes raw line to list of word indices. Applies ``line_process_fn`` before encoding.

        Args:
            line (str): Raw lines.
            with_eos (bool): Whether to append eos_value at the end or not.
            with_go (bool): Whether to append go_token in the beginning of line or not.
        Returns:
             encoded (list of ints): Encoded line.
        """
        tokens = self._tok_line(line)
        encoded = self.vocab.encode_words(tokens, with_eos, with_go)
        return encoded

    def encode_lines(self, lines, with_eos=False, with_go=False):
        """Encodes raw lines to list of word indices. Applies ``line_process_fn`` for each line.

        Args:
            lines (list of str): List of raw lines.
            with_eos (bool): Whether to append eos_value at the end of each line or not.
            with_go (bool): Whether to append go_token in the beginning of each line or not.
        Returns:
             encoded (list of list of ints): List of encoded lines.
        """
        encoded = [self.encode_line(line, with_eos, with_go) for line in lines]
        return encoded

    def decode_line(self, encoded_line):
        return self.vocab.decode_idxs(encoded_line)

    def make_batch(self, encoded_lines, max_len=None):
        """Makes `Batch` instance based on `encoded_lines`.

        Args:
            encoded_lines (list of list of int): List of encoded lines. Encoded lines
            can be obtained via ``encode_lines`` or ``encode_line`` methods.
            max_len (int): If not None, lines will be padded up to max_len with vocab.pad_value.
                Otherwise, lines will be padded using maximum length of line in ``encoded_lines``.
        Returns:
            batch (Batch): Batch instance.
        """
        if not max_len:
            max_len = min(max(map(len, encoded_lines)), self.max_len)
        encoded_lines = [line[:max_len] for line in encoded_lines]
        padded_lines = utils.pad_sequences(encoded_lines, max_len,
                                           self.vocab.pad_value)
        batch = Batch(padded_lines, self.vocab.pad_value, self.vocab.go_value,
                      self.vocab.eos_value)
        return batch

    @staticmethod
    def _make_triples_for_paragraph(paragraph):
        """Generate prev, curr, next lists based on paragraph.
        """
        if len(paragraph) < TextData.NUM_LINE_TRIPLES:
            return [], [], []
        prev = paragraph[:-2]
        curr = paragraph[1:-1]
        next = paragraph[2:]
        return prev, curr, next

    def make_triples(self, lines):
        """Returns prev, curr, next lists based on lines.

        Context is not shared between different paragraphs in text. So, last line in one paragraph
        will not be in context with first line in the next paragraph.
        Paragraphs must be separated by '\n\n'

        There will be asymmetric context for first and last lines.

        Args:
            lines (list of str): List of lines.
        Returns:
            prev, curr, next (tuple of list of str):
        """
        idxs = [-1] + list(
            filter(
                None,
                [i if len(lines[i]) == 0 else None
                 for i in range(len(lines))])) + [len(lines)]
        all_prev, all_curr, all_next = [], [], []
        for start, end in zip(idxs[:-1], idxs[1:]):
            tmp_prev, tmp_curr, tmp_next = self._make_triples_for_paragraph(
                lines[start + 1:end])
            if tmp_prev == [] or tmp_curr == [] or tmp_next == []:
                continue
            all_prev.extend(tmp_prev)
            all_curr.extend(tmp_curr)
            all_next.extend(tmp_next)
        return all_prev, all_curr, all_next

    @staticmethod
    def make_lines_triples(lines):
        """Returns prev, curr, next lists based on lines.
        """
        lines = filter(lambda x: x, lines)
        if len(lines) < TextData.NUM_LINE_TRIPLES:
            return [], [], []
        # prev, curr, next
        return lines[:-2], lines[1:-1], lines[2:]

    def triples_data_iterator(self,
                              prev_data,
                              curr_data,
                              next_data,
                              max_len,
                              batch_size=64,
                              shuffle=False):
        """Creates iterator for (current sentence, prev sentence, next sentence)
        data. Is is useful for training skip-thought vectors.

        Args:
            curr_data (list of lists of ints): List with raw lines which corresponds to current sentences.
                Lines can be with different lengths. They will be encoder inputs.
            prev_data (list of lists of ints): List with raw previous
                lines. Lines can be with different lengths.
            next_data (list of lists of ints): List with raw next lines.
                Lines can be with different lengths.
            max_len (int): Maximum length for padding previous and next sentences.
            batch_size (int): Size of batch.
            shuffle (bool): Whether to shuffle data or not.

        Yields:
            enc_inp, prev_inp, prev_targ, next_inp, next_targ (Batch)

        """
        if shuffle:
            indices = np.random.permutation(len(curr_data))
            curr_data = [curr_data[i] for i in indices]
            prev_data = [prev_data[i] for i in indices]
            next_data = [next_data[i] for i in indices]

        total_processed_examples = 0
        total_steps = int(np.ceil(len(curr_data) / float(batch_size)))
        for step in range(total_steps):
            batch_start = step * batch_size

            curr = curr_data[batch_start:batch_start + batch_size]
            prev = prev_data[batch_start:batch_start + batch_size]
            next = next_data[batch_start:batch_start + batch_size]

            if batch_start + batch_size > len(curr_data):
                num_index = min(batch_size,
                                batch_start + batch_size - len(curr_data))
                data_index = random.sample(range(len(curr_data)), num_index)
                for i_index in data_index:
                    curr.append(curr_data[i_index])
                    prev.append(prev_data[i_index])
                    next.append(next_data[i_index])

            enc_inp = self.make_batch(self.encode_lines(curr))

            prev_inp = self.make_batch(self.encode_lines(prev, with_go=True),
                                       max_len)
            prev_targ = self.make_batch(self.encode_lines(prev, with_eos=True),
                                        max_len)

            next_inp = self.make_batch(self.encode_lines(next, with_go=True),
                                       max_len)
            next_targ = self.make_batch(self.encode_lines(next, with_eos=True),
                                        max_len)
            assert prev_inp.shape == prev_targ.shape == next_inp.shape == next_targ.shape, (
                prev, curr, next)

            yield enc_inp, prev_inp, prev_targ, next_inp, next_targ

            total_processed_examples += len(curr)

            if total_processed_examples >= len(curr_data):
                break

        assert total_processed_examples >= len(curr_data), \
            'Expected {} and processed {}'.format(len(curr_data),
                                                  total_processed_examples)

    def lines_curr_iterator(self, curr_data, batch_size=1, shuffle=False):
        """Creates iterator for current sentence data. Is is useful for predicting | encoding skip-thought vectors.
        """

        if shuffle:
            indices = np.random.permutation(len(curr_data))
            curr_data = [curr_data[i] for i in indices]

        total_processed_examples = 0
        total_steps = int(np.ceil(len(curr_data) / float(batch_size)))
        for step in range(total_steps):
            batch_start = step * batch_size

            curr = curr_data[batch_start:batch_start + batch_size]

            if batch_start + batch_size > len(curr_data):
                num_index = min(batch_size,
                                batch_start + batch_size - len(curr_data))
                data_index = random.sample(range(len(curr_data)), num_index)
                for i_index in data_index:
                    curr.append(curr_data[i_index])

            enc_inp = self.make_batch(self.encode_lines(curr))
            assert curr

            yield enc_inp

            total_processed_examples += len(curr)
            if total_processed_examples >= len(curr_data):
                break
        assert total_processed_examples >= len(curr_data), \
            'Expected {} and processed {}'.format(len(curr_data),
                                                  total_processed_examples)
Exemplo n.º 7
0
class SkipThoughtModel(object):
    """
    Model skip-thought
    """
    VOCAB_SIZE_THRESHOLD_CPU = 20000
    MAX_GRADIENT_NORM = 5.0
    LOG_FILE = 'skip_thought_model.log'

    def __init__(self, vocab_size, start_vocab, max_target_len, unit_type,
                 num_units, num_layers, dropout, embedding_size, learning_rate,
                 num_keep_ckpts):
        self.vocab_size = vocab_size  # src & tgt share vocab_size
        self.start_vocab = start_vocab  # start_vocab = ['<pad>', '<go>', '<eos>', '<unk>']
        self.max_target_len = max_target_len
        # net-parameters
        self.unit_type = unit_type
        self.num_units = num_units
        self.num_layers = num_layers
        self.dropout = dropout
        self.embedding_size = embedding_size
        self.learning_rate = learning_rate
        self.embedding_share = None
        # net-output-data
        self.curr_encoder_output = None
        self.curr_encoder_state = None
        self.prev_train_logits = None
        self.prev_predict_logits = None
        self.next_train_logits = None
        self.next_predict_logits = None
        self.loss = None
        self.gradients = None
        self.train_op = None
        # net-transit-data
        self.encoder_output = None
        self.encoder_state = None
        self.prev_train_decoder_output = None
        self.prev_predict_decoder_output = None
        self.next_train_decoder_output = None
        self.next_predict_decoder_output = None
        # init-log
        self._logger = FinalLogger(self.LOG_FILE)
        # init-device
        self.num_gpus = 0
        self._init_device_gpus()
        # init placeholder
        self._init_placeholder()
        # embeded init
        self._init_embeddings()
        # build graph
        self._build_graph()
        # compute and apply gradients
        self._build_train()
        # predict
        self._build_predict()
        # save train
        self.saver = tf.train.Saver(tf.global_variables(),
                                    max_to_keep=num_keep_ckpts)

    def _init_device_gpus(self):
        """Init device GPU and CPU."""
        gpu_names = [
            x.name for x in device_lib.list_local_devices()
            if x.device_type == 'GPU'
        ]
        self.num_gpus = len(gpu_names)
        self._logger.info('{0} GPUs are detected : {1}'.format(
            self.num_gpus, gpu_names))

    def _init_placeholder(self):
        """Init prev_curr_next data placeholder."""
        self._logger.info('Init prev_curr_next data placeholder.')
        with tf.variable_scope('placeholders'):
            # curr input
            self.curr_source_data = tf.placeholder(tf.int32, [None, None],
                                                   name='curr_data')
            self.curr_source_seq_len = tf.placeholder(tf.int32, [None],
                                                      name='curr_data_seq_len')
            self.batch_size = tf.size(self.curr_source_seq_len,
                                      name='batch_size')
            # prev target
            self.prev_target_data_input = tf.placeholder(
                tf.int32, [None, None], name='prev_targets_input')
            self.prev_target_data_output = tf.placeholder(
                tf.int32, [None, None], name='prev_targets_output')
            self.prev_target_mask = tf.placeholder(tf.float32, [None, None],
                                                   name='prev_targets_mask')
            self.prev_target_seq_len = tf.placeholder(
                tf.int32, [None], name='prev_targets_seq_len')
            # next target
            self.next_target_data_input = tf.placeholder(
                tf.int32, [None, None], name='next_targets_input')
            self.next_target_data_output = tf.placeholder(
                tf.int32, [None, None], name='next_targets_output')
            self.next_target_mask = tf.placeholder(tf.float32, [None, None],
                                                   name='next_targets_mask')
            self.next_target_seq_len = tf.placeholder(
                tf.int32, [None], name='next_targets_seq_len')

    def _build_cell(self, unit_type, num_units, num_layers, dropout):
        """Build cell"""
        cell_list = []
        for i in range(num_layers):
            single_cell = self._create_rnn_cell(
                unit_type=unit_type,
                num_units=num_units,
                dropout=dropout,
                device_str=self._get_device_str(i, self.num_gpus))
            cell_list.append(single_cell)

        if len(cell_list) == 1:
            return cell_list[0]
        else:  # Multi layers
            return tf.contrib.rnn.MultiRNNCell(cell_list)

    def _create_rnn_cell(self, unit_type, num_units, dropout, device_str=None):
        """Create rnn single-cell"""
        # cell
        if unit_type == 'lstm':
            single_cell = tf.contrib.rnn.LSTMCell(
                num_units,
                initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        elif unit_type == 'gru':
            single_cell = tf.contrib.rnn.GRUCell(num_units)
        else:
            raise ValueError('Unknown cell type %s!' % unit_type)
        # dropout wrapper
        if dropout > 0.0:
            single_cell = tf.contrib.rnn.DropoutWrapper(
                cell=single_cell,
                input_keep_prob=(1.0 - dropout),
                output_keep_prob=1.0)
        # device wrapper
        if device_str:
            single_cell = tf.contrib.rnn.DeviceWrapper(single_cell, device_str)
            self._logger.info('  %s, device=%s' %
                              (type(single_cell).__name__, device_str))

        return single_cell

    @staticmethod
    def _get_device_str(device_id, num_gpus):
        """Return a device string for multi-GPU setup."""
        if num_gpus == 0:
            return '/cpu:0'
        device_str_output = '/gpu:%d' % (device_id % num_gpus)
        return device_str_output

    def _get_embed_device(self, vocab_size):
        """Get embed device"""
        if vocab_size < self.VOCAB_SIZE_THRESHOLD_CPU and self.num_gpus > 0:
            return '/gpu:0'
        else:
            return '/cpu:0'

    def _init_embeddings(self):
        """Init embedding."""
        # share vocab
        self._logger.info('Init embedding src_tgt_share.')
        with tf.device(self._get_embed_device(self.vocab_size)):
            self.embedding_share = tf.get_variable(
                'embedding_share', [self.vocab_size, self.embedding_size],
                dtype=tf.float32)
        self._logger.info('  %s, device=%s' %
                          (type(self.embedding_share).__name__,
                           self._get_embed_device(self.vocab_size)))

    def _build_encoder(self, enc_scope_name):
        """Network encoder."""
        self._logger.info('Build encoder.')
        with tf.variable_scope(enc_scope_name):
            # shape, [batch_size, max_time, embed_size]
            # encoder_embed_input = tf.contrib.layers.embed_sequence(self.curr_source_data, self.vocab_size,
            #                                                        self.embedding_size)
            encoder_embed_input = tf.nn.embedding_lookup(
                self.embedding_share, self.curr_source_data)
            cell = self._build_cell(self.unit_type, self.num_units,
                                    self.num_layers, self.dropout)
            encoder_output, encoder_state = tf.nn.dynamic_rnn(
                cell,
                encoder_embed_input,
                sequence_length=self.curr_source_seq_len,
                dtype=tf.float32)

        return encoder_output, encoder_state

    def _build_decoder(self, encoder_output, encoder_state, target_data,
                       target_seq_len, dec_scope_name):
        """Network decoder."""
        self._logger.info('Build %s.', dec_scope_name)
        with tf.variable_scope(dec_scope_name):
            # decoder_embeddings = tf.Variable(tf.random_uniform([self.vocab_size, self.embedding_size]))

            cell = self._build_cell(self.unit_type, self.num_units,
                                    self.num_layers, self.dropout)

            # attention-model
            cell, encoder_state = self._build_attention(
                encoder_output, encoder_state, cell)
            # output_layer
            output_layer = Dense(
                self.vocab_size,
                use_bias=False,
                kernel_initializer=tf.truncated_normal_initializer(mean=0.0,
                                                                   stddev=0.1))

            self._logger.info(' Build decoder train.')
            with tf.variable_scope(dec_scope_name + '_train'):
                # Data format of target_data: <GO>...<PAD>
                # shape: [batch_size, max_time, embed_size], type: float32.
                decoder_embed_input = tf.nn.embedding_lookup(
                    self.embedding_share, target_data)
                train_helper = tf.contrib.seq2seq.TrainingHelper(
                    inputs=decoder_embed_input,
                    sequence_length=target_seq_len,
                    time_major=False)
                train_decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell, train_helper, encoder_state, output_layer)
                train_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    train_decoder,
                    impute_finished=True,
                    maximum_iterations=self.max_target_len)

            self._logger.info(' Build decoder predict.')
            with tf.variable_scope(dec_scope_name + '_predict', reuse=True):
                # start_tokens = tf.tile(
                #     tf.constant([self.start_vocab.index('<go>')], dtype=tf.int32),
                #     [self.batch_size], name='start_tokens')
                predict_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                    self.embedding_share,
                    tf.fill([self.batch_size], self.start_vocab.index('<go>')),
                    self.start_vocab.index('<eos>'))
                predict_decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell, predict_helper, encoder_state, output_layer)
                predict_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    predict_decoder,
                    impute_finished=True,
                    maximum_iterations=self.max_target_len)

        return train_decoder_output, predict_decoder_output

    def _build_attention(self, encoder_output, encoder_state, cell):
        """Attention"""
        # attention_states: [batch_size, max_time, num_units]
        # attention_states = tf.transpose(encoder_output, [1, 0, 2])
        attention_mechanism = tf.contrib.seq2seq.LuongAttention(
            self.num_units,
            encoder_output,
            memory_sequence_length=self.curr_source_seq_len)

        cell = tf.contrib.seq2seq.AttentionWrapper(
            cell, attention_mechanism, attention_layer_size=self.num_units)

        decoder_initial_state = cell.zero_state(
            self.batch_size, tf.float32).clone(cell_state=encoder_state)

        cell = tf.contrib.rnn.DeviceWrapper(
            cell, self._get_device_str(self.num_layers - 1, self.num_gpus))

        return cell, decoder_initial_state

    def _build_graph(self):
        """Build skip-thought model by seq2seq model"""
        self._logger.info('Build graph.')
        # curr_data encoder
        self.encoder_output, self.encoder_state = self._build_encoder(
            'encoder')
        # prev_data decoder
        self.prev_train_decoder_output, self.prev_predict_decoder_output = self._build_decoder(
            self.encoder_output, self.encoder_state,
            self.prev_target_data_input, self.prev_target_seq_len,
            'prev_decoder')
        # next_data decoder
        self.next_train_decoder_output, self.next_predict_decoder_output = self._build_decoder(
            self.encoder_output, self.encoder_state,
            self.next_target_data_input, self.next_target_seq_len,
            'next_decoder')
        self._logger.info('Compute loss.')
        # compute loss
        with tf.device(self._get_device_str(self.num_layers - 1,
                                            self.num_gpus)):
            # prev loss
            prev_train_logits = tf.identity(
                self.prev_train_decoder_output.rnn_output, name='prev_logits')
            prev_loss = self._compute_loss(self.prev_target_data_output,
                                           self.prev_target_mask,
                                           prev_train_logits)
            # next loss
            next_train_logits = tf.identity(
                self.next_train_decoder_output.rnn_output, name='next_logits')
            next_loss = self._compute_loss(self.next_target_data_output,
                                           self.next_target_mask,
                                           next_train_logits)
            # loss
            self.loss = prev_loss + next_loss

    def _compute_loss(self, target_output, target_mask, logits):
        """Compute optimization loss."""
        crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=target_output, logits=logits)
        loss = tf.reduce_sum(crossent * target_mask) / tf.to_float(
            self.batch_size)

        return loss

    def _build_train(self):
        """Train, compute and apply gradients"""
        self._logger.info('Build train.')
        params = tf.trainable_variables()
        gradients = tf.gradients(self.loss, params)
        clipped_grads, _ = tf.clip_by_global_norm(gradients,
                                                  self.MAX_GRADIENT_NORM)

        opt = tf.train.AdamOptimizer(self.learning_rate)
        self.train_op = opt.apply_gradients(zip(clipped_grads, params))

    def _build_predict(self):
        """Predict output: curr_data encoder, prev_predict and next_predict data"""
        self._logger.info('Build predict.')
        with tf.device(self._get_device_str(self.num_layers - 1,
                                            self.num_gpus)):
            with tf.variable_scope('prev'):
                self.prev_train_logits = tf.identity(
                    self.prev_train_decoder_output.rnn_output, name='logits')
                self.prev_predict_logits = tf.identity(
                    self.prev_predict_decoder_output.sample_id,
                    name='predictions')

            with tf.variable_scope('next'):
                self.next_train_logits = tf.identity(
                    self.next_train_decoder_output.rnn_output, name='logits')
                self.next_predict_logits = tf.identity(
                    self.next_predict_decoder_output.sample_id,
                    name='predictions')

            with tf.variable_scope('curr'):
                self.curr_encoder_output = tf.identity(self.encoder_output,
                                                       name='output')
                self.curr_encoder_state = tf.identity(self.encoder_state,
                                                      name='state')
Exemplo n.º 8
0
                pass

            with open(FLAGS.pred_tgt_path, 'w') as f:
                pred_data = text_data.pro_tuple_data(FLAGS.pred_src_path, batch_size=FLAGS.pred_batch_size)
                for j, batch in enumerate(pred_data):
                    if batch == text_data.ONE_LINE_TOKEN:
                        f.write('\n'.encode('utf-8'))
                        continue
                    prev_predict, curr_state = sess.run(
                        [skip_thought_model.prev_predict_logits, skip_thought_model.curr_encoder_state],
                        feed_dict=pred_feed_dict(skip_thought_model, batch)
                    )

                    predict_log.info('%d, %s', j, '------')
                    for pred_i in prev_predict:
                        pred_str = ''
                        for pred_j in pred_i:
                            pred_str += text_data.vocab.index2word[pred_j] + ','
                        predict_log.info(pred_str)

                    f.write((' '.join(map(str, curr_state[-1][-1])) + ' ').encode('utf-8'))

    predict_log.info('Elapse time: ' + str((time.time() - start_time)))


if __name__ == '__main__':
    predict_log = FinalLogger('skip_thought_pred.log')
    predict_log.info('start')
    tf.app.run()
    predict_log.info('ok')