Python GetWordIds 예제들, data.GetWordIds Python 예제들

예제 #1

0

파일 보기

def read_data_sets(train_dir, vocab, hps):
    start_id = vocab.WordToId(data.SENTENCE_START)
    end_id = vocab.WordToId(data.SENTENCE_END)
    pad_id = vocab.WordToId(data.PAD_TOKEN)
    articles_abstracts = data.getArticlesAndAbstracts(train_dir)
    enc_inputs = np.zeros((len(articles_abstracts), hps.enc_timesteps),
                          dtype=np.int32)
    dec_inputs = np.zeros((len(articles_abstracts), hps.dec_timesteps),
                          dtype=np.int32)
    targets = np.zeros((len(articles_abstracts), hps.dec_timesteps),
                       dtype=np.int32)

    origin_articles = []
    origin_abstract = []
    for index, (article, abstract) in enumerate(articles_abstracts):
        # Use the <s> as the <GO> symbol for decoder inputs.
        enc_input = []
        dec_input = [start_id]
        enc_input += data.GetWordIds(article, vocab)
        dec_input += data.GetWordIds(abstract, vocab)

        enc_input[:] = data.GetWordIds(article, vocab)
        dec_input[1:] = data.GetWordIds(abstract, vocab)

        enc_input = enc_input[:hps.enc_timesteps]
        dec_input = dec_input[:hps.dec_timesteps]

        # targets is dec_inputs without <s> at beginning, plus </s> at end
        target = dec_input[1:]
        target.append(end_id)

        # Now len(enc_inputs) should be <= enc_timesteps, and
        # len(targets) = len(dec_inputs) should be <= dec_timesteps

        #enc_input_len = len(enc_inputs)
        #dec_output_len = len(targets)

        # Pad if necessary
        while len(enc_input) < hps.enc_timesteps:
            enc_input.append(pad_id)
        while len(dec_input) < hps.dec_timesteps:
            dec_input.append(end_id)
        while len(target) < hps.dec_timesteps:
            target.append(end_id)

        enc_inputs[index] = enc_input
        dec_inputs[index] = dec_input
        targets[index] = target
        origin_articles.append(article)
        origin_abstract.append(abstract)

    return DataSet(enc_inputs, dec_inputs, targets, origin_articles,
                   origin_abstract)

예제 #2

0

파일 보기

파일: dataset_preprocessing_cccs_ext.py 프로젝트: lfzCarlosC/textsumFinal

def _extract_we_binary(output_file, vocab_file, we_dic):
    vocab = data.Vocab(vocab_file, 1000000)
    vsize = vocab.NumIds()
    output = codecs.open(output_file, "w", "utf-8")
    unknown_ids = [vocab.WordToId(UNKNOWN_TOKEN)]
    with open(we_dic, "rb") as f:
        header = f.readline()
        vocab_size, layer1_size = map(int, header.split())
        binary_len = np.dtype('float32').itemsize * layer1_size
        print "layer1_size:", layer1_size
        for line in xrange(vocab_size):
            word = []
            while True:
                ch = f.read(1)
                if ch == ' ':
                    word = ''.join(word)
                    break
                if ch != '\n':
                    word.append(ch)
            idx = data.GetWordIds(word, vocab)
            if idx != None and idx != unknown_ids and word == "<s>":
                print idx, ":", word
                output.write(word + ' ' + ' '.join(
                    map(str, np.fromstring(f.read(binary_len),
                                           dtype='float32'))) + '\n')
            elif idx == unknown_ids:
                f.read(binary_len)
            else:
                f.read(binary_len)
    f.close()
    output.close()

예제 #3

0

파일 보기

 def _loadWord2VecGo(self):
     vsize = self._vocab.NumIds()
     emb_dim = self._hps.emb_dim
     print "vsize:", vsize
     print "emb_dim", emb_dim
     if FLAGS.word2vec:
         # initial matrix with random uniform
         initWE = np.random.uniform(-0.25, 0.25,
                                    (vsize, emb_dim)).astype(np.float32)
         # load any vectors from the word2vec
         print("Load word2vec file {}\n".format(FLAGS.word2vec))
         with open(FLAGS.word2vec, "rb") as f:
             header = f.readline()
             vocab_size, layer1_size = map(int, header.split())
             binary_len = np.dtype('float32').itemsize * layer1_size
             print "start to read"
             for line in xrange(vocab_size):
                 word = []
                 while True:
                     ch = f.read(1)
                     if ch == ' ':
                         word = ''.join(word)
                         break
                     if ch != '\n':
                         word.append(ch)
                 idx = data.GetWordIds(word, self._vocab)
                 if idx != None:
                     initWE[idx] = np.fromstring(f.read(binary_len),
                                                 dtype='float32')
                 else:
                     f.read(binary_len)
         f.close()
         print "initWe loaded:", initWE
         return initWE

예제 #4

0

파일 보기

    def _loadWord2Vec(self):
        vsize = self._vocab.NumIds()
        emb_dim = self._hps.emb_dim
        print "vsize:", vsize
        print "emb_dim", emb_dim
        if FLAGS.word2vec:
            # initial matrix with random uniform
            initWE = np.random.uniform(-0.25, 0.25,
                                       (vsize, emb_dim)).astype(np.float32)
            # load any vectors from the word2vec
            print("Load word2vec file {}\n".format(FLAGS.word2vec))
            f = codecs.open(FLAGS.word2vec, "r")
            for line in f:
                string = line.split(" ")
                word = string[0]
                value = " ".join(x for x in string[1:])
                idx = data.GetWordIds(word, self._vocab)
                if idx != None:
                    initWE[idx] = np.fromstring(value,
                                                dtype='float32',
                                                sep=' ')

            f.close()
            print "initWe loaded:", initWE
            return initWE

예제 #5

0

파일 보기

  def _Decode(self, article_text):
    """Restore a checkpoint and decode it.
    Args:
      saver: Tensorflow checkpoint saver.
      sess: Tensorflow session.
    Returns:
      If success, returns true, otherwise, false.
    """

    bs = beam_search.BeamSearch(
        self._model, self._hps.batch_size,
        self._vocab.WordToId(data.SENTENCE_START),
        self._vocab.WordToId(data.SENTENCE_END),
        self._hps.dec_timesteps)

    ###################
    #article_text = "How do I know the difference, between class and object"
    article = "<d><p><s>"+article_text+"</s></p></d>"
    article_sentences = [sent.strip() for sent in data.ToSentences(article, include_token=False)]
    pad_id = self._vocab.WordToId(data.PAD_TOKEN)

    enc_inputs = []
    for i in xrange(min(100,len(article_sentences))):
      enc_inputs += data.GetWordIds(article_sentences[i], self._vocab)

    enc_input_len = len(enc_inputs)
    while len(enc_inputs) < self._hps.enc_timesteps:
      enc_inputs.append(pad_id)
    ###################


    w, h = 120, 4
    article_batch_cp = [[0 for x in range(w)] for y in range(h)] 
    for i in range(0,4):
      article_batch_cp[i] = enc_inputs#article_batch[i]


    w, h = 1, 4
    article_lens_cp = [[0 for x in range(w)] for y in range(h)] 
    #article_lens_cp = article_lens.copy()
    for i in range(0,4):
      article_lens_cp[i] = enc_input_len

    best_beam = bs.BeamSearch(self._sess, article_batch_cp, article_lens_cp)
    #print len(best_beam)
    best_beam = best_beam[0]
     
    decode_output = [int(t) for t in best_beam.tokens[1:]]

    QUESTION = article_text

    test = ' '.join(data.Ids2Words(decode_output, self._vocab))
    end_p = test.find(data.SENTENCE_END, 0)

    if end_p != -1:
      test = test[:end_p]
    #print "<Answer>"+test
    ANSWER = test.replace('<UNK>','')

    return QUESTION, ANSWER

예제 #6

0

파일 보기

 def _add_seq2seq(self):
     hps = self._hps
     vsize = self._vocab.NumIds()
     with tf.variable_scope('seq2seq'):
         encoder_inputs = tf.unpack(tf.transpose(self._articles))
         decoder_inputs = tf.unpack(tf.transpose(self._abstracts))
         targets = tf.unpack(tf.transpose(self._targets))
         loss_weights = tf.unpack(tf.transpose(self._loss_weights))
         article_lens = self._article_lens
         sess = tf.get_default_session()
         print sess
         with tf.variable_scope('Embedding'), tf.device('/gpu:0'):
             # Embedding shared by the input and outputs.
             #embedding = tf.get_variable(
             #      'embedding', [vsize, hps.emb_dim], dtype=tf.float32,
             #   trainable=False,
             #      initializer=tf.truncated_normal_initializer(stddev=1e-4))
             embedding = tf.get_variable(
                 'embedding', [vsize, hps.emb_dim],
                 dtype=tf.float32,
                 trainable=False,
                 initializer=tf.truncated_normal_initializer(stddev=1e-4))
             sess.run(tf.initialize_all_variables())
             if FLAGS.word2vec:
                 # initial matrix with random uniform
                 initW = np.random.uniform(-0.25, 0.25,
                                           (vsize, hps.emb_dim))
                 # load any vectors from the word2vec
                 print("Load word2vec file {}\n".format(FLAGS.word2vec))
                 with open(FLAGS.word2vec, "rb") as f:
                     header = f.readline()
                     vocab_size, layer1_size = map(int, header.split())
                     binary_len = np.dtype('float32').itemsize * layer1_size
                     for line in xrange(vocab_size):
                         word = []
                         while True:
                             ch = f.read(1)
                             if ch == ' ':
                                 word = ''.join(word)
                                 break
                             if ch != '\n':
                                 word.append(ch)
                         idx = data.GetWordIds(word, self._vocab)
                         if idx != None:
                             initW[idx] = np.fromstring(f.read(binary_len),
                                                        dtype='float32')
                         else:
                             f.read(binary_len)
             sess.run(embedding.assign(initW))
             encoder_inputs = [2, 4, 6, 8]
             emb_encoder_inputs = [
                 tf.nn.embedding_lookup(embedding, x)
                 for x in encoder_inputs
             ]
             print emb_encoder_inputs

예제 #7

0

파일 보기

    def fill_input_quest(self, quest):
        start_id = myServer.batcher._vocab.WordToId(data.SENTENCE_START)
        end_id = myServer.batcher._vocab.WordToId(data.SENTENCE_END)
        pad_id = myServer.batcher._vocab.WordToId(data.PAD_TOKEN)
        quest = ' '.join(self.get_words(quest))
        article_sentences = quest.strip()
        abstract_sentences = article_sentences
        enc_inputs = []
        # Use the <s> as the <GO> symbol for decoder inputs.
        dec_inputs = [start_id]
        enc_inputs += data.GetWordIds(article_sentences,
                                      myServer.batcher._vocab)
        dec_inputs += data.GetWordIds(abstract_sentences,
                                      myServer.batcher._vocab)

        #句子太长
        if len(enc_inputs) > myServer.batcher._hps.enc_timesteps:
            enc_inputs = enc_inputs[:myServer.batcher._hps.enc_timesteps]
        if len(dec_inputs) > myServer.batcher._hps.dec_timesteps:
            dec_inputs = dec_inputs[:myServer.batcher._hps.dec_timesteps]

        # targets is dec_inputs without <s> at beginning, plus </s> at end
        #<s>之前额外加上了,此处额外加上</s>
        targets = dec_inputs[1:]
        targets.append(end_id)
        enc_input_len = len(enc_inputs)
        dec_output_len = len(targets)

        # 如果比指定长度短，在此处填充,dec_inputs是[<s>,...],targets是[...,<\s>]
        while len(enc_inputs) < myServer.batcher._hps.enc_timesteps:
            enc_inputs.append(pad_id)
        while len(dec_inputs) < myServer.batcher._hps.dec_timesteps:
            dec_inputs.append(end_id)
        while len(targets) < myServer.batcher._hps.dec_timesteps:
            targets.append(end_id)

        #将nametupe放入队列之中
        element = ModelInput(enc_inputs, dec_inputs, targets, enc_input_len,
                             dec_output_len, article_sentences,
                             abstract_sentences)
        return element

예제 #8

0

파일 보기

파일: batch_reader.py 프로젝트: dean1977a/pyDataUtils

    def _FillInputQueue(self):
        """逐行填充输入队列"""
        pad_id = self._vocab.WordToId(parameter_config.PAD_TOKEN)
        if self._hps.mode == 'train':
            input_gen = self._TextGenerator(
                data.ExampleGen(os.path.join(self._data_path, '*')))
        else:
            input_gen = self._TextGenerator(
                data.ExampleGen(os.path.join(self._data_path, '*'), 1))

        while True:
            try:
                (index_id, target, sentence) = input_gen.next()
            except (GeneratorExit, StopIteration):
                break

            enc_inputs = data.GetWordIds(sentence.strip(), self._vocab)
            target = int(target)

            # Filter out too-short input
            if (len(enc_inputs) < self._hps.min_input_len):
                # tf.logging.warning('Drop an example - too short.\nenc:%d\ndec:%d',
                #                   len(enc_inputs), len(dec_inputs))
                continue

            # If we're not truncating input, throw out too-long input
            if not self._truncate_input:
                if (len(enc_inputs) > self._hps.enc_timesteps):
                    # tf.logging.warning('Drop an example - too long.\nenc:%d\ndec:%d',
                    #                  len(enc_inputs), len(dec_inputs))
                    continue
            # If we are truncating input, do so if necessary
            else:
                if len(enc_inputs) > self._hps.enc_timesteps:
                    enc_inputs = enc_inputs[:self._hps.enc_timesteps]

            enc_input_len = len(enc_inputs)

            # Pad if necessary
            while len(enc_inputs) < self._hps.enc_timesteps:
                enc_inputs.append(pad_id)

            element = ModelInput(index_id, target, enc_inputs, enc_input_len)
            self._input_queue.put(element)

예제 #9

0

파일 보기

파일: dataset_preprocessing_cccs_ext.py 프로젝트: lfzCarlosC/textsumFinal

def _extract_we_text(output_file, vocab_file, we_dic):
    vocab = data.Vocab(vocab_file, 1000000)
    vsize = vocab.NumIds()
    m = copy.deepcopy(vocab._word_to_id)
    unknown_ids = [vocab.WordToId(UNKNOWN_TOKEN)]
    output = codecs.open(output_file, "w", "utf-8")
    with open(we_dic, "rb") as f:
        for line in f:
            string = line.split(" ")
            word = string[0].strip()
            value = " ".join(x for x in string[1:])
            idx = data.GetWordIds(word, vocab)
            if idx != None and idx != unknown_ids and word in m:
                del m[word]
                output.write(word + ' ' + value)
    print "====:", m
    print "---:", len(m)

    f.close()
    output.close()

    #this operation wants to garuantee that words in WE and words in vocab file must be the same
    del m['<s>']
    del m['</s>']
    del m['<d>']
    del m['</d>']
    del m['<p>']
    del m['</p>']
    tt = m.keys()

    vocab_new = vocab_file + "_new"
    with open(vocab_file, 'r') as f:
        with open(vocab_new, 'w') as g:
            for line in f.readlines():
                if all(string not in line for string in tt):
                    g.write(line)
            if '<UNK>' in m:
                g.write('<UNK> 0\n')
            if '<PAD>' in m:
                g.write('<PAD> 0\n')
    shutil.move(vocab_new, vocab_file)
    f.close()
    g.close()

예제 #10

0

파일 보기

 def _loadWord2VecGo(self, emb_dim):
     sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
     vsize = self._vocab.NumIds()
     with tf.variable_scope('goEmbedding'), tf.device('/gpu:0'):
         embedding = tf.get_variable(
             'embedding', [vsize, emb_dim],
             dtype=tf.float32,
             trainable=False,
             initializer=tf.truncated_normal_initializer(stddev=1e-4))
         sess.run(tf.initialize_all_variables())
     if FLAGS.word2vec:
         # initial matrix with random uniform
         initW = np.random.uniform(-0.25, 0.25, (vsize, emb_dim))
         # load any vectors from the word2vec
         print("Load word2vec file {}\n".format(FLAGS.word2vec))
         with open(FLAGS.word2vec, "rb") as f:
             header = f.readline()
             vocab_size, layer1_size = map(int, header.split())
             binary_len = np.dtype('float32').itemsize * layer1_size
             for line in xrange(vocab_size):
                 word = []
                 while True:
                     ch = f.read(1)
                     if ch == ' ':
                         word = ''.join(word)
                         break
                     if ch != '\n':
                         word.append(ch)
                 idx = data.GetWordIds(word, self._vocab)
                 if idx != None:
                     initW[idx] = np.fromstring(f.read(binary_len),
                                                dtype='float32')
                 else:
                     f.read(binary_len)
         print "embedding first loaded:", embedding
         print(sess.run(tf.nn.embedding_lookup(embedding, 2)))
         sess.run(embedding.assign(initW))
         print "function loaded:", embedding
         print(sess.run(tf.nn.embedding_lookup(embedding, 2)))
         return sess

예제 #11

0

파일 보기

    def _loadWord2Vec(self, embedding, emb_dim):
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        vsize = self._vocab.NumIds()
        sess.run(tf.initialize_all_variables())
        if FLAGS.word2vec:
            # initial matrix with random uniform
            initW = np.random.uniform(-0.25, 0.25, (vsize, emb_dim))
            # load any vectors from the word2vec
            print("Load word2vec file {}\n".format(FLAGS.word2vec))
            f = codecs.open(FLAGS.word2vec, "r")
            for line in f:
                str = line.split(" ")
                word = str[0]
                value = " ".join(x for x in str[1:])
                idx = data.GetWordIds(word, self._vocab)
                if idx != None:
                    initW[idx] = np.fromstring(value, dtype='float32', sep=' ')

            f.close()

            sess.run(embedding.assign(initW))
            sess.run(embedding)

예제 #12

0

파일 보기

    def _add_seq2seq_old(self, sess):
        hps = self._hps
        vsize = self._vocab.NumIds()
        with tf.variable_scope('seq2seq'):
            encoder_inputs = tf.unpack(tf.transpose(self._articles))
            decoder_inputs = tf.unpack(tf.transpose(self._abstracts))
            targets = tf.unpack(tf.transpose(self._targets))
            loss_weights = tf.unpack(tf.transpose(self._loss_weights))
            article_lens = self._article_lens
            with tf.variable_scope('Embedding'), tf.device('/gpu:0'):
                #==============================================================================
                # Embedding shared by the input and outputs.
                #embedding = tf.get_variable(
                #      'embedding', [vsize, hps.emb_dim], dtype=tf.float32,
                #   trainable=False,
                #      initializer=tf.truncated_normal_initializer(stddev=1e-4))
                #sess.run(tf.initialize_all_variables())
                #==============================================================================
                vsize = self._vocab.NumIds()
                embedding = tf.get_variable(
                    'embedding', [vsize, hps.emb_dim],
                    dtype=tf.float32,
                    trainable=False,
                    initializer=tf.truncated_normal_initializer(stddev=1e-4))
                sess.run(tf.initialize_all_variables())
                if FLAGS.word2vec:
                    # initial matrix with random uniform
                    initW = np.random.uniform(-0.25, 0.25,
                                              (vsize, hps.emb_dim))
                    # load any vectors from the word2vec
                    print("Load word2vec file {}\n".format(FLAGS.word2vec))
                    with open(FLAGS.word2vec, "rb") as f:
                        header = f.readline()
                        vocab_size, layer1_size = map(int, header.split())
                        binary_len = np.dtype('float32').itemsize * layer1_size
                        for line in xrange(vocab_size):
                            word = []
                            while True:
                                ch = f.read(1)
                                if ch == ' ':
                                    word = ''.join(word)
                                    break
                                if ch != '\n':
                                    word.append(ch)
                            idx = data.GetWordIds(word, self._vocab)
                            if idx != None:
                                initW[idx] = np.fromstring(f.read(binary_len),
                                                           dtype='float32')
                            else:
                                f.read(binary_len)

                print "to test ... .. . . embedding first loaded:"
                print(sess.run(tf.nn.embedding_lookup(embedding, 2)))
                sess.run(embedding.assign(initW))
                print "to test ... .. .. . function loaded:"
                print(sess.run(tf.nn.embedding_lookup(embedding, 2)))
                #===============================================================================

                # Embedding shared by the input and outputs.
                emb_encoder_inputs = [
                    tf.nn.embedding_lookup(embedding, x)
                    for x in encoder_inputs
                ]
                emb_decoder_inputs = [
                    tf.nn.embedding_lookup(embedding, x)
                    for x in decoder_inputs
                ]

            #matrix factorization
        ## s,u,v=tf.svd(emb_encoder_inputs,compute_uv=True)
        ## eigenSum=tf.reduce_sum(s)
        ## eigen=0
        ## threshold=0
        ## i=0;
        ## for i in range(len(s)):
        ##   eigen=s(i)
        ##   if((eigen/eigenSum)>threshold)
        ##     break;
        #rebuild eigenvector with i length
        ## new_eigenMatrix = tf.Variable(tf.zeros([i,i]))
        ## for j in range(i):
        ##   new_eigenMatrix[j,j]=s(j)
        #decrease embedding dim  [vsize,64]
        ##emb_encoder_inputs=tf.batch_matmul(u[,:j],new_eigenMatrix)
        # new_embedding=u*s
        #or decrease word length [N,128]
        # new_embedding=v*s

            for layer_i in xrange(hps.enc_layers):
                with tf.variable_scope('encoder%d' % layer_i), tf.device(
                        self._next_device()):
                    #bidirectional rnn cell
                    cell_fw = tf.nn.rnn_cell.LSTMCell(
                        hps.num_hidden,
                        initializer=tf.random_uniform_initializer(-0.1,
                                                                  0.1,
                                                                  seed=123),
                        state_is_tuple=False)
                    cell_bw = tf.nn.rnn_cell.LSTMCell(
                        hps.num_hidden,
                        initializer=tf.random_uniform_initializer(-0.1,
                                                                  0.1,
                                                                  seed=113),
                        state_is_tuple=False)
                    cell_fw = tf.nn.rnn_cell.DropoutWrapper(
                        cell_fw,
                        input_keep_prob=hps.input_dropout,
                        output_keep_prob=hps.output_dropout)
                    cell_bw = tf.nn.rnn_cell.DropoutWrapper(
                        cell_bw,
                        input_keep_prob=hps.input_dropout,
                        output_keep_prob=hps.output_dropout)
                    (emb_encoder_inputs, fw_state,
                     _) = tf.nn.bidirectional_rnn(cell_fw,
                                                  cell_bw,
                                                  emb_encoder_inputs,
                                                  dtype=tf.float32,
                                                  sequence_length=article_lens)
            encoder_outputs = emb_encoder_inputs
            print "fw_state:", fw_state
            with tf.variable_scope('output_projection'):
                w = tf.get_variable(
                    'w', [hps.num_hidden, vsize],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=1e-4))
                w_t = tf.transpose(w)
                v = tf.get_variable(
                    'v', [vsize],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=1e-4))

            with tf.variable_scope('decoder'), tf.device(self._next_device()):
                # When decoding, use model output from the previous step
                # for the next step.
                loop_function = None
                if hps.mode == 'decode':
                    loop_function = _extract_argmax_and_embed(
                        embedding, (w, v), update_embedding=False)
                cell = tf.nn.rnn_cell.LSTMCell(
                    hps.num_hidden,
                    initializer=tf.random_uniform_initializer(-0.1,
                                                              0.1,
                                                              seed=113),
                    state_is_tuple=False)
                cell = tf.nn.rnn_cell.DropoutWrapper(
                    cell,
                    input_keep_prob=hps.input_dropout,
                    output_keep_prob=hps.output_dropout)
                encoder_outputs = [
                    tf.reshape(x, [hps.batch_size, 1, 2 * hps.num_hidden])
                    for x in encoder_outputs
                ]
                self._enc_top_states = tf.concat(1, encoder_outputs)
                self._dec_in_state = fw_state
                # During decoding, follow up _dec_in_state are fed from beam_search.
                # dec_out_state are stored by beam_search for next step feeding.
                initial_state_attention = (hps.mode == 'decode')
                decoder_outputs, self._dec_out_state = tf.nn.seq2seq.attention_decoder(
                    emb_decoder_inputs,
                    self._dec_in_state,
                    self._enc_top_states,
                    cell,
                    num_heads=1,
                    loop_function=loop_function,
                    initial_state_attention=initial_state_attention)

                print "====emb_decoder_inputs:", emb_decoder_inputs
                print "====self._dec_in_state:", self._dec_in_state
                print "====self._enc_top_states:", self._enc_top_states
                print "====decoder_outputs:", decoder_outputs
                print "====self._dec_out_state:", self._dec_out_state
            with tf.variable_scope('output'), tf.device(self._next_device()):
                model_outputs = []
                for i in xrange(len(decoder_outputs)):
                    if i > 0:
                        tf.get_variable_scope().reuse_variables()
                    model_outputs.append(
                        tf.nn.xw_plus_b(decoder_outputs[i], w, v))

            if hps.mode == 'decode':
                with tf.variable_scope('decode_output'), tf.device('/gpu:0'):
                    best_outputs = [tf.argmax(x, 1) for x in model_outputs]
                    tf.logging.info('best_outputs%s',
                                    best_outputs[0].get_shape())
                    self._outputs = tf.concat(1, [
                        tf.reshape(x, [hps.batch_size, 1])
                        for x in best_outputs
                    ])

                    self._topk_log_probs, self._topk_ids = tf.nn.top_k(
                        tf.log(tf.nn.softmax(model_outputs[-1])),
                        hps.batch_size * 2)

            with tf.variable_scope('loss'), tf.device(self._next_device()):

                def sampled_loss_func(inputs, labels):
                    with tf.device('/gpu:0'):  # Try gpu.
                        labels = tf.reshape(labels, [-1, 1])
                        tf.logging.info('num_sampled%s',
                                        hps.num_softmax_samples)
                        return tf.nn.sampled_softmax_loss(
                            w_t, v, inputs, labels, hps.num_softmax_samples,
                            vsize)

                if hps.num_softmax_samples != 0 and hps.mode == 'train':
                    self._loss = seq2seq_lib.sampled_sequence_loss(
                        decoder_outputs, targets, loss_weights,
                        sampled_loss_func)
                else:
                    self._loss = tf.nn.seq2seq.sequence_loss(
                        model_outputs, targets, loss_weights)
                tf.scalar_summary('loss', tf.minimum(12.0, self._loss))

예제 #13

0

파일 보기

파일: batch_reader.py 프로젝트: willprice/tensorflow-model-zoo.torch

    def _FillInputQueue(self):
        """Fill input queue with ModelInput."""
        start_id = self._vocab.WordToId(data.SENTENCE_START)
        end_id = self._vocab.WordToId(data.SENTENCE_END)
        pad_id = self._vocab.WordToId(data.PAD_TOKEN)
        input_gen = self._TextGenerator(data.ExampleGen(self._data_path))
        while True:
            (article, abstract) = input_gen.next()
            article_sentences = [
                sent.strip()
                for sent in data.ToSentences(article, include_token=False)
            ]
            abstract_sentences = [
                sent.strip()
                for sent in data.ToSentences(abstract, include_token=False)
            ]

            enc_inputs = []
            # Use the <s> as the <GO> symbol for decoder inputs.
            dec_inputs = [start_id]

            # Convert first N sentences to word IDs, stripping existing <s> and </s>.
            for i in xrange(
                    min(self._max_article_sentences, len(article_sentences))):
                enc_inputs += data.GetWordIds(article_sentences[i],
                                              self._vocab)
            for i in xrange(
                    min(self._max_abstract_sentences,
                        len(abstract_sentences))):
                dec_inputs += data.GetWordIds(abstract_sentences[i],
                                              self._vocab)

            # Filter out too-short input
            if (len(enc_inputs) < self._hps.min_input_len
                    or len(dec_inputs) < self._hps.min_input_len):
                tf.logging.warning(
                    'Drop an example - too short.\nenc:%d\ndec:%d',
                    len(enc_inputs), len(dec_inputs))
                continue

            # If we're not truncating input, throw out too-long input
            if not self._truncate_input:
                if (len(enc_inputs) > self._hps.enc_timesteps
                        or len(dec_inputs) > self._hps.dec_timesteps):
                    tf.logging.warning(
                        'Drop an example - too long.\nenc:%d\ndec:%d',
                        len(enc_inputs), len(dec_inputs))
                    continue
            # If we are truncating input, do so if necessary
            else:
                if len(enc_inputs) > self._hps.enc_timesteps:
                    enc_inputs = enc_inputs[:self._hps.enc_timesteps]
                if len(dec_inputs) > self._hps.dec_timesteps:
                    dec_inputs = dec_inputs[:self._hps.dec_timesteps]

            # targets is dec_inputs without <s> at beginning, plus </s> at end
            targets = dec_inputs[1:]
            targets.append(end_id)

            # Now len(enc_inputs) should be <= enc_timesteps, and
            # len(targets) = len(dec_inputs) should be <= dec_timesteps

            enc_input_len = len(enc_inputs)
            dec_output_len = len(targets)

            # Pad if necessary
            while len(enc_inputs) < self._hps.enc_timesteps:
                enc_inputs.append(pad_id)
            while len(dec_inputs) < self._hps.dec_timesteps:
                dec_inputs.append(end_id)
            while len(targets) < self._hps.dec_timesteps:
                targets.append(end_id)

            element = ModelInput(enc_inputs, dec_inputs, targets,
                                 enc_input_len, dec_output_len,
                                 ' '.join(article_sentences),
                                 ' '.join(abstract_sentences))
            self._input_queue.put(element)

예제 #14

0

파일 보기

파일: copynet_batcher.py 프로젝트: renlang97/text2text

    def _FillInputQueue(self):
        """Fills input queue with ModelInput."""

        # input gets padded
        pad_id = self._input_vocab.WordToId(data.PAD_TOKEN)
        # output get start id and padded with end ids
        end_id = self._output_vocab.WordToId(data.SENTENCE_END)

        input_gen = self._TextGenerator(data.ExampleGen(self._data_path))
        while True:
            (source, targets) = next(input_gen)
            # target = choice(targets)
            target = targets[0]

            # Convert sentences to word IDs, stripping existing <s> and </s>.
            enc_inputs = data.GetWordIds(source, self._input_vocab)
            dec_inputs_gen = data.GetWordIds(target, self._output_vocab)
            dec_inputs_cop = data.GetWordIndices(target,
                                                 source,
                                                 self._input_vocab,
                                                 position_based_indexing=True)

            # Filter out too-short input
            if len(enc_inputs) < self._config.min_input_len:
                tf.logging.warning(
                    'Drop an example - input to short: %d (min: %d)',
                    len(enc_inputs), self._config.min_input_len)
                continue

            if len(dec_inputs_gen) < self._config.min_input_len:
                tf.logging.warning(
                    'Drop an example - output to short: %d (min: %d)',
                    len(enc_inputs), self._config.min_input_len)
                continue

            # If we're not truncating input, throw out too-long input
            if not self._truncate_input:
                if len(enc_inputs) > self._config.max_input_len:
                    tf.logging.warning(
                        'Drop an example - input to long: %d (max: %d)',
                        len(enc_inputs), self._config.max_input_len)
                    continue
                if len(dec_inputs_gen) > self._config.max_output_len:
                    tf.logging.warning(
                        'Drop an example - output to long: %d (max: %d)',
                        len(dec_inputs_gen), self._config.max_output_len)
                    continue
            # If we are truncating input, do so if necessary
            else:
                if len(enc_inputs) > self._config.max_input_len:
                    enc_inputs = enc_inputs[:self._config.max_input_len]
                    dec_inputs_cop = [
                        pos if pos <= self._config.max_input_len else 0
                        for pos in dec_inputs_cop
                    ]
                if len(dec_inputs_gen) > self._config.max_output_len:
                    dec_inputs_gen = dec_inputs_gen[:self._config.
                                                    max_output_len]
                    dec_inputs_cop = dec_inputs_cop[:self._config.
                                                    max_output_len]

            # dec_targets_gen is dec_inputs without <s> at beginning, plus </s> at end
            dec_targets_gen = dec_inputs_gen[1:]
            dec_targets_gen.append(end_id)

            # dec_targets_gen is dec_inputs without <s> at beginning, plus </s> at end
            dec_targets_cop = dec_inputs_cop[1:]
            end_position = len(enc_inputs)
            dec_targets_cop.append(end_position)

            enc_input_len = len(enc_inputs)
            dec_output_len = len(
                dec_targets_gen)  # is equal to len(dec_targets_cop)

            # Pad if necessary
            while len(enc_inputs) < self._config.max_input_len:
                enc_inputs.append(pad_id)
            while len(dec_inputs_gen) < self._config.max_output_len:
                dec_inputs_gen.append(end_id)
            while len(dec_targets_gen) < self._config.max_output_len:
                dec_targets_gen.append(end_id)
            while len(dec_targets_cop) < self._config.max_output_len:
                dec_targets_cop.append(end_position)

            element = ModelInput(enc_inputs, dec_inputs_gen, dec_targets_gen,
                                 dec_targets_cop, enc_input_len,
                                 dec_output_len, source, targets)
            self._input_queue.put(element)

예제 #15

0

파일 보기

파일: batch_reader.py 프로젝트: zhuhongjinA/TensorFlow

    def _FillInputQueue(self):
        """Fill input queue with ModelInput.
    SENTENCE_START = '<s>'
    SENTENCE_END = '</s>'
    UNKNOWN_TOKEN = '<UNK>'
    PAD_TOKEN = '<PAD>'
    """
        start_id = self._vocab.WordToId(data.SENTENCE_START)
        end_id = self._vocab.WordToId(data.SENTENCE_END)
        pad_id = self._vocab.WordToId(data.PAD_TOKEN)
        input_gen = self._TextGenerator(data.ExampleGen(self._data_path))
        while True:
            (article, abstract) = six.next(input_gen)
            #得到一个个句子，每个句子开头以<s>开始，以</s>结束，当include_token为False时，将开始和结尾的<s>，</s>去掉了
            article_sentences = [
                sent.strip()
                for sent in data.ToSentences(article, include_token=False)
            ]
            abstract_sentences = [
                sent.strip()
                for sent in data.ToSentences(abstract, include_token=False)
            ]

            enc_inputs = []
            # Use the <s> as the <GO> symbol for decoder inputs.
            #在解码模块的输入最前方加上<s>
            dec_inputs = [start_id]

            # Convert first N sentences to word IDs, stripping existing <s> and </s>.
            for i in xrange(
                    min(self._max_article_sentences, len(article_sentences))):
                #将一句话变为一个向量
                enc_inputs += data.GetWordIds(article_sentences[i],
                                              self._vocab)
            for i in xrange(
                    min(self._max_abstract_sentences,
                        len(abstract_sentences))):
                dec_inputs += data.GetWordIds(abstract_sentences[i],
                                              self._vocab)

            # Filter out too-short input
            #句子长度太短
            if (len(enc_inputs) < self._hps.min_input_len
                    or len(dec_inputs) < self._hps.min_input_len):
                tf.logging.warning(
                    'Drop an example - too short.\nenc:%d\ndec:%d',
                    len(enc_inputs), len(dec_inputs))
                continue

            #句子太长
            if not self._truncate_input:
                if (len(enc_inputs) > self._hps.enc_timesteps
                        or len(dec_inputs) > self._hps.dec_timesteps):
                    tf.logging.warning(
                        'Drop an example - too long.\nenc:%d\ndec:%d',
                        len(enc_inputs), len(dec_inputs))
                    continue
            # If we are truncating input, do so if necessary
            else:
                if len(enc_inputs) > self._hps.enc_timesteps:
                    enc_inputs = enc_inputs[:self._hps.enc_timesteps]
                if len(dec_inputs) > self._hps.dec_timesteps:
                    dec_inputs = dec_inputs[:self._hps.dec_timesteps]

            # targets is dec_inputs without <s> at beginning, plus </s> at end
            #解码阶段的输入是dec_inputs，以<s>开始，目标targets以</s结束>
            targets = dec_inputs[1:]
            targets.append(end_id)

            # Now len(enc_inputs) should be <= enc_timesteps, and
            # len(targets) = len(dec_inputs) should be <= dec_timesteps

            enc_input_len = len(enc_inputs)
            dec_output_len = len(targets)

            # 如果比指定长度短，在此处填充,dec_inputs是[<s>,...],targets是[...,<\s>]
            while len(enc_inputs) < self._hps.enc_timesteps:
                enc_inputs.append(pad_id)  #<PAD>，enc_inputs不包含<s>,</s>
            while len(dec_inputs) < self._hps.dec_timesteps:
                dec_inputs.append(end_id)
            while len(targets) < self._hps.dec_timesteps:
                targets.append(end_id)
            #将nametupe放入队列之中
            #参数:enc_inputs是编码阶段的输入，dec_inputs是解码阶段的输入，targets是解码的输出目标
            element = ModelInput(enc_inputs, dec_inputs, targets,
                                 enc_input_len, dec_output_len,
                                 ' '.join(article_sentences),
                                 ' '.join(abstract_sentences))
            self._input_queue.put(element)