Python Vocab 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: pomelo.corpora.Vocab

클래스/타입: Vocab

hotexamples.com에서의 예제들: 2

Python Vocab - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 pomelo.corpora.Vocab.Vocab에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

construct(1)

encode(1)

예제 #1

파일 보기

파일: RNNLM.py 프로젝트: yysherlock/pomelo

    def load_data(self, data_dir, debug=False):
        """Loads starter word-vectors and train/dev/test data. """
        train_fp = data_dir+"{}.txt".format('train')
        valid_fp = data_dir+"{}.txt".format('valid')
        test_fp = data_dir+"{}.txt".format('test')

        self.vocab = Vocab()
        self.vocab.construct(get_dataset(train_fp))
        self.encoded_train = np.array(
            [self.vocab.encode(word) for word in get_dataset(train_fp)],
            dtype=np.int32)
        self.encoded_valid = np.array(
            [self.vocab.encode(word) for word in get_dataset(valid_fp)],
            dtype=np.int32)
        self.encoded_test = np.array(
            [self.vocab.encode(word) for word in get_dataset(test_fp)],
            dtype=np.int32)
        if debug:
            num_debug = 1024*3
            self.encoded_train = self.encoded_train[:num_debug]
            self.encoded_valid = self.encoded_valid[:num_debug]
            self.encoded_test = self.encoded_test[:num_debug]

예제 #2

파일 보기

파일: RNNLM.py 프로젝트: yysherlock/pomelo

class RNNLM_Model(LanguageModel):
    def load_data(self, data_dir, debug=False):
        """Loads starter word-vectors and train/dev/test data. """
        train_fp = data_dir+"{}.txt".format('train')
        valid_fp = data_dir+"{}.txt".format('valid')
        test_fp = data_dir+"{}.txt".format('test')

        self.vocab = Vocab()
        self.vocab.construct(get_dataset(train_fp))
        self.encoded_train = np.array(
            [self.vocab.encode(word) for word in get_dataset(train_fp)],
            dtype=np.int32)
        self.encoded_valid = np.array(
            [self.vocab.encode(word) for word in get_dataset(valid_fp)],
            dtype=np.int32)
        self.encoded_test = np.array(
            [self.vocab.encode(word) for word in get_dataset(test_fp)],
            dtype=np.int32)
        if debug:
            num_debug = 1024*3
            self.encoded_train = self.encoded_train[:num_debug]
            self.encoded_valid = self.encoded_valid[:num_debug]
            self.encoded_test = self.encoded_test[:num_debug]

    def add_placeholders(self):
      """Generate placeholder variables to represent the input tensors

      These placeholders are used as inputs by the rest of the model building
      code and will be fed data during training.  Note that when "None" is in a
      placeholder's shape, it's flexible

      """
      self.input_placeholder = tf.placeholder(tf.int32, shape=[None, self.config.num_steps], name='Input')
      self.labels_placeholder = tf.placeholder(tf.int32, shape=[None, self.config.num_steps], name='Target')
      self.dropout_placeholder = tf.placeholder(tf.float32, name='Dropout')

    def add_embedding(self):
      """Add embedding layer.
    Returns:
        inputs: List of length num_steps, each of whose elements should be
                a tensor of shape (batch_size, embed_size).
      """
      # The embedding lookup is currently only implemented for the CPU
      with tf.device('/cpu:0'):
        embedding = tf.get_variable('Embedding', [len(self.vocab), self.config.embed_size])
        inputs = tf.nn.embedding_lookup(embedding, self.input_placeholder) # (data_size, num_steps, embed_size)
        inputs = [tf.squeeze(x,[1]) for x in tf.split(1, self.config.num_steps, inputs)] # Each element is (data_size, embed_size).
        return inputs

    def add_projection(self, rnn_outputs):
      """Adds a projection layer.
      Args:
        rnn_outputs: List of length num_steps, each of whose elements should be
                     a tensor of shape (batch_size, hidden_size).
      Returns:
        outputs: List of length num_steps, each a tensor of shape
                 (batch_size, len(vocab))
      """
      with tf.variable_scope('Softmax') as scope:
          U = tf.get_variable('U', [self.config.hidden_size, len(self.vocab)])
          b_2 = tf.get_variable('b_2', [len(self.vocab)])
          outputs = [tf.matmul(rnn_output, U) + b_2 for rnn_output in rnn_outputs] # Each  rnn_output is a hidden layer states
      return outputs

    def add_loss_op(self, output):
      """Adds loss ops to the computational graph.
      Hint: Use tensorflow.python.ops.seq2seq.sequence_loss to implement sequence loss.
      Args:
        output: A tensor of shape (None, self.vocab)
      Returns:
        loss: A 0-d tensor (scalar)
      """
      all_ones_weights = [tf.ones([self.config.batch_size * self.config.num_steps])]
      # output is logits
      loss = sequence_loss([output], \
          [tf.reshape(self.labels_placeholder, [-1])],\
          all_ones_weights) # , len(self.vocab)
      return loss

    def add_training_op(self, loss):
      """Sets up the training Ops.
      Args:
        loss: Loss tensor, from cross_entropy_loss.
      Returns:
        train_op: The Op for training.
      """
      tf.scalar_summary("cost", loss)
      opt = tf.train.AdamOptimizer(learning_rate=self.config.lr)
      global_step = tf.Variable(0, name='global_step', trainable=False)
      train_op = opt.minimize(loss,global_step=global_step)
      return train_op

    def __init__(self, config):
        self.config = config
        data_dir = config.data_dir
        self.load_data(data_dir,debug=False)
        self.add_placeholders()
        self.inputs = self.add_embedding()
        self.rnn_outputs = self.add_model(self.inputs)
        self.outputs = self.add_projection(self.rnn_outputs)

        self.predictions = [tf.nn.softmax(tf.cast(o, 'float64')) for o in self.outputs]
        output = tf.reshape(tf.concat(1, self.outputs), [-1,len(self.vocab)])
        self.calculate_loss = self.add_loss_op(output)
        self.train_step = self.add_training_op(self.calculate_loss)


    def add_model(self, inputs):
        """Creates the RNN LM model.

        Args:
          inputs: List of length num_steps, each of whose elements should be
                  a tensor of shape (batch_size, embed_size).
        Returns:
          outputs: List of length num_steps, each of whose elements should be
                   a tensor of shape (batch_size, hidden_size)
        """
        with tf.variable_scope('RNN') as scope:
            self.initial_state = tf.zeros([self.config.batch_size, self.config.hidden_size])
            hidden_state = self.initial_state
            rnn_outputs = []
            for tstep,rnn_input in enumerate(inputs):
                if tstep > 0: scope.reuse_variables()
                H = tf.get_variable('H', [self.config.hidden_size, self.config.hidden_size]) # Wh
                I = tf.get_variable('I', [self.config.embed_size, self.config.hidden_size]) # Wx
                b1 = tf.get_variable('b1', [self.config.hidden_size])
                rnn_input = tf.nn.dropout(rnn_input, self.dropout_placeholder)
                hidden_state = tf.nn.tanh( tf.matmul( rnn_input, I) + b1) + tf.nn.tanh(tf.matmul(hidden_state, H))
                output = tf.nn.dropout(hidden_state, self.dropout_placeholder)
                rnn_outputs.append(output)

        self.final_state = rnn_outputs[-1]

        return rnn_outputs


    def run_epoch(self, session, data, train_op=None, verbose=10):
      config = self.config
      dp = config.dropout
      if not train_op:
        train_op = tf.no_op()
        dp = 1
######################
      """
      batch_size = config.batch_size
      print('batch_size:', batch_size)
      data_len = len(data)
      batch_len = data_len // batch_size
      print('data_len:',data_len)
      print('batch_len:',batch_len)
      epoch_size = (batch_len - 1) // config.num_steps
      print('epoch_size:', epoch_size)
      """
#######################
      total_steps = sum(1 for x in rnn_data_iterator(data, config.batch_size, config.num_steps))

      total_loss = []
      state = self.initial_state.eval()
      for step, (x, y) in enumerate(
        rnn_data_iterator(data, config.batch_size, config.num_steps)):
        # We need to pass in the initial state and retrieve the final state to give
        # the RNN proper history
        feed = {self.input_placeholder: x,
                self.labels_placeholder: y,
                self.initial_state: state,
                self.dropout_placeholder: dp}
        loss, state, _ = session.run(
            [self.calculate_loss, self.final_state, train_op], feed_dict=feed)
        total_loss.append(loss)
        if verbose and step % verbose == 0:
            sys.stdout.write('\r{} / {} : pp = {}'.format(
                step, total_steps, np.exp(np.mean(total_loss))))
            sys.stdout.flush()
      if verbose:
        sys.stdout.write('\r')
      return np.exp(np.mean(total_loss))