예제 #1
0
    def forward(self,
                lemma_indices,
                tag_indices,
                inflected_form_indices=None,
                a_ls_true=None,
                p_gens_true=None):
        """

        Args:
            lemma_indices: list of list containing lemma indices
            tag_indices: list of list containing tag indices
            inflected_form_indices: list of list containing inflected form indices (for teacher forcing)
            a_ls_true: true alignments (for teacher forcing)
            p_gens_true: true p_gens (for teacher forcing)

        Returns:
            p_ws: log probabilities, of shape (bsz, max_decode_len, char_vocab_size)
            a_ls: attention over lemmas, of shape (bsz, max_decode_len, max_lemma_len)
            p_gens: p_gens, of shape (bsz, max_decode_len)
        """

        # (bsz, max_lemma_len, 2*hidden_size), (bsz, max_lemma_len), (1, bsz, hidden_size)
        h_l, mask_l, (h_l_n, c_l_n) = self.lemma_encoder(lemma_indices)

        # (bsz, max_tag_len, 2*hidden_size), (bsz, max_tag_len), (1, bsz, hidden_size)
        h_tg, mask_tg, (h_tg_n, c_tg_n) = self.tag_encoder(tag_indices)

        # (1, bsz, hidden_size) & (1, bsz, hidden_size) -> (1, bsz, hidden_size)
        s_0 = self.bridge_h(torch.cat([h_l_n, h_tg_n], dim=2))

        # (1, bsz, hidden_size) & (1, bsz, hidden_size) -> (1, bsz, hidden_size)
        c_0 = self.bridge_c(torch.cat([c_l_n, c_tg_n], dim=2))

        lemma_indices_padded = pad_lists(lemma_indices,
                                         self.vocab.padding_idx,
                                         dtype=torch.long,
                                         device=device)

        if inflected_form_indices is not None:
            inflected_form_indices = [
                [self.vocab.char_to_index(self.vocab.START_CHAR)] + seq_indices
                for seq_indices in inflected_form_indices
            ]
            inflected_form_indices = pad_lists(
                inflected_form_indices,
                self.vocab.padding_idx,
                dtype=torch.long,
                device=device)  # (bsz, max_tgt_len)
            decoder_input = self.lemma_encoder.embedder(
                inflected_form_indices)  # (bsz, max_tgt_len, embedding_size)
        else:
            decoder_input = None

        p_ws, a_ls, p_gens = self.decoder(lemma_indices_padded, h_l, h_tg,
                                          mask_l, mask_tg, (s_0, c_0),
                                          decoder_input, a_ls_true,
                                          p_gens_true)

        return p_ws, a_ls, p_gens
예제 #2
0
    def forward(self,
                lemma_indices,
                tag_indices,
                inflected_form_indices=None,
                a_ls_true=None,
                p_gens_true=None):
        """

        Args:
            lemma_indices: list of list containing lemma indices
            tag_indices: list of list containing tag indices
            inflected_form_indices: list of list containing inflected form indices (for teacher forcing)
            a_ls_true: true alignments (for teacher forcing)
            p_gens_true: true p_gens (for teacher forcing)

        Returns:
            p_ws: log probabilities, of shape (bsz, max_decode_len, char_vocab_size)
            a_ls: attention over lemmas, of shape (bsz, max_decode_len, max_lemma_len)
            p_gens: p_gens, of shape (bsz, max_decode_len)
        """

        # (bsz, max_lemma+tag_len, 2*hidden_size), (bsz, max_lemma_tag_len), (1, bsz, hidden_size)
        h, mask, (h_n, c_n) = self.encoder(
            [x + y for x, y in zip(lemma_indices, tag_indices)])

        s_0 = h_n  # (1, bsz, hidden_size)
        c_0 = c_n  # (1, bsz, hidden_size)

        input_indices_padded = pad_lists(
            [x + y for x, y in zip(lemma_indices, tag_indices)],
            self.vocab.padding_idx,
            dtype=torch.long,
            device=device)

        if inflected_form_indices is not None:
            inflected_form_indices = [
                [self.vocab.char_to_index(self.vocab.START_CHAR)] + seq_indices
                for seq_indices in inflected_form_indices
            ]
            inflected_form_indices = pad_lists(
                inflected_form_indices,
                self.vocab.padding_idx,
                dtype=torch.long,
                device=device)  # (bsz, max_tgt_len)
            decoder_input = self.encoder.embedder(
                inflected_form_indices)  # (bsz, max_tgt_len, embedding_size)
        else:
            decoder_input = None

        p_ws, a_ls, p_gens = self.decoder(input_indices_padded, h, mask,
                                          (s_0, c_0), decoder_input, a_ls_true,
                                          p_gens_true)

        return p_ws, a_ls, p_gens
def Evaluate(sess, model, dev_data, transition_params_trained, parameters):
  total_token_num = 0
  correct_token_num = 0
  start = time.time()
  epoch_num = 0
  while True:
    step = 0
    sent_list = []
    sentences = []
    tags = []
    sentence_lengths = []
    word_lengths = []
    while len(sentences) < parameters['batch_size']:
      sent, epoch_num = advance_sent(epoch_num, dev_data)
      sent_list.append(sent)
      sentences.append(sent.word_ids)
      tags.append(sent.tag_ids)
      sentence_lengths.append(sent.get_sent_len())
    feed_dict = {
      model.input_token_indices: utils.pad_lists(sentences),
      model.input_sent_lengths: sentence_lengths,
      model.input_label_indices:utils.pad_lists(tags),
      model.dropout_keep_prob: 1-parameters['dropout_rate']
    }
    unary_scores, predictions = sess.run([model.unary_scores, model.predictions], feed_dict)
    for index in range(parameters["batch_size"]):
      if parameters['use_crf']:
        outputs, _ = tf.contrib.crf.viterbi_decode(unary_scores[index], transition_params_trained)
        sent_list[index].set_sent_tags(outputs[1:-1])
      else:
        outputs = predictions[index]
        sent_list[index].set_sent_tags(outputs)
    if epoch_num >= 1:
      break
  num_tokens = 0
  num_correct = 0
  dev_data.reset_index()
  while dev_data.has_next_sent():
    sent = dev_data.get_next_sent()
    gold_tags = sent.tag_ids
    output_tags = sent.get_tag_output()
    assert len(gold_tags) == len(output_tags)
    for idx, tag in enumerate(gold_tags):
      num_tokens += 1
      if gold_tags[idx] == output_tags[idx]:
        num_correct += 1
  dev_data.reset_index()
  logging.info(num_correct)
  logging.info(num_tokens)
  logging.info('token number is %d, accuracy is %.2f%%', num_tokens, (100.0*num_correct/num_tokens))
  return 100.0 * num_correct / num_tokens
def Evaluate(sess, model, data, transition_params_trained, parameters):
    total_token_num = 0
    correct_token_num = 0
    start = time.time()
    while data.has_next_sent('dev'):
        sent = data.get_next_sent('dev')
        feed_dict = {
            model.input_token_indices: sent.word_ids,
            model.input_label_indices: sent.ner_ids,
            model.input_pos_indices: sent.pos_ids,
            model.input_token_character_indices:
            utils.pad_lists(sent.char_lists),
            model.input_token_lengths: sent.word_lengths,
            model.dropout_keep_prob: 1 - parameters['dropout_rate']
        }
        unary_scores, predictions = sess.run(
            [model.unary_scores, model.predictions], feed_dict)
        if parameters['use_crf']:
            predictions, _ = tf.contrib.crf.viterbi_decode(
                unary_scores, transition_params_trained)
            predictions = predictions[1:-1]
        else:
            predictions = predictions.tolist()
        gold_labels = sent.ner_ids
        total_token_num += len(predictions)
        for idx, p in enumerate(predictions):
            if p == gold_labels[idx]:
                correct_token_num += 1
    data.reset_index('dev')
    return 100.0 * correct_token_num / total_token_num
예제 #5
0
 def forward(self, a_ls, a_ls_true):
     max_decoder_len = a_ls.shape[1]
     max_lemma_len = a_ls.shape[2]
     target = pad_lists(a_ls_true,
                        -1,
                        pad_len=max_decoder_len,
                        dtype=torch.long,
                        device=device)
     loss = self.criterion(
         torch.log(a_ls + 1e-6).view(-1, max_lemma_len), target.view(-1))
     return loss
예제 #6
0
    def forward(self, indices):
        """

        Args:
            indices: list containing sequences of indices, of length bsz

        Returns:
            h: hidden state at each time step, of shape (bsz, max_src_len, 2*hidden_size)
            mask: 1 where input index is 0 (bsz, max_src_len)
            (h_n, c_n): final hidden state, a tuple ((1, bsz, hidden_size), (1, bsz, hidden_size))
        """

        # Inspired from here, https://discuss.pytorch.org/t/rnns-sorting-operations-autograd-safe/1461
        # See also, https://discuss.pytorch.org/t/solved-multiple-packedsequence-input-ordering/2106

        lengths = torch.tensor([len(x) for x in indices],
                               dtype=torch.long,
                               device=device)
        indices_padded = pad_lists(indices,
                                   self.vocab.padding_idx,
                                   dtype=torch.long,
                                   device=device)
        lengths_sorted, sorted_idx = lengths.sort(descending=True)
        indices_sorted = indices_padded[sorted_idx]
        embeddings_padded = self.embedder(indices_sorted)
        embeddings_padded = self.dropout_input(embeddings_padded)
        embeddings_packed = pack_padded_sequence(embeddings_padded,
                                                 lengths_sorted.tolist(),
                                                 batch_first=True)
        h, (h_n, c_n) = self.lstm(embeddings_packed)
        h, _ = pad_packed_sequence(h,
                                   batch_first=True,
                                   padding_value=self.vocab.padding_idx)
        h = torch.zeros_like(h).scatter_(
            0,
            sorted_idx.unsqueeze(1).unsqueeze(1).expand(
                -1, h.shape[1], h.shape[2]), h)  # Revert sorting
        h_n = torch.zeros_like(h_n).scatter_(
            1,
            sorted_idx.unsqueeze(0).unsqueeze(2).expand(
                h_n.shape[0], -1, h_n.shape[2]), h_n)  # Revert sorting
        c_n = torch.zeros_like(c_n).scatter_(
            1,
            sorted_idx.unsqueeze(0).unsqueeze(2).expand(
                c_n.shape[0], -1, c_n.shape[2]), c_n)  # Revert sorting
        h = self.dropout_output(h)
        h_n = (h_n[0, :, :] + h_n[1, :, :]).unsqueeze(
            0)  # (1, bsz, hidden_size)
        c_n = (c_n[0, :, :] + c_n[1, :, :]).unsqueeze(
            0)  # (1, bsz, hidden_size)
        mask = indices_padded == 0  # (bsz, max_lemma_len)

        return h, mask, (h_n, c_n)
예제 #7
0
 def forward(self, p_gens, p_gens_true):
     bsz = p_gens.shape[0]
     max_decoder_len = p_gens.shape[1]
     score = torch.zeros(bsz, max_decoder_len, 2, device=device)
     score[:, :, 0] = 1 - p_gens
     score[:, :, 1] = p_gens
     target = pad_lists(p_gens_true,
                        -1,
                        pad_len=max_decoder_len,
                        dtype=torch.long,
                        device=device)
     loss = self.criterion(
         torch.log(score + 1e-6).view(-1, 2), target.view(-1))
     return loss
예제 #8
0
 def forward(self, p_ws, inflected_forms_indices):
     max_decoder_len = p_ws.shape[1]
     tgt_classes = p_ws.shape[2]
     inflected_forms_indices = [
         seq_indices + [self.vocab.char_to_index(self.vocab.STOP_CHAR)]
         for seq_indices in inflected_forms_indices
     ]
     p_ws_target = pad_lists(inflected_forms_indices,
                             -1,
                             pad_len=max_decoder_len,
                             dtype=torch.long,
                             device=device)
     loss = self.criterion(p_ws.view(-1, tgt_classes), p_ws_target.view(-1))
     return loss
예제 #9
0
    def prune(self, interval, keep_end=False):
        """
        keep only the times(and their associated values) that are at least `interval` distance apart

        Parameters
        ----------
        interval: numeric, required
            the minimum distance between times to be preserved

        keep_end : bool, optional
            keep the last time and value of the timeseries, even if its less than `interval` distance from prior time
        """
        self._times, self._values = pad_lists(interval,
                                              self._times,
                                              self._values,
                                              keep_end=keep_end)
예제 #10
0
    def _new_slice(self, times, values, key):
        """
        slicing functionality for timeseries
        """
        try:
            start, stop, step = key.start, key.stop, key.step
            if all(x is None for x in [start, stop, step]):
                # [:] slice, return everything
                return times, values
        except AttributeError:
            start, stop, step = key, False, None

        if start is not None and start < times[
                0] and self.first_val is not False:
            # add default beginning value to front of list
            times = [start] + times
            values = [self.first_val] + values

        start_idx = index_of(start, times, begin=True)
        if stop is False:
            # slice only wants one value
            if self.interpolate:
                return start, self._interpolate(start, times, values)
            return start, values[start_idx]

        times, values = times[start_idx:], values[start_idx:]
        slice_times, slice_values = [x for x in times], [x for x in values]
        if start > slice_times[0]:
            # reset first time in slice_times
            slice_times[0] = start

        if step:
            slice_times, slice_values = pad_lists(step,
                                                  slice_times,
                                                  slice_values,
                                                  keep_dist=True)

        stop_idx = index_of(stop, slice_times)
        if not stop or stop > slice_times[stop_idx]:
            # hack to include the last value if stop is past the end of list
            stop_idx += 1

        if self.interpolate:
            return slice_times[:stop_idx], self._interpolate(
                slice_times[:stop_idx], times, values)

        return slice_times[:stop_idx], slice_values[:stop_idx]
def Evaluate(sess, model, dataset, transition_params_trained, parameters,
             epoch_num):

    start = time.time()
    accs = []
    correct_preds, total_correct, total_preds = 0., 0., 0.
    word_count = 0
    while dataset.has_next_sent('test'):
        sent = dataset.get_next_sent('test')
        feed_dict = {
            model.input_token_indices: sent.word_ids,
            model.input_token_character_indices:
            utils.pad_lists(sent.char_lists),
            model.input_token_lengths: sent.word_lengths,
            model.dropout_keep_prob: 1
        }
        unary_scores, predictions = sess.run(
            [model.unary_scores, model.predictions], feed_dict)
        if parameters['use_crf']:
            predictions, _ = tf.contrib.crf.viterbi_decode(
                unary_scores, transition_params_trained)
            predictions = predictions[1:-1]
        gold_labels = sent.ner_ids
        words = sent.word_ids
        word_count += len(words)
        accs += [a == b for (a, b) in zip(gold_labels, predictions)]
        lab_chunks = set(utils.get_chunks(gold_labels, dataset.ner_map))
        lab_pred_chunks = set(utils.get_chunks(predictions, dataset.ner_map))
        #logging.info(sent.ner_ids)
        #logging.info(predictions)
        correct_preds += len(lab_chunks & lab_pred_chunks)
        total_preds += len(lab_pred_chunks)
        total_correct += len(lab_chunks)

    p = correct_preds / total_preds if correct_preds > 0 else 0
    r = correct_preds / total_correct if correct_preds > 0 else 0
    f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0

    test_time = time.time() - start
    dataset.reset_index('test')
    logging.info("epoch: %d, f1 score: %.2f", epoch_num, f1 * 100.0)

    return test_time
예제 #12
0
    def pad(self, interval, keep_end=False):
        """
        pad timeseries so that there is a time(and value) at every interval
        if interpolate, the values will be interpolated, otherwise the previous value will be repeated

        Parameters
        ----------
        interval: numeric, required
            the minimum distance between times to be preserved

        keep_end : bool, optional
            keep the last time and value of the timeseries, even if its less than `interval` distance from prior time
        """
        new_times, new_values = pad_lists(interval,
                                          self._times,
                                          self._values,
                                          keep_end=keep_end)
        if self.interpolate:
            new_values = self._interpolate(new_times, self._times,
                                           self._values)
        self._times, self._values = new_times, new_values
def main():
    logging.set_verbosity(logging.INFO)
    if not gfile.IsDirectory(OutputPath('')):
        gfile.MakeDirs(OutputPath(''))
    log_output = codecs.open("ner_training_log1", 'w')
    parameters = {}
    parameters['use_character_lstm'] = True
    parameters['character_embedding_dimension'] = 25
    parameters['token_embedding_dimension'] = 100
    parameters['token_pretrained_embedding_filepath'] = ''
    parameters['character_lstm_hidden_state_dimension'] = 25
    parameters['token_lstm_hidden_state_dimension'] = 100
    parameters['use_crf'] = True
    parameters['optimizer'] = 'sgd'
    parameters['learning_rate'] = 0.01
    parameters['gradient_clipping_value'] = 5.0
    parameters['dropout_rate'] = 0.2
    parameters['maximum_number_of_epochs'] = 10
    parameters['use_tag_embedding'] = True
    parameters['pos_embedding_dimension'] = 16

    loading_time = time.time()
    train_data_path = '/cs/natlang-data/CoNLL/CoNLL-2003/eng.train'
    dev_data_path = '/cs/natlang-data/CoNLL/CoNLL-2003/eng.testa'
    test_data_path = '/cs/natlang-data/CoNLL/CoNLL-2003/eng.testb'
    logging.info("loading data and precomputing features...")

    dataset = Dataset(train_data_path, dev_data_path, test_data_path)
    dataset.load_dataset()

    sess = tf.Session()
    with sess.as_default():
        model = EntityLSTM(dataset, parameters)
        sess.run(tf.global_variables_initializer())
        #load glove token embeddings
        load_pretrained_token_embeddings(sess, model, dataset, parameters)
        epoch_num = 0
        start = time.time()
        best = 0.0
        while True:
            step = 0
            epoch_num += 1
            cost_sum = 0
            while dataset.has_next_sent('train'):
                sent = dataset.get_next_sent('train')
                step += 1
                feed_dict = {
                    model.input_token_indices:
                    sent.word_ids,
                    model.input_label_indices:
                    sent.ner_ids,
                    model.input_pos_indices:
                    sent.pos_ids,
                    model.input_token_character_indices:
                    utils.pad_lists(sent.char_lists),
                    model.input_token_lengths:
                    sent.word_lengths,
                    model.dropout_keep_prob:
                    1 - parameters['dropout_rate']
                }
                _, _, loss, transition_params_trained = sess.run([
                    model.train_op, model.global_step, model.loss,
                    model.transition_parameters
                ], feed_dict)
                cost_sum += loss
                if step % 1000 == 0:
                    current = Evaluate(sess, model, dataset,
                                       transition_params_trained, parameters)
                    log_output.write('EPOCH %d, loss is %.2f' %
                                     (epoch_num, cost_sum / 1000))
                    if current > best:
                        logging.info("saving the model...")
                        model_saver = tf.train.Saver(
                            max_to_keep=parameters['maximum_number_of_epochs'])
                        model_saver.save(
                            sess,
                            OutputPath(
                                'char_model_{0:05d}.ckpt'.format(epoch_num)))
                        best = current
                    cost_sum = 0
            dataset.reset_index('train')
            if epoch_num >= parameters['maximum_number_of_epochs']:
                break
        log_output.close()
        logging.info("finished training, time is %.2f", time.time() - start)

        total_token_num = 0
        correct_token_num = 0
        start = time.time()
        out_file = open("ner_out", "w")
        while dataset.has_next_sent('test'):
            sent = dataset.get_next_sent('test')
            feed_dict = {
                model.input_token_indices:
                sent.word_ids,
                model.input_label_indices:
                sent.ner_ids,
                model.input_pos_indices:
                sent.pos_ids,
                model.input_token_character_indices:
                utils.pad_lists(sent.char_lists),
                model.input_token_lengths:
                sent.word_lengths,
                model.dropout_keep_prob:
                1 - parameters['dropout_rate']
            }
            unary_scores, predictions = sess.run(
                [model.unary_scores, model.predictions], feed_dict)
            if parameters['use_crf']:
                predictions, _ = tf.contrib.crf.viterbi_decode(
                    unary_scores, transition_params_trained)
                predictions = predictions[1:-1]
            else:
                predictions = predictions.tolist()
            total_token_num += len(predictions)
            gold_labels = sent.ner_ids
            words = sent.get_word_list()
            pos = sent.get_pos_list()
            for idx, p in enumerate(predictions):
                tag = gold_labels[idx]
                if p == tag:
                    correct_token_num += 1
                out_file.write("%s %s %s %s\n" %
                               (words[idx], pos[idx], dataset.ner_map[tag],
                                dataset.ner_map[p]))
            out_file.write("\n")
        out_file.close()

        logging.info('token number is %d, accuracy is %.2f%%, time is %.2f',
                     total_token_num,
                     (100.0 * correct_token_num / total_token_num),
                     time.time() - start)
def main():
  logging.set_verbosity(logging.INFO)
  if not gfile.IsDirectory(OutputPath('')):
    gfile.MakeDirs(OutputPath(''))
  log_output = codecs.open("batch_train_out", 'w')
  parameters = {}
  parameters['use_character_lstm'] = True
  parameters['character_embedding_dimension'] = 25
  parameters['token_embedding_dimension'] = 100
  parameters['token_pretrained_embedding_filepath'] = ''
  parameters['character_lstm_hidden_state_dimension'] = 25
  parameters['token_lstm_hidden_state_dimension'] = 100
  parameters['use_crf'] = True
  parameters['optimizer'] = 'adam'
  parameters['learning_rate'] = 0.005
  parameters['gradient_clipping_value'] = 5.0
  parameters['dropout_rate'] = 0.2
  parameters['maximum_number_of_epochs'] = 10

  loading_time = time.time()
  train_data_path = '/cs/natlang-user/vivian/wsj-conll/train.conllu'
  dev_data_path = '/cs/natlang-user/vivian/wsj-conll/dev.conllu'
  logging.info("loading data and precomputing features...")
  train_data = Dataset(train_data_path)
  train_data.load_dataset()
  test_data = Dataset(dev_data_path)
  test_data.load_dataset(train_data.word_map, train_data.tag_map, train_data.char_map)

  sess = tf.Session()
  with sess.as_default():
    model = EntityLSTM(train_data, parameters)
    sess.run(tf.global_variables_initializer())
    #load glove token embeddings
    load_pretrained_token_embeddings(sess, model, train_data, parameters)
    epoch_num = 0
    start = time.time()
    best = 0.0
    while True:
      step = 0
      epoch_num += 1
      cost_sum = 0
      while train_data.has_next_sent():
        sent = train_data.get_next_sent()
        step += 1
        feed_dict = {
          model.input_token_indices: sent.word_ids,
          model.input_label_indices: sent.tag_ids,
          model.input_token_character_indices: utils.pad_lists(sent.char_lists),
          model.input_token_lengths: sent.word_lengths,
          model.dropout_keep_prob: 1-parameters['dropout_rate']
        }
        _, _, loss, transition_params_trained = sess.run(
                    [model.train_op, model.global_step, model.loss, model.transition_parameters],
                    feed_dict)
        cost_sum += loss
        if step % 1000 == 0:
          current = Evaluate(sess, model, test_data, transition_params_trained, parameters)
          log_output.write('EPOCH %d, loss is %.2f, accuracy is %.2f\n'%(epoch_num, cost_sum/1000, current))
          cost_sum = 0
          if current > best:
            logging.info("saving the model...")
            model_saver = tf.train.Saver(max_to_keep=parameters['maximum_number_of_epochs'])
            model_saver.save(sess, OutputPath('char_model_{0:05d}.ckpt'.format(epoch_num)))
            best = current
      train_data.reset_index()
      if epoch_num >= parameters['maximum_number_of_epochs']: 
        break
    log_output.close()
    logging.info("finished training, time is %.2f", time.time()-start)

    total_token_num = 0
    correct_token_num = 0
    start = time.time()
    while test_data.has_next_sent():
      sent = test_data.get_next_sent()
      feed_dict = {
        model.input_token_indices: sent.word_ids,
        model.input_label_indices: sent.tag_ids,
        model.input_token_character_indices: utils.pad_lists(sent.char_lists),
        model.input_token_lengths: sent.word_lengths,
        model.dropout_keep_prob: 1-parameters['dropout_rate']
      }
      logging.info("Train...")

      unary_scores, predictions = sess.run([model.unary_scores, model.predictions], feed_dict)
      if parameters['use_crf']:
          predictions, _ = tf.contrib.crf.viterbi_decode(unary_scores, transition_params_trained)
          predictions = predictions[1:-1]
      else:
          predictions = predictions.tolist()
      gold_labels = sent.tag_ids
      assert(len(predictions) == len(gold_labels))
      total_token_num += len(predictions)
      for idx, p in enumerate(predictions):
        if p == gold_labels[idx]:
          correct_token_num += 1

    logging.info('token number is %d, accuracy is %.2f%%, time is %.2f', total_token_num, (100.0*correct_token_num/total_token_num), time.time()-start)
def main():
    logging.set_verbosity(logging.INFO)
    if not gfile.IsDirectory(OutputPath('')):
        gfile.MakeDirs(OutputPath(''))
    parameters = {}
    parameters['use_crf'] = True
    parameters['use_character_lstm'] = True
    parameters['character_embedding_dimension'] = 25
    parameters['token_embedding_dimension'] = 100
    parameters[
        'token_pretrained_embedding_filepath'] = '/cs/natlang-user/vivian/NeuroNER/data/word_vectors/glove.6B.100d.txt'
    #parameters['token_pretrained_embedding_filepath'] = ''
    parameters['character_lstm_hidden_state_dimension'] = 50
    parameters['token_lstm_hidden_state_dimension'] = 100
    parameters['optimizer'] = 'sgd'
    parameters['learning_rate'] = 0.005
    parameters['gradient_clipping_value'] = 0
    parameters['dropout_rate'] = 0.5
    parameters['maximum_number_of_epochs'] = 50
    parameters['freeze_token_embeddings'] = False

    loading_time = time.time()
    train_data_path = '/cs/natlang-user/vivian/engonto.train'
    dev_data_path = '/cs/natlang-user/vivian/engonto.testa'
    test_data_path = '/cs/natlang-user/vivian/engonto.testb'
    logging.info("loading data and precomputing features...")

    dataset = Dataset(train_data_path,
                      dev_data_path,
                      test_data_path,
                      use_char=True)
    dataset.load_dataset()

    logging.info(dataset.ner_map)
    logging.info(dataset.ner_index)
    logging.info(time.time() - loading_time)
    total_time = 0.0

    sess = tf.Session()
    with sess.as_default():
        model = EntityLSTM(dataset, parameters)
        sess.run(tf.global_variables_initializer())
        #load glove token embeddings
        model.load_pretrained_token_embeddings(sess, dataset, parameters)
        epoch_num = 0
        start = time.time()
        best = 0.0
        while True:
            step = 0
            epoch_num += 1
            cost_sum = 0
            while dataset.has_next_sent('train'):
                sent = dataset.get_next_sent('train')
                step += 1
                feed_dict = {
                    model.input_token_indices:
                    sent.word_ids,
                    model.input_label_indices:
                    sent.ner_ids,
                    model.input_token_character_indices:
                    utils.pad_lists(sent.char_lists),
                    model.input_token_lengths:
                    sent.word_lengths,
                    model.dropout_keep_prob:
                    1 - parameters['dropout_rate']
                }
                if parameters['use_crf']:
                    _, loss, transition_params_trained = sess.run([
                        model.train_op, model.loss, model.transition_parameters
                    ], feed_dict)
                else:
                    _, loss = sess.run([model.train_op, model.loss], feed_dict)
                    transition_params_trained = None
                '''
        cost_sum += loss
        if step % 1000 == 0:
          current = Evaluate(sess, model, dataset, transition_params_trained, parameters)
          log_output.write('EPOCH %d, loss is %.2f'%(epoch_num, cost_sum/1000))
          if current > best:
            logging.info("saving the model...")
            model_saver = tf.train.Saver(max_to_keep=parameters['maximum_number_of_epochs'])
            model_saver.save(sess, OutputPath('char_model_{0:05d}.ckpt'.format(epoch_num)))
            best = current
          cost_sum = 0
        '''
            current = Evaluate(sess, model, dataset, transition_params_trained,
                               parameters, epoch_num)
            dataset.reset_index('train')
            if epoch_num >= parameters['maximum_number_of_epochs']:
                break

            model_saver = tf.train.Saver(
                max_to_keep=parameters['maximum_number_of_epochs'])
            model_saver.save(sess, OutputPath('char_model'))

            total_time += Evaluate(sess, model, dataset,
                                   transition_params_trained, parameters,
                                   epoch_num)
        logging.info("done")
def main():
    logging.set_verbosity(logging.INFO)
    if not gfile.IsDirectory(OutputPath('')):
        gfile.MakeDirs(OutputPath(''))
    parameters = {}
    parameters['use_character_lstm'] = True
    parameters['character_embedding_dimension'] = 25
    parameters['token_embedding_dimension'] = 100
    parameters['token_pretrained_embedding_filepath'] = ''
    parameters['pretrained_model_checkpoint_filepath'] = OutputPath(
        'char_model_{0:05d}.ckpt'.format(2))
    parameters['character_lstm_hidden_state_dimension'] = 25
    parameters['token_lstm_hidden_state_dimension'] = 100
    parameters['use_crf'] = True
    parameters['optimizer'] = 'adam'
    parameters['learning_rate'] = 0.005
    parameters['gradient_clipping_value'] = 5.0
    parameters['dropout_rate'] = 0.2
    parameters['maximum_number_of_epochs'] = 10

    loading_time = time.time()
    test_data_path = '/cs/natlang-user/vivian/wsj-conll/test.conllu'
    wordMapPath = 'word_map'
    tagMapPath = 'tag_map'
    charMapPath = 'char_map'
    word_map = readMap(wordMapPath)
    tag_map = readMap(tagMapPath)
    char_map = readMap(charMapPath)

    test_data = Dataset(test_data_path)
    test_data.load_dataset(word_map, tag_map, char_map)

    sess = tf.Session()
    with sess.as_default():
        model = EntityLSTM(test_data, parameters)
        sess.run(tf.global_variables_initializer())

        model_saver = tf.train.Saver(
            max_to_keep=parameters['maximum_number_of_epochs'])
        model_saver.restore(sess,
                            parameters['pretrained_model_checkpoint_filepath'])

        total_token_num = 0
        correct_token_num = 0
        start = time.time()
        transition_params_trained = sess.run(model.transition_parameters)
        start = time.time()
        while test_data.has_next_sent():
            sent = test_data.get_next_sent()
            feed_dict = {
                model.input_token_indices:
                sent.word_ids,
                model.input_label_indices:
                sent.tag_ids,
                model.input_token_character_indices:
                utils.pad_lists(sent.char_lists),
                model.input_token_lengths:
                sent.word_lengths,
                model.dropout_keep_prob:
                1 - parameters['dropout_rate']
            }
            unary_scores, predictions = sess.run(
                [model.unary_scores, model.predictions], feed_dict)
            if parameters['use_crf']:
                predictions, _ = tf.contrib.crf.viterbi_decode(
                    unary_scores, transition_params_trained)
                predictions = predictions[1:-1]
            else:
                predictions = predictions.tolist()
            gold_labels = sent.tag_ids
            total_token_num += len(predictions)
            for idx, p in enumerate(predictions):
                if p == gold_labels[idx]:
                    correct_token_num += 1
        logging.info('token number is %d, accuracy is %.2f%%, time is %.2f',
                     total_token_num,
                     (100.0 * correct_token_num / total_token_num),
                     time.time() - start)
def main():
  logging.set_verbosity(logging.INFO)
  if not gfile.IsDirectory(OutputPath('')):
    gfile.MakeDirs(OutputPath(''))
  parameters = {}
  parameters['use_character_lstm'] = False
  parameters['character_embedding_dimension'] = 25
  parameters['token_embedding_dimension'] = 100
  parameters['freeze_token_embeddings'] = False
  parameters['character_lstm_hidden_state_dimension'] = 25
  parameters['token_lstm_hidden_state_dimension'] = 100
  parameters['use_crf'] = True
  parameters['optimizer'] = 'adam'
  parameters['learning_rate'] = 0.002
  parameters['gradient_clipping_value'] = 5.0
  parameters['dropout_rate'] = 0.4
  parameters['maximum_number_of_epochs'] = 10
  parameters['batch_size'] = 32

  loading_time = time.time()
  train_data_path = '/cs/natlang-user/vivian/wsj-conll/train.conllu'
  dev_data_path = '/cs/natlang-user/vivian/wsj-conll/test.conllu'
  logging.info("loading data and precomputing features...")
  train_data = Dataset(train_data_path)
  train_data.load_dataset()
  test_data = Dataset(dev_data_path)
  test_data.load_dataset(train_data.word_map, train_data.tag_map, train_data.char_map)

  sess = tf.Session()
  epoch_num = 0
  with sess.as_default():
    model = EntityLSTM(train_data, parameters)
    sess.run(tf.global_variables_initializer())
    start = time.time()
    best = 0.0
    while True:
      step = 0
      sentences = []
      tags = []
      sentence_lengths = []
      word_lengths = []
      while len(sentences) < parameters['batch_size']:
        sent, epoch_num = advance_sent(epoch_num, train_data)
        sentences.append(sent.word_ids)
        tags.append(sent.tag_ids)
        sentence_lengths.append(sent.get_sent_len())
      feed_dict = {
        model.input_token_indices: utils.pad_lists(sentences),
        model.input_sent_lengths: sentence_lengths,
        model.input_label_indices:utils.pad_lists(tags),
        model.dropout_keep_prob: 1-parameters['dropout_rate']
      }
      _, _, loss, accuracy, transition_params_trained = sess.run(
                    [model.train_op, model.global_step, model.loss, model.accuracy, model.transition_parameters],
                    feed_dict)
      step += 1
      '''
      if step % 10 == 0:
        current = Evaluate(sess, model, test_data, transition_params_trained, parameters)
        if current > best:
          model_saver = tf.train.Saver(max_to_keep=parameters['maximum_number_of_epochs'])
          model_saver.save(sess, OutputPath('char_model_{0:05d}.ckpt'.format(epoch_num)))
          best = current
        logging.info('EPOCH %d, Training %.2f%% done', epoch_num, (100.0*step/train_data.get_sent_num()))
        logging.info('best accuracy is %.2f%%', best)
      '''
      if epoch_num >= parameters['maximum_number_of_epochs']: 
        break
    best = Evaluate(sess, model, test_data, transition_params_trained, parameters)
    logging.info("finished training, time is %.2f", time.time()-start)