def _bert_tokenizer_init(self, bert_pretrained_dir, bert_pretrained):

        self._bert_tokenizer = tokenization_bert.BertTokenizer(
            vocab_file=os.path.join(
                os.path.join(bert_pretrained_dir, bert_pretrained),
                "%s-vocab.txt" % bert_pretrained))
        print("BERT tokenizer init completes")
예제 #2
0
    def __init__(self, hparams, split=""):
        super().__init__()

        self.hparams = hparams
        self._input_examples = []

        bert_pretrained_dir = os.path.join(self.hparams.bert_pretrained_dir,
                                           self.hparams.bert_pretrained)
        print(bert_pretrained_dir)
        self._tokenizer = tokenization_bert.BertTokenizer(
            vocab_file=os.path.join(
                bert_pretrained_dir, "%s-vocab.txt" %
                self.hparams.bert_pretrained))
        self._vocab = self._tokenizer.vocab

        with open(
                os.path.join(
                    hparams.data_dir,
                    "%s_electra_post_training.pkl" % hparams.task_name),
                "rb") as pkl_handle:
            while True:
                try:
                    self._input_examples.append(pickle.load(pkl_handle))
                    if len(self._input_examples) % 100000 == 0:
                        print("%d examples has been loaded!" %
                              len(self._input_examples))
                except EOFError:
                    break

        print("total post-training examples : %d" % len(self._input_examples))
예제 #3
0
    def _bert_tokenizer_init(self, bert_pretrained='bert-base-cased'):
        bert_pretrained_dir = "/mnt/raid5/shared/bert/pytorch/%s/" % bert_pretrained
        vocab_file_path = "%s-vocab.txt" % bert_pretrained

        self._tokenizer = tokenization_bert.BertTokenizer(
            vocab_file=os.path.join(bert_pretrained_dir, vocab_file_path),
            do_lower_case=False)
        print("bert_tokenizer")
예제 #4
0
  def _bert_tokenizer_init(self, special_tok, bert_pretrained):
    bert_pretrained_dir = os.path.join("./resources", bert_pretrained)
    vocab_file_path = "%s-vocab.txt" % bert_pretrained

    self._tokenizer = tokenization_bert.BertTokenizer(vocab_file=os.path.join(bert_pretrained_dir, vocab_file_path))
    self._tokenizer.add_tokens([special_tok])  # add EOT

    print("BERT tokenizer init completes")
예제 #5
0
  def __init__(
      self,
      hparams,
      split: str = "",
  ):
    super().__init__()

    self.hparams = hparams
    self.split = split

    # read pkls -> Input Examples
    self.input_examples = []
    utterance_len_dict = dict()

    with open(os.path.join(hparams.data_dir, "%s_%s.pkl" % (hparams.task_name, split)), "rb") as pkl_handle:

      while True:
        try:
          example = pickle.load(pkl_handle)
          num_examples = len(example.utterances) if len(example.utterances) < 10 else 10
          try:
            utterance_len_dict[str(num_examples)] += 1
          except KeyError:
            utterance_len_dict[str(num_examples)] = 1

          if self.hparams.do_shuffle_ressel:
            random.shuffle(example.utterances)

          self.input_examples.append(example)

          if len(self.input_examples) % 100000 == 0:
            print("%d examples has been loaded!" % len(self.input_examples))

            if self.hparams.pca_visualization:
              break
            break
        except EOFError:
          break
    print(utterance_len_dict)
    random.seed(self.hparams.random_seed)
    self.num_input_examples = len(self.input_examples)
    print("total %s examples" % split, self.num_input_examples)

    bert_pretrained_dir = os.path.join(self.hparams.bert_pretrained_dir, self.hparams.bert_pretrained)
    print(bert_pretrained_dir)
    self._bert_tokenizer = tokenization_bert.BertTokenizer(
      vocab_file=os.path.join(bert_pretrained_dir, "%s-vocab.txt" % self.hparams.bert_pretrained))

    # End of Turn Token
    if self.hparams.do_eot:
      self._bert_tokenizer.add_tokens(["[EOT]"])
    if self.hparams.do_sent_insertion:
      self._bert_tokenizer.add_tokens(["[INS]"])
    if self.hparams.do_sent_deletion:
      self._bert_tokenizer.add_tokens(["[DEL]"])
    if self.hparams.do_sent_search:
      self._bert_tokenizer.add_tokens(["[SRCH]"])
예제 #6
0
    def _bert_tokenizer_init(self,
                             bert_pretrained_dir,
                             bert_pretrained='bert-base-uncased'):

        #self._bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
        self._bert_tokenizer = tokenization_bert.BertTokenizer(
            vocab_file=os.path.join(
                os.path.join(bert_pretrained_dir, bert_pretrained),
                "%s-vocab.txt" % bert_pretrained))
        print("BERT tokenizer init completes")
예제 #7
0
    def __init__(
        self,
        hparams,
        split: str = "",
    ):
        super().__init__()

        self.hparams = hparams
        self.split = split

        # read pkls -> Input Examples
        self.input_examples = []
        with open(hparams.data_dir % (hparams.task_name, split),
                  "rb") as pkl_handle:
            while True:
                try:
                    self.input_examples.append(pickle.load(pkl_handle))
                    if len(self.input_examples) % 100000 == 0:
                        print("%d examples has been loaded!" %
                              len(self.input_examples))
                except EOFError:
                    break

        print("total %s examples" % split, len(self.input_examples))

        bert_pretrained_dir = os.path.join(self.hparams.bert_pretrained_dir,
                                           self.hparams.bert_pretrained)
        print(bert_pretrained_dir)
        self._bert_tokenizer = tokenization_bert.BertTokenizer(
            vocab_file=os.path.join(
                bert_pretrained_dir, "%s-vocab.txt" %
                self.hparams.bert_pretrained))

        # End of Turn Token
        if self.hparams.do_eot:
            self._bert_tokenizer.add_tokens(["[EOT]"])