def _bert_tokenizer_init(self, bert_pretrained_dir, bert_pretrained): self._bert_tokenizer = tokenization_bert.BertTokenizer( vocab_file=os.path.join( os.path.join(bert_pretrained_dir, bert_pretrained), "%s-vocab.txt" % bert_pretrained)) print("BERT tokenizer init completes")
def __init__(self, hparams, split=""): super().__init__() self.hparams = hparams self._input_examples = [] bert_pretrained_dir = os.path.join(self.hparams.bert_pretrained_dir, self.hparams.bert_pretrained) print(bert_pretrained_dir) self._tokenizer = tokenization_bert.BertTokenizer( vocab_file=os.path.join( bert_pretrained_dir, "%s-vocab.txt" % self.hparams.bert_pretrained)) self._vocab = self._tokenizer.vocab with open( os.path.join( hparams.data_dir, "%s_electra_post_training.pkl" % hparams.task_name), "rb") as pkl_handle: while True: try: self._input_examples.append(pickle.load(pkl_handle)) if len(self._input_examples) % 100000 == 0: print("%d examples has been loaded!" % len(self._input_examples)) except EOFError: break print("total post-training examples : %d" % len(self._input_examples))
def _bert_tokenizer_init(self, bert_pretrained='bert-base-cased'): bert_pretrained_dir = "/mnt/raid5/shared/bert/pytorch/%s/" % bert_pretrained vocab_file_path = "%s-vocab.txt" % bert_pretrained self._tokenizer = tokenization_bert.BertTokenizer( vocab_file=os.path.join(bert_pretrained_dir, vocab_file_path), do_lower_case=False) print("bert_tokenizer")
def _bert_tokenizer_init(self, special_tok, bert_pretrained): bert_pretrained_dir = os.path.join("./resources", bert_pretrained) vocab_file_path = "%s-vocab.txt" % bert_pretrained self._tokenizer = tokenization_bert.BertTokenizer(vocab_file=os.path.join(bert_pretrained_dir, vocab_file_path)) self._tokenizer.add_tokens([special_tok]) # add EOT print("BERT tokenizer init completes")
def __init__( self, hparams, split: str = "", ): super().__init__() self.hparams = hparams self.split = split # read pkls -> Input Examples self.input_examples = [] utterance_len_dict = dict() with open(os.path.join(hparams.data_dir, "%s_%s.pkl" % (hparams.task_name, split)), "rb") as pkl_handle: while True: try: example = pickle.load(pkl_handle) num_examples = len(example.utterances) if len(example.utterances) < 10 else 10 try: utterance_len_dict[str(num_examples)] += 1 except KeyError: utterance_len_dict[str(num_examples)] = 1 if self.hparams.do_shuffle_ressel: random.shuffle(example.utterances) self.input_examples.append(example) if len(self.input_examples) % 100000 == 0: print("%d examples has been loaded!" % len(self.input_examples)) if self.hparams.pca_visualization: break break except EOFError: break print(utterance_len_dict) random.seed(self.hparams.random_seed) self.num_input_examples = len(self.input_examples) print("total %s examples" % split, self.num_input_examples) bert_pretrained_dir = os.path.join(self.hparams.bert_pretrained_dir, self.hparams.bert_pretrained) print(bert_pretrained_dir) self._bert_tokenizer = tokenization_bert.BertTokenizer( vocab_file=os.path.join(bert_pretrained_dir, "%s-vocab.txt" % self.hparams.bert_pretrained)) # End of Turn Token if self.hparams.do_eot: self._bert_tokenizer.add_tokens(["[EOT]"]) if self.hparams.do_sent_insertion: self._bert_tokenizer.add_tokens(["[INS]"]) if self.hparams.do_sent_deletion: self._bert_tokenizer.add_tokens(["[DEL]"]) if self.hparams.do_sent_search: self._bert_tokenizer.add_tokens(["[SRCH]"])
def _bert_tokenizer_init(self, bert_pretrained_dir, bert_pretrained='bert-base-uncased'): #self._bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese") self._bert_tokenizer = tokenization_bert.BertTokenizer( vocab_file=os.path.join( os.path.join(bert_pretrained_dir, bert_pretrained), "%s-vocab.txt" % bert_pretrained)) print("BERT tokenizer init completes")
def __init__( self, hparams, split: str = "", ): super().__init__() self.hparams = hparams self.split = split # read pkls -> Input Examples self.input_examples = [] with open(hparams.data_dir % (hparams.task_name, split), "rb") as pkl_handle: while True: try: self.input_examples.append(pickle.load(pkl_handle)) if len(self.input_examples) % 100000 == 0: print("%d examples has been loaded!" % len(self.input_examples)) except EOFError: break print("total %s examples" % split, len(self.input_examples)) bert_pretrained_dir = os.path.join(self.hparams.bert_pretrained_dir, self.hparams.bert_pretrained) print(bert_pretrained_dir) self._bert_tokenizer = tokenization_bert.BertTokenizer( vocab_file=os.path.join( bert_pretrained_dir, "%s-vocab.txt" % self.hparams.bert_pretrained)) # End of Turn Token if self.hparams.do_eot: self._bert_tokenizer.add_tokens(["[EOT]"])