def preprocess_data(args): label_counter = Counter([]) examples_per_file = Counter() print("Reading all files for labels.") for input_file in args.input_files: with xopen(input_file, "rt") as f: for example, labels in input_readers[args.task](f): examples_per_file[input_file] += 1 label_counter.update(labels) if args.top_n_labels > 0: mlb_full = MultiLabelBinarizer(sparse_output=True) mlb_full = mlb_full.fit(label_counter.keys()) label_counter = dict(label_counter.most_common(args.top_n_labels)) mlb = MultiLabelBinarizer(sparse_output=True) # Passing a list in a list because that's what the function wants. if args.labels_in: labels = json.load(open(args.labels_in)) mlb = mlb.fit([labels]) else: mlb = mlb.fit([[pair for pair in label_counter]]) # Save list of partial -> full mapping if doing top N labels. if args.top_n_labels > 0: label_mapping = np.where(np.in1d(mlb_full.classes_, mlb.classes_))[0].tolist() with xopen(args.label_mapping, "wt") as f: f.write(json.dumps(label_mapping)) # Also save the full labels. with xopen(args.full_labels, "wt") as f: f.write(json.dumps(list(mlb_full.classes_))) # Save list of labels. with xopen(args.labels_out, "wt") as f: f.write(json.dumps(list(mlb.classes_))) # Set parallel tokenization thread count. os.environ["RAYON_NUM_THREADS"] = str(args.processes) from tokenizers import Tokenizer, decoders, trainers from tokenizers.models import WordPiece from tokenizers.normalizers import BertNormalizer from tokenizers.pre_tokenizers import BertPreTokenizer from tokenizers.processors import BertProcessing if args.task == 'cafa': # Define our custom tokenizer. # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences. tokenizer = WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=20000) tokenizer = Tokenizer(tokenizer) tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"]) tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.post_processor = BertProcessing( ("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]"))) tokenizer.decoder = decoders.WordPiece(prefix='##') else: tokenizer = BertWordPieceTokenizer(args.vocab, lowercase=args.do_lower_case) tokenizer.enable_padding(max_length=args.seq_len) tokenizer.enable_truncation(max_length=args.seq_len) for input_file in args.input_files: with xopen(input_file, 'rt') as in_f: file_name = generate_out_filename(input_file, args) with xopen(file_name, "wt") as out_f: print("Processing to: ", file_name) # Write the shape as the first row, useful for the finetuning. if args.labels_in: n_labels = len(json.load(open(args.labels_in))) else: n_labels = len(label_counter) out_f.write( json.dumps((examples_per_file[input_file], n_labels)) + '\n') batch_size = min(examples_per_file[input_file], args.processes * 100) example_batch = [] labels_batch = [] doc_idx_batch = [] with ParallelGenerator(input_readers[args.task](in_f), max_lookahead=batch_size) as g: START_POS = int(args.window_start) / 100 for doc_idx, (example, labels) in enumerate(g): #example = ' '.join(example.split(' ')[-510:]) example_batch.append(example) labels_batch.append(labels) doc_idx_batch.append(doc_idx) if len(example_batch) == batch_size: example_batch = tokenizer.encode_batch( example_batch) labels_batch = mlb.transform(labels_batch) for example, labels, doc_idx in zip( example_batch, labels_batch, doc_idx_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() """try: [][0] print("DOC_LEN:",len(example.overflowing)+1) mid = len(example.overflowing)//2 out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n') except IndexError: out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')""" if args.all_blocks or args.n_blocks > 0: blocks = [example.ids] + [ blk.ids for blk in example.overflowing ] #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks]))) for b, block in enumerate(blocks, 2): if b > args.n_blocks and args.n_blocks > 0: break out_f.write( json.dumps( [block, labels, doc_idx]) + '\n') else: window = get_window(example, START_POS) assert len(window) == 512 assert all( [type(y) is int for y in window]) out_f.write( json.dumps([window, labels]) + '\n') example_batch = [] labels_batch = [] # Write out whatever is left in the last smaller batch. example_batch = tokenizer.encode_batch(example_batch) labels_batch = mlb.transform(labels_batch) for example, labels, doc_idx in zip( example_batch, labels_batch, doc_idx_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() """try: [][0] print("DOC_LEN:",len(example.overflowing)+1) mid = len(example.overflowing)//2 out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n') except IndexError: out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')""" if args.all_blocks or args.n_blocks > 0: blocks = [example.ids] + [ blk.ids for blk in example.overflowing ] #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks]))) for b, block in enumerate(blocks, 2): if b > args.n_blocks and args.n_blocks > 0: break out_f.write( json.dumps([block, labels, doc_idx]) + '\n') else: out_f.write( json.dumps( [get_window(example, START_POS), labels]) + '\n')
def encodings(self, bert_files): tokenizer = BertWordPieceTokenizer.from_file(bert_files["vocab"]) single_encoding = tokenizer.encode("I love HuggingFace") pair_encoding = tokenizer.encode("I love HuggingFace", "Do you?") return single_encoding, pair_encoding
from transformers import BertTokenizer, TFBertModel, BertConfig max_len = 384 configuration = BertConfig() # default paramters and configuration for BERT """ ## Set-up BERT tokenizer """ # Save the slow pretrained tokenizer slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") save_path = "bert_base_uncased/" if not os.path.exists(save_path): os.makedirs(save_path) slow_tokenizer.save_pretrained(save_path) # Load the fast tokenizer from saved file tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True) """ ## Load the data """ train_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json" train_path = keras.utils.get_file("train.json", train_data_url) eval_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json" eval_path = keras.utils.get_file("eval.json", eval_data_url) """ ## Preprocess the data 1. Go through the JSON file and store every record as a `SquadExample` object. 2. Go through each `SquadExample` and create `x_train, y_train, x_eval, y_eval`. """
) parser.add_argument("--name", default="bert-wordpiece", type=str, help="The name of the output vocab files") args = parser.parse_args() files = glob.glob(args.files) if not files: print(f"File does not exist: {args.files}") exit(1) # Initialize an empty tokenizer tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, ) # And then train tokenizer.train( files, vocab_size=10000, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=1000, wordpieces_prefix="##", ) # Save the files
# tensboard log and graph output folder declaration log_tensorboard_dir = output_path / "runs" / args.word_embedding_type writer = SummaryWriter(log_tensorboard_dir) # load datasets train_path = Path(args.train_path) test_path = Path(args.test_path) eval_path = Path(args.eval_path) train_data = Articles(train_path) test_data = Articles(test_path) eval_data = Articles(eval_path, index_file=args.index_file_path) print("Data Loaded") # initialize tokenizer from BERT library tokenizer = BertWordPieceTokenizer(args.tokenizer_file, lowercase=True) print("Tokenizer Initialized!") # create and save or load dictionaries based on arguments if args.create_dicts: ( final_word_ids, final_url_ids, final_publication_ids, ) = dictionary.create_merged_dictionaries( train_data.examples, "target", args.tokenizer_file ) print("Dictionaries Created") dict_path = Path(args.data_dir) / "dictionaries" if not dict_path.is_dir():
def get_tokenizer(self, path): tokenizer = BertWordPieceTokenizer(os.path.join(path, 'vocab.txt')) return tokenizer
from tokenizers import BertWordPieceTokenizer tokenizer = BertWordPieceTokenizer() tokenizer.train(["pg34988.txt"], vocab_size=100) opt = tokenizer.encode("Welcome to the wonderland.") print("Output: ") print(opt.ids, opt.tokens, opt.offsets)
from tokenizers import BertWordPieceTokenizer import os MAX_LEN = 64 VOCAB = "vocab.txt" CONFIG = "config.json" MODEL = "pytorch_model.bin" BERT_BASE = "./models/bert_base" TRAIN_SET = "./data/train.csv" TEST_SET = "./data/test.csv" CLEANED_TRAIN_SET = "./data/cleaned_train.csv" BERT_BASE_TOKENIZER = BertWordPieceTokenizer(os.path.join(BERT_BASE, VOCAB), lowercase=True) CHECKPOINT_FOLDER = "./checkpoints" BERTBASEWITHOUTSENTIMENT = "BertBaseWithOutSentiment" BERTBASEWITHSENTIMENT = "BertBaseWithSentiment" SENT_DIS = "./figs/sent_dis.png" SENTIMENT_DIS = "./figs/sentiment_dis.png" SENTIMENT_CONDITION_DIS = "./figs/sentiment_condition_dis.png" LOG_FOLDER = "./logs"
import torch from tokenizers import BertWordPieceTokenizer from amadeus_model import Amadeus tokenizer = BertWordPieceTokenizer('data/bert-base-uncased-vocab.txt', lowercase=True) model = Amadeus(num_tokens=tokenizer.get_vocab_size(), enc_seq_len=4096, dec_seq_len=1024) model.load_state_dict( torch.load('models/amadeus-performer-2020-11-03-16.54.13.pt')) model.eval(fix_proj_matrices=True) in_seq = torch.randint(0, tokenizer.get_vocab_size(), (1, model.in_seq_len)) out_seq = torch.randint(0, tokenizer.get_vocab_size(), (1, model.out_seq_len)) traced_script_model = torch.jit.trace(model, (in_seq, out_seq), check_trace=False) traced_script_model.save('traced.pt')
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" import numpy as np import tensorflow_hub as hub from tokenizers import BertWordPieceTokenizer import tensorflow as tf tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) from tensorflow import keras from tensorflow.keras import layers max_seq_length = 384 bert_layer = hub.KerasLayer( "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2", trainable=True) vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy().decode( "utf-8") tokenizer = BertWordPieceTokenizer(vocab=vocab_file, lowercase=True) model = keras.models.load_model('../model') def preprocess(context, question): start_token_idx = -1 end_token_idx = -1 context = " ".join(str(context).split()) question = " ".join(str(question).split()) # tokenize context and question tokenized_context = tokenizer.encode(context) tokenized_question = tokenizer.encode(question) input_ids = tokenized_context.ids + tokenized_question.ids[1:] token_type_ids = [0] * len(tokenized_context.ids) + [1] * len( tokenized_question.ids[1:])
def main(args): print(args) if args['train']: tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=True, strip_accents=True, # Must be False if cased model lowercase=True, wordpieces_prefix="##" ) tokenizer.train( files=['/data2/BERT/data/naver_news/news_3_preprocessed/naver_news.txt'], limit_alphabet=6000, vocab_size=32000 ) print(tokenizer.save_model("../BertWordPieceTokenizer_32000")) elif args['test']: test_str = '나는 워드피스 토크나이저를 써요. 성능이 좋은지 테스트 해보려 합니다.' print("=========== tokenizer ===========") tokenizer = BertWordPieceTokenizer("../BertWordPieceTokenizer_32000/vocab.txt") print(tokenizer) encoded_str = tokenizer.encode(test_str) print('encoding: ', encoded_str.ids) decoded_str = tokenizer.decode(encoded_str.ids) print(decoded_str) print("=========== BertTokenizer ===========") tokenizer = BertTokenizer("../BertWordPieceTokenizer_32000/vocab.txt") print(tokenizer) encoded_str = tokenizer.encode(test_str) print('encoding: ', encoded_str) decoded_str = tokenizer.decode(encoded_str) print(decoded_str) print("=========== BertTokenizer2 ===========") tokenizer = BertTokenizer.from_pretrained("../BertWordPieceTokenizer_32000") print(tokenizer) encoded_str = tokenizer.encode(test_str) print('encoding: ', encoded_str) decoded_str = tokenizer.decode(encoded_str) print(decoded_str)
import nltk from nltk.corpus import stopwords stopwords = stopwords.words('english') import pickle sentence_re = r'''(?x) # set flag to allow verbose regexps (?:[A-Z]\.)+ # abbreviations, e.g. U.S.A. | \w+(?:-\w+)* # words with optional internal hyphens | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():_`-] # these are separate tokens; includes ], [ ''' vocab = "D:/Word embedding/bert/assets/vocab.txt" tokenizer = BertWordPieceTokenizer(vocab, lowercase=True) kp_training = pd.read_json('kp/kp20k_training.json', lines=True) kp_testing = pd.read_json('kp/kp20k_testing.json', lines=True) kp_validation = pd.read_json('kp/kp20k_validation.json', lines=True) max_kp = 0 min_len =1e100 max_len = 512 all_reps = [] att_masks = [] key_positions = [] ref_positions = [] for i in range(len(kp_training[:20000])): text = kp_training['abstract'][i]
print(movie_reviews.columns.values) print(movie_reviews.sentiment.unique()) y = movie_reviews["sentiment"] y = np.array(list(map(lambda x: 1 if x == "positive" else 0, y))) slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") save_path = "bert_base_uncased/" if not os.path.exists(save_path): os.makedirs(save_path) slow_tokenizer.save_pretrained(save_path) # Load the fast tokenizer from saved file tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True) tokenizer.enable_truncation(MAX_SEQ_LEN - 2) train_count = 40000 # 40000 test_count = 2000 # # X_train = convert_sentences_to_features(reviews[:40000], tokenizer) # X_test = convert_sentences_to_features(reviews[40000:], tokenizer) X_train = convert_sentences_to_features(reviews[:train_count], tokenizer) X_test = convert_sentences_to_features(reviews[train_count : train_count + test_count], tokenizer) one_hot_encoded = to_categorical(y) # one_hot_encoded = tf.one_hot(y, 1) # y_train = one_hot_encoded[:40000]
#! /usr/bin/env python3 __AUTHORS__ = [("CS17BTECH11044", "YASH KHASBAGE"), ("CS17BTECH11029", "PUNEET MANGLA")] import json import pandas as pd from tokenizers import BertWordPieceTokenizer # Initialize an empty BERT tokenizer tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=False, strip_accents=False, lowercase=True, ) # prepare text files to train vocab on them files = ['input.txt'] # train BERT tokenizer tokenizer.train(files, vocab_size=30000, min_frequency=10, show_progress=True, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], limit_alphabet=1000, wordpieces_prefix="##") # save vocabulary tokenizer.save('./udc_vocab.txt')
def __init__(self, sentences, bert_path, padding=140): self.sentences = sentences self.tokenizer = BertWordPieceTokenizer(f'{bert_path}/vocab.txt', lowercase=True) self.padding = padding
from colorama import Fore from tokenizers import BertWordPieceTokenizer from tqdm import tqdm from transformers import ElectraTokenizer, ElectraForQuestionAnswering import torch from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from Reader.Sample import Sample model = ElectraForQuestionAnswering.from_pretrained("Reader/electra_QA").to( device=torch.device('cpu')) model.load_state_dict( torch.load('Reader/weight_electra/weights_3.pth', map_location=torch.device('cpu'))) model.eval() tokenizer = BertWordPieceTokenizer("Reader/electra_base_uncased/vocab.txt", lowercase=True) def inference(question, paragraph): squad_eg = Sample(tokenizer, question, paragraph) squad_eg.preprocess() dataset_dict = { "input_word_ids": [], "input_type_ids": [], "input_mask": [], "start_token_idx": [], "end_token_idx": [], } if squad_eg.skip is False: for key in dataset_dict:
class Reader(object): def __init__(self, bert_model: str, tokenizer: BaseTokenizer = None, cls: str = "[CLS]", sep: str = "[SEP]", threshold=6): self.tokenizer: BaseTokenizer = tokenizer self.cls = cls self.sep = sep if self.tokenizer is None: vocab_path: str = "tokenization/" + bert_model + ".txt" self.tokenizer = BertWordPieceTokenizer(vocab_path, lowercase="-cased" not in bert_model) self.threshold = threshold self.subword_alphabet: Optional[Alphabet] = None self.label_alphabet: Optional[Alphabet] = None self.train: Optional[List[SentInst]] = None self.dev: Optional[List[SentInst]] = None self.test: Optional[List[SentInst]] = None def _read_file(self, filename: str, mode: str = 'train') -> List[SentInst]: sent_list = [] max_len = 0 num_thresh = 0 with open(filename, "r", encoding="utf-8") as f: for line in f: line = line.strip() if line == "": # last few blank lines break raw_tokens = line.split(' ') tokens = raw_tokens chars = [list(t) for t in raw_tokens] entities = next(f).strip() if entities == "": # no entities sent_inst = SentInst(tokens, chars, []) else: entity_list = [] entities = entities.split("|") for item in entities: pointers, label = item.split() pointers = pointers.split(",") if int(pointers[1]) > len(tokens): pdb.set_trace() span_len = int(pointers[1]) - int(pointers[0]) if span_len < 0: print("Warning! span_len < 0") continue if span_len > max_len: max_len = span_len if span_len > self.threshold: num_thresh += 1 new_entity = (int(pointers[0]), int(pointers[1]), label) # may be duplicate entities in some datasets if (mode == 'train' and new_entity not in entity_list) or (mode != 'train'): entity_list.append(new_entity) # assert len(entity_list) == len(set(entity_list)) # check duplicate sent_inst = SentInst(tokens, chars, entity_list) assert next(f).strip() == "" # separating line sent_list.append(sent_inst) print("Max length: {}".format(max_len)) print("Threshold {}: {}".format(self.threshold, num_thresh)) return sent_list def _gen_dic(self) -> None: label_set = set() for sent_list in [self.train, self.dev, self.test]: num_mention = 0 for sentInst in sent_list: for entity in sentInst.entities: label_set.add(entity[2]) num_mention += len(sentInst.entities) print("# mentions: {}".format(num_mention)) vocab = [ self.tokenizer.id_to_token(idx) for idx in range(self.tokenizer.get_vocab_size()) ] self.subword_alphabet = Alphabet(vocab, 0) self.label_alphabet = Alphabet(label_set, 0) @staticmethod def _pad_batches(input_ids_batches: List[List[List[int]]], first_subtokens_batches: List[List[List[int]]]) \ -> Tuple[List[List[List[int]]], List[List[List[int]]], List[List[List[bool]]]]: padded_input_ids_batches = [] input_mask_batches = [] mask_batches = [] all_batches = list(zip(input_ids_batches, first_subtokens_batches)) for input_ids_batch, first_subtokens_batch in all_batches: batch_len = len(input_ids_batch) max_subtokens_num = max( [len(input_ids) for input_ids in input_ids_batch]) max_sent_len = max([ len(first_subtokens) for first_subtokens in first_subtokens_batch ]) padded_input_ids_batch = [] input_mask_batch = [] mask_batch = [] for i in range(batch_len): subtokens_num = len(input_ids_batch[i]) sent_len = len(first_subtokens_batch[i]) padded_subtoken_vec = input_ids_batch[i].copy() padded_subtoken_vec.extend([0] * (max_subtokens_num - subtokens_num)) input_mask = [1] * subtokens_num + [0] * (max_subtokens_num - subtokens_num) mask = [True] * sent_len + [False] * (max_sent_len - sent_len) padded_input_ids_batch.append(padded_subtoken_vec) input_mask_batch.append(input_mask) mask_batch.append(mask) padded_input_ids_batches.append(padded_input_ids_batch) input_mask_batches.append(input_mask_batch) mask_batches.append(mask_batch) return padded_input_ids_batches, input_mask_batches, mask_batches def get_batches(self, sentences: List[SentInst], batch_size: int) -> Tuple: subtoken_dic_dic = defaultdict(lambda: defaultdict(list)) first_subtoken_dic_dic = defaultdict(lambda: defaultdict(list)) last_subtoken_dic_dic = defaultdict(lambda: defaultdict(list)) label_dic_dic = defaultdict(lambda: defaultdict(list)) this_input_ids_batches = [] this_first_subtokens_batches = [] this_last_subtokens_batches = [] this_label_batches = [] for sentInst in sentences: subtoken_vec = [] first_subtoken_vec = [] last_subtoken_vec = [] subtoken_vec.append(self.tokenizer.token_to_id(self.cls)) for t in sentInst.tokens: encoding = self.tokenizer.encode(t) ids = [ v for v, mask in zip(encoding.ids, encoding.special_tokens_mask) if mask == 0 ] first_subtoken_vec.append(len(subtoken_vec)) subtoken_vec.extend(ids) last_subtoken_vec.append(len(subtoken_vec)) subtoken_vec.append(self.tokenizer.token_to_id(self.sep)) label_list = [(u[0], u[1], self.label_alphabet.get_index(u[2])) for u in sentInst.entities] subtoken_dic_dic[len( sentInst.tokens)][len(subtoken_vec)].append(subtoken_vec) first_subtoken_dic_dic[len( sentInst.tokens)][len(subtoken_vec)].append(first_subtoken_vec) last_subtoken_dic_dic[len( sentInst.tokens)][len(subtoken_vec)].append(last_subtoken_vec) label_dic_dic[len( sentInst.tokens)][len(subtoken_vec)].append(label_list) input_ids_batches = [] first_subtokens_batches = [] last_subtokens_batches = [] label_batches = [] for length1 in sorted(subtoken_dic_dic.keys(), reverse=True): for length2 in sorted(subtoken_dic_dic[length1].keys(), reverse=True): input_ids_batches.extend(subtoken_dic_dic[length1][length2]) first_subtokens_batches.extend( first_subtoken_dic_dic[length1][length2]) last_subtokens_batches.extend( last_subtoken_dic_dic[length1][length2]) label_batches.extend(label_dic_dic[length1][length2]) [ this_input_ids_batches.append(input_ids_batches[i:i + batch_size]) for i in range(0, len(input_ids_batches), batch_size) ] [ this_first_subtokens_batches.append( first_subtokens_batches[i:i + batch_size]) for i in range(0, len(first_subtokens_batches), batch_size) ] [ this_last_subtokens_batches.append( last_subtokens_batches[i:i + batch_size]) for i in range(0, len(last_subtokens_batches), batch_size) ] [ this_label_batches.append(label_batches[i:i + batch_size]) for i in range(0, len(label_batches), batch_size) ] this_input_ids_batches, this_input_mask_batches, this_mask_batches \ = self._pad_batches(this_input_ids_batches, this_first_subtokens_batches) return (this_input_ids_batches, this_input_mask_batches, this_first_subtokens_batches, this_last_subtokens_batches, this_label_batches, this_mask_batches) def to_batch(self, batch_size: int) -> Tuple: ret_list = [] for sent_list in [self.train, self.dev, self.test]: ret_list.append(self.get_batches(sent_list, batch_size)) return tuple(ret_list) def read_all_data(self, file_path: str, train_file: str, dev_file: str, test_file: str) -> None: self.train = self._read_file(file_path + train_file) self.dev = self._read_file(file_path + dev_file, mode='dev') self.test = self._read_file(file_path + test_file, mode='test') self._gen_dic() def debug_single_sample(self, subtoken: List[int], label_list: List[Tuple[int, int, int]]) -> None: print(" ".join( [self.subword_alphabet.get_instance(t) for t in subtoken])) for label in label_list: print(label[0], label[1], self.label_alphabet.get_instance(label[2]))
""" Parse boolean arguments from the command line. """ if s.lower() in ['off', 'false', '0']: return False if s.lower() in ['on', 'true', '1']: return True raise argparse.ArgumentTypeError("invalid value for a boolean flag (0 or 1)") if __name__ == '__main__': parser = argparse.ArgumentParser(description='Unsupervised training') parser.add_argument("--model_path", type=str, default="", help="Pretrained tokenizer path") parser.add_argument("--do_lower_case", type=bool_flag, default=False, help="do_lower_case") parser.add_argument("--input_file", type=str, default="", help="Input file to be tokenized") parser.add_argument("--output_file", type=str, default="", help="Output tokenized file") args = parser.parse_args() tokenizer = BertWordPieceTokenizer(args.model_path, lowercase=args.do_lower_case) count = 0 with open(args.input_file, 'r', encoding='utf-8') as fin: with open(args.output_file, 'w', encoding='utf-8') as fout: for line in fin: if len(line.strip())>0: output = tokenizer.encode(line.strip()) fout.write(' '.join(output.tokens)+'\n') else: fout.write('\n') count += 1 if count % 1000 == 0: fout.flush() print('%d sentences tokenized!'%count)
import os from tokenizers import BertWordPieceTokenizer from pathlib import Path save_dir = "vocab" paths = [ str(x) for x in Path("/home/phmay/data/nlp/corpus/ready/").glob("*.txt") ] print(paths) vocab_size = 32_767 # 2^15-1 min_frequency = 2 os.makedirs(save_dir, exist_ok=True) special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] for i in range(767 - 5): special_tokens.append('[unused{}]'.format(i)) # https://github.com/huggingface/tokenizers/blob/04fb9e4ebe785a6b2fd428766853eb27ee894645/bindings/python/tokenizers/implementations/bert_wordpiece.py#L11 tokenizer = BertWordPieceTokenizer(strip_accents=False) tokenizer.train( files=paths, vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=special_tokens, ) tokenizer.save_model(save_dir) tokenizer.save(save_dir + "/tokenizer.json")
def main(_): tf.logging.set_verbosity(tf.logging.INFO) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) processor = PextProcessor() label_list = processor.get_labels() # load tokenizer tokenizer = BertWordPieceTokenizer(FLAGS.vocab_file, add_special_tokens=False) tokenizer.no_padding() tokenizer.no_truncation() # Create Estimator train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) params = { "bert_config": bert_config, "num_labels": len(label_list), "init_checkpoint": FLAGS.init_checkpoint, "learning_rate": FLAGS.learning_rate, "num_train_steps": num_train_steps, "num_warmup_steps": num_warmup_steps } estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=FLAGS.output_dir, params=params) if FLAGS.do_train: tf.gfile.MakeDirs(FLAGS.output_dir) train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, batch_size=FLAGS.train_batch_size) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) # save config out_config_file = os.path.join(FLAGS.output_dir, 'config.json') with tf.gfile.GFile(out_config_file, "w") as writer: writer.write(bert_config.to_json_string()) # save tokenizer tokenizer.save(FLAGS.output_dir) if FLAGS.do_eval: # load last ckpt ckpt_prefix = get_last_ckpt_prefix(FLAGS.output_dir) params = { 'bert_config': modeling.BertConfig.from_json_file( os.path.join(FLAGS.output_dir, 'config.json')), 'num_labels': len(label_list), 'init_checkpoint': get_last_ckpt_prefix(FLAGS.output_dir), } estimator = tf.estimator.Estimator(model_fn=model_fn, params=params) # prepare data eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") #file_based_convert_examples_to_features( # eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, batch_size=FLAGS.eval_batch_size) result = estimator.predict(input_fn=eval_input_fn, yield_single_examples=False) pred_ids, label_ids = [], [] for prediction in result: pred_ids.append(prediction['pred_ids']) label_ids.append(prediction['label_ids']) pred_ids = np.concatenate(pred_ids, axis=0) label_ids = np.concatenate(label_ids, axis=0) y_pred = [[] for _ in range(label_ids.shape[0])] y_true = [[] for _ in range(label_ids.shape[0])] pad_token_label_id = 0 for i in range(label_ids.shape[0]): for j in range(label_ids.shape[1]): if label_ids[i, j] != pad_token_label_id: y_pred[i].append(label_list[pred_ids[i, j]]) y_true[i].append(label_list[label_ids[i, j]]) report = metrics.classification_report(y_true, y_pred, digits=4) precision = metrics.precision_score(y_true, y_pred) recall = metrics.recall_score(y_true, y_pred) f1 = metrics.f1_score(y_true, y_pred) tf.logging.info("Eval result: \n" + report) out_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(out_eval_file, "w") as writer: writer.write(f'precision = {precision: f} \n') writer.write(f'recall = {recall: f} \n') writer.write(f'f1 = {f1 :f} \n') if FLAGS.do_predict: pass
type=int, required=True, help='Vocabulary size', ) args = parser.parse_args() files = glob.glob(args.files) if not files: logger.info(f"File does not exist: {args.files}") exit(1) # CHINESE CHARACTERS???!!! # Initialize an empty tokenizer tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=False, strip_accents=True, lowercase=True, ) # And then train trainer = tokenizer.train( files, vocab_size=10000, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=1000, wordpieces_prefix="##", ) # Save the files
import argparse from tokenizers import BertWordPieceTokenizer parser = argparse.ArgumentParser() parser.add_argument("--corpus_file", type=str, default="../data/namuwiki.txt") parser.add_argument("--vocab_size", type=int, default=22000) parser.add_argument("--limit_alphabet", type=int, default=6000) args = parser.parse_args() tokenizer = BertWordPieceTokenizer( vocab_file=None, clean_text=True, handle_chinese_chars=True, strip_accents=False, # Must be False if cased model lowercase=False, wordpieces_prefix="##") tokenizer.train(files=[args.corpus_file], limit_alphabet=args.limit_alphabet, vocab_size=args.vocab_size) tokenizer.save( "./ch-{}-wpm-{}-pretty".format(args.limit_alphabet, args.vocab_size), True)
class config: MAX_LEN = 128 TOKENIZER = BertWordPieceTokenizer( '/home/koushik/Documents/Pretrained Models/bert-base-uncased/vocab.txt' ) BERT_PATH = '/home/koushik/Documents/Pretrained Models/bert-base-uncased'
def __init__(self, args: argparse.Namespace): """Initialize a model, tokenizer and config.""" super().__init__() if isinstance(args, argparse.Namespace): self.save_hyperparameters(args) self.args = args else: # eval mode TmpArgs = namedtuple("tmp_args", field_names=list(args.keys())) self.args = args = TmpArgs(**args) # self.bert_dir = args.bert_config_dir self.data_dir = self.args.data_dir self.bert_config_dir = BERT_DIR[self.args.model] self.tokenizer = BertWordPieceTokenizer(vocab=self.bert_config_dir + '/vocab.txt') if self.args.model == 'BERTMRC': self.tokenizer = BertWordPieceTokenizer( vocab=self.bert_config_dir + '/vocab.txt') bert_config = BertQueryNerConfig.from_pretrained( self.bert_config_dir, hidden_dropout_prob=args.bert_dropout, attention_probs_dropout_prob=args.bert_dropout, mrc_dropout=args.mrc_dropout) self.model = BERTModel[self.args.model].from_pretrained( self.bert_config_dir, config=bert_config) else: # self.tokenizer = AutoTokenizer.from_pretrained(self.bert_config_dir, do_lower_case=True) # self.model = BertForQuestionAnswering.from_pretrained(self.bert_config_dir) self.model = BERTModel[self.args.model](self.bert_config_dir, self.args) logging.info(str(self.model)) logging.info( str(args.__dict__ if isinstance(args, argparse.ArgumentParser ) else args)) # self.ce_loss = CrossEntropyLoss(reduction="none") self.loss_type = args.loss_type # self.loss_type = "bce" if self.loss_type == "bce": self.bce_loss = BCEWithLogitsLoss(reduction="none") else: self.dice_loss = DiceLoss(with_logits=True, smooth=args.dice_smooth) # todo(yuxian): 由于match loss是n^2的,应该特殊调整一下loss rate weight_sum = args.weight_start + args.weight_end + args.weight_span self.weight_start = args.weight_start / weight_sum self.weight_end = args.weight_end / weight_sum self.weight_span = args.weight_span / weight_sum self.flat_ner = args.flat self.span_f1 = QuerySpanF1(flat=self.flat_ner) self.chinese = args.chinese self.optimizer = args.optimizer self.span_loss_candidates = args.span_loss_candidates self.dataset_train, self.dataset_valid, self.dataset_test = get_dataloader( args.tgt_domain, args.n_samples, args.batch_size, self.tokenizer, query_type=self.args.query_type)
data_csv = "./data/ner_dataset.csv" """ ## Setup Tokenizers """ # Save the slow pretrained tokenizer slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # save_path = "D:/spyder/tf_torch_model/torch_model/bert_base_uncased/" if not os.path.exists(save_path): os.makedirs(save_path) slow_tokenizer.save_pretrained(save_path) # Load the fast tokenizer from saved file # "bert_base_uncased/vocab.txt" tokenizer = BertWordPieceTokenizer("D:/spyder/tf_torch_model/torch_model/bert_base_uncased/vocab.txt", lowercase=True) """ ## Define model """ loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=False, reduction=tf.keras.losses.Reduction.NONE ) def masked_ce_loss(real, pred): mask = tf.math.logical_not(tf.math.equal(real, 17)) loss_ = loss_object(real, pred) mask = tf.cast(mask, dtype=loss_.dtype)
def tokenizer( self) -> Union[BaseTokenizer, CountVectorizer, TfidfVectorizer]: pkl_path = os.path.join(self.tokenizer_path, "model.pkl") if self._tokenizer is not None: return self._tokenizer ### get pickled tokenizer if os.path.exists(pkl_path) and not self.retrain_tokenizer: with open(pkl_path, 'rb') as f: tokenizer = pickle.load(f) ### train new tokenizer else: self.retrain_tokenizer = False if self.algorithm == 'bert': from tokenizers import BertWordPieceTokenizer tokenizer = BertWordPieceTokenizer( vocab_file=None if self._init_vocabulary is None else os. path.join(self.cache_path, "bert_vocab.txt")) tokenizer.enable_truncation(max_length=self.max_length) tokenizer.enable_padding(length=self.max_length) # train the tokenizer if self._init_vocabulary is None: path = os.path.join(self.cache_path, 'train.txt') with open(path, 'w') as f: for i in chain(self.train_text, self.valid_text, self.test_text): if len(i) == 0: continue f.write(i + "\n" if i[-1] != "\n" else i) tokenizer.train(files=path, vocab_size=self.vocab_size, min_frequency=self.min_frequency, limit_alphabet=self.limit_alphabet, show_progress=True) tokenizer.save_model(self.tokenizer_path) elif self.algorithm in ('count', 'tf', 'tfidf'): if self.algorithm == 'count': tokenizer = CountVectorizer( input='content', ngram_range=self.ngram_range, min_df=self.min_frequency, max_df=self.max_frequency, max_features=self.vocab_size, vocabulary=self._init_vocabulary, tokenizer=_simple_tokenizer, stop_words='english') elif self.algorithm in ('tf', 'tfidf'): tokenizer = TfidfVectorizer( input='content', ngram_range=self.ngram_range, min_df=self.min_frequency, max_df=self.max_frequency, max_features=self.vocab_size, stop_words='english', vocabulary=self._init_vocabulary, tokenizer=_simple_tokenizer, use_idf=False if self.algorithm == 'tf' else True) tokenizer.fit((_simple_preprocess(i) for i in chain( self.train_text, self.valid_text, self.test_text))) else: raise NotImplementedError # save the pickled model with open(pkl_path, "wb") as f: pickle.dump(tokenizer, f) ### assign and return self._tokenizer = tokenizer return self._tokenizer
# plot_list_histogram(ocr_lengths, 'OCR') # print_statistics(ocr_lengths, 'OCR') # plot_list_histogram(gs_lengths, 'GS') # print_statistics(gs_lengths, 'GS') # In[5]: full_ocr_tokens_path = os.path.join('results', 'combined_ocr_tokens.pickle') full_gs_tokens_path = os.path.join('results', 'combined_gs_tokens.pickle') vocab_path = os.path.join('vocabularies', 'bert-base-cased-vocab.txt') tokenizer = BertWordPieceTokenizer(vocab_path) if not os.path.exists(full_ocr_tokens_path) or not os.path.exists(full_gs_tokens_path): ocr_tokens = [] gs_tokens = [] for i in range(len(ocr_file_data)): current_ids = tokenizer.encode(ocr_file_data[i]).ids if len(current_ids) > 2000: continue ocr_tokens.append(current_ids) gs_tokens.append(tokenizer.encode(gs_file_data[i]).ids) with open(full_ocr_tokens_path, 'wb') as ocr_handle: pickle.dump(ocr_tokens, ocr_handle, protocol=-1)
# Random seed define SEED_NUM = 1234 tf.random.set_seed(SEED_NUM) np.random.seed(SEED_NUM) # Save the slow pretrained tokenizer slow_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased", lowercase=False) save_path = "bert-base-multilingual-cased/" if not os.path.exists(save_path): os.makedirs(save_path) slow_tokenizer.save_pretrained(save_path) # Load the fast tokenizer from saved file tokenizer = BertWordPieceTokenizer("bert-base-multilingual-cased/vocab.txt", lowercase=False) # get dataset train_data_url = "https://korquad.github.io/dataset/KorQuAD_v1.0_train.json" train_path = keras.utils.get_file("train.json", train_data_url) eval_data_url = "https://korquad.github.io/dataset/KorQuAD_v1.0_dev.json" eval_path = keras.utils.get_file("eval.json", eval_data_url) wget.download( 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json', out='./bert-base-multilingual-cased/') os.rename( './bert-base-multilingual-cased/bert-base-multilingual-cased-config.json', './bert-base-multilingual-cased/config.json') wget.download(
from pathlib import Path from tokenizers import ByteLevelBPETokenizer if __name__ == '__main__': # ============================================================================= # Bert Tokenizer Format # ============================================================================= from tokenizers import BertWordPieceTokenizer # Initialize an empty BERT tokenizer tokenizer = BertWordPieceTokenizer( clean_text=False, handle_chinese_chars=False, strip_accents=False, lowercase=True, ) # prepare text files to train vocab on them files = ['data/merged_CC.txt', 'data/merged_wiki.txt'] # train BERT tokenizer tokenizer.train( files, vocab_size=50000, min_frequency=2, show_progress=True, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], limit_alphabet=1000, wordpieces_prefix="##"
import os # os.environ["CUDA_VISIBLE_DEVICES"] = "-1" import tensorflow as tf import traceback from keras.backend import set_session import numpy as np from tokenizers import BertWordPieceTokenizer from keras.models import load_model from keras_bert import get_custom_objects from flask import Flask from flask import request, jsonify max_len = 300 # Load the fast tokenizer from saved file tokenizer = BertWordPieceTokenizer("chinese_L-12_H-768_A-12/vocab.txt", lowercase=True) class SquadExample: def __init__(self, question, context, start_char_idx, answer_text, all_answers): self.question = question self.context = context self.start_char_idx = start_char_idx self.answer_text = answer_text self.all_answers = all_answers self.skip = False def preprocess(self): context = self.context question = self.question