def test_full_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", ",", "low", "lowest", ] with TemporaryDirectory() as tmpdirname: vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) with open(vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) input_text = "UNwant\u00E9d,running" output_text = "unwanted, running" create_and_check_tokenizer_commons(self, input_text, output_text, BertTokenizer, tmpdirname) tokenizer = BertTokenizer(vocab_file) tokens = tokenizer.tokenize("UNwant\u00E9d,running") self.assertListEqual( tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
def test_full_tokenizer(self): tokenizer = BertTokenizer(self.vocab_file) tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
def encode_documents(documents: list, tokenizer: BertTokenizer, max_input_length=512): """ Returns a len(documents) * max_sequences_per_document * 3 * 512 tensor where len(documents) is the batch dimension and the others encode bert input. This is the input to any of the document bert architectures. :param documents: a list of text documents :param tokenizer: the sentence piece bert tokenizer :return: """ tokenized_documents = [tokenizer.tokenize(document) for document in documents] max_sequences_per_document = math.ceil(max(len(x)/(max_input_length-2) for x in tokenized_documents)) assert max_sequences_per_document <= 300, "Your document is to large, arbitrary size when writing" output = torch.zeros(size=(len(documents), max_sequences_per_document, 3, 512), dtype=torch.long) document_seq_lengths = [] #number of sequence generated per document #Need to use 510 to account for 2 padding tokens for doc_index, tokenized_document in enumerate(tokenized_documents): max_seq_index = 0 for seq_index, i in enumerate(range(0, len(tokenized_document), (max_input_length-2))): raw_tokens = tokenized_document[i:i+(max_input_length-2)] tokens = [] input_type_ids = [] tokens.append("[CLS]") input_type_ids.append(0) for token in raw_tokens: tokens.append(token) input_type_ids.append(0) tokens.append("[SEP]") input_type_ids.append(0) input_ids = tokenizer.convert_tokens_to_ids(tokens) attention_masks = [1] * len(input_ids) while len(input_ids) < max_input_length: input_ids.append(0) input_type_ids.append(0) attention_masks.append(0) assert len(input_ids) == 512 and len(attention_masks) == 512 and len(input_type_ids) == 512 #we are ready to rumble output[doc_index][seq_index] = torch.cat((torch.LongTensor(input_ids).unsqueeze(0), torch.LongTensor(input_type_ids).unsqueeze(0), torch.LongTensor(attention_masks).unsqueeze(0)), dim=0) max_seq_index = seq_index document_seq_lengths.append(max_seq_index+1) return output, torch.LongTensor(document_seq_lengths)
def __init__(self): self.model_path = 'output/model' #self.processor = ATEPCProcessor() #self.labels = self.processor.get_labels() #self.n_class = len(self.labels) self.tokenizer = BertTokenizer.from_pretrained('./output/model/vocab.txt') self.device = torch.device("cuda:5" if torch.cuda.is_available() else 'cpu')
def __init__(self, params, shared=None): super(BiEncoderRanker, self).__init__() self.params = params self.device = torch.device("cuda" if torch.cuda.is_available() and not params["no_cuda"] else "cpu") self.n_gpu = torch.cuda.device_count() # init tokenizer self.NULL_IDX = 0 self.START_TOKEN = "[CLS]" self.END_TOKEN = "[SEP]" vocab_path = os.path.join(params["bert_model"], 'vocab.txt') if os.path.isfile(vocab_path): print(f"Found tokenizer vocabulary at {vocab_path}") self.tokenizer = BertTokenizer.from_pretrained( vocab_path if os.path.isfile(vocab_path) else params["bert_model"], do_lower_case=params["lowercase"]) # init model self.build_model() # Path to pytorch_model.bin for the biencoder model (not the pre-trained BERT model) model_path = params.get("path_to_biencoder_model") if model_path is None: model_path = params.get("path_to_model") if model_path is not None: self.load_model(model_path) self.model = self.model.to(self.device) self.data_parallel = params.get("data_parallel") if self.data_parallel: self.model = torch.nn.DataParallel(self.model)
def __init__(self, params, shared=None): super(BiEncoderRanker, self).__init__() self.params = params self.device = torch.device("cuda" if torch.cuda.is_available() and not params["no_cuda"] else "cpu") self.n_gpu = torch.cuda.device_count() # init tokenizer self.NULL_IDX = 0 self.START_TOKEN = "[CLS]" self.END_TOKEN = "[SEP]" self.tokenizer = BertTokenizer.from_pretrained( params["bert_model"], do_lower_case=params["lowercase"]) # init model self.build_model() model_path = params.get("path_to_model", None) if model_path is not None: self.load_model( model_path, cand_enc_only=params.get("load_cand_enc_only", False), ) self.model = self.model.to(self.device) # todo self.data_parallel = params.get("data_parallel") if self.data_parallel: self.model = torch.nn.DataParallel(self.model)
def main(): parser = ArgumentParser() parser.add_argument('--train_corpus', type=Path, required=True) parser.add_argument("--output_dir", type=Path, required=True) parser.add_argument("--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--do_lower_case", action="store_true") parser.add_argument("--do_whole_word_mask", action="store_true", help="Whether to use whole word masking rather than per-WordPiece masking.") parser.add_argument("--reduce_memory", action="store_true", help="Reduce memory usage for large datasets by keeping data on disc rather than in memory") parser.add_argument("--num_workers", type=int, default=1, help="The number of workers to use to write the files") parser.add_argument("--epochs_to_generate", type=int, default=3, help="Number of epochs of data to pregenerate") parser.add_argument("--max_seq_len", type=int, default=128) parser.add_argument("--short_seq_prob", type=float, default=0.1, help="Probability of making a short sentence as a training example") parser.add_argument("--masked_lm_prob", type=float, default=0.15, help="Probability of masking each token for the LM task") parser.add_argument("--max_predictions_per_seq", type=int, default=20, help="Maximum number of tokens to mask in each sequence") args = parser.parse_args() if args.num_workers > 1 and args.reduce_memory: raise ValueError("Cannot use multiple workers while reducing memory") tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) vocab_list = list(tokenizer.vocab.keys()) with DocumentDatabase(reduce_memory=args.reduce_memory) as docs: with args.train_corpus.open() as f: doc = [] for line in tqdm(f, desc="Loading Dataset", unit=" lines"): line = line.strip() if line == "": docs.add_document(doc) doc = [] else: tokens = tokenizer.tokenize(line) doc.append(tokens) if doc: docs.add_document(doc) # If the last doc didn't end on a newline, make sure it still gets added if len(docs) <= 1: exit("ERROR: No document breaks were found in the input file! These are necessary to allow the script to " "ensure that random NextSentences are not sampled from the same document. Please add blank lines to " "indicate breaks between documents in your input file. If your dataset does not contain multiple " "documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, " "sections or paragraphs.") args.output_dir.mkdir(exist_ok=True) if args.num_workers > 1: writer_workers = Pool(min(args.num_workers, args.epochs_to_generate)) arguments = [(docs, vocab_list, args, idx) for idx in range(args.epochs_to_generate)] writer_workers.starmap(create_training_file, arguments) else: for epoch in trange(args.epochs_to_generate, desc="Epoch"): create_training_file(docs, vocab_list, args, epoch)
def load_dataset(task_cfg, split): assert (split == "eval") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) loaders = {} loaders[split] = get_loader(task_cfg, tokenizer, split) return loaders
def __init__(self, params, shared=None): super(CrossEncoderRanker, self).__init__() self.params = params self.device = torch.device("cuda" if torch.cuda.is_available() and not params["no_cuda"] else "cpu") self.n_gpu = torch.cuda.device_count() if params.get("roberta"): self.tokenizer = RobertaTokenizer.from_pretrained( params["bert_model"], ) else: self.tokenizer = BertTokenizer.from_pretrained( params["bert_model"], do_lower_case=params["lowercase"]) special_tokens_dict = { "additional_special_tokens": [ ENT_START_TAG, ENT_END_TAG, ENT_TITLE_TAG, ], } self.tokenizer.add_special_tokens(special_tokens_dict) self.NULL_IDX = self.tokenizer.pad_token_id self.START_TOKEN = self.tokenizer.cls_token self.END_TOKEN = self.tokenizer.sep_token # init model self.build_model() if params["path_to_model"] is not None: self.load_model(params["path_to_model"]) self.model = self.model.to(self.device) self.data_parallel = params.get("data_parallel") if self.data_parallel: self.model = torch.nn.DataParallel(self.model)
def __init__(self,args=None, labels=None, device='cuda', bert_model_path='bert-base-uncased', architecture="DocumentBertLSTM", batch_size=10, bert_batch_size=7, learning_rate = 5e-5, weight_decay=0, use_tensorboard=False): if args is not None: self.args = vars(args) if not args: self.args = {} self.args['bert_model_path'] = bert_model_path self.args['device'] = device self.args['learning_rate'] = learning_rate self.args['weight_decay'] = weight_decay self.args['batch_size'] = batch_size self.args['labels'] = labels self.args['bert_batch_size'] = bert_batch_size self.args['architecture'] = architecture self.args['use_tensorboard'] = use_tensorboard if 'fold' not in self.args: self.args['fold'] = 0 assert self.args['labels'] is not None, "Must specify all labels in prediction" self.log = logging.getLogger() self.bert_tokenizer = BertTokenizer.from_pretrained(self.args['bert_model_path']) #account for some random tensorflow naming scheme if os.path.exists(self.args['bert_model_path']): if os.path.exists(os.path.join(self.args['bert_model_path'], CONFIG_NAME)): config = BertConfig.from_json_file(os.path.join(self.args['bert_model_path'], CONFIG_NAME)) elif os.path.exists(os.path.join(self.args['bert_model_path'], 'bert_config.json')): config = BertConfig.from_json_file(os.path.join(self.args['bert_model_path'], 'bert_config.json')) else: raise ValueError("Cannot find a configuration for the BERT based model you are attempting to load.") else: config = BertConfig.from_pretrained(self.args['bert_model_path']) config.__setattr__('num_labels',len(self.args['labels'])) config.__setattr__('bert_batch_size',self.args['bert_batch_size']) if 'use_tensorboard' in self.args and self.args['use_tensorboard']: assert 'model_directory' in self.args is not None, "Must have a logging and checkpoint directory set." from torch.utils.tensorboard import SummaryWriter self.tensorboard_writer = SummaryWriter(os.path.join(self.args['model_directory'], "..", "runs", self.args['model_directory'].split(os.path.sep)[-1]+'_'+self.args['architecture']+'_'+str(self.args['fold']))) self.bert_doc_classification = document_bert_architectures[self.args['architecture']].from_pretrained(self.args['bert_model_path'], config=config) self.optimizer = torch.optim.Adam( self.bert_doc_classification.parameters(), weight_decay=self.args['weight_decay'], lr=self.args['learning_rate'] )
def __init__(self, args): torch.manual_seed(args.seed) self.args = args # Tokenizer, Generator, Discriminator if args.load_epoch > -1: # NOTE: 0-indexed. Load from trained gen_path, dis_path = get_gan_path(self.args.model_out, self.args.load_epoch) else: gen_path, dis_path = args.bert_model, args.bert_model self.tokenizer = BertTokenizer.from_pretrained( gen_path) # TODO requires_grad = False? self.generator = BertForMaskedLM.from_pretrained(gen_path) self.discriminator = BertForSequenceClassification.from_pretrained( dis_path, num_labels=self.args.num_labels) # Optimizer self.optimizerG = self._get_optimizer_(self.generator) self.optimizerD = self._get_optimizer_(self.discriminator) # DataLoader self.msk_data = load_data(args.data_in, args.maxlen, args.batch_size, self.tokenizer, args.seed, 'masked') self.org_data = load_data(args.data_in, args.maxlen, args.batch_size, self.tokenizer, args.seed, 'original') self.mask_id = self.tokenizer.convert_tokens_to_ids(['[MASK]'])[0] self.device = torch.device("cuda:0" if args.cuda else "cpu") self.generator.to(self.device) self.discriminator.to(self.device)
def __init__(self, debug, args, data_dir, data_process_output): self.eval_steps = args.eval_steps self.adam_epsilon = args.adam_epsilon self.warmup_steps = args.warmup_steps self.learning_rate = args.learning_rate self.weight_decay = args.weight_decay self.gradient_accumulation_steps = args.gradient_accumulation_steps self.device = torch.device('cuda') self.debug = debug self.seed = 2019 self.args = args self.data_dir = args.data_dir self.max_seq_length = args.max_seq_length self.batch_size = args.per_gpu_train_batch_size self.train_steps = args.train_steps self.tokenizer = BertTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) self.config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3) self.seed_everything() self.do_eval = True self.data_dir = data_dir self.data_process_output = data_process_output self.output_dir = './'
def main(args): print(f"\nmain({args})\n") tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) vocab_list = list(tokenizer.vocab.keys()) if args.rand_mask > 0: fin_name = 'general_in_rand_mask.txt' args.output_dir = args.output_dir / 'rand_mask_bert_pregen' else: fin_name = 'general_in_lcs.txt' args.output_dir = args.output_dir / 'lcs_bert_pregen' with DocumentDatabase(reduce_memory=args.reduce_memory) as docs: with open(os.path.join(args.data_path,fin_name)) as f: for line in tqdm(f, desc="Loading Dataset", unit=" lines"): # mwp_ans is a list of tuples ('hello [MASK] ! [SEP] how are you ? [SEP]', 'world') mwp, ans = line[6:].strip().split('$$$') # [6:] to avoid "[CLS] " sents = mwp.split(' [SEP]')[:-1] ans = ans.split() ans = [[ans.pop(0) for _ in range(s.count('[MASK]'))] for s in sents] #docs.add_document(list(zip([tokenizer.tokenize(s) for s in sents], ans))) docs.add_document(list(zip([s.split() for s in sents], ans))) # It's bert-tokenized in make_data assert len(docs) > 1 args.output_dir.mkdir(exist_ok=True) for epoch in range(args.epochs_to_generate): my_create_training_file(docs, vocab_list, args, epoch)
def __init__(self, data_dir, output_dir, num_labels, args): self.data_dir = data_dir self.output_dir = output_dir self.num_labels = num_labels self.weight_decay = args.weight_decay self.eval_steps = args.eval_steps self.gradient_accumulation_steps = args.gradient_accumulation_steps self.warmup_steps = args.warmup_steps self.learning_rate = args.learning_rate self.adam_epsilon = args.adam_epsilon self.train_steps = args.train_steps self.per_gpu_eval_batch_size = args.per_gpu_eval_batch_size self.train_batch_size = args.per_gpu_train_batch_size self.eval_batch_size = self.per_gpu_eval_batch_size self.do_lower_case = args.do_lower_case self.model_name_or_path = args.model_name_or_path self.max_seq_length = args.max_seq_length self.seed = args.seed self.seed_everything() self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.tokenizer = BertTokenizer.from_pretrained( self.model_name_or_path, do_lower_case=self.do_lower_case) self.do_test = args.do_test self.do_eval = True self.args = args
def load_datasets(task_cfg, splits): tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) loaders = {} for split in splits: loaders[split] = get_loader(task_cfg, tokenizer, split) return loaders
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() set_seed() # args.output_dir=os.path.join(args.data_dir,args.task_name,args.output_dir) # args.temp_score_file_path=os.path.join(args.data_dir,args.task_name,args.temp_score_file_path) # args.input_cache_dir=os.path.join(args.data_dir, args.task_name, args.input_cache_dir) # if not os.path.exists(args.output_dir): # os.makedirs(args.output_dir) # if not os.path.exists(args.input_cache_dir): # os.makedirs(args.input_cache_dir) # myDataProcessorUtt = MyDataProcessorUtt(args.max_utterance_num) myDataProcessorSeg = MyDataProcessorSegres() # label_list = myDataProcessorUtt.get_labels() # num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.bert_model) if args.do_train: logger.info("start train...") output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) # if(os.path.exists(output_model_file)): # logger.info("load dict...") # model_state_dict = torch.load(output_model_file) # model = BertForSequenceClassificationTS.from_pretrained(args.bert_model, config=config, # state_dict=model_state_dict, num_labels=num_labels) # else: model = BertForSequenceClassificationTSv3.from_pretrained( args.bert_model, config=config, max_seg_num=args.max_segment_num, max_seq_len=args.max_seq_length, device=device) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) train(model, tokenizer, device, myDataProcessorSeg, n_gpu) else: logger.info("start test...") logger.info("load dict...") output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) model_state_dict = torch.load(output_model_file) model = BertForSequenceClassificationTSv3.from_pretrained( args.bert_model, config=config, state_dict=model_state_dict, max_seg_num=args.max_segment_num, max_seq_len=args.max_seq_length, device=device) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # similar_score(model, tokenizer, device,myDataProcessorSeg) result = eval(model, tokenizer, device, myDataProcessorSeg) logger.info( "Evaluation Result: \nMAP: %f\tMRR: %f\tP@1: %f\tR1: %f\tR2: %f\tR5: %f", result[0], result[1], result[2], result[3], result[4], result[5]) print(result)
def __init__(self, params): 'Initialization' self.numDataPoints = {} num_samples_train = params['num_train_samples'] num_samples_val = params['num_val_samples'] self._image_features_reader = ImageFeaturesH5Reader( params['visdial_image_feats']) with open(params['visdial_processed_train_dense']) as f: self.visdial_data_train = json.load(f) if params['overfit']: if num_samples_train: self.numDataPoints['train'] = num_samples_train else: self.numDataPoints['train'] = 5 else: if num_samples_train: self.numDataPoints['train'] = num_samples_train else: self.numDataPoints['train'] = len( self.visdial_data_train['data']['dialogs']) with open(params['visdial_processed_val']) as f: self.visdial_data_val = json.load(f) if params['overfit']: if num_samples_val: self.numDataPoints['val'] = num_samples_val else: self.numDataPoints['val'] = 5 else: if num_samples_val: self.numDataPoints['val'] = num_samples_val else: self.numDataPoints['val'] = len( self.visdial_data_val['data']['dialogs']) self.overfit = params['overfit'] with open(params['visdial_processed_train_dense_annotations']) as f: self.visdial_data_train_ndcg = json.load(f) with open(params['visdial_processed_val_dense_annotations']) as f: self.visdial_data_val_ndcg = json.load(f) #train val setup self.numDataPoints['trainval'] = self.numDataPoints[ 'train'] + self.numDataPoints['val'] self.num_options = params["num_options"] self._split = 'train' self.subsets = ['train', 'val', 'trainval'] tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.tokenizer = tokenizer # fetching token indicecs of [CLS] and [SEP] tokens = ['[CLS]', '[MASK]', '[SEP]'] indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) self.CLS = indexed_tokens[0] self.MASK = indexed_tokens[1] self.SEP = indexed_tokens[2] self.params = params self._max_region_num = 37
def __init__(self, config, *args, **kwargs): self.max_length = config.max_length self.bert_tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased') assert self.bert_tokenizer.encode(self.bert_tokenizer.pad_token) == [0] self.get_qgen_inds = getattr(config, 'get_qgen_inds', False) if self.get_qgen_inds: print('computing question generation indices in bert tokenizer')
def main(): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) reader = BertMCQParallelReader() out = reader.read("dummy_data.jsonl", tokenizer, 70, None) print(len(out)) tokens, segs, masks, labels = out[0]
def preprocess(self, data, opt): """ Preprocess the data and convert to ids. """ processed = [] tqdm_data = tqdm(data) if opt["lower"]: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') else: tokenizer = BertTokenizer.from_pretrained('bert-base-cased') for d in tqdm_data: bert_tokenize(tokenizer, d, opt) tokens = list(d["token"]) seq_len = len(tokens) + 2 # anonymize tokens ss, se = d['subj_start'], d['subj_end'] os, oe = d['obj_start'], d['obj_end'] pos = d['stanford_pos'] ner = d['stanford_ner'] deprel = d['stanford_deprel'] head = [int(x) for x in d['stanford_head']] assert any([x == 0 for x in head]) positions = get_positions(d['subj_start'] + 1, d['subj_end'] + 1, d['obj_start'] + 1, d['obj_end'] + 1, self.e_type2idx[d["subj_type"]], self.e_type2idx[d["obj_type"]], seq_len) subj_type = d['subj_type'] obj_type = d['obj_type'] relation = self.r_type2idx[d['relation']] processed.append({ "len": seq_len, "tokens": tokens, "pos": pos, "ner": ner, "deprel": deprel, "head": head, "position": positions, "s_type": subj_type, "o_type": obj_type, "relation": relation }) return processed
def __init__(self) -> None: os.environ[ 'CORENLP_HOME'] = '{}/stanford-corenlp-full-2018-10-05'.format( os.environ['HOME']) self.client = CoreNLPClient() self.client.ensure_alive() self.do_lower_case = '-cased' not in config.bert_model self.basic_tokenizer: BasicTokenizer \ = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=self.do_lower_case).basic_tokenizer
def make_gan_data(mathqa_train, out_dir, bert_model, do_lower_case, subset=1e8): print('ma') """ Parameters: mathqa_train str path to MathQA train.json out_dir str path to output dir subset int size of subset adopted NOTE: use the whole train set rather than 3k (actually 2.4k) for training """ def is_bad(sent): # 21476 out of 29837 MWPs used (0.7198) if any([s in '+-*/|@' for s in sent]): return True return True if sum([1 for s in sent if s.isalpha()]) < len(sent) / 2 else False obj = [] with open(mathqa_train, 'r') as jsonfile: obj = json.load(jsonfile) print(f'{len(obj)} MWPs from {mathqa_train}') good_mwps, bad_mwps = [], [] for imwp in trange(len(obj)): mwp = obj[imwp] if is_bad(mwp['Problem']): #or mwp['category'] == 'other':#TODO bad_mwps.append(mwp) continue # 27688 out of 29837, i.e. 92.8% kept good_mwps.append(mwp) if len(good_mwps) == subset: break if not os.path.exists(out_dir): os.makedirs(out_dir) tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case) with open(os.path.join(out_dir, 'mathqa.txt'), 'w') as fout: for imwp in trange(len(good_mwps)): mwp = good_mwps[imwp] problem = ' [SEP]'.join(sent_tokenize(mwp['Problem'])) + ' [SEP]' toks = [CLS] for tok in problem.split(): if tok is not SEP: toks.extend(tokenizer.tokenize(tok)) #if random() > 0.5: # toks.extend(['[MASK]']*len(tokenizer.tokenize(tok))) else: toks.append(SEP) fout.writelines(' '.join(toks) + '@@@' + mwp['category'] + '\n')
def main(): # **************************** 基础信息 *********************** logger = init_logger(log_name=config['model']['arch'], log_dir=config['output']['log_dir']) logger.info(f"seed is {config['train']['seed']}") device = 'cuda:%d' % config['train']['n_gpu'][0] if len( config['train']['n_gpu']) else 'cpu' seed_everything(seed=config['train']['seed'], device=device) logger.info('starting load data from disk') id2label = {value: key for key, value in config['label2id'].items()} #**************************** 数据生成 *********************** DT = DataTransformer(logger=logger, seed=config['train']['seed']) # 读取数据集以及数据划分 targets, sentences = DT.read_data( raw_data_path=config['data']['test_file_path'], preprocessor=EnglishPreProcessor(), is_train=False) tokenizer = BertTokenizer( vocab_file=config['pretrained']['bert']['vocab_path'], do_lower_case=config['train']['do_lower_case']) # train test_dataset = CreateDataset(data=list(zip(sentences, targets)), tokenizer=tokenizer, max_seq_len=config['train']['max_seq_len'], seed=config['train']['seed'], example_type='test') # 验证数据集 test_loader = DataLoader(dataset=test_dataset, batch_size=config['train']['batch_size'], num_workers=config['train']['num_workers'], shuffle=False, drop_last=False, pin_memory=False) # **************************** 模型 *********************** logger.info("initializing model") model = BertFine.from_pretrained( config['pretrained']['bert']['bert_model_dir'], cache_dir=config['output']['cache_dir'], num_classes=len(id2label)) # **************************** training model *********************** logger.info('model predicting....') predicter = Predicter( model=model, logger=logger, n_gpu=config['train']['n_gpu'], model_path=config['output']['checkpoint_dir'] / f"best_{config['model']['arch']}_model.pth", ) # 拟合模型 result = predicter.predict(data=test_loader) print(result) # 释放显存 if len(config['train']['n_gpu']) > 0: torch.cuda.empty_cache()
def __init__(self, config, *args, **kwargs): self.max_length = config.max_length pythia_root = get_pythia_root() VOCAB = 'bert-base-uncased-vocab.txt' self.bert_tokenizer = BertTokenizer.from_pretrained( os.path.join(pythia_root, config.model_data_dir, 'bert', VOCAB)) assert self.bert_tokenizer.encode(self.bert_tokenizer.pad_token) == [0] self.get_qgen_inds = getattr(config, 'get_qgen_inds', False) if self.get_qgen_inds: print('computing question generation indices in bert tokenizer')
def load_pretrain(configs, model_class, fine_tune_dir, processor, eval=False): """ configs: 配置文件 model_class: 模型名称 fine_tune_dir: 微调模型保存路径 processor: DataProcessor eval: 是否验证 """ model_class_map = { 'Bert': Bert, 'BertCRF': BertCRF, 'BertBiLSTMCRF': BertBiLSTMCRF, 'BiLSTM': BiLSTM, 'BiLSTMCRF': BiLSTMCRF } model_class_ = model_class_map[model_class] label_list = processor.get_labels() check_dir(fine_tune_dir) if eval: model_pretrained_path = fine_tune_dir else: model_pretrained_path = configs['pretrained_model_dir'] tokenizer = BertTokenizer.from_pretrained( model_pretrained_path, do_lower_case=configs['lower_case']) if model_class in ['Bert', 'BertCRF', 'BertBiLSTMCRF']: bert_config = BertConfig.from_pretrained(model_pretrained_path, num_labels=len(label_list), finetuning_task="ner") model = model_class_.from_pretrained(model_pretrained_path, config=bert_config, model_configs=configs) elif model_class in ['BiLSTM', 'BiLSTMCRF']: configs['num_labels'] = len(label_list) if configs['use_pretrained_embedding']: pretrained_word_embed = build_word_embed( tokenizer, pretrain_embed_file=configs['pretrain_embed_file'], pretrain_embed_pkl=configs['pretrain_embed_pkl']) configs['word_vocab_size'] = pretrained_word_embed.shape[0] configs['word_embedding_dim'] = pretrained_word_embed.shape[1] else: pretrained_word_embed = None if eval: model_pretrained_path = fine_tune_dir model = model_class_.from_pretrained(model_pretrained_path, pretrained_word_embed) else: model = model_class_(configs, pretrained_word_embed) else: raise ValueError("Invalid Model Class") return model, tokenizer
def LoadDatasetEval(args, config, task_cfg, task_id): if "roberta" in args.bert_model: tokenizer = RobertaTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) else: tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) task = "TASK" + task_id task_name = task_cfg[task]["name"] # initialize the feature reader feats_h5path1 = task_cfg[task]["features_h5path1"] feats_h5path2 = task_cfg[task]["features_h5path2"] features_reader1 = ImageFeaturesH5Reader(feats_h5path1, config, args.in_memory) if feats_h5path1 != "" else None features_reader2 = ImageFeaturesH5Reader(feats_h5path2, config, args.in_memory) if feats_h5path2 != "" else None batch_size = task_cfg[task].get("eval_batch_size", args.batch_size) if args.local_rank != -1: batch_size = int(batch_size / dist.get_world_size()) logger.info("Loading %s Dataset with batch size %d" % (task_name, batch_size)) if args.split: eval_split = args.split else: eval_split = task_cfg[task]["val_split"] dset_val = DatasetMapEval[task_name]( task=task_cfg[task]["name"], dataroot=task_cfg[task]["dataroot"], annotations_jsonpath=task_cfg[task]["val_annotations_jsonpath"], split=eval_split, image_features_reader=features_reader1, gt_image_features_reader=features_reader2, tokenizer=tokenizer, bert_model=args.bert_model, padding_index=0, max_seq_length=task_cfg[task]["max_seq_length"], max_region_num=task_cfg[task]["max_region_num"], num_locs=config.num_locs, add_global_imgfeat=config.add_global_imgfeat, append_mask_sep=(config.fusion_method == 'vl-bert_vqa'), ) dl_val = DataLoader( dset_val, shuffle=False, batch_size=batch_size, num_workers=10, pin_memory=True, drop_last=args.drop_last, ) task2num_iters = {task: len(dl_val)} return batch_size, task2num_iters, dset_val, dl_val
def LoadDatasets(args, task_cfg, ids, split="trainval"): tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) task_feature_reader1 = {} task_feature_reader2 = {} for i, task_id in enumerate(ids): task = "TASK" + task_id + "1" if task_cfg[task]["features_h5path1"] not in task_feature_reader1: task_feature_reader1[task_cfg[task]["features_h5path1"]] = None if task_cfg[task]["features_h5path2"] not in task_feature_reader2: task_feature_reader2[task_cfg[task]["features_h5path2"]] = None
def test_sequence_builders(self): tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") text = tokenizer.encode("sequence builders") text_2 = tokenizer.encode("multi-sequence build") encoded_sentence = tokenizer.add_special_tokens_single_sentence(text) encoded_pair = tokenizer.add_special_tokens_sentences_pair( text, text_2) assert encoded_sentence == [101] + text + [102] assert encoded_pair == [101] + text + [102] + text_2 + [102]
def tokenize_one_(self, msg: str, tokenizer: BertTokenizer = None): bert_tokens = [] wp_starts = [] truncated = False if tokenizer is None: raise Exception('Tokenizer can not be None.') tokens = pre_processing.tokenise(f'{msg["title"]}. {msg["body"]}', lowercase=False, simple=True, remove_stopwords=False) for i_token, token_str in enumerate(tokens): skip_token = False wordpieces = tokenizer.tokenize(token_str) if not wordpieces: # this mainly happens for strange unicode characters token_str = '[UNK]' wordpieces = tokenizer.tokenize(token_str) skip_token = True if len(bert_tokens) + len(wordpieces) > 510: # bert model is limited to 512 tokens truncated = True break if not skip_token: wp_starts.append(len(bert_tokens) + 1) # first token is [CLS] bert_tokens.extend(wordpieces) bert_tokens = ['[CLS]'] + bert_tokens + ['[SEP]'] assert len(bert_tokens) <= 512, f'{len(bert_tokens)} > 512' bert_ids = tokenizer.convert_tokens_to_ids(bert_tokens) return bert_ids, wp_starts, truncated
def model_samples_from_json(config, token_id_dict, unknown_token_id, type_id_dict, mentions_file, sents_file): if config.use_bert: tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) print('bert tokenizer loaded') sent_tokens_id_dict = dict() sent_tokens_dict = dict() with open(sents_file, encoding='utf-8') as f: for line in f: sent = json.loads(line) tokens = sent['text'].split(' ') sent_tokens_id_dict[sent['sent_id']] = [token_id_dict.get(t, unknown_token_id) for t in tokens] sent_tokens_dict[sent['sent_id']] = [t for t in tokens] samples = list() mentions = datautils.read_json_objs(mentions_file) for m in mentions: if config.use_bert: org_tok_sents = sent_tokens_dict[m['sent_id']] bert_sent_tokens = org_tok_sents[:m['span'][0]] + ['[MASK]'] + org_tok_sents[m['span'][1]:] full_sent = ' '.join(bert_sent_tokens) tokens = ["[CLS]"] t = tokenizer.tokenize(full_sent) tokens.extend(t) mention_token_idx = 0 for i, x in enumerate(tokens): if x == '[MASK]': mention_token_idx = i break tokens.append("[SEP]") sentence_token = tokenizer.convert_tokens_to_ids(tokens) else: sentence_token = sent_tokens_id_dict[m['sent_id']] mention_token_idx = m['span'][0] labels = m['labels'] label_ids = [type_id_dict[t] for t in labels] sample = [m['mention_id'], sent_tokens_id_dict[m['sent_id']][m['span'][0]:m['span'][1]], sentence_token, mention_token_idx, label_ids ] samples.append(sample) return samples