def __init__(self, args): self.args = args if (args.bert_model == 'bert-base-multilingual-cased'): self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-cased', do_lower_case=False) else: self.tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True) if (len(self.tokenizer.vocab) == 31748): f = open(args.bert_model + "/vocab.txt", "a") f.write( "\n[unused1]\n[unused2]\n[unused3]\n[unused4]\n[unused5]\n[unused6]\n[unused7]" ) f.close() self.tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=True) self.sep_token = '[SEP]' self.cls_token = '[CLS]' self.pad_token = '[PAD]' self.tgt_bos = '[unused1]' self.tgt_eos = '[unused2]' self.tgt_sent_split = '[unused3]' self.sep_vid = self.tokenizer.vocab[self.sep_token] self.cls_vid = self.tokenizer.vocab[self.cls_token] self.pad_vid = self.tokenizer.vocab[self.pad_token]
def __init__(self, args): self.args = args self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) self.sep_token = '[SEP]' self.cls_token = '[CLS]' self.pad_token = '[PAD]' self.unk_token = '[UNK]' self.mask_token = '[MASK]' self.tgt_bos = '[unused0]' self.tgt_eos = '[unused1]' with open(args.src_dict_path) as f: line = f.read().strip() self.src_dict = json.loads(line) with open(args.tgt_dict_path) as f: line = f.read().strip() self.tgt_dict = json.loads(line) with open(args.relation_path) as f: line = f.read().strip() self.relation_dict = json.loads(line) self.sep_vid = self.src_dict[self.sep_token] self.cls_vid = self.src_dict[self.cls_token] self.pad_vid = self.src_dict[self.pad_token] self.unk_vid = self.src_dict[self.unk_token]
def __init__(self, args): self.args = args if args.cased: self.tokenizer = BertTokenizer.from_pretrained('BETO/') else: self.tokenizer = BertTokenizer.from_pretrained('BETO/', do_lower_case=True) print(self.tokenizer) self.sep_token = '[SEP]' self.cls_token = '[CLS]' self.pad_token = '[PAD]' self.tgt_bos = '[unused0]' self.tgt_eos = '[unused1]' self.tgt_sent_split = '[unused2]' self.sep_vid = self.tokenizer.vocab[self.sep_token] self.cls_vid = self.tokenizer.vocab[self.cls_token] self.pad_vid = self.tokenizer.vocab[self.pad_token]
def load_one_text_web(source, device): from others.tokenization import BertTokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) sep_vid = tokenizer.vocab['[SEP]'] cls_vid = tokenizer.vocab['[CLS]'] max_pos = 512 def _process_src(raw): raw = raw.strip().lower() raw = raw.replace('[cls]', '[CLS]').replace('[sep]', '[SEP]') src_subtokens = tokenizer.tokenize(raw) src_subtokens = ['[CLS]'] + src_subtokens + ['[SEP]'] src_subtoken_idxs = tokenizer.convert_tokens_to_ids(src_subtokens) src_subtoken_idxs = src_subtoken_idxs[:-1][:max_pos] src_subtoken_idxs[-1] = sep_vid _segs = [-1] + [ i for i, t in enumerate(src_subtoken_idxs) if t == sep_vid ] segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))] segments_ids = [] segs = segs[:max_pos] for i, s in enumerate(segs): if (i % 2 == 0): segments_ids += s * [0] else: segments_ids += s * [1] src = torch.tensor(src_subtoken_idxs)[None, :].to(device) mask_src = (1 - (src == 0).float()).to(device) cls_ids = [[ i for i, t in enumerate(src_subtoken_idxs) if t == cls_vid ]] clss = torch.tensor(cls_ids).to(device) mask_cls = 1 - (clss == -1).float() clss[clss == -1] = 0 return src, mask_src, segments_ids, clss, mask_cls x = source src, mask_src, segments_ids, clss, mask_cls = _process_src(x) segs = torch.tensor(segments_ids)[None, :].to(device) batch = Batch() batch.src = src batch.tgt = None batch.mask_src = mask_src batch.mask_tgt = None batch.segs = segs batch.src_str = [[ sent.replace('[SEP]', '').strip() for sent in x.split('[CLS]') ]] batch.tgt_str = [''] batch.clss = clss batch.mask_cls = mask_cls batch.batch_size = 1 yield batch
def __init__(self, args): self.CHUNK_LIMIT = 512 self.args = args if args.model_name == 'scibert': self.tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased', do_lower_case=True) elif 'bert-base' in args.model_name or 'bert-large' in args.model_name: self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) self.sep_token = '[SEP]' self.cls_token = '[CLS]' self.pad_token = '[PAD]' self.tgt_bos = '[unused0]' self.tgt_eos = '[unused1]' self.tgt_sent_split = '[unused2]' self.sep_vid = self.tokenizer.vocab[self.sep_token] self.cls_vid = self.tokenizer.vocab[self.cls_token] self.pad_vid = self.tokenizer.vocab[self.pad_token]
def __init__(self, args): self.args = args self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) self.sep_token = '[SEP]' self.cls_token = '[CLS]' self.pad_token = '[PAD]' self.tgt_bos = '[unused0]' self.tgt_eos = '[unused1]' self.tgt_sent_split = '[unused2]' self.sep_vid = self.tokenizer.vocab[self.sep_token] self.cls_vid = self.tokenizer.vocab[self.cls_token] self.pad_vid = self.tokenizer.vocab[self.pad_token]
def validate(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() valid_iter = data_loader.Dataloader(args, load_dataset(args, 'valid', shuffle=False), args.batch_size, device, shuffle=False, is_test=False) #tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False, cache_dir=args.temp_dir) #tokenizer = BertTokenizer.from_pretrained('hubert-wiki', do_lower_case=False, cache_dir=None) #tokenizer = BertTokenizer.from_pretrained('hubert-web', do_lower_case=False, cache_dir=None) tokenizer = BertTokenizer.from_pretrained('libert-large', do_lower_case=False, cache_dir=None) symbols = { 'BOS': tokenizer.vocab['[unused5]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]'] } valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device) trainer = build_trainer(args, device_id, model, None, valid_loss) stats = trainer.validate(valid_iter, step) return stats.xent()
def __init__(self, args): self.args = args self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) self.sep_token = "[SEP]" self.cls_token = "[CLS]" self.pad_token = "[PAD]" self.tgt_bos = "[unused0]" self.tgt_eos = "[unused1]" self.tgt_sent_split = "[unused2]" self.sep_vid = self.tokenizer.vocab[self.sep_token] self.cls_vid = self.tokenizer.vocab[self.cls_token] self.pad_vid = self.tokenizer.vocab[self.pad_token]
def __init__(self, args): self.args = args self.tokenizer = BertTokenizer.from_pretrained(BERT_PATH, do_lower_case=False) self.sep_token = '[SEP]' self.cls_token = '[CLS]' self.pad_token = '[PAD]' self.tgt_bos = '[unused1]' self.tgt_eos = '[unused2]' self.tgt_sent_split = '[unused3]' self.sep_vid = self.tokenizer.vocab[self.sep_token] self.cls_vid = self.tokenizer.vocab[self.cls_token] self.pad_vid = self.tokenizer.vocab[self.pad_token]
def __init__(self, args): self.args = args self.tokenizer = BertTokenizer.from_pretrained( '/home/ffajri/Data/Bert/indobert/indobert-vocab-presum.txt', do_lower_case=True) self.sep_token = '[SEP]' self.cls_token = '[CLS]' self.pad_token = '[PAD]' self.tgt_bos = '[BOS]' self.tgt_eos = '[EOS]' self.tgt_sent_split = '[QOS]' self.sep_vid = self.tokenizer.vocab[self.sep_token] self.cls_vid = self.tokenizer.vocab[self.cls_token] self.pad_vid = self.tokenizer.vocab[self.pad_token]
def __init__(self, args): self.args = args #self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True) #self.tokenizer = BertTokenizer.from_pretrained('hubert',do_lower_case=False) self.tokenizer = BertTokenizer.from_pretrained('libert-large', do_lower_case=False) self.sep_token = '[SEP]' self.cls_token = '[CLS]' self.pad_token = '[PAD]' self.tgt_bos = '[unused5]' self.tgt_eos = '[unused1]' self.tgt_sent_split = '[unused2]' self.sep_vid = self.tokenizer.vocab[self.sep_token] self.cls_vid = self.tokenizer.vocab[self.cls_token] self.pad_vid = self.tokenizer.vocab[self.pad_token]
def __init__(self, args): self.args = args self.tokenizer = BertTokenizer.from_pretrained(args.bert_temp_dir) self.sep_token = '[SEP]' self.cls_token = '[CLS]' self.pad_token = '[PAD]' self.unk_token = '[UNK]' self.tgt_bos = '[unused1]' self.tgt_eos = '[unused2]' self.tgt_sent_split = '[unused3]' self.role_1 = '[unused4]' self.role_2 = '[unused5]' self.sep_vid = self.tokenizer.vocab[self.sep_token] self.cls_vid = self.tokenizer.vocab[self.cls_token] self.pad_vid = self.tokenizer.vocab[self.pad_token] self.unk_vid = self.tokenizer.vocab[self.unk_token]
def __init__(self, abs_model_file): self.args = self._build_abs_args() # load model step_abs = int(abs_model_file.split('.')[-2].split('_')[-1]) checkpoint = torch.load(abs_model_file, map_location=lambda storage, loc: storage) self.model_abs = model_bld.AbsSummarizer(self.args, self.args.device, checkpoint) self.model_abs.eval() # prepare tokenizer and predictor self.tokenizer = BertTokenizer.from_pretrained(path.join(self.args.bert_model_path, self.model_abs.bert.model_name), do_lower_case=True) self.symbols = {'BOS': self.tokenizer.vocab['[unused0]'], 'EOS': self.tokenizer.vocab['[unused1]'], 'PAD': self.tokenizer.vocab['[PAD]'], 'EOQ': self.tokenizer.vocab['[unused2]']} self.predictor = pred_abs.build_predictor(self.args, self.tokenizer, self.symbols, self.model_abs, logger) # special tokens self.sep_token = '[SEP]' self.cls_token = '[CLS]' self.pad_token = '[PAD]' self.sep_vid = self.tokenizer.vocab[self.sep_token] self.cls_vid = self.tokenizer.vocab[self.cls_token] self.pad_vid = self.tokenizer.vocab[self.pad_token]
def __init__(self, min_src_ntokens_per_sent=5, max_src_ntokens_per_sent=200, max_src_nsents=max_src_nsents, min_src_nsents=1, max_tgt_ntokens=500, min_tgt_ntokens=5): self.min_src_ntokens_per_sent = min_src_ntokens_per_sent self.max_src_ntokens_per_sent = max_src_ntokens_per_sent self.max_src_nsents = max_src_nsents self.min_src_nsents = min_src_nsents self.max_tgt_ntokens = max_tgt_ntokens self.min_tgt_ntokens = min_tgt_ntokens self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) self.sep_token = '[SEP]' self.cls_token = '[CLS]' self.pad_token = '[PAD]' self.tgt_bos = '[unused0]' self.tgt_eos = '[unused1]' self.tgt_sent_split = '[unused2]' self.sep_vid = self.tokenizer.vocab[self.sep_token] self.cls_vid = self.tokenizer.vocab[self.cls_token] self.pad_vid = self.tokenizer.vocab[self.pad_token]
def load_text(args, source_fp, target_fp, device): from others.tokenization import BertTokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) sep_vid = tokenizer.vocab['[SEP]'] cls_vid = tokenizer.vocab['[CLS]'] n_lines = len(open(source_fp, encoding='UTF-8').read().split('\n')) def _process_src(raw): raw = raw.strip().lower() src_subtokens = tokenizer.tokenize(raw) src_subtokens = ['[CLS]'] + src_subtokens + ['[SEP]'] src_subtokens_temp = [] j = 0 for i in range(len(src_subtokens) - 4): if i != j: continue if ("".join(src_subtokens[i:i + 4])) == '[##cl##s##]': src_subtokens_temp.append('[CLS]') j = i + 4 elif ("".join(src_subtokens[i:i + 4])) == '[##se##p##]': src_subtokens_temp.append('[SEP]') j = i + 4 else: src_subtokens_temp.append(src_subtokens[i]) j = i + 1 src_subtokens = src_subtokens_temp + src_subtokens[-3:] # print(src_subtokens) src_subtoken_idxs = tokenizer.convert_tokens_to_ids(src_subtokens) src_subtoken_idxs = src_subtoken_idxs[:-1][:args.max_pos] src_subtoken_idxs[-1] = sep_vid _segs = [-1] + [ i for i, t in enumerate(src_subtoken_idxs) if t == sep_vid ] segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))] segments_ids = [] segs = segs[:args.max_pos] for i, s in enumerate(segs): if (i % 2 == 0): segments_ids += s * [0] else: segments_ids += s * [1] src = torch.tensor(src_subtoken_idxs)[None, :].to(device) mask_src = (1 - (src == 0).float()).to(device) cls_ids = [[ i for i, t in enumerate(src_subtoken_idxs) if t == cls_vid ]] clss = torch.tensor(cls_ids).to(device) mask_cls = 1 - (clss == -1).float() clss[clss == -1] = 0 return src, mask_src, segments_ids, clss, mask_cls if (target_fp == ''): with open(source_fp, encoding='UTF-8') as source: for x in tqdm(source, total=n_lines): src, mask_src, segments_ids, clss, mask_cls = _process_src(x) segs = torch.tensor(segments_ids)[None, :].to(device) batch = Batch() batch.src = src batch.tgt = None batch.mask_src = mask_src batch.mask_tgt = None batch.segs = segs batch.src_str = [[ sent.replace('[SEP]', '').strip() for sent in x.split('[CLS]') ]] batch.tgt_str = [''] batch.clss = clss batch.mask_cls = mask_cls batch.batch_size = 1 yield batch else: with open(source_fp, encoding='UTF-8') as source, open( target_fp, encoding='UTF-8') as target: for x, y in tqdm(zip(source, target), total=n_lines): x = x.strip() y = y.strip() y = ' '.join(y.split()) src, mask_src, segments_ids, clss, mask_cls = _process_src(x) segs = torch.tensor(segments_ids)[None, :].to(device) batch = Batch() batch.src = src batch.tgt = None batch.mask_src = mask_src batch.mask_tgt = None batch.segs = segs batch.src_str = [[ sent.replace('[SEP]', '').strip() for sent in x.split('[CLS]') ]] batch.tgt_str = [y] batch.clss = clss batch.mask_cls = mask_cls batch.batch_size = 1 yield batch
def train_abs_single(args, device_id): init_logger(args.log_file) logger.info(str(args)) device = "cpu" if args.visible_gpus == '-1' else "cuda" logger.info('Device ID %d' % device_id) logger.info('Device %s' % device) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True if device_id >= 0: torch.cuda.set_device(device_id) torch.cuda.manual_seed(args.seed) if args.train_from != '': logger.info('Loading checkpoint from %s' % args.train_from) checkpoint = torch.load(args.train_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) else: checkpoint = None if (args.load_from_extractive != ''): logger.info('Loading bert from extractive model %s' % args.load_from_extractive) bert_from_extractive = torch.load( args.load_from_extractive, map_location=lambda storage, loc: storage) bert_from_extractive = bert_from_extractive['model'] else: bert_from_extractive = None torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True def train_iter_fct(): return data_loader.Dataloader(args, load_dataset(args, 'train', shuffle=True), args.batch_size, device, shuffle=True, is_test=False) model = AbsSummarizer(args, device, checkpoint, bert_from_extractive) if (args.sep_optim): optim_bert = model_builder.build_optim_bert(args, model, checkpoint) optim_dec = model_builder.build_optim_dec(args, model, checkpoint) optim = [optim_bert, optim_dec] else: optim = [model_builder.build_optim(args, model, checkpoint)] logger.info(model) #tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False, cache_dir=args.temp_dir) #tokenizer = BertTokenizer.from_pretrained('hubert-wiki', do_lower_case=False, cache_dir=None) #tokenizer = BertTokenizer.from_pretrained('hubert-web', do_lower_case=False, cache_dir=None) tokenizer = BertTokenizer.from_pretrained('libert-large', do_lower_case=False, cache_dir=None) symbols = { 'BOS': tokenizer.vocab['[unused5]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]'] } train_loss = abs_loss(model.generator, symbols, model.vocab_size, device, train=True, label_smoothing=args.label_smoothing) trainer = build_trainer(args, device_id, model, optim, train_loss) trainer.train(train_iter_fct, args.train_steps)
def load_text(args, source_fp, target_fp, device): from others.tokenization import BertTokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) sep_vid = tokenizer.vocab['[SEP]'] cls_vid = tokenizer.vocab['[CLS]'] n_lines = len(open(source_fp).read().split('\n')) # 根据论文进项相应处理 def _process_src(raw): raw = raw.strip().lower() raw = raw.replace('[cls]','[CLS]').replace('[sep]','[SEP]') # 都替换为大写. src_subtokens = tokenizer.tokenize(raw) src_subtokens = ['[CLS]'] + src_subtokens + ['[SEP]'] # 加上收尾token src_subtoken_idxs = tokenizer.convert_tokens_to_ids(src_subtokens) src_subtoken_idxs = src_subtoken_idxs[:-1][:args.max_pos] src_subtoken_idxs[-1] = sep_vid _segs = [-1] + [i for i, t in enumerate(src_subtoken_idxs) if t == sep_vid] segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))] segments_ids = []# 计算segment编码, 得到0,1 向量而已. segs = segs[:args.max_pos] for i, s in enumerate(segs): if (i % 2 == 0): segments_ids += s * [0] else: segments_ids += s * [1] src = torch.tensor(src_subtoken_idxs)[None, :].to(device) mask_src = (1 - (src == 0).float()).to(device)#去掉pad cls_ids = [[i for i, t in enumerate(src_subtoken_idxs) if t == cls_vid]] # 句子编号 clss = torch.tensor(cls_ids).to(device) mask_cls = 1 - (clss == -1).float() clss[clss == -1] = 0 return src, mask_src, segments_ids, clss, mask_cls if(target_fp==''): with open(source_fp) as source: for x in tqdm(source, total=n_lines): src, mask_src, segments_ids, clss, mask_cls = _process_src(x) segs = torch.tensor(segments_ids)[None, :].to(device) batch = Batch() batch.src = src batch.tgt = None batch.mask_src = mask_src batch.mask_tgt = None batch.segs = segs batch.src_str = [[sent.replace('[SEP]','').strip() for sent in x.split('[CLS]')]] batch.tgt_str = [''] batch.clss = clss batch.mask_cls = mask_cls batch.batch_size=1 yield batch else: with open(source_fp) as source, open(target_fp) as target: for x, y in tqdm(zip(source, target), total=n_lines): x = x.strip() y = y.strip() y = ' '.join(y.split()) src, mask_src, segments_ids, clss, mask_cls = _process_src(x) segs = torch.tensor(segments_ids)[None, :].to(device) batch = Batch() batch.src = src batch.tgt = None batch.mask_src = mask_src batch.mask_tgt = None batch.segs = segs batch.src_str = [[sent.replace('[SEP]','').strip() for sent in x.split('[CLS]')]] batch.tgt_str = [y] batch.clss = clss batch.mask_cls = mask_cls batch.batch_size=1 yield batch
#coding=utf8 import sys sys.path.append('../src/') from others.tokenization import BertTokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) use_bert_basic_tokenizer=False def build_dict(input_file, tag): tokens = {'[PAD]':0, '[SEP]':1, '[CLS]':2, '[UNK]':3, '[unused0]':4, '[unused1]':5, '[unused2]':6} for line in open(input_file): sentences = line.strip().split('\t') if tag == 'src': sentences = sentences[:-1] for sent in sentences: for tok in sent.split(' '): tok = tok.lower() if tok not in tokens: tokens[tok] = len(tokens) return tokens def build_dict_bert(input_file, tag): tokens = {'[PAD]':0, '[SEP]':1, '[CLS]':2, '[UNK]':3, '[unused0]':4, '[unused1]':5, '[unused2]':6} for line in open(input_file): sentences = line.strip().split('\t') if tag == 'src': sentences = sentences[:-1] for sent in sentences: sent = sent.lower() sub_tokens = tokenizer.tokenize(sent, use_bert_basic_tokenizer=use_bert_basic_tokenizer)