def test_tokenization_bert(self): # Given self.base_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=self.test_dir) self.rust_tokenizer = PyBertTokenizer( get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['bert-base-uncased']), do_lower_case=True) output_baseline = [] for example in self.examples: output_baseline.append(self.base_tokenizer.encode_plus(example.text_a, text_pair=example.text_b, add_special_tokens=True, return_overflowing_tokens=True, return_special_tokens_mask=True, max_length=128)) # When output_rust = self.rust_tokenizer.encode_pair_list( [(example.text_a, example.text_b) for example in self.examples], max_len=128, truncation_strategy='longest_first', stride=0) # Then for idx, (rust, baseline) in enumerate(zip(output_rust, output_baseline)): assert rust.token_ids == baseline[ 'input_ids'], f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' \ f'Sentence a: {self.examples[idx].text_a} \n' \ f'Sentence b: {self.examples[idx].text_b} \n' \ f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n' \ f'Rust: {rust.token_ids} \n' \ f' Python {baseline["input_ids"]}' assert (rust.segment_ids == baseline['token_type_ids']) assert (rust.special_tokens_mask == baseline['special_tokens_mask'])
def setup_class(self): self.use_gpu = torch.cuda.is_available() self.test_dir = Path(tempfile.mkdtemp()) self.base_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=self.test_dir) self.rust_tokenizer = PyBertTokenizer( get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['bert-base-uncased'])) self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', output_attentions=False).eval() if self.use_gpu: self.model.cuda() self.sentence_list = ['For instance, on the planet Earth, man had always assumed that he was more intelligent ' 'than dolphins because he had achieved so much—the wheel, New York, wars and so on—whilst' ' all the dolphins had ever done was muck about in the water having a good time. But ' 'conversely, the dolphins had always believed that they were far more intelligent than ' 'man—for precisely the same reasons.'] * 64 # Pre-allocate GPU memory tokens_list = [self.base_tokenizer.tokenize(sentence) for sentence in self.sentence_list] features = [self.base_tokenizer.convert_tokens_to_ids(tokens) for tokens in tokens_list] features = [self.base_tokenizer.prepare_for_model(input, None, add_special_tokens=True, max_length=128) for input in features] all_input_ids = torch.tensor([f['input_ids'] for f in features], dtype=torch.long) if self.use_gpu: all_input_ids = all_input_ids.cuda() with torch.no_grad(): _ = self.model(all_input_ids)[0].cpu().numpy()
def __init__( self, pretrained_model: str, use_starting_offsets: bool = False, do_lowercase: bool = True, never_lowercase: List[str] = None, max_pieces: int = 512, truncate_long_sequences: bool = True, ) -> None: if pretrained_model.endswith("-cased") and do_lowercase: logger.warning( "Your BERT model appears to be cased, but your indexer is lowercasing tokens." ) elif pretrained_model.endswith("-uncased") and not do_lowercase: logger.warning("Your BERT model appears to be uncased, " "but your indexer is not lowercasing tokens.") bert_tokenizer = BertTokenizer.from_pretrained( pretrained_model, do_lower_case=do_lowercase) super().__init__( vocab=bert_tokenizer.vocab, wordpiece_tokenizer=bert_tokenizer.wordpiece_tokenizer.tokenize, namespace="bert", use_starting_offsets=use_starting_offsets, max_pieces=max_pieces, do_lowercase=do_lowercase, never_lowercase=never_lowercase, start_tokens=["[CLS]"], end_tokens=["[SEP]"], separator_token="[SEP]", truncate_long_sequences=truncate_long_sequences, )
def __init__(self, args, device='cpu'): print(args.bert_model) self.tokenizer = BertTokenizer.from_pretrained(args.bert_model) self.data_dir = args.data_dir file_list = get_json_file_list(args.data_dir) self.data = [] self.shortt=0 self.longg=0 #max_article_len = 0 for file_name in file_list: data = json.loads(open(file_name, 'r').read()) data['high'] = 0 if ('high' in file_name): data['high'] = 1 self.data.append(data) #max_article_len = max(max_article_len, len(nltk.word_tokenize(data['article']))) self.data_objs = [] high_cnt = 0 middle_cnt = 0 for sample in self.data: high_cnt += sample['high'] middle_cnt += (1 - sample['high']) self.data_objs += self._create_sample(sample) #print(self.data_objs[-1].ph) #break print('high school sample:', high_cnt) print('middle school sample:', middle_cnt) print('<512:',self.shortt) print('>512:',self.longg) for i in range(len(self.data_objs)): self.data_objs[i].convert_tokens_to_ids(self.tokenizer) #break torch.save(self.data_objs, args.save_name)
def __init__(self) -> None: os.environ['CORENLP_HOME'] = '{}/stanford-corenlp-full-2018-10-05'.format(os.environ['HOME']) self.client: CoreNLPClient = CoreNLPClient() self.client.ensure_alive() self.do_lower_case = '-cased' not in config.bert_model self.basic_tokenizer: BasicTokenizer \ = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=self.do_lower_case).basic_tokenizer
def __init__( self, mappings: dict = None, instance_conll_file: str = None, debug: bool = None, singleton_replacement_ratio: float = 0.0, bert_use: bool = False, bert_voc_dir: str = None, bert_lowercase: bool = False, pretrained_use: bool = False, char_use: bool = False, elmo_use: bool = False, ): self.mappings = mappings self.instance_conll_file = instance_conll_file self.debug = debug self.singleton_replacement_ratio = singleton_replacement_ratio self.bert_use = bert_use self.bert_voc_dir = bert_voc_dir self.bert_lowercase = bert_lowercase self.bert_tokenizer = None self.pretrained_use = pretrained_use self.char_use = char_use self.elmo_use = elmo_use if self.bert_use: self.bert_tokenizer = BertTokenizer.from_pretrained( self.bert_voc_dir, do_lower_case=self.bert_lowercase) self.singletons = set(self.extract_singletons()) self.instances = dict() self.load_instances()
def main(): # 引数を処理する args = parse_argument() # 乱数処理のシードをチェック if args.seed is not -1: # 各種の乱数処理のシードを固定化する random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # pytorchに利用するGPU/CPUを設定する device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") if device != "cpu": # GPUの乱数シードを設定する torch.cuda.manual_seed_all(args.seed) # 事前学習済みのBERTモデルのTokernizerを読み込む tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=False, tokenize_chinese_chars=False) if args.do_train: # 学習 train(args, tokenizer, device) if args.do_generate: # 生成 generate(tokenizer, device, max_iter=args.max_iter, length=args.seq_length, model=args.bert_model, fix_word=args.fix_word, samples=args.samples)
def __init__(self, reader): task_cfg = reader tokenizer = BertTokenizer.from_pretrained( task_cfg.bert_model, do_lower_case=task_cfg.do_lower_case) task_feature_reader1 = {} task_feature_reader2 = {} self.task = [] self._limit_sample_nums = task_cfg.get('limit_nums', None) is_train = task_cfg.get('is_train', False) ids = task_cfg.tasks.split('-') for i, task_id in enumerate(ids): task = 'TASK' + task_id self.task.append(task) cfg = task_cfg.TASKS[task] if cfg.features_h5path1 not in task_feature_reader1: task_feature_reader1[cfg.features_h5path1] = None if cfg.features_h5path2 not in task_feature_reader2: task_feature_reader2[cfg.features_h5path2] = None # initilzie the feature reader for features_h5path in task_feature_reader1.keys(): if features_h5path != '': task_feature_reader1[features_h5path] = ImageFeaturesH5Reader( features_h5path, task_cfg.in_memory) for features_h5path in task_feature_reader2.keys(): if features_h5path != '': task_feature_reader2[features_h5path] = ImageFeaturesH5Reader( features_h5path, task_cfg.in_memory) self.task_datasets = {} # only one task now for i, task_id in enumerate(ids): task = 'TASK' + task_id cfg = task_cfg.TASKS[task] task_name = cfg.name if is_train: split = cfg.train_split annotations_jsonpath = cfg.train_annotations_jsonpath else: split = cfg.val_split annotations_jsonpath = cfg.val_annotations_jsonpath self.task_datasets[task] = DatasetMapTrain[task_name]( task=cfg.name, dataroot=cfg.dataroot, annotations_jsonpath=annotations_jsonpath, split=split, image_features_reader=task_feature_reader1[ cfg.features_h5path1], gt_image_features_reader=task_feature_reader2[ cfg.features_h5path2], tokenizer=tokenizer, bert_model=task_cfg.bert_model, clean_datasets=task_cfg.clean_datasets, padding_index=0, max_seq_length=cfg.max_seq_length, max_region_num=cfg.max_region_num)
def test_tokenization_bert(self): # Given self.base_tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased', do_lower_case=True, cache_dir=self.test_dir) self.rust_tokenizer = PyBertTokenizer( get_from_cache( self.base_tokenizer.pretrained_vocab_files_map['vocab_file'] ['bert-base-uncased'])) output_baseline = [] for example in self.examples: output_baseline.append( self.base_tokenizer.encode_plus( example.text_a, add_special_tokens=True, return_overflowing_tokens=True, return_special_tokens_mask=True, max_length=128)) # When output_rust = self.rust_tokenizer.encode_list( [example.text_a for example in self.examples], max_len=128, truncation_strategy='longest_first', stride=0) # Then for rust, baseline in zip(output_rust, output_baseline): assert (rust.token_ids == baseline['input_ids']) assert (rust.segment_ids == baseline['token_type_ids']) assert ( rust.special_tokens_mask == baseline['special_tokens_mask'])
def main(): # truncate csv with open('./output/tachikoma_out.csv', 'w') as fd: fd.truncate() fd.write('tweet\n') args = parse_argument() if args.seed is not -1: random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") if device != "cpu": torch.cuda.manual_seed_all(args.seed) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=False, tokenize_chinese_chars=False) if args.do_train: train(args, tokenizer, device) if args.do_generate: generate(tokenizer, device, max_iter=args.max_iter, length=args.seq_length, model=args.bert_model, fix_word=args.fix_word, samples=args.samples)
def load_model(model_name, data_dir): processors = { "rte": RteProcessor } output_modes = { "rte": "classification" } # task_name = args.task_name.lower() task_name = 'rte' if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() # [0,1] num_labels = len(label_list) pretrain_model_dir = '{}/FineTuneOn{}'.format(data_dir, model_name) # pretrain_model_dir = 'please enter your pretrain models path here/FineTuneOn{}'.format(model_name) # Prepare model # cache_dir = os.path.join(str(PYTORCH_TRANSFORMERS_CACHE), '{} model distributed_{}'.format(model_name, -1)) # # cache_dir = os.path.join(str(PYTORCH_TRANSFORMERS_CACHE), '{} model distributed_{}'.format(model_name, -1)) model = BertForSequenceClassification.from_pretrained(pretrain_model_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained(pretrain_model_dir) # model = BertForSequenceClassification.from_pretrained('bert-base-uncased', # cache_dir=cache_dir, # num_labels=num_labels) # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # print(tokenizer) return model, tokenizer
def load(cls, pretrained_model_name_or_path, tokenizer_class=None, **kwargs): """ Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from `pretrained_model_name_or_path` or define it manually via `tokenizer_class`. :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) :type pretrained_model_name_or_path: str :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) :type tokenizer_class: str :param kwargs: :return: Tokenizer """ pretrained_model_name_or_path = str(pretrained_model_name_or_path) # guess tokenizer type from name if tokenizer_class is None: if "albert" in pretrained_model_name_or_path.lower(): tokenizer_class = "AlbertTokenizer" elif "xlm-roberta" in pretrained_model_name_or_path.lower(): tokenizer_class = "XLMRobertaTokenizer" elif "roberta" in pretrained_model_name_or_path.lower(): tokenizer_class = "RobertaTokenizer" elif "distilbert" in pretrained_model_name_or_path.lower(): tokenizer_class = "DistilBertTokenizer" elif "bert" in pretrained_model_name_or_path.lower(): tokenizer_class = "BertTokenizer" elif "xlnet" in pretrained_model_name_or_path.lower(): tokenizer_class = "XLNetTokenizer" else: raise ValueError( f"Could not infer tokenizer_type from name '{pretrained_model_name_or_path}'. Set arg `tokenizer_type` in Tokenizer.load() to one of: 'bert', 'roberta', 'xlnet' " ) logger.info(f"Loading tokenizer of type '{tokenizer_class}'") # return appropriate tokenizer object if tokenizer_class == "AlbertTokenizer": ret = AlbertTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif tokenizer_class == "XLMRobertaTokenizer": ret = XLMRobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "RobertaTokenizer": ret = RobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "DistilBertTokenizer": ret = DistilBertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "BertTokenizer": ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "XLNetTokenizer": ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) if ret is None: raise Exception("Unable to load tokenizer") else: return ret
def main(): args = parse_argument() if args.seed is not -1: random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") if device != "cpu": torch.cuda.manual_seed_all(args.seed) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=False, tokenize_chinese_chars=False) if args.do_train: train(args, tokenizer, device) if args.do_generate: generate(tokenizer, device, max_iter=args.max_iter, length=args.seq_length, model=args.bert_model, fix_word=args.fix_word, samples=args.samples)
def main(): parser = ArgumentParser() parser.add_argument('--train_corpus', type=Path, required=False) parser.add_argument("--output_dir", type=Path, required=False) parser.add_argument("--bert_model", type=str, required=False, default=BERT_PRETRAINED_MODEL, choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased", "bert-base-multilingual-uncased", "bert-base-chinese", "bert-base-multilingual-cased"]) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument("--do_whole_word_mask", action="store_true", help="Whether to use whole word masking rather than per-WordPiece masking.") parser.add_argument("--reduce_memory", action="store_true", help="Reduce memory usage for large datasets by keeping data on disc rather than in memory") parser.add_argument("--num_workers", type=int, default=NUM_CPU, help="The number of workers to use to write the files") parser.add_argument("--epochs_to_generate", type=int, default=EPOCHS, help="Number of epochs of data to pregenerate") parser.add_argument("--max_seq_len", type=int, default=MAX_SENTIMENT_SEQ_LENGTH) parser.add_argument("--short_seq_prob", type=float, default=0.1, help="Probability of making a short sentence as a training example") parser.add_argument("--masked_lm_prob", type=float, default=MLM_PROB, help="Probability of masking each token for the LM task") parser.add_argument("--max_predictions_per_seq", type=int, default=MAX_PRED_PER_SEQ, help="Maximum number of tokens to mask in each sequence") parser.add_argument("--masking_method", type=str, default="double_num_adj", choices=("mlm_prob", "double_num_adj"), help="Method of determining num masked tokens in sentence") args = parser.parse_args() if args.num_workers > 1 and args.reduce_memory: raise ValueError("Cannot use multiple workers while reducing memory") tokenizer = BertTokenizer.from_pretrained(BERT_PRETRAINED_MODEL, do_lower_case=bool(BERT_PRETRAINED_MODEL.endswith("uncased"))) generate_data_for_treatment(tokenizer, args)
def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, domain_identifier: str = None, bert_model_name: str = None, **kwargs, ) -> None: super().__init__(**kwargs) if token_indexers is not None: self._token_indexers = token_indexers elif bert_model_name is not None: from allennlp.data.token_indexers import PretrainedTransformerIndexer self._token_indexers = { "tokens": PretrainedTransformerIndexer(bert_model_name) } else: self._token_indexers = {"tokens": SingleIdTokenIndexer()} self._domain_identifier = domain_identifier if bert_model_name is not None: self.bert_tokenizer = BertTokenizer.from_pretrained( bert_model_name) self.lowercase_input = "uncased" in bert_model_name else: self.bert_tokenizer = None self.lowercase_input = False
def generate_data_for_domain(args, domain): tokenizer = BertTokenizer.from_pretrained( BERT_PRETRAINED_MODEL, do_lower_case=bool(BERT_PRETRAINED_MODEL.endswith("uncased"))) vocab_list = list(tokenizer.vocab.keys()) with open(SENTIMENT_TOPICS_DOMAIN_TREAT_CONTROL_MAP_FILE, "r") as jsonfile: domain_topic_treat_dict = json.load(jsonfile) treatment_topic = domain_topic_treat_dict[domain]["treated_topic"] control_topic = domain_topic_treat_dict[domain]["control_topics"][-1] treatment_column = f"{treatment_topic}_bin" control_column = f"{control_topic}_bin" with DocumentDatabase(reduce_memory=args.reduce_memory) as docs: print(f"\nGenerating data for domain: {domain}") output_dir = Path(SENTIMENT_TOPICS_PRETRAIN_DATA_DIR) / domain output_dir.mkdir(exist_ok=True, parents=True) unique_ids, reviews, treatment_labels, control_labels = list(), list( ), list(), list() for dataset in ("train", "dev"): DATASET_FILE = f"{SENTIMENT_TOPICS_DATASETS_DIR}/topics_{dataset}.csv" df = pd.read_csv( DATASET_FILE, header=0, encoding='utf-8', usecols=["id", "review", treatment_column, control_column]).set_index(keys="id", drop=False).sort_index() df = df[df[treatment_column].notnull()] unique_ids += df["id"].astype(int).tolist() reviews += df["review"].apply(tokenizer.tokenize).tolist() treatment_labels += df[treatment_column].astype(int).tolist() control_labels += df[control_column].astype(int).tolist() for unique_id, doc, treatment_label, control_label in tqdm( zip(unique_ids, reviews, treatment_labels, control_labels)): if doc: docs.add_document( unique_id, doc, treatment_label, control_label ) # If the last doc didn't end on a newline, make sure it still gets added if len(docs) <= 1: exit( "ERROR: No document breaks were found in the input file! These are necessary to allow the script to " "ensure that random NextSentences are not sampled from the same document. Please add blank lines to " "indicate breaks between documents in your input file. If your dataset does not contain multiple " "documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, " "sections or paragraphs.") if args.num_workers > 1: writer_workers = Pool( min(args.num_workers, args.epochs_to_generate)) arguments = [(docs, vocab_list, args, idx, output_dir) for idx in range(args.epochs_to_generate)] writer_workers.starmap(create_training_file, arguments) else: for epoch in trange(args.epochs_to_generate, desc="Epoch"): create_training_file(docs, vocab_list, args, epoch, output_dir)
def __init__(self, num_classes, model_name='bert-base-uncased' ): self.num_classes = num_classes self.tokenizer = BertTokenizer.from_pretrained(model_name) self.model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=self.num_classes)
def __init__(self): # Googleの公開している事前学習済みのトークナイザとモデルをロード self.tokenizer = BertTokenizer.from_pretrained( "bert-base-multilingual-cased", do_lower_case=False) self.model = BertForSequenceClassification.from_pretrained( "bert-base-multilingual-cased", num_labels=2) # Google Colabでファインチューニングしたモデルをロード self.model.load_state_dict( torch.load("bert_evaluator.bin", map_location='cpu'))
def init_tokens(self): # self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=self.tokenizer_path) tokens = ['[CLS]', '[MASK]', '[SEP]'] indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokens) self.CLS = indexed_tokens[0] self.MASK = indexed_tokens[1] self.SEP = indexed_tokens[2]
def __init__(self, n_kws=15): self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.n_kws = n_kws self.bert_w2i = {w: i for i, w in enumerate(self.tokenizer.vocab)} self.bert_vocab = self.tokenizer.vocab # self.dataset = h5py.File("/home/phillab/data/headliner_6M.hdf5") # self.dset = self.dataset['name'] self.keyworder = None self.i2w = None
def __init__(self, device, model_file=None): self.model = BertForNextSentencePrediction.from_pretrained( 'bert-base-uncased') self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.tokenizer.max_len = 10000 self.model.to(device) self.device = device if model_file is not None: self.reload_model(model_file)
def __init__(self, bert_model: str) -> None: self.bert_tokenizer: BertTokenizer = BertTokenizer.from_pretrained( bert_model, do_lower_case='-cased' not in bert_model) self.subword_alphabet: Optional[Alphabet] = None self.label_alphabet: Optional[Alphabet] = None self.train: Optional[List[SentInst]] = None self.dev: Optional[List[SentInst]] = None self.test: Optional[List[SentInst]] = None
def process(self, data_bundle): """ 输入为 word definition 测试 这是 一个 测试 :param data_bundle: :return: """ tokenizer = BertTokenizer.from_pretrained(self.bert_name) tokenizer.do_basic_tokenize = True return _prepare_data_bundle(tokenizer, data_bundle, self.max_word_len)
def __init__(self, file_path, tag2idx, tokenizer_path='', do_lower_case=True): self.tag2idx = tag2idx self.tokenizer = BertTokenizer.from_pretrained( tokenizer_path, do_lower_case=do_lower_case) self._file_path = file_path with open(file_path, 'r', encoding='utf-8') as fp: self._lines_count = len(fp.readlines())
def load_model(self): self.tokenizer = BertTokenizer.from_pretrained(self.args.pretrained_path,do_lower_case=self.args.do_lower_case) self.config = BertConfig.from_pretrained(self.args.pretrained_path,num_labels=self.args.num_labels) if self.args.resume_model: self.model = BertForMultiLable.from_pretrained(self.args.resume_model_path,config=self.config) with open(self.threshold_path, 'r') as f: self.threshold = float(f.read()) # read the best model's threshold else: self.model = BertForMultiLable.from_pretrained(self.args.pretrained_path,config=self.config) if self.args.cuda: self.model.cuda() if self.args.n_gpus>1: self.model = DataParallel(self.model)
def __init__(self, method): if "gpt" in method: from transformers.tokenization_gpt2 import GPT2Tokenizer self.tokenizer = GPT2Tokenizer.from_pretrained(method) elif "bert" in method: from transformers.tokenization_bert import BertTokenizer self.tokenizer = BertTokenizer.from_pretrained(method) else: raise ValueError( '`method` is invalid value {}, should be "gpt"/"bpe" or "bert"' .format(method)) self._tokenizer_class_name = self.tokenizer.__class__.__name__
def __init__(self, task, word2index,config): # self.transform = transform # Torch operations on the input image # self.target_transform = target_transform self.task = task self.config = config # self.split = split self.word2index = word2index self.max_len = int(config["data"]["window"]) # self.image_roots = self.task.train_roots if self.split == 'train' else self.task.test_roots # self.labels = self.task.train_labels if self.split == 'train' else self.task.test_labels self.image_roots = self.task.train_roots+self.task.test_roots self.labels = self.task.train_labels+self.task.test_labels self.tokenizer = BertTokenizer.from_pretrained(config['data']['pretrain_path'])
def __init__(self, data_path: str, treatment: str, subset: str, text_column: str, label_column: str, bert_pretrained_model: str = BERT_PRETRAINED_MODEL, max_seq_length: int = MAX_SENTIMENT_SEQ_LENGTH): super().__init__() if subset not in ("train", "dev", "test", "train_debug", "dev_debug", "test_debug"): raise ValueError("subset argument must be {train, dev,test}") self.dataset_file = f"{data_path}/{treatment}_{subset}.csv" self.subset = subset self.text_column = text_column self.label_column = label_column self.max_seq_length = max_seq_length self.tokenizer = BertTokenizer.from_pretrained(bert_pretrained_model, do_lower_case=bool(BERT_PRETRAINED_MODEL.endswith("uncased"))) self.dataset = self.preprocessing_pipeline()
def load_embedding_tokenizer(pretrained_model_name_or_path, **kwargs): # if the pretrained model points to a file on deepset s3, we need to adjust transformers dictionaries if pretrained_model_name_or_path in PRETRAINED_INIT_CONFIGURATION: BertTokenizer.pretrained_vocab_files_map["vocab_file"]. \ update({pretrained_model_name_or_path: EMBEDDING_VOCAB_FILES_MAP["vocab_file"].get( pretrained_model_name_or_path, None)}) BertTokenizer.max_model_input_sizes. \ update({pretrained_model_name_or_path: MAX_MODEL_INPU_SIZES.get(pretrained_model_name_or_path, None)}) BertTokenizer.pretrained_init_configuration. \ update( {pretrained_model_name_or_path: PRETRAINED_INIT_CONFIGURATION.get(pretrained_model_name_or_path, None)}) ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) return ret
def main(args): # For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm) # If we want to fine-tune these model, we have to use same tokenizer : LTP (https://github.com/HIT-SCIR/ltp) with open(args.file_name, "r", encoding="utf-8") as f: data = f.readlines() ltp_tokenizer = LTP(args.ltp) # faster in GPU device bert_tokenizer = BertTokenizer.from_pretrained(args.bert) ref_ids = prepare_ref(data, ltp_tokenizer, bert_tokenizer) with open(args.save_path, "w", encoding="utf-8") as f: data = [json.dumps(ref) + "\n" for ref in ref_ids] f.writelines(data)