def __init__(self, model_directory: str, predictor_name: str, device="cuda") -> None: self.device = device self.config = RobertaConfig.from_pretrained(model_directory) # Load in model related information self._tokenizer = RobertaTokenizerFast.from_pretrained( model_directory, add_special_tokens=False) self._model = model = RobertaForSequenceClassification.from_pretrained( model_directory, config=self.config).to(device) self._model.eval() # Prepare optimizer no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ] }, ] self._optimizer = AdamW(optimizer_grouped_parameters) self._optimizer.load_state_dict( torch.load(os.path.join(model_directory, "optimizer.pt")))
def tokenize(args): src, tgt = args if not os.path.exists(src): return # print(src, tgt) # return tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base") print("START", flush = True) with open(src, "r", encoding = "utf-8") as read_f: text = read_f.read() print(f"Read {src}", flush = True) tokens = tokenizer.tokenize(text) print(f"Tokenized {src}", flush = True) del text token_ids = tokenizer.convert_tokens_to_ids(tokens) print(f"To Token IDs {src}", flush = True) with open(tgt, "wb") as dump_f: pickle.dump(token_ids, dump_f) print(f"Dump {tgt}", flush = True) print("END", flush = True)
def build(image_set, args): img_dir = Path(args.vg_img_path) if image_set == "val": # We validate on the minival for efficiency image_set = "miniv" if image_set == "miniv": ann_file = Path( args.phrasecut_ann_path) / f"finetune_phrasecut_miniv.json" image_set = "val" else: ann_file = Path( args.phrasecut_ann_path) / f"finetune_phrasecut_{image_set}.json" if args.test: ann_file = Path( args.phrasecut_ann_path) / f"finetune_phrasecut_test.json" tokenizer = RobertaTokenizerFast.from_pretrained(args.text_encoder_type) dataset = PhrasecutDetection( img_dir, ann_file, transforms=make_coco_transforms(image_set, cautious=True), return_masks=args.masks, return_tokens=True, # args.contrastive_align_loss, tokenizer=tokenizer, ) return dataset
def get_data(self, data, all_data=None, train=True, smoothing_label_factor=0.4): if all_data is None: all_data = data if train: self.tokenizer = RobertaTokenizerFast.from_pretrained('blinoff/roberta-base-russian-v0') self.encoder = OneHotEncoder() self.encoder.fit(np.array(all_data['class']).reshape(-1, 1)) X = data['purp'].apply(lambda x: ' '.join(re.findall(r'[\w\d\+]+[\.,]*[\w\d\+]*', x))) result = self.tokenizer(list(X), padding='longest') tokens = np.array(result['input_ids']) attn_mask = np.array(result['attention_mask']) if train: self.padding_len = tokens.shape[1] else: tokens = pad_sequences(tokens, padding='post', maxlen=self.padding_len) attn_mask = pad_sequences(attn_mask, padding='post', maxlen=self.padding_len) y = self.encoder.transform(np.array(data['class']).reshape(-1, 1)).toarray() if train: y = smooth_labels(y, factor=smoothing_label_factor) return tokens, attn_mask, y
def __init__(self, cfg, device): super().__init__() tokenizer = RobertaTokenizerFast.from_pretrained('./bird_bpe_vocab', max_len=256) _config = RobertaConfig( vocab_size=tokenizer._tokenizer.get_vocab_size(), hidden_size=512, num_hidden_layers=4, num_attention_heads=8, max_position_embeddings=256, pad_token_id=1, eos_token_id=0, bos_token_id=2, output_attentions=False, output_hidden_states=False ) _model = RobertaForMaskedLM(_config) _model.load_state_dict(torch.load('bert_small/checkpoint-1100/pytorch_model.bin')) _model.eval() self.tokenizer = tokenizer self._model = _model self.device = device self.pad_token = 0 self.batch_size = cfg.batch_size self.proj = None if cfg.proj_lang: self.proj = nn.Sequential(*[EqualisedLinearLayer(512, cfg.latent_dim, weight_scaling=cfg.weight_scaling), nn.Tanh()])
def load_mask_predictor(model_name='roberta-large'): logger.info( f"Downloading roBERTa model from huggingface for Masked Text Prediction " ) model = RobertaForMaskedLM.from_pretrained(model_name) tokenizer = RobertaTokenizerFast.from_pretrained(model_name) device_number = torch.cuda.current_device() if torch.cuda.is_available( ) else -1 predictor = FillMaskPipeline(model=model, tokenizer=tokenizer, device=device_number) def _postprocess_mask_prediction_token(text): return text[1:] if text[0] == "Ġ" else text def predict_mask(masked_text: str, options: Optional[List[str]] = None, num_results: int = 1) -> List[Dict[str, Any]]: results = predictor(masked_text, targets=options, top_k=num_results) parsed_results = [] for result in results: parsed_result = { "word": _postprocess_mask_prediction_token(result['token_str']), "softmax": result["score"] } parsed_results.append(parsed_result) return parsed_results return predict_mask
def less_than_n_tokens(data, n): tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base') tokenizer.add_special_tokens( {"additional_special_tokens": [AddedToken('<skip>', lstrip=True), AddedToken('<no_skip>', lstrip=True)]}) # splits_ratio = [1, 1, 0] splits_ratio = [1] splits = [] for split, ratio in zip([data], splits_ratio): text = split['social_assesment'].tolist()[:int(len(split['social_assesment']) * ratio)] n_samples = len(text) if n_samples == 0: continue batch_size = 10000 batch_idx = 0 while batch_idx * batch_size < n_samples: batch_text = text[batch_idx * batch_size: min((batch_idx + 1) * batch_size, n_samples)] encoded_texts = tokenizer(batch_text, return_attention_mask=False, truncation=False, padding=False)['input_ids'] greater_than_n_indices = [] for text_idx, encoded_text in enumerate(encoded_texts): text_length = len(encoded_text) if text_length > n: greater_than_n_indices.append(text_idx) split = split.drop(greater_than_n_indices, axis=0) print('batch ' + str(batch_idx) + ' done.') batch_idx += 1 splits.append(split) return splits
def __init__(self, hyper_params: Namespace): super().__init__() print(hyper_params) self.model_name = hyper_params.pretrained_encoder self.lower_case = "uncased" in self.model_name if self.model_name.startswith("bert"): # BERT self.tokenizer = BertTokenizerFast.from_pretrained( self.model_name, do_lower_case=self.lower_case) self.pretrained_model = BertModel.from_pretrained(self.model_name) else: # RoBERTa self.tokenizer = RobertaTokenizerFast.from_pretrained( self.model_name) self.pretrained_model = RobertaModel.from_pretrained( self.model_name) self.CLS = "<s>" self.SEP = "</s>" # Add the new tokens to BERT self.new_tags = [ self.START_ARG, self.END_ARG, self.START_PRED, self.END_PRED ] n_new_tags = self.tokenizer.add_tokens(self.new_tags) assert len( self.new_tags) == n_new_tags, "Couldn't add all the new tokens!" self.pretrained_model.resize_token_embeddings(len(self.tokenizer)) self.max_len = 100 #self.pretrained_model.embeddings.position_embeddings.weight.size(0) self.dim = self.pretrained_model.embeddings.position_embeddings.weight.size( 1)
def __init__(self, model: str = None, service: str = "sentiment"): """ Constructor to the class that does the Sentiment Analysis Processing in the back end :param model: Transfomer model that will be used for sentiment analysis :param service: string to represent the service, this will be defaulted to sentiment """ if model is None: model = "distilbert" # path to all the files that will be used for inference self.path = f"./app/api/{model}/" # json file for mapping of network output to the correct category self.mapping = self.path + "mapping.json" self.model_path = self.path + "model.bin" # Selecting the correct model based on the passed madel input. Default distilbert if model == "roberta": self.model = RobertaClass() self.tokenizer = RobertaTokenizerFast.from_pretrained(self.path) elif model == "distilbert": self.model = DistilBertClass() self.tokenizer = DistilBertTokenizerFast.from_pretrained(self.path) else: self.model = DistilBertClass() self.tokenizer = DistilBertTokenizerFast.from_pretrained(self.path) self.model.eval() self.model.load_state_dict( torch.load(self.model_path, map_location=device)) with open(self.mapping) as f: self.config = json.load(f)
def test_multiple_sequences(self): tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base") model = FlaxRobertaModel.from_pretrained("roberta-base") sequences = [ "this is an example sentence", "this is another", "and a third one" ] encodings = tokenizer(sequences, return_tensors=TensorType.JAX, padding=True, truncation=True) @jax.jit def model_jitted(input_ids, attention_mask=None, token_type_ids=None): return model(input_ids, attention_mask, token_type_ids) with self.subTest("JIT Disabled"): with jax.disable_jit(): tokens, pooled = model_jitted(**encodings) self.assertEqual(tokens.shape, (3, 7, 768)) self.assertEqual(pooled.shape, (3, 768)) with self.subTest("JIT Enabled"): jitted_tokens, jitted_pooled = model_jitted(**encodings) self.assertEqual(jitted_tokens.shape, (3, 7, 768)) self.assertEqual(jitted_pooled.shape, (3, 768))
def __init__(self, use_gpu=True, tokenizer=None): super().__init__() MODEL_NAME = 'iarfmoose/roberta-small-bulgarian-pos' if tokenizer: self.tokenizer = tokenizer else: self.tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME) self.model = RobertaForTokenClassification.from_pretrained(MODEL_NAME) self.model.to(self.device) self.tag_to_id = { 'ADJ': 0, 'ADP': 1, 'PUNCT': 2, 'ADV': 3, 'AUX': 4, 'SYM': 5, 'INTJ': 6, 'CCONJ': 7, 'X': 8, 'NOUN': 9, 'DET': 10, 'PROPN': 11, 'NUM': 12, 'VERB': 13, 'PART': 14, 'PRON': 15, 'SCONJ': 16 } self.id_to_tag = {self.tag_to_id[tag]: tag for tag in self.tag_to_id}
def __init__(self, config: Dict, datapoints: List[Datapoint]): self.data = [] base_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) full_model_output_path = os.path.join(base_dir, config["model_output_path"]) tokenizer = RobertaTokenizerFast.from_pretrained( config["tokenizer_path"], cache_dir=full_model_output_path, padding_side="right") for datapoint in datapoints: tokenized = tokenizer(datapoint.statement, padding="max_length", max_length=config["max_seq_len"], truncation=True, return_tensors="np", return_token_type_ids=True, return_attention_mask=True, return_special_tokens_mask=True) # Only a single encoding since only a single datapoint tokenized self.data.append({ "ids": tokenized.data["input_ids"].squeeze(), "type_ids": tokenized.data["token_type_ids"].squeeze(), "attention_mask": tokenized.data["attention_mask"].squeeze(), "special_tokens_mask": tokenized.data["special_tokens_mask"].squeeze(), "label": np.array(int(datapoint.label)) })
def test_from_pytorch(self): with torch.no_grad(): with self.subTest("roberta-base"): tokenizer = RobertaTokenizerFast.from_pretrained( "roberta-base") fx_model = FlaxRobertaModel.from_pretrained("roberta-base") pt_model = RobertaModel.from_pretrained("roberta-base") # Check for simple input pt_inputs = tokenizer.encode_plus( "This is a simple input", return_tensors=TensorType.PYTORCH) fx_inputs = tokenizer.encode_plus( "This is a simple input", return_tensors=TensorType.JAX) pt_outputs = pt_model(**pt_inputs) fx_outputs = fx_model(**fx_inputs) self.assertEqual( len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch") for fx_output, pt_output in zip(fx_outputs, pt_outputs.to_tuple()): self.assert_almost_equals(fx_output, pt_output.numpy(), 5e-3)
def get_model_and_tokenizer(args, type='pattern'): if type == 'pattern': dropout = args.pattern_dropout elif type == 'classifier': dropout = args.classifier_dropout else: raise ValueError('"type" argument for "get_model_and_tokenizer" mast be "pattern" or "classifier", not {}'.format(type)) model, tokenizer = None, None if 'roberta' in args.model_name: tokenizer = RobertaTokenizerFast.from_pretrained(args.model_name) if args.model_type == 'sequence_classification': model = RobertaForSequenceClassification.from_pretrained(args.model_name, hidden_dropout_prob=dropout, attention_probs_dropout_prob=dropout, num_labels=args.num_labels) elif args.model_type == 'MLM': model = CompactRobertaForMaskedLM.from_pretrained(args.model_name, hidden_dropout_prob=dropout, attention_probs_dropout_prob=dropout) elif args.model_type == 'soft_label_classification': model = RobertaForSoftLabelSequenceClassification.from_pretrained(args.model_name, hidden_dropout_prob=dropout, attention_probs_dropout_prob=dropout, num_labels=args.num_labels) if model and args.eval: model = model.from_pretrained(args.model_dir) if model and tokenizer: model.resize_token_embeddings(len(tokenizer)) return model, tokenizer raise Exception('no such model: name "{}", type "{}"'.format(args.model_name, args.model_type))
def build(image_set, args): img_dir = Path(args.flickr_img_path) / f"{image_set}" if args.GT_type == "merged": identifier = "mergedGT" elif args.GT_type == "separate": identifier = "separateGT" else: assert False, f"{args.GT_type} is not a valid type of annotation for flickr" if args.test: ann_file = Path( args.flickr_ann_path) / f"final_flickr_{identifier}_test.json" else: ann_file = Path(args.flickr_ann_path ) / f"final_flickr_{identifier}_{image_set}.json" tokenizer = RobertaTokenizerFast.from_pretrained(args.text_encoder_type) dataset = FlickrDetection( img_dir, ann_file, transforms=make_coco_transforms(image_set, cautious=True), return_masks=False, return_tokens=True, # args.contrastive_align_loss, tokenizer=tokenizer, is_train=image_set == "train") return dataset
def build(image_set, args): img_dir = Path(args.coco_path) / "train2014" refexp_dataset_name = args.refexp_dataset_name if refexp_dataset_name in ["refcoco", "refcoco+", "refcocog"]: if args.test: test_set = args.test_type ann_file = Path(args.refexp_ann_path) / f"finetune_{refexp_dataset_name}_{test_set}.json" else: ann_file = Path(args.refexp_ann_path) / f"finetune_{refexp_dataset_name}_{image_set}.json" elif refexp_dataset_name in ["all"]: ann_file = Path(args.refexp_ann_path) / f"final_refexp_{image_set}.json" else: assert False, f"{refexp_dataset_name} not a valid datasset name for refexp" tokenizer = RobertaTokenizerFast.from_pretrained(args.text_encoder_type) dataset = RefExpDetection( img_dir, ann_file, transforms=make_coco_transforms(image_set, cautious=True), return_masks=args.masks, return_tokens=True, tokenizer=tokenizer, ) return dataset
def __init__(self, batch_size=256): self.input_ids_list = [] self.attention_mask_list = [] self.label_list = [] self.batch_size = batch_size self.roberta_tokenizer = RobertaTokenizerFast.from_pretrained( "roberta-base")
def run(): #load and prepare data train, test = load_data() train, test = prepare_input(train), prepare_input(test, True) #train-test split train_texts, val_texts, train_labels, val_labels = train_test_split( list(train["input"].values), list(train["label_numeric"].values), test_size=.2, random_state=5) #tokenize and train tokenizer = RobertaTokenizerFast.from_pretrained('roberta-large') train_encodings, val_encodings = tokenize_data(train_texts, tokenizer), tokenize_data( val_texts, tokenizer) model = start_train(train_encodings, train_labels, val_encodings, val_labels) #validate and predict on test and write test output validate_model(model, tokenizer, val_texts, val_labels) predict_on_test(model, tokenizer, test) #save model model.save_pretrained("data/roberta_model")
def __init__(self, use_gpu=True, tokenizer=None): super().__init__() MODEL_NAME = 'iarfmoose/roberta-small-bulgarian-ner' if tokenizer: self.tokenizer = tokenizer else: self.tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME) self.model = RobertaForTokenClassification.from_pretrained(MODEL_NAME) self.model.to(self.device) self.tag_to_id = { 'O': 0, 'I-PRO': 1, 'I-PER': 2, 'I-ORG': 3, 'I-LOC': 4, 'I-EVT': 5, 'B-PRO': 6, 'B-PER': 7, 'B-ORG': 8, 'B-LOC': 9, 'B-EVT': 10 } self.id_to_tag = {self.tag_to_id[tag]: tag for tag in self.tag_to_id}
def tokenize(data_x, name, length): print(f"Started tokenizer for {name} at length {length}") tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base') mm = np.memmap(f'vectors/{name}_{length}.mm', dtype='int32', mode='w+', shape=(len(data_x), 3, length)) for i in range( (int(len(data_x) / 1000)) ): # This is run in batches of 1000 due to memory. Slow tokenizer has # issues with some files which causes it to take hours even though batching isn't needed tokens = tokenizer.batch_encode_plus(data_x[(i * 1000):((i + 1) * 1000)], add_special_tokens=True, pad_to_max_length=True, truncation=True, max_length=length, return_attention_mask=True, return_token_type_ids=True, return_tensors='np') mm[(i * 1000):((i + 1) * 1000), 0, :] = np.array(tokens.get('input_ids')) mm[(i * 1000):((i + 1) * 1000), 1, :] = np.array(tokens.get('attention_mask')) mm[(i * 1000):((i + 1) * 1000), 2, :] = np.array(tokens.get('token_type_ids')) print(f"Finished tokenizer for {name} at length {length}")
def convert_to_long_model(model_name, tokenizer_name, save_model_to, attention_window, max_pos): """ Starting from the roberta-base checkpoint, the following function converts it into an instance of RobertaLong. Args: save_model_to (str): path to output dir attention_window (int): max_pos (int): max model position before adding extra 2 tokens for roberta models Returns: transformers.RobertaForMaskedLM: RoBERTa model with LM head on top """ model = RobertaForMaskedLM.from_pretrained(model_name) tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_name, model_max_length=max_pos) config = model.config # extend position embeddings tokenizer.model_max_length = max_pos tokenizer.init_kwargs['model_max_length'] = max_pos current_max_pos, embed_size = model.roberta.embeddings.position_embeddings.weight.shape max_pos += 2 # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2 config.max_position_embeddings = max_pos assert max_pos > current_max_pos # allocate a larger position embedding matrix new_pos_embed = model.roberta.embeddings.position_embeddings.weight.new_empty( max_pos, embed_size) # copy position embeddings over and over to initialize the new position embeddings k = 2 step = current_max_pos - 2 while k < max_pos - 1: new_pos_embed[k:( k + step)] = model.roberta.embeddings.position_embeddings.weight[2:] k += step model.roberta.embeddings.position_embeddings.weight.data = new_pos_embed # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention` config.attention_window = [attention_window] * config.num_hidden_layers for i, layer in enumerate(model.roberta.encoder.layer): longformer_self_attn = LongformerSelfAttention(config, layer_id=i) longformer_self_attn.query = layer.attention.self.query longformer_self_attn.key = layer.attention.self.key longformer_self_attn.value = layer.attention.self.value longformer_self_attn.query_global = layer.attention.self.query longformer_self_attn.key_global = layer.attention.self.key longformer_self_attn.value_global = layer.attention.self.value layer.attention.self = longformer_self_attn logger.info(f' saving model to {save_model_to}') model.save_pretrained(save_model_to) tokenizer.save_pretrained(save_model_to) return model, tokenizer
def getTokenizer(model_name): if 'roberta' in model_name: return RobertaTokenizerFast.from_pretrained(model_name, add_prefix_space=False) elif model_name.startswith('bert'): return BertTokenizerFast.from_pretrained(model_name, add_prefix_space=False) elif 'bart' in model_name: return RobertaTokenizerFast.from_pretrained( 'roberta-large', add_prefix_space=False ) #check https://github.com/huggingface/transformers/blob/68e19f1c228c92d5d800533f558faff24b57127a/src/transformers/tokenization_bart.py#L27 elif 'electra' in model_name: return ElectraTokenizerFast.from_pretrained(model_name, add_prefix_space=False) else: return AutoTokenizer.from_pretrained(model_name, add_prefix_space=False)
def setUpClass(self): self.tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base") self.dataset = pd.DataFrame.from_dict({ "question": ["question 0", "question 1"], "passage": ["passage 0", "passage 1"], "idx": [0, 1], "label": [True, False], }) self.max_seq_len = 4
def build(dataset_file, image_set, args): if dataset_file == "clevr_question": if args.clevr_variant == "humans": assert args.no_detection, "CLEVR-Humans doesn't have boxes, please disable detection" im_set = image_set if args.test: im_set = "test" ann_file = Path( args.clevr_ann_path) / f"CLEVR-Humans-{im_set}.json" img_dir = Path(args.clevr_img_path) / f"{im_set}" image_set = "train" if im_set == "train" else "val" elif args.clevr_variant == "cogent": assert image_set != "train", "Please train CoGenT with 'clevr' dataset, not 'clevr_question'" im_set = args.cogent_set ann_file = Path( args.clevr_ann_path) / f"CLEVR_{im_set}_questions.json" img_dir = Path(args.clevr_img_path) / f"{im_set}" image_set = "train" if im_set == "train" else "val" elif args.clevr_variant == "normal": im_set = image_set if args.test: im_set = "test" ann_file = Path( args.clevr_ann_path) / f"CLEVR_{im_set}_questions.json" img_dir = Path(args.clevr_img_path) / f"{im_set}" image_set = "train" if im_set == "train" else "val" else: assert False, f"Unknown clevr variant {args.clevr_variant}" print("loading ", img_dir, ann_file) return ClevrQuestion( img_dir, ann_file, transforms=make_clevr_transforms(image_set, cautious=True), ) tokenizer = RobertaTokenizerFast.from_pretrained(args.text_encoder_type) img_dir = Path(args.clevr_img_path) / f"{image_set}" ann_file = Path(args.clevr_ann_path) / f"{image_set}.json" if args.clevr_variant == "cogent": im_set = "trainA" if image_set == "train" else "valA" img_dir = Path(args.clevr_img_path) / f"{image_set}A" dataset = ClevrDetection( img_dir, ann_file, transforms=make_clevr_transforms(image_set, cautious=True), return_masks=False, return_tokens=True, tokenizer=tokenizer, do_qa=args.do_qa, ) return dataset
def __init__(self, path_to_model): # model: self.model = RobertaForClaimDetection(n_classes=2, unfreeze=False) checkpoint = torch.load(path_to_model) self.model.load_state_dict(checkpoint['model_state_dict']) self.model.eval() self.model.to(DEVICE) # tokenizer: self.tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
def get_fast_tokenizer(self): if 'roberta' in self.bert_name: tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=True) elif 'xlnet' in self.bert_name: tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') else: tokenizer = BertWordPieceTokenizer( "data/.bert-base-uncased-vocab.txt", lowercase=True) return tokenizer
def get_tokenizer(vocab_size): pretrained_tokenizer_path = Path( 'experiments/tokenizers') / f'{tokenizer_type}-{vocab_size}' logger.info( f'loading {tokenizer_type}-{vocab_size} tokenizer from {pretrained_tokenizer_path}' ) if transformer_type == 'roberta': return RobertaTokenizerFast.from_pretrained( str(pretrained_tokenizer_path), max_len=512) return BertTokenizerFast.from_pretrained(str(pretrained_tokenizer_path), max_len=512)
def __init__(self, config: Bunch) -> None: pl.LightningModule.__init__(self) self.config = config self.model = RobertaForSequenceClassification.from_pretrained( config.pretrained_model) roberta_tokenizer = RobertaTokenizerFast.from_pretrained( self.config.pretrained_model) tokenizer = PreTrainedTokenizer(roberta_tokenizer, self.config.max_tokens_per_tweet) self.data_processor = DataProcessor(config, tokenizer) self.loss = CrossEntropyLoss()
def __init__(self, use_gpu=True): tokenizer = RobertaTokenizerFast.from_pretrained( 'iarfmoose/roberta-small-bulgarian') self.pos_tagger = POSTagger(use_gpu=use_gpu, tokenizer=tokenizer) self.ner_tagger = NERTagger(use_gpu=use_gpu, tokenizer=tokenizer) self.entity_types = { 'PRO': 'PRODUCT', 'PER': 'PERSON', 'ORG': 'ORGANISATION', 'LOC': 'LOCATION', 'EVT': 'EVENT' }
def test_featurize(): """Test that RxnFeaturizer.featurize() correctly featurizes the reactions, correctly outputs the input_ids and attention_mask. """ from transformers import RobertaTokenizerFast from deepchem.feat.reaction_featurizer import RxnFeaturizer tokenizer = RobertaTokenizerFast.from_pretrained( "seyonec/PubChem10M_SMILES_BPE_450k") featurizer = RxnFeaturizer(tokenizer, sep_reagent=True) reaction = ['CCS(=O)(=O)Cl.OCCBr>CCN(CC)CC.CCOCC>CCS(=O)(=O)OCCBr'] feats = featurizer.featurize(reaction) assert (feats.shape == (1, 2, 2, 1))