def __init__(self): category_model_dir = './resources/category_model' resource_model_dir = './resources/resource_type_model' mapping_csv = './resources/mapping.csv' hierarchy_json = './resources/dbpedia_hierarchy.json' id_to_label = {} label_to_id = {} with open(mapping_csv) as csvfile: reader = csv.reader(csvfile, delimiter=',') for row in reader: id_to_label[row[0]] = row[1] label_to_id[row[1]] = row[0] self.id_to_label = id_to_label self.label_to_id = label_to_id self.category_tokenizer = DistilBertTokenizer.from_pretrained( category_model_dir) self.category_model = DistilBertForSequenceClassification.from_pretrained( category_model_dir, num_labels=5) self.resource_tokenizer = DistilBertTokenizer.from_pretrained( resource_model_dir) self.resource_model = DistilBertForSequenceClassification.from_pretrained( resource_model_dir, num_labels=len(id_to_label)) hierarchy = {} with open(hierarchy_json) as json_file: hierarchy = json.load(json_file) self.hierarchy = hierarchy
def makeUnilabelModel(self, modelName, num_labels=10, root='', **kwargs): if modelName == 'distilbert-base-uncased': tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') model = DistilBertForSequenceClassification.from_pretrained( root + "distilbert-base-uncased", num_labels=num_labels, **kwargs) if modelName == 'gpt2': tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') tokenizer.add_special_tokens({'pad_token': '[PAD]'}) model = GPT2ForSequenceClassification.from_pretrained( root + "gpt2", num_labels=num_labels, **kwargs) model.resize_token_embeddings(len(tokenizer)) # add padding token model.config.pad_token_id = tokenizer('[PAD]').input_ids[0] if modelName == 'bertweet': tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base') model = AutoModelForSequenceClassification.from_pretrained( root + "vinai/bertweet-base", num_labels=num_labels, **kwargs) if modelName == 'distilroberta-base': tokenizer = AutoTokenizer.from_pretrained('distilroberta-base') model = AutoModelForSequenceClassification.from_pretrained( root + "distilroberta-base", num_labels=num_labels, **kwargs) if modelName == 'lstm': tokenizer = AutoTokenizer.from_pretrained( 'distilbert-base-uncased') model = LSTMCclassifier(128, 64, 2, tokenizer.vocab_size, num_labels) return tokenizer, model
def load_model(manifest): """Loads the model object from the file at model_filepath key in config dict""" checkpoints_path = manifest["model_filepath"] if __name__ == "__main__": checkpoints = checkpoints_path else: checkpoints = client.file(checkpoints_path).getFile().name assert_model_md5(checkpoints) class_mapping = { 0: "Movies_Negative", 1: "Movies_Positive", 2: "Food_Negative", 3: "Food_Positive", 4: "Clothing_Negative", 5: "Clothing_Positive", } model = DistilBertForSequenceClassification.from_pretrained( "distilbert-base-uncased", num_labels=len(class_mapping), output_attentions=False, output_hidden_states=False, ) tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") model.load_state_dict( torch.load(checkpoints, map_location=torch.device("cpu"))) return model, tokenizer, class_mapping
def __init__(self, config, bert_hidden_states=1, dropout=0.1, update_bert=False): config = deepcopy(config) config.output_hidden_states = True config.dropout = dropout super(DistilBertForToxic, self).__init__(config) self.bert_hidden_states = bert_hidden_states self.num_labels = 1 self.update_bert = update_bert #bert=DistilBertModel(DistilBertConfig()) bert = DistilBertForSequenceClassification.from_pretrained( "distilbert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. num_labels= 2, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions= False, # Whether the model returns attentions weights. output_hidden_states= True, # Whether the model returns all hidden-states. ).distilbert bert.config = config device = get_device() bert = bert.to(device) self.bert = bert self.qa_outputs = nn.Sequential( nn.Dropout(dropout), nn.Linear(config.hidden_size * bert_hidden_states, 1), nn.Sigmoid())
def main(): # 1 get data into dataframe df = read_into_pandas() (mlb_category, df) = replace_column_with_label_representation(df, 'category', 'category_int') df_train, df_test = train_test_split(df, test_size=0.2) # 2 transform into BERT format df_bert = pd.DataFrame({ 'id':df_train['id'], 'label':df_train['category_int'], 'alpha':['a']*df_train.shape[0], 'text': df_train['text'].str[:512].replace(r'\n', ' ', regex=True) }) df_bert_train, df_bert_dev = train_test_split(df_bert, test_size=0.01) df_bert_test = pd.DataFrame({ 'id':df_test['id'], 'text': df_test['text'].str[:512].replace(r'\n', ' ', regex=True) }) # Saving dataframes to .tsv format as required by BERT df_bert_train.to_csv('../datasets/Newswire_BERT/train.tsv', sep='\t', index=False, header=False) df_bert_dev.to_csv('../datasets/Newswire_BERT/dev.tsv', sep='\t', index=False, header=False) df_bert_test.to_csv('../datasets/Newswire_BERT/test.tsv', sep='\t', index=False, header=False) # 3 load pretrained model tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', return_dict=True) # 4 transform tokenized = df_bert_train['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True))) print('Padding') max_len = 0 for i in tokenized.values: if len(i) > max_len: max_len = len(i) padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values]) print('Shape after padding ' + str(np.array(padded).shape)) attention_mask = np.where(padded != 0, 1, 0) attention_mask.shape input_ids = torch.tensor(padded) attention_mask = torch.tensor(attention_mask).to('cuda:0') print('Embedding model start') model.train() with torch.no_grad(): input_ids = input_ids.clone().detach().to(torch.int64).to('cuda:0') model = model.to('cuda:0') labels = torch.tensor(df_bert_train['label'].values).to(torch.int64).to('cuda:0') print(labels) last_hidden_states = model(input_ids, attention_mask=attention_mask, labels=labels) print(model) model.save_pretrained('models/BERT1')
def main(**kwargs): project_path = str(Path(__file__).resolve().parents[1]) train_dataset = IMDBDataset(project_path + '/data/train.csv', kwargs['seq_length']) train_loader = DataLoader(train_dataset, batch_size=kwargs['batch_size'], shuffle=True, num_workers=4) valid_dataset = IMDBDataset(project_path + '/data/validation.csv', kwargs['seq_length']) valid_loader = DataLoader(valid_dataset, batch_size=kwargs['batch_size'], shuffle=True, num_workers=4) print('Downloading the pretrained DistilBert model...') model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased', num_labels=2).to('cuda') lrs = [{ 'params': model.distilbert.parameters(), 'lr': kwargs['lr_transformer'] }, { 'params': model.pre_classifier.parameters() }, { 'params': model.classifier.parameters() }] optim = Adam(lrs, lr=kwargs['lr_classifier'], eps=kwargs['eps']) print('Training...') st = time.time() train(train_loader, valid_loader, model, optim, kwargs['num_epochs']) print(f'Training time: {time.time() - st}sec')
def __init__(self): super(DistilBertModel, self).__init__() self.distilbert = DistilBertForSequenceClassification.from_pretrained( "adamlin/bert-distil-chinese", num_labels=2) self.device = torch.device("cuda") for param in self.distilbert.parameters(): param.requires_grad = True # 每个参数都要 求梯度
def main(): args = parse_args() logger = setup_logger() train_source = args.training_dir + "/synthetic.train.txt" val_source = args.valid_dir + "/synthetic.validation.txt" train_labels, train_texts = load_labels_and_texts(train_source) val_labels, val_texts = load_labels_and_texts(val_source) logger.info("\n------- 01 --------\n") train_encodings = tokenizer(train_texts, truncation=True, padding=True) val_encodings = tokenizer(val_texts, truncation=True, padding=True) train_labels_processed = list(map(make_number, train_labels)) val_labels_processed = list(map(make_number, val_labels)) train_dataset = CustomDataset(train_encodings, train_labels_processed) val_dataset = CustomDataset(val_encodings, val_labels_processed) logger.info("\n------- 02 --------\n") training_args = TrainingArguments( output_dir='./results', num_train_epochs=3, per_device_train_batch_size=16, per_device_eval_batch_size=64, warmup_steps=500, weight_decay=0.01, logging_dir='./logs', logging_steps=10, ) model = DistilBertForSequenceClassification.from_pretrained( "distilbert-base-uncased") logger.info("\n------- 03 --------\n") trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset) print('DONE') trainer.train() logger.info("\n------- 04 --------\n") eval_result = trainer.evaluate(eval_dataset=val_dataset) with open(os.path.join(args.output_data_dir, "eval_results.txt"), "w") as writer: for key, value in sorted(eval_result.items()): writer.write(f"{key} = {value}\n") trainer.save_model(args.model_dir) logger.info("\n------- 05 --------\n")
def main(argv): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased') if torch.cuda.device_count() > 1: logging.info("Using {} GPUs.".format(torch.cuda.device_count())) model = nn.DataParallel(model) model.to(device) train_dataset, val_dataset, test_dataset = dataset.create_dataset( FLAGS.data_dir) train_loader = DataLoader(train_dataset, batch_size=FLAGS.train_batch_size, shuffle=True) valid_loader = DataLoader(val_dataset, batch_size=FLAGS.test_batch_size, shuffle=False) optimizer = AdamW(model.parameters(), lr=FLAGS.learning_rate) train.train_model(model=model, device=device, num_epochs=FLAGS.num_epochs, optimizer=optimizer, train_loader=train_loader, valid_loader=valid_loader, file_path=FLAGS.output_dir, eval_every=1000)
def getTweets(): print("Getting tweets now ...") # Default keyword if you hit search keyword = request.args.get( 'keyword', default='coronavirus covid vaccine vaccination COVID-19') # Fetch the 20 most recent tweets matching the query. Change the argument # in `items()` to decrease or increase the number of retrieved tweets. # The larger the number the longer the retreival time query = keyword # text from the search box tweets_ = tweepy.Cursor(api.search, query, result_type='recent').items(20) tweets = [tweet.text for tweet in tweets_] print("Done ... retrieving tweets from API based on the keyword=" + keyword) df = pd.DataFrame(data=tweets, columns=['Tweet']) print("Done ... creating dataframe") # Iterate over the tweet texts in `tweets` and pass each item to the model # to obtain a prediction, then write those predictions to a Pandas dataframe model = pipeline( 'sentiment-analysis', model=DistilBertForSequenceClassification.from_pretrained("model"), tokenizer=DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased')) results = list(model(tweet) for tweet in tweets) df['Sentiment'] = list(LABELS[s[0].get('label')] for s in results) df['Score'] = list(s_[0].get('score') for s_ in results) print("Done ... sentiment-analysis") print(df) return render_template("covid.html", data=list(df.values.tolist()))
def simple_inference(): ''' this one is simpler and better for general case. It doesn't show the distribution of all the sentiments. this one uses the TextClassificationPipeline from transformers lib which is preferable :return: ''' tokenizer = DistilBertTokenizer.from_pretrained("./model_out/") model = DistilBertForSequenceClassification.from_pretrained("./model_out/") model.to('cpu') sentiment_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=-1) t1 = time.time() result = sentiment_classifier("this is so cute!") t2 = time.time() print(t2 - t1, result) result = sentiment_classifier("That's so disgusting!") t3 = time.time() print(t3 - t2, result) result = sentiment_classifier("this is a simple test.") t4 = time.time() print(t4 - t3, result)
def __init__(self, *args, **kwargs): # initialize super class with request & response schema, configs super().__init__(*args, **kwargs) # initialize model and other tools self.tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased') self.model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased-finetuned-sst-2-english')
def __init__(self, data_config: Dict[str, Any], args: argparse.Namespace = None): super().__init__() self.model = DistilBertForSequenceClassification.from_pretrained( "distilbert-base-uncased" ) self.data_config = data_config self.idx_2_label = {v: k for k, v in data_config["mapping"].items()} self.tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def __init__(self, hparams: Dict): super().__init__() self.hparams = hparams self.model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased', num_labels=1, ) self.am = AverageMeterSet()
def check_sentiment(text): tokenizer = DistilBertTokenizer.from_pretrained('./pretrain_distillbert_full_sst') model = DistilBertForSequenceClassification.from_pretrained('./pretrain_distillbert_full_sst') sentiment_classifier = SentimentClassifer(model, tokenizer) result = sentiment_classifier(text) sentiment = max(result, key=result.get) sentiment_distribution = list(result.values()) print("sentiment of {}: {}".format(text, sentiment)) return sentiment
def __init__(self, hparams, test_dataset=None): super().__init__() self.hparams = hparams print(f"[{dt.datetime.now()}] Building model") self.bert = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased', num_labels=2) self.data_loaded = False
def generate_tokenizer_and_model(model_name): if model_name == "bert-base-uncased": tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForSequenceClassification.from_pretrained("bert-base-uncased") elif model_name == 'distilbert-base-uncased': tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") return tokenizer, model
def __init__( self, semantic_analysis_config: BrainSentimentAnalysisConfiguration): super().__init__() self._semantic_analysis_config = semantic_analysis_config model_dir = semantic_analysis_config.model_dir tokenizer = DistilBertTokenizer.from_pretrained(model_dir) model = DistilBertForSequenceClassification.from_pretrained(model_dir) self.sentiment_classifier = SentimentClassifer(model, tokenizer)
def __init__(self, path=None, model_name=None): if path: self.model = DistilBertForSequenceClassification.from_pretrained( path) tokenizer_path = os.path.join(path, "model/") if os.path.exists(tokenizer_path): self.tokenizer = DistilBertTokenizerFast.from_pretrained( tokenizer_path) else: self.tokenizer = DistilBertTokenizerFast.from_pretrained( "distilbert-base-uncased") elif model_name: config = DistilBertConfig.from_pretrained(model_name, return_dict=True, num_labels=2) self.model = DistilBertForSequenceClassification.from_pretrained( model_name, config=config) self.tokenizer = DistilBertTokenizerFast.from_pretrained( model_name)
def load_model(ckpt_path: str): state_dict = load_pretrained_dict(ckpt_path) model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased', num_labels=1, ) model.load_state_dict(state_dict, strict=True) model.eval().cuda() return model
def __init__(self, pre_trained: str, classes: List[Tuple[Rel, Ent]]): super().__init__() self.tokenizer = DistilBertTokenizer.from_pretrained(pre_trained) self.tokenizer.add_tokens(['[MENTION_START]', '[MENTION_END]', '[MASK]'], special_tokens=True) class_count = len(classes) self.classes = classes self.bert = DistilBertForSequenceClassification.from_pretrained(pre_trained, num_labels=class_count)
def model_fn(model_dir): config = DistilBertConfig.from_json_file('/opt/ml/model/code/config.json') model_path = '{}/{}'.format(model_dir, 'model.pth') model = DistilBertForSequenceClassification.from_pretrained(model_path, config=config) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) return model
def model_fn(model_dir): config = DistilBertConfig.from_json_file("/opt/ml/model/code/config.json") model_path = "{}/{}".format(model_dir, "model.pth") model = DistilBertForSequenceClassification.from_pretrained(model_path, config=config) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) return model
def loadmodel(self, path): try: self.tokenizer = AutoTokenizer.from_pretrained(path) self.model = DistilBertForSequenceClassification.from_pretrained( path) self.model.to(self.device) except Exception as e: print(e) else: print('Loaded model.')
def main(): # コマンドライン引数の取得(このファイル上部のドキュメントから自動生成) args = docopt(__doc__) pprint(args) # パラメータの取得 lr = float(args['--lr']) seq_len = int(args['--seq_len']) max_epoch = int(args['--max_epoch']) batch_size = int(args['--batch_size']) num_train = int(args['--num_train']) num_valid = int(args['--num_valid']) # モデルの選択 pretrained_weights = 'distilbert-base-uncased' tokenizer = DistilBertTokenizer.from_pretrained(pretrained_weights) config = DistilBertConfig(num_labels=4) model = DistilBertForSequenceClassification.from_pretrained( pretrained_weights, config=config) # 使用デバイスの取得 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) # データの読み込みとデータセットの作成 encoder = TwinPhraseEncoder(tokenizer, seq_len) train_dataset = WordnetDataset(mode='train', num_data=num_train, transform=encoder) valid_dataset = WordnetDataset(mode='valid', num_data=num_valid, transform=encoder) train_loader = data.DataLoader(train_dataset, batch_size, shuffle=True) valid_loader = data.DataLoader(valid_dataset, batch_size, shuffle=True) # 最適化法の定義 optimizer = optim.Adam(model.parameters(), lr=lr) # 学習 for epoch in range(1, max_epoch + 1): print('=' * 27 + f' Epoch {epoch:0>2} ' + '=' * 27) # Training loss, accu = train_model(model, optimizer, train_loader, device) print( f'| Training | loss-avg : {loss:>8.6f} | accuracy : {accu:>8.3%} |' ) # Validation loss, accu = valid_model(model, optimizer, valid_loader, device) print( f'| Validation | loss-avg : {loss:>8.6f} | accuracy : {accu:>8.3%} |' ) # 保存 torch.save(model.state_dict(), f'../result/bert.pkl')
def test_unbatch_attentions_hidden_states(self): model = DistilBertForSequenceClassification.from_pretrained( "hf-internal-testing/tiny-random-distilbert", output_hidden_states=True, output_attentions=True ) tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-distilbert") text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer) # Used to throw an error because `hidden_states` are a tuple of tensors # instead of the expected tensor. outputs = text_classifier(["This is great !"] * 20, batch_size=32) self.assertEqual(len(outputs), 20)
def __init__(self, config: Dict[str, Union[str, int, float]]): mode_path = download_model(bucket_name=config["bucket_name"], model_s3_path=config["model_s3_path"]) self.model_version = config["model_version"] self.model_threshold = config["model_threshold"] self.tokenizer = DistilBertTokenizer.from_pretrained(mode_path) self.model = DistilBertForSequenceClassification.from_pretrained( mode_path) self.model.eval()
def get_clf(self): if self.flags.use_clf: clf = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased', num_labels=len(self.labels)).to(self.flags.device) text_clf_path = Path( __file__ ).parent.parent / 'classifiers/state_dicts/text_clf.pth' clf.load_state_dict( torch.load(text_clf_path, map_location=self.flags.device)) return TextClf(self, clf).to(self.flags.device)
def load_existing_model(): global label global current_model model_path = "" files = os.listdir('data/'+label) for file_name in files: if file_name.endswith(".model"): model_path = 'data/'+label+"/"+file_name if model_path != '': if verbose: print("Loading model from "+model_path) current_model = DistilBertForSequenceClassification.from_pretrained(model_path) eel.sleep(0.1) else: if verbose: print("Creating new uninitialized model (OK to ignore warnings)") current_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
def model_call(txt_input): model_dir = os.getcwd() + "/model_save" tokenizer = DistilBertTokenizer.from_pretrained(model_dir) model_loaded = DistilBertForSequenceClassification.from_pretrained( model_dir) # regex to parse string into array sentence_enders = "(?<=[!.?])\s" sent_array = re.split(sentence_enders, txt_input) txt_output = [] option = ["Gramatically correct", "Gramatically in-correct"] for i in range(len(sent_array)): device = torch.device("cpu") seq = tokenizer.encode_plus( sent_array[i], # Sentence to encode. add_special_tokens=True, # Add '[CLS]' and '[SEP]' max_length=64, # Pad & truncate all sentences. pad_to_max_length=True, return_attention_mask=True, # Construct attn. masks. return_tensors='pt', # Return pytorch tensors. ) # Add the encoded sentence to the list. # And its attention mask (simply differentiates padding from non-padding). input_id = seq['input_ids'] attention_mask = seq['attention_mask'] input_id = torch.LongTensor(input_id) attention_mask = torch.LongTensor(attention_mask) # block 2 model_loaded = model_loaded.to(device) input_id = input_id.to(device) attention_mask = attention_mask.to(device) # block 3 with torch.no_grad(): # Forward pass, calculate logit predictions outputs = model_loaded(input_id, attention_mask=attention_mask) logits = outputs[0] # print(logits) index = logits.argmax() if index == 1: print(option[0]) txt_output.append(option[0]) else: print(option[1]) txt_output.append(option[1]) return txt_output #array