def load_word_embedding_model(model_type, task, vocab_path, word_tokenizer_class, emb_path, num_labels, lower=True): # Load config config = BertConfig.from_pretrained('bert-base-uncased') # Init word tokenizer word_tokenizer = word_tokenizer_class() # Load vocab _, vocab_map = load_vocab(vocab_path) tokenizer = SimpleTokenizer(vocab_map, word_tokenizer, lower=lower) vocab_list = list(tokenizer.vocab.keys()) # Adjust config if type(num_labels) == list: config.num_labels = max(num_labels) config.num_labels_list = num_labels else: config.num_labels = num_labels config.num_hidden_layers = num_labels if 'word2vec' in model_type: embeddings = gen_embeddings(vocab_list, emb_path) config.hidden_size = 400 config.num_attention_heads = 8 else: # 'fasttext' embeddings = gen_embeddings(vocab_list, emb_path, emb_dim=300) config.hidden_size = 300 config.num_attention_heads = 10 config.vocab_size = len(embeddings) # Instantiate model if 'sequence_classification' == task: model = BertForSequenceClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'token_classification' == task: model = BertForWordClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'multi_label_classification' == task: model = BertForMultiLabelClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) return model, tokenizer
def predict(text): # Load Tokenizer and Config tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1') config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1') config.num_labels = NerGritDataset.NUM_LABELS # Instantiate model w2i, i2w = NerGritDataset.LABEL2INDEX, NerGritDataset.INDEX2LABEL model = BertForWordClassification.from_pretrained( 'indobenchmark/indobert-base-p1', config=config) text = word_tokenize(text) subwords, subword_to_word_indices = word_subword_tokenize(text, tokenizer) subwords = torch.LongTensor(subwords).view(1, -1).to(model.device) subword_to_word_indices = torch.LongTensor(subword_to_word_indices).view( 1, -1).to(model.device) logits = model(subwords, subword_to_word_indices)[0] preds = torch.topk(logits, k=1, dim=-1)[1].squeeze().numpy() labels = [i2w[preds[i]] for i in range(len(preds))] return pd.DataFrame({'words': text, 'label': labels})
def load_model(args): if 'bert-base-multilingual' in args['model_checkpoint']: # bert-base-multilingual-uncased or bert-base-multilingual-cased # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint']) config = BertConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = BertForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'xlm-mlm' in args['model_checkpoint']: # xlm-mlm-100-1280 # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = XLMTokenizer.from_pretrained(args['model_checkpoint']) config = XLMConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = XLMForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = XLMForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = XLMForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'xlm-roberta' in args['model_checkpoint']: # xlm-roberta-base or xlm-roberta-large # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = XLMRobertaTokenizer.from_pretrained( args['model_checkpoint']) config = XLMRobertaConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = XLMRobertaForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = XLMRobertaForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = XLMRobertaForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'fasttext' in args['model_checkpoint']: # Prepare config & tokenizer vocab_path = args['vocab_path'] config_path = None word_tokenizer = args['word_tokenizer_class']() emb_path = args['embedding_path'][args['model_checkpoint']] _, vocab_map = load_vocab(vocab_path) tokenizer = SimpleTokenizer(vocab_map, word_tokenizer, lower=args["lower"]) vocab_list = list(tokenizer.vocab.keys()) config = BertConfig.from_pretrained('bert-base-uncased') if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] config.num_hidden_layers = args["num_layers"] embeddings = gen_embeddings(vocab_list, emb_path, emb_dim=300) config.hidden_size = 300 config.num_attention_heads = 10 config.vocab_size = len(embeddings) # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'token_classification' == args['task']: model = BertForWordClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'scratch' in args['model_checkpoint']: vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") config = BertConfig.from_pretrained("bert-base-uncased") if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] config.num_hidden_layers = args["num_layers"] config.hidden_size = 300 config.num_attention_heads = 10 if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config=config) elif 'token_classification' == args['task']: model = BertForWordClassification(config=config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config=config) elif 'indobenchmark' in args['model_checkpoint']: # indobenchmark models # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint']) config = BertConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model model_class = None if 'sequence_classification' == args['task']: model_class = AlbertForSequenceClassification if 'lite' in args[ 'model_checkpoint'] else BertForSequenceClassification elif 'token_classification' == args['task']: model_class = AlbertForWordClassification if 'lite' in args[ 'model_checkpoint'] else BertForWordClassification elif 'multi_label_classification' == args['task']: model_class = AlbertForMultiLabelClassification if 'lite' in args[ 'model_checkpoint'] else BertForMultiLabelClassification model = model_class.from_pretrained(args['model_checkpoint'], config=config) return model, tokenizer, vocab_path, config_path
def load_model(args): if 'albert-large-wwmlm-512' == args['model_checkpoint']: vocab_path = "../embeddings/albert-large-wwmlm-512/albert_large_model_bpe_wwmlm_512_vocab_uncased_30000.txt" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file( "../embeddings/albert-large-wwmlm-512/albert_large_model_bpe_wwmlm_512_albert_large_config.json" ) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-large-wwmlm-512/albert_large_model_bpe_wwmlm_512_pytorch_albert_large_512_629k.bin", from_tf=False, config=config) model.albert = albert_model elif 'albert-base-wwmlm-512' == args['model_checkpoint']: vocab_path = "../embeddings/albert-base-wwmlm-512/albert_base_model_bpe_wwmlm_512_vocab_uncased_30000.txt" config_path = "../embeddings/albert-base-wwmlm-512/albert_base_model_bpe_wwmlm_512_albert_base_config.json" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-base-wwmlm-512/albert_base_model_bpe_wwmlm_512_pytorch_model_albert_base_162k.bin", from_tf=False, config=config) model.albert = albert_model elif 'albert-large-wwmlm-128' == args['model_checkpoint']: vocab_path = "../embeddings/albert-large-wwmlm-128/albert_large_model_bpe_wwmlm_128_vocab_uncased_30000.txt" config_path = "../embeddings/albert-large-wwmlm-128/albert_large_model_bpe_wwmlm_128_albert_large_config.json" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-large-wwmlm-128/albert_large_model_bpe_wwmlm_128_pytorch_albert_large_128_500k.bin", from_tf=False, config=config) model.albert = albert_model elif 'babert-bpe-mlm-large-512' == args['model_checkpoint']: # babert_bpe # Prepare config & tokenizer vocab_path = "../embeddings/babert-bpe-mlm-large-512/babert_model_bpe_mlm_uncased_large_512_dup10-5_vocab_uncased_30522.txt" config_path = "../embeddings/babert-bpe-mlm-large-512/babert_model_bpe_mlm_uncased_large_512_dup10-5_bert_large_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-bpe-mlm-large-512/babert_model_bpe_mlm_uncased_large_512_dup10-5_pytorch_babert_uncased_large_512_dup10-5_1120k.bin", config=config) model.bert = bert_model.bert elif 'albert-base-uncased-112500' == args['model_checkpoint']: vocab_path = "../embeddings/albert-base-uncased-112500/vocab.txt" config_path = "../embeddings/albert-base-uncased-112500/bert_config.json" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-base-uncased-112500/albert_base_uncased_112500.bin", from_tf=False, config=config) model.albert = albert_model elif 'albert-base-uncased-96000' == args['model_checkpoint']: vocab_path = "../embeddings/albert-base-uncased-96000/vocab.txt" config_path = "../embeddings/albert-base-uncased-96000/bert_config.json" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-base-uncased-96000/albert_base_uncased_96000.bin", from_tf=False, config=config) model.albert = albert_model elif 'albert-base-uncased-191k' == args['model_checkpoint']: vocab_path = "../embeddings/albert-base-uncased-191k/pytorch_models_albert_base_uncased_191500_vocab_uncased_30000.txt" config_path = "../embeddings/albert-base-uncased-191k/pytorch_models_albert_base_uncased_191500_albert_base_config.json" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-base-uncased-191k/pytorch_models_albert_base_uncased_191500_pytorch_model_albert_base_191k.bin", from_tf=False, config=config) model.albert = albert_model elif 'babert-opensubtitle' == args['model_checkpoint']: # babert-opensubtitle # Prepare config & tokenizer vocab_path = "../embeddings/babert-opensubtitle/vocab.txt" config_path = "../embeddings/babert-opensubtitle/bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-opensubtitle/model.ckpt-1000000.index", from_tf=True, config=config) model.bert = bert_model.bert elif 'babert-bpe-mlm-large-uncased-1100k' == args['model_checkpoint']: # babert_bpe # Prepare config & tokenizer vocab_path = "../embeddings/babert-bpe-mlm-large-uncased-1100k/pytorch_models_babert_uncased_large_1100k_vocab_uncased_30522.txt" config_path = "../embeddings/babert-bpe-mlm-large-uncased-1100k/pytorch_models_babert_uncased_large_1100k_bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-bpe-mlm-large-uncased-1100k/pytorch_models_babert_uncased_large_1100k_pytorch_model_babert_large_1100k.bin", config=config) model.bert = bert_model.bert elif 'babert-bpe-mlm-large-uncased-1m' == args['model_checkpoint']: # babert_bpe # Prepare config & tokenizer vocab_path = "../embeddings/babert-bpe-mlm-large-uncased-1m/pytorch_models_babert_uncased_large_1mil_vocab_uncased_30522.txt" config_path = "../embeddings/babert-bpe-mlm-large-uncased-1m/pytorch_models_babert_uncased_large_1mil_bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-bpe-mlm-large-uncased-1m/pytorch_models_babert_uncased_large_1mil_pytorch_model_babert_large_1mil.bin", config=config) model.bert = bert_model.bert elif 'babert-base-512' == args['model_checkpoint']: # babert_bpe # Prepare config & tokenizer vocab_path = "../embeddings/babert-base-512/pytorch_models_babert_base_512_vocab_uncased_30522.txt" config_path = "../embeddings/babert-base-512/pytorch_models_babert_base_512_bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-base-512/pytorch_models_babert_base_512_pytorch_model_babert_base_uncased_512.bin", config=config) model.bert = bert_model.bert elif 'babert-bpe-mlm-large-uncased' == args['model_checkpoint']: # babert_bpe # Prepare config & tokenizer vocab_path = "../embeddings/babert-bpe-mlm-large-uncased/pytorch_models_babert_uncased_large_vocab_uncased_30522.txt" config_path = "../embeddings/babert-bpe-mlm-large-uncased/pytorch_models_babert_uncased_large_bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-bpe-mlm-large-uncased/pytorch_models_babert_uncased_large_pytorch_model_babert_large_778500.bin", config=config) model.bert = bert_model.bert elif 'babert-bpe-mlm-uncased-128-dup10-5' == args['model_checkpoint']: # babert_bpe_wwmlm # Prepare config & tokenizer vocab_path = "../embeddings/babert-bpe-mlm-uncased-128-dup10-5/vocab.txt" config_path = "../embeddings/babert-bpe-mlm-uncased-128-dup10-5/bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-bpe-mlm-uncased-128-dup10-5/pytorch_model.bin", config=config) model.bert = bert_model.bert elif 'bert-base-multilingual' in args['model_checkpoint']: # bert-base-multilingual-uncased or bert-base-multilingual-cased # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint']) config = BertConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = BertForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'xlm-mlm' in args['model_checkpoint']: # xlm-mlm-100-1280 # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = XLMTokenizer.from_pretrained(args['model_checkpoint']) config = XLMConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = XLMForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = XLMForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = XLMForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'xlm-roberta' in args['model_checkpoint']: # xlm-roberta-base or xlm-roberta-large # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = XLMRobertaTokenizer.from_pretrained( args['model_checkpoint']) config = XLMRobertaConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = XLMRobertaForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = XLMRobertaForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = XLMRobertaForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'word2vec' in args['model_checkpoint'] or 'fasttext' in args[ 'model_checkpoint']: # Prepare config & tokenizer vocab_path = args['vocab_path'] config_path = None word_tokenizer = args['word_tokenizer_class']() emb_path = args['embedding_path'][args['model_checkpoint']] _, vocab_map = load_vocab(vocab_path) tokenizer = SimpleTokenizer(vocab_map, word_tokenizer, lower=args["lower"]) vocab_list = list(tokenizer.vocab.keys()) config = BertConfig.from_pretrained('bert-base-uncased') if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] config.num_hidden_layers = args["num_layers"] if args['model_checkpoint'] == 'word2vec-twitter': embeddings = gen_embeddings(vocab_list, emb_path) config.hidden_size = 400 config.num_attention_heads = 8 if args['model_checkpoint'] == 'fasttext-cc-id' or args[ 'model_checkpoint'] == 'fasttext-cc-id-300-no-oov-uncased' or args[ 'model_checkpoint'] == 'fasttext-4B-id-300-no-oov-uncased': embeddings = gen_embeddings(vocab_list, emb_path, emb_dim=300) config.hidden_size = 300 config.num_attention_heads = 10 config.vocab_size = len(embeddings) # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'token_classification' == args['task']: model = BertForWordClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'scratch' in args['model_checkpoint']: vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") config = BertConfig.from_pretrained("bert-base-uncased") if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] config.num_hidden_layers = args["num_layers"] config.hidden_size = 300 config.num_attention_heads = 10 if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config=config) elif 'token_classification' == args['task']: model = BertForWordClassification(config=config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config=config) elif 'indobenchmark' in args['model_checkpoint']: # indobenchmark models # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint']) config = BertConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = BertForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) return model, tokenizer, vocab_path, config_path
train_loader = NerDataLoader(dataset=train_dataset, max_seq_len=max_seq_len, batch_size=batch_size, num_workers=16, shuffle=True) valid_loader = NerDataLoader(dataset=valid_dataset, max_seq_len=max_seq_len, batch_size=eval_batch_size, num_workers=16, shuffle=False) w2i, i2w = NerShopeeDataset.LABEL2INDEX, NerShopeeDataset.INDEX2LABEL # Instantiate model model = BertForWordClassification.from_pretrained(model_name, config=config) ## RESUME model_epoch = 2 output_model = "{}model-{}.pth".format(model_dir[:-1] + "-last_trained/", model_epoch) logger.info("Loaded model: {}".format(output_model)) checkpoint = torch.load(output_model, map_location='cpu') model.load_state_dict(checkpoint['model_state_dict']) learning_rate = 5e-6 optimizer = optim.Adam(model.parameters(), lr=learning_rate) model = model.cuda() logger.info("Model: indobert-{}".format(model_version))