def construct_seq_class_transformer(options: KaggleEvaluationOptions) -> Reranker: try: model = MonoBERT.get_model(options.model_name, device=options.device) except OSError: try: model = MonoBERT.get_model( options.model_name, from_tf=True, device=options.device) except AttributeError: # Hotfix for BioBERT MS MARCO. Refactor. BertForSequenceClassification.bias = torch.nn.Parameter( torch.zeros(2)) BertForSequenceClassification.weight = torch.nn.Parameter( torch.zeros(2, 768)) model = BertForSequenceClassification.from_pretrained( options.model_name, from_tf=True) model.classifier.weight = BertForSequenceClassification.weight model.classifier.bias = BertForSequenceClassification.bias device = torch.device(options.device) model = model.to(device).eval() tokenizer = MonoBERT.get_tokenizer( options.tokenizer_name, do_lower_case=options.do_lower_case) return MonoBERT(model, tokenizer)
def construct_seq_class_transformer( options: DocumentRankingEvaluationOptions) -> Reranker: model = MonoBERT.get_model(options.model, from_tf=options.from_tf, device=options.device) tokenizer = MonoBERT.get_tokenizer(options.tokenizer_name) return MonoBERT(model, tokenizer)
def build_bert_reranker( name_or_path: str = "castorini/monobert-large-msmarco-finetune-only", device: str = None, ): """Returns a BERT reranker using the provided model name or path to load from""" model = MonoBERT.get_model(name_or_path, device=device) tokenizer = MonoBERT.get_tokenizer(name_or_path) return MonoBERT(model, tokenizer)
def construct_seq_class_transformer( options: DocumentRankingEvaluationOptions) -> Reranker: model = AutoModelForSequenceClassification.from_pretrained( options.model, from_tf=options.from_tf) device = torch.device(options.device) model = model.to(device).eval() tokenizer = AutoTokenizer.from_pretrained(options.tokenizer_name) return MonoBERT(model, tokenizer)
def construct_seq_class_transformer( options: PassageRankingEvaluationOptions) -> Reranker: try: model = AutoModelForSequenceClassification.from_pretrained( options.model, from_tf=options.from_tf) except AttributeError: # Hotfix for BioBERT MS MARCO. Refactor. BertForSequenceClassification.bias = torch.nn.Parameter(torch.zeros(2)) BertForSequenceClassification.weight = torch.nn.Parameter( torch.zeros(2, 768)) model = BertForSequenceClassification.from_pretrained( options.model, from_tf=options.from_tf) model.classifier.weight = BertForSequenceClassification.weight model.classifier.bias = BertForSequenceClassification.bias device = torch.device(options.device) model = model.to(device).eval() tokenizer = AutoTokenizer.from_pretrained(options.tokenizer_name) return MonoBERT(model, tokenizer)
import nltk from bs4 import BeautifulSoup from nltk.tokenize import word_tokenize from collections import Counter from nltk.corpus import stopwords from pygaggle.rerank.base import Query, Text from pygaggle.rerank.transformer import MonoT5 from nltk.tokenize import word_tokenize from pygaggle.rerank.transformer import MonoBERT from pygaggle.rerank.base import hits_to_texts from expanders.relevancefeedback import RelevanceFeedback from cmn import utils reranker = MonoBERT() #@inproceedings{zheng-etal-2020-bert, # title = "{BERT-QE}: {C}ontextualized {Q}uery {E}xpansion for {D}ocument {R}e-ranking", # author = "Zheng, Zhi and Hui, Kai and He, Ben and Han, Xianpei and Sun, Le and Yates, Andrew", # booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020", # month = nov, # year = "2020", # address = "Online", # publisher = "Association for Computational Linguistics", # url = "https://www.aclweb.org/anthology/2020.findings-emnlp.424", # pages = "4718--4728", #} class BertQE(RelevanceFeedback):