def main(): """This is where it happens""" tok = T5Tokenizer.from_pretrained('t5-small') data = Data( xmi_dir=args.xmi_dir, tokenizer=tok, max_input_length=args.max_input_length, max_output_length=args.max_output_length, partition=args.partition, n_files=args.n_files) for index in range(len(data)): input_ids = data[index]['input_ids'] output_ids = data[index]['decoder_input_ids'] print(tok.decode(input_ids, skip_special_tokens=True)) print(tok.decode(output_ids, skip_special_tokens=True)) print()
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True): super(T5, self).__init__() self.config_keys = ['max_seq_length', 'do_lower_case'] self.do_lower_case = do_lower_case if max_seq_length > 512: logging.warning( "T5 only allows a max_seq_length of 512. Value will be set to 512" ) max_seq_length = 512 self.max_seq_length = max_seq_length self.enc_model = T5Model.from_pretrained(model_name_or_path) self.tokenizer = T5Tokenizer.from_pretrained( model_name_or_path, do_lower_case=do_lower_case)
def __init__(self, type_path: str, input_length: int, output_length: int, num_samples: int = None, tokenizer=T5Tokenizer.from_pretrained('t5-small'), sql2txt: bool = True) -> None: self.dataset = load_dataset('wikisql', 'all', data_dir='data/', split=type_path) if num_samples: self.dataset = self.dataset.select(list(range(0, num_samples))) self.input_length = input_length self.tokenizer = tokenizer self.output_length = output_length self.sql2txt = sql2txt
def __init__( self, model_name_or_path, tokenizer_name, model_cache_dir, input_max_length, target_max_length, summary_column_name, document_column_name, title_column_name, summarize_prefix, title_prefix, wandb_project, wandb_run_name, version_column=None, **kwargs, ): super().__init__( input_max_length, target_max_length, summary_column_name, document_column_name, wandb_project, wandb_run_name, ) self.title_column_name = title_column_name self.tokenizer = T5Tokenizer.from_pretrained( tokenizer_name if tokenizer_name else model_name_or_path, cache_dir=model_cache_dir, ) self.model = T5ForConditionalGeneration.from_pretrained( model_name_or_path, cache_dir=model_cache_dir, ) self.summarize_prefix = summarize_prefix self.title_prefix = title_prefix self.version_column = version_column self.summarize_prefixes = { "en": "summarize", "de": "zusammenfassen", "fr": "résume" } self.title_prefixes = {"en": "title", "de": "titel", "fr": "titre"}
def get_tokenizer(self, config): tokenizer = None # if not configured, then no need to assign if config.tokenizer_type.startswith('word'): tokenizer = nltk.word_tokenize elif config.tokenizer_type.startswith('bert-'): tokenizer = BertTokenizer.from_pretrained(config.tokenizer_type, do_lower_case=True) elif config.tokenizer_type.startswith('xlnet'): tokenizer = XLNetTokenizer.from_pretrained(config.tokenizer_type, do_lower_case=True) elif config.tokenizer_type.startswith('t5-'): tokenizer = T5Tokenizer.from_pretrained(config.tokenizer_type, do_lower_case=True) elif config.tokenizer_type.startswith('bart-'): tokenizer = BartTokenizer.from_pretrained(config.tokenizer_type, do_lower_case=True) return tokenizer
def __init__(self, s2v_model_path='s2v_old', qg_model_path='Parth/result', bq_model_path='ramsrigouthamg/t5_boolean_questions', ap_model_path='Parth/boolean', t5_tokenizer_path='t5-base'): self.tokenizer = T5Tokenizer.from_pretrained(t5_tokenizer_path) self.normalized_levenshtein = NormalizedLevenshtein() self.rand = random.Random(datetime.now()) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.qg_model = T5ForConditionalGeneration.from_pretrained( qg_model_path).to(self.device) self.bq_model = T5ForConditionalGeneration.from_pretrained( bq_model_path).to(self.device) self.ap_model = T5ForConditionalGeneration.from_pretrained( ap_model_path).to(self.device)
def _test_TFT5Model(self, size, large=False): from transformers import T5Tokenizer, TFT5Model tokenizer = T5Tokenizer.from_pretrained(size) model = TFT5Model.from_pretrained(size) input_ids = \ tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="tf").input_ids decoder_input_ids = \ tokenizer("Studies show that", return_tensors="tf").input_ids input_dict = { "input_ids": input_ids, "decoder_input_ids": decoder_input_ids } spec, input_dict = self.spec_and_pad(input_dict) outputs = ["last_hidden_state"] self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large)
def summarize(function_directory, text): model_path = get_model_path(function_directory) logging.info(f"Loading model from {model_path}") start = time() tokenizer = T5Tokenizer.from_pretrained(model_path) model = T5ForConditionalGeneration.from_pretrained(model_path) logging.info(f"Model loaded in {round(time()-start, 2)}s.") logging.info("Tokenizing data...") input_text = tokenizer.encode(f"summarize: :{text}", return_tensors="pt") start = time() translated = model.generate(input_text) logging.info(f"Model executed in {round(time()-start, 2)}s.") logging.info("Generating result...") start = time() result = tokenizer.decode(translated[0], skip_special_tokens=True) logging.info(f"Result generated in {round(time()-start, 2)}s.") return result
def question_generation(text): model = T5ForConditionalGeneration.from_pretrained( 'ramsrigouthamg/t5_boolean_questions') tokenizer = T5Tokenizer.from_pretrained('t5-small') passage = text truefalse = "yes" text = "truefalse: %s passage: %s </s>" % (passage, truefalse) max_len = 256 encoding = tokenizer.encode_plus(text, return_tensors="pt") input_ids, attention_masks = encoding["input_ids"], encoding[ "attention_mask"] output = beam_search_decoding(input_ids, attention_masks, model, tokenizer) return output
def __init__(self, **kwargs): """ Initialize T5 embedder. :param str model_directory: where the weights of the model can be found :param device: whether to compute on the CPU or GPU :type device: str or torch.device or None :param bool decoder: Whether to use also the decoder (default: False) :param bool half_precision_model: Use the model in half precision (float16) mode (default: False) """ # HIWI Benjamin # The user can use the half precision model either by specifiing the path or setting the flag # The half precision model with be used if either the flag is set or a path providied # This is performed before calling super so that the paths can be fetched if not provided if ('half_precision_model' in kwargs.keys() or 'half_precision_model_directory' in kwargs.keys()) and 'model_directory' not in kwargs.keys(): # the necessary directories are changed since now 'model_directory' isn't needed but 'half_precision_model_dir' is self.necessary_directories = ["half_precision_model_directory"] # if the path was provided and the flag wasn't this sets the flag for later use kwargs['half_precision_model'] = True super().__init__(**kwargs) # set the model directory depending on whether to use half precision if 'half_precision_model' in kwargs.keys( ) and 'model_directory' not in kwargs.keys(): self._model_directory = self._options[ "half_precision_model_directory"] else: self._model_directory = self._options["model_directory"] # Until we know whether we need the decoder, let's keep it here as an undocumented option. # Should the need arise we can just split this class in to an encoder and a decoder subclass # by setting one subclass to _decoder=True and the other to _decoder=False self._decoder = self._options.get("decoder", False) self._half_precision_model = self._options.get("half_precision_model", False) self._model = self.get_model().to(self._device).eval() self._model_fallback = None self._tokenizer = T5Tokenizer.from_pretrained(self._model_directory, do_lower_case=False)
def __init__(self, hparams: argparse.Namespace, num_labels=None, **config_kwargs) -> 'T5QaModel': super().__init__() self.hparams = hparams cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None self.config = T5Config.from_pretrained( self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path, **({ "num_labels": num_labels } if num_labels is not None else {}), cache_dir=cache_dir, **config_kwargs, ) self.tokenizer = T5Tokenizer.from_pretrained( self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path, cache_dir=cache_dir, ) self.model = T5ForConditionalGeneration.from_pretrained( self.hparams.model_name_or_path, from_tf=bool(".ckpt" in self.hparams.model_name_or_path), config=self.config, cache_dir=cache_dir, ) # fix for eos token id problem # see https://github.com/huggingface/transformers/issues/5142 for more info on the problem and workaround if self.tokenizer.eos_token_id == 1: self.tokenizer.add_special_tokens({'eos_token': '[EOS]'}) self.model.resize_token_embeddings(len(self.tokenizer)) self.dataset_kwargs: dict = dict( data_dir=self.hparams.input_dir, max_source_length=1024, max_target_length=56, ) self.loss_names = ["loss"] self.metric_names = ROUGE_KEYS self.val_metric = "rouge2"
def Summarize(document): model = T5ForConditionalGeneration.from_pretrained('t5-small') tokenizer = T5Tokenizer.from_pretrained('t5-small') device = torch.device('cpu') preprocess_text = document.strip().replace("\n", " ") t5_prepared_Text = "summarize: " + preprocess_text tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt", max_length=5000, truncation=True).to(device) summary_ids = model.generate(tokenized_text, num_beams=2, no_repeat_ngram_size=2, min_length=10, max_length=100, early_stopping=False) output = tokenizer.decode(summary_ids[0], skip_special_tokens=True) return output
def __init__(self, hparams): super().__init__() #Parameters stored in dictionary self.hparams = hparams #Tokenizer for decoding sentences self.tokenizer = T5Tokenizer.from_pretrained(self.hparams.t5_model) #Decoder -> Decode image embedding combined with the last hidden state of the encoder self.decoder = T5ForConditionalGeneration.from_pretrained(self.hparams.t5_model) #Sentence encoder -> just transformer encoder for questions if self.hparams.same_enc: self.sentence_encoder = self.decoder.get_encoder() else: self.sentence_encoder = T5EncoderModel.from_pretrained(self.hparams.t5_model) self.sync_dist = self.hparams.gpus > 1
def __init__(self, model_size: str = "small", num_beams: int = 4, no_repeat_ngram_size: int = 2, min_length: int = 30, max_length: int = 100, skip_special_tokens: bool = True): if model_size not in ["small", "base", "large", "xl", "xxl"]: raise ValueError(f"""model_size \"{model_size}\" not found. It might be a typo; if not, please consult our document.""") self.model = MT5ForConditionalGeneration.from_pretrained( f'google/mt5-{model_size}') self.tokenizer = T5Tokenizer.from_pretrained( f'google/mt5-{model_size}') self.num_beams = num_beams self.no_repeat_ngram_size = no_repeat_ngram_size self.min_length = min_length self.max_length = max_length self.skip_special_tokens = skip_special_tokens
def __init__(self, lang_code='en', max_questions=20): self.tokenizer = T5Tokenizer.from_pretrained('t5-base') model = T5ForConditionalGeneration.from_pretrained('Parth/result') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # model.eval() self.device = device self.model = model self.nlp = self.try_load_spacy_model(lang_code) self.max_questions = int(max_questions) self.s2v = Sense2Vec().from_disk( '/Users/dev/Develop/text-to-anki/backend/src/Questgen.ai/Questgen.ai/Questgen/s2v_old' ) self.fdist = FreqDist(brown.words()) self.normalized_levenshtein = NormalizedLevenshtein() self.set_seed(42)
def main(): document = {} document['uuid'] = '1234567' document['text'] = "Que tal fazer uma poc inicial para vermos a viabilidade e identificarmos as dificuldades?\nA motivação da escolha desse problema " \ "foi que boa parte dos atos de matrícula passam de 512 tokens, e ainda não temos uma solução definida para fazer treinamento e predições em " \ "janelas usando o QA.\nEssa limitação dificulta o uso de QA para problemas que não sabemos onde a informação está no documento (por enquanto, " \ "só aplicamos QA em tarefas que sabemos que a resposta está nos primeiros 512 tokens da matrícula).\nComo esse problema de identificar a proporção " \ "de cada pessoa são duas tarefas (identificação + relação com uma pessoa), podemos usar a localização da pessoa no texto para selecionar apenas " \ "uma pedaço do ato de alienação pra passar como contexto pro modelo, evitando um pouco essa limitação dos 512 tokens." document[ 'text'] = "PREFEITURA DE CAUCAIA\nSECRETARIA DE FINAN\u00c7AS,PLANEJAMENTO E OR\u00c7AMENTO\nCERTID\u00c3O NEGATIVA DE TRIBUTOS ECON\u00d4MICOS\nLA SULATE\nN\u00ba 2020000982\nRaz\u00e3o Social\nCOMPASS MINERALS AMERICA DO SUL INDUSTRIA E COMERC\nINSCRI\u00c7\u00c3O ECON\u00d4MICA Documento\nBairro\n00002048159\nC.N.P.J.: 60398138001860\nSITIO SALGADO\nLocalizado ROD CE 422 KM 17, S/N - SALA SUPERIOR 01 CXP - CAUCAIA-CE\nCEP\n61600970\nDADOS DO CONTRIBUINTE OU RESPONS\u00c1VEL\nInscri\u00e7\u00e3o Contribuinte / Nome\n169907 - COMPASS MINERALS AMERICA DO SUL INDUSTRIA E COMERC\nEndere\u00e7o\nROD CE 422 KM 17, S/N SALA SUPERIOR 01 CXP\nDocumento\nC.N.P.J.: 60.398.138/0018-60\nSITIO SALGADO CAUCAIA-CE CEP: 61600970\nNo. Requerimento\n2020000982/2020\nNatureza jur\u00eddica\nPessoa Juridica\nCERTID\u00c3O\nCertificamos para os devidos fins, que revendo os registros dos cadastros da d\u00edvida ativa e de\ninadimplentes desta Secretaria, constata-se - at\u00e9 a presente data \u2013 n\u00e3o existirem em nome do (a)\nrequerente, nenhuma pend\u00eancia relativa a tributos municipais.\nSECRETARIA DE FINAN\u00c7AS, PLANEJAMENTO E OR\u00c7AMENTO se reserva o direito de inscrever e cobrar as\nd\u00edvidas que posteriormente venham a ser apurados. Para Constar, foi lavrada a presente Certid\u00e3o.\nA aceita\u00e7\u00e3o desta certid\u00e3o est\u00e1 condicionada a verifica\u00e7\u00e3o de sua autenticidade na internet, nos\nseguinte endere\u00e7o: http://sefin.caucaia.ce.gov.br/\nCAUCAIA-CE, 03 DE AGOSTO DE 2020\nEsta certid\u00e3o \u00e9 v\u00e1lida por 090 dias contados da data de emiss\u00e3o\nVALIDA AT\u00c9: 31/10/2020\nCOD. VALIDA\u00c7\u00c3O 2020000982" document[ 'text'] = 'M Santander\nProposta de Abertura de Conta Poupança, Utilizando de\nProdutos e Serviços e Outras Avenças - Pessoa Física\nP3ID008159105563\n1695\nAgência Nº\nQuantidade de Titulares\nPAB N°\n1\nCondição de Movimentação da Conta\nModalidade de Poupança\nPOUPANCA ESPECIAL PF\nConta Poupança\n0033-1695-000600100141\nConta Corrente Vinculada\nConta Corrente Associada\nDados Básicos do Titular 1\nCPF |06621595271\nNome Completo\nTAISSA RIBEIRO GOMES\nDocumento de Identificação\n02-IDENTIDADE-RG\nNº do Documento \\/ N° da Série (CTPS)\n19117457\nÓrgão Emissor\nPC\nUF PA\nData de Emissão\n24\\/09\\/2018\nData de Vencimento\nData de Nascimento\n(12\\/04\\/2002\nCartório\nNº Livro\nNº Folha\nSexo FEMININO\nPaís de Nascimento\nBRASIL\nNacionalidade\nBRASILEIRA\nNaturalidade\nUF PA\nALTAMIRA\nSOLTEIRO(A)\nEstado Civil\nCondição Pessoal\n101-MAIOR COM RENDA\nNome da Mãe\nCREUZA ROSA RIBEIRO\nNome do Pai\nWELSON GOMES\nCidadania\nBRASILEIRA\nOutro domicilio fiscal\n| BRASIL\nEndereços\nEndereço Residencial\nRua\\/Av\\/Pça\\/Estrada\nTV AGRARIO CAVALCANTE\nNúmero\n| 338 _\n\\/\nComplemento\nCASA 04 VILA\nBairro\nRECREIO\nMunicipio ALTAMIRA\nPaís BRASIL\n| UF PA\n168371140\nCEP\nEndereço Comercial\nRua\\/Av\\/Pça\\/Estrada\nNúmero\nComplemento\nBairro\nMunicípio\n| UF\nUF |\nPaís BRASIL\nCEP\nEndereço Alternativo\nRua\\/Av\\/Pça\\/Estrada\nNúmero\nComplemento\nBairro |\nPag. 1 16\n' context_content = 'token' #'position_token' start_position = 158 max_size = 200 # tokenizer = T5Tokenizer.from_pretrained('models/', do_lower_case=False) tokenizer = T5Tokenizer.from_pretrained( '/home/ramonpires/git/NLP/qa-t5/models/', do_lower_case=False) max_tokens = 512 #150 question = 'Qual é a proporção?' context, offset = get_context(document, context_content=context_content, max_size=max_size, start_position=start_position, proportion_before=0.2, return_position_offset=True, tokenizer=tokenizer, max_tokens=max_tokens, question=question, window_overlap=0.5, verbose=True) print('--> testing the offset:') if isinstance(context, list): context, offset = context[-1], offset[-1] # last window print('>>>>>>>>>> using the offset\n' + document['text'][offset:offset + len(context)]) print('>>>>>>>>>> returned context\n' + context)
def abstractive(text): model = T5ForConditionalGeneration.from_pretrained('t5-small') tokenizer = T5Tokenizer.from_pretrained('t5-small') device = torch.device('cpu') t5_prepared_Text = "summarize: " + text # print ("original text preprocessed: \n", preprocess_text) tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt").to(device) # summmarize summary_ids = model.generate(tokenized_text, num_beams=4, no_repeat_ngram_size=2, min_length=30, max_length=100, early_stopping=True) output = tokenizer.decode(summary_ids[0], skip_special_tokens=True) return output
def __init__( self, name, model_name, input_max_length, device, batch_size, summarize_prefix, title_prefix, ): super().__init__(name) if isinstance(model_name, str): model_name = [model_name, model_name] self.tokenizer = T5Tokenizer.from_pretrained(model_name[0]) self.model = T5ForConditionalGeneration.from_pretrained(model_name[1]) self.input_max_length = input_max_length self.device = device self.batch_size = batch_size self.summarize_prefix = summarize_prefix self.title_prefix = title_prefix
def load_pretained_model_and_tokenizer( base_model: str, model_dict_path: str, gpu_device: str, eval=False, ): ''' Load pretainted T5 model on UnifiedQA base_model: base model name for T5 model_dict_path: trained model checkpoint for unifiedQA ''' tokenizer = T5Tokenizer.from_pretrained(base_model) model = T5ForConditionalGeneration(T5Config.from_pretrained(base_model)) if eval: model = torch.load(model_dict_path, map_location=gpu_device) else: load_tf_weights_in_t5(model, None, model_dict_path) return tokenizer, model
def summarize(text): model = T5ForConditionalGeneration.from_pretrained("t5-base") tokenizer = T5Tokenizer.from_pretrained("t5-base") # encode the text into tensor of integers using the appropriate tokenizer inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True) outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, no_repeat_ngram_size=2, num_return_sequences=4, early_stopping=True) # just for debugging #print(outputs) print(tokenizer.decode(outputs[0])) return tokenizer.decode(outputs[0])
def get_summary(self, text): modelInfo = self.models.get("doc_summarization") model = modelInfo['model'] model.eval() tokenizer = T5Tokenizer.from_pretrained("t5-small") df = pd.DataFrame({'text': [""], 'ctext': [text]}) params = { 'batch_size': 1, 'shuffle': True, 'num_workers': 0 } loader = DataLoader( CustomDataset( df, tokenizer, modelInfo["max_text_length"], modelInfo["max_sum_length"]), **params) predictions, truth = inference(tokenizer, model, "cpu", loader) return predictions[0]
def perform_fine_tuning(): """Fine-tune and save model""" # import data provider (e.g. dtr, rel, or events) data = importlib.import_module(args.data_reader) # need this to save a fine-tuned model if os.path.isdir(args.model_dir): shutil.rmtree(args.model_dir) os.mkdir(args.model_dir) # load pretrained T5 tokenizer tokenizer = T5Tokenizer.from_pretrained(args.model_name) # load a pretrained T5 model model = T5ForConditionalGeneration.from_pretrained(args.model_name) train_dataset = data.Data(xmi_dir=args.xmi_dir, tokenizer=tokenizer, max_input_length=args.max_input_length, max_output_length=args.max_output_length, partition='train', n_files=args.n_files) train_data_loader = DataLoader(train_dataset, shuffle=True, batch_size=args.train_batch_size) val_dataset = data.Data(xmi_dir=args.xmi_dir, tokenizer=tokenizer, max_input_length=args.max_input_length, max_output_length=args.max_output_length, partition='dev', n_files=args.n_files) val_data_loader = DataLoader(val_dataset, shuffle=False, batch_size=args.train_batch_size) # fine-tune model on thyme data and save it best_loss, optimal_epochs = fit(model, train_data_loader, val_data_loader, tokenizer) print('best loss %.3f after %d epochs\n' % (best_loss, optimal_epochs))
def __init__(self, hparams): """For uninitiated""" super(T5FineTuner, self).__init__() self.hparams = hparams self.model = T5ForConditionalGeneration.from_pretrained( hparams.model_name_or_path) self.tokenizer = T5Tokenizer.from_pretrained( hparams.tokenizer_name_or_path) self.rouge_metric = load_metric('rouge') n_observations_per_split = { "train": self.hparams.n_train, "validation": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = { k: v if v >= 0 else None for k, v in n_observations_per_split.items() }
def __init__(self, hparams): super().__init__() self.hparams = hparams if "base" in self.hparams.size: self.embedding_extractor = nn.Sequential(ConvBlock(3, 16), ConvBlock(16, 64), ConvBlock(64, 256), ConvBlock(256, 512), ConvBlock(512, 768), Feature2Embedding(768)) else: self.embedding_extractor = nn.Sequential(ConvBlock(3, 16), ConvBlock(16, 64), ConvBlock(64, 256), ConvBlock(256, 512), Feature2Embedding(512)) print(f"Embedding extractor:\n{self.embedding_extractor}") self.decoder = T5ForConditionalGeneration.from_pretrained( self.hparams.size) self.tokenizer = T5Tokenizer.from_pretrained(self.hparams.size)
def test_greedy_generate(self): model = TFT5ForConditionalGeneration.from_pretrained("t5-small") tokenizer = T5Tokenizer.from_pretrained("t5-small") sentences = ["Yesterday, my name was", "Today is a beautiful day and"] input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids generation_kwargs = { "bad_words_ids": [tokenizer("my").input_ids, tokenizer("ein schöner").input_ids], "no_repeat_ngram_size": 3, "do_sample": False, "repetition_penalty": 2.2, } output_ids = model.generate(input_ids, **generation_kwargs) output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) expected_output_string = ["Yesterday, my name was", "Heute ist ein schöne Tag und"] self.assertListEqual(expected_output_string, output_strings)
def custom_init(self): self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.tokenizer = T5Tokenizer.from_pretrained( "/usr/src/WHOA-FAQ-Answer-Project/WHO-FAQ-Search-Engine/variation_generation/models/" ) config = T5Config.from_json_file( '/usr/src/WHOA-FAQ-Answer-Project/WHO-FAQ-Search-Engine/variation_generation/T5config.json' ) # TODO : Add model weight download # self.model = torch.load(path, map_location=self.device) self.model = T5ForConditionalGeneration.from_pretrained(\ self.path, from_tf=True, config=config) self.model.to(self.device) self.model.eval() self.max_length = self.max_length self.num_variations = self.num_variations self.initialised = True
def create_model(checkpoint_path): """Return a T5 model. """ if os.path.isdir(checkpoint_path): model = T5ForConditionalGeneration.from_pretrained(checkpoint_path) else: model = T5ForConditionalGeneration.from_pretrained('t5-small') os.mkdir(checkpoint_path) tokenizer = T5Tokenizer.from_pretrained('t5-small') optimizer = Adafactor( params=model.parameters(), lr=1e-4, eps=(1e-30, 1e-3), clip_threshold=1.0, decay_rate=-0.8, beta1=None, weight_decay=0.0, relative_step=False, scale_parameter=False, warmup_init=False) return model, optimizer, tokenizer
def __init__(self, settings: NtrSettings = NtrSettings(), device: str = None): super().__init__("Ntr", verbose=settings.verbose) # Model settings self.max_length = settings.max_length self.num_beams = settings.num_beams self.early_stopping = settings.early_stopping device = device or ("cuda" if torch.cuda.is_available() else "cpu") self.device = torch.device(device) if self.verbose: logging.info( f"Initializing T5 using model {settings.model_name}...") self.model = (T5ForConditionalGeneration.from_pretrained( settings.model_name).to(device).eval()) self.tokenizer = T5Tokenizer.from_pretrained(settings.model_name) self.nlp = English() self.history = []
def generate_summaries(lns, output_file_path, model_size, batch_size, device): output_file = Path(output_file_path).open("w", encoding="utf-8") model = T5ForConditionalGeneration.from_pretrained(model_size) model.to(device) tokenizer = T5Tokenizer.from_pretrained(model_size) # update config with summarization specific params task_specific_params = model.config.task_specific_params if task_specific_params is not None: model.config.update(task_specific_params.get("summarization", {})) counter = 0 for batch in tqdm(list(chunks(lns, batch_size))): batch = [model.config.prefix + text for text in batch] dct = tokenizer.batch_encode_plus(batch, max_length=512, return_tensors="pt", pad_to_max_length=True) input_ids = dct["input_ids"].to(device) attention_mask = dct["attention_mask"].to(device) summaries = model.generate(input_ids=input_ids, attention_mask=attention_mask) dec = [ tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries ] for hypothesis in dec: output_file.write(hypothesis + "\n") output_file.flush() counter += 1 if counter > 100: break
def __init__(self, rouge_metrics=None, lang="english"): if rouge_metrics is None: rouge_metrics = ['rouge1', 'rougeL', 'rougeLsum'] warnings.warn( f"Rouge metrics not defined, using default metrics {rouge_metrics}." ) self.LANGUAGE = lang stemmer = Stemmer(self.LANGUAGE) # single-doc LexRank self.lr_sum = LexRankSummarizer(stemmer) self.lr_sum.stop_words = get_stop_words(self.LANGUAGE) # single-doc LSA self.lsa_sum = LsaSummarizer(stemmer) self.lsa_sum.stop_words = get_stop_words(self.LANGUAGE) # single-doc TextRank self.tr_sum = TextRankSummarizer(stemmer) self.tr_sum.stop_words = get_stop_words(self.LANGUAGE) # single-doc T5 self.t5_sum_model = T5ForConditionalGeneration.from_pretrained( 't5-base') self.t5_sum_tokenizer = T5Tokenizer.from_pretrained('t5-base') # single-doc BART self.bart_tokenizer = BartTokenizer.from_pretrained( 'facebook/bart-large-cnn') # self.bart_sum_model = pipeline('summarization', model='facebook/bart-large-cnn', # tokenizer='facebook/bart-large-cnn') self.bart_sum_model = BartForConditionalGeneration.from_pretrained( 'facebook/bart-large-cnn') # SCORES # what is stemming? - https://en.wikipedia.org/wiki/Stemming # Stemming is the process of reducing words to their root form. # For example: contesting -> contest ; contestant -> contest self.scorer = rouge_scorer.RougeScorer(rouge_metrics, use_stemmer=True)