示例#1
0
def main():
  """This is where it happens"""

  tok = T5Tokenizer.from_pretrained('t5-small')
  data = Data(
    xmi_dir=args.xmi_dir,
    tokenizer=tok,
    max_input_length=args.max_input_length,
    max_output_length=args.max_output_length,
    partition=args.partition,
    n_files=args.n_files)

  for index in range(len(data)):
    input_ids = data[index]['input_ids']
    output_ids = data[index]['decoder_input_ids']
    print(tok.decode(input_ids, skip_special_tokens=True))
    print(tok.decode(output_ids, skip_special_tokens=True))
    print()
示例#2
0
    def __init__(self,
                 model_name_or_path: str,
                 max_seq_length: int = 128,
                 do_lower_case: bool = True):
        super(T5, self).__init__()
        self.config_keys = ['max_seq_length', 'do_lower_case']
        self.do_lower_case = do_lower_case

        if max_seq_length > 512:
            logging.warning(
                "T5 only allows a max_seq_length of 512. Value will be set to 512"
            )
            max_seq_length = 512
        self.max_seq_length = max_seq_length

        self.enc_model = T5Model.from_pretrained(model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(
            model_name_or_path, do_lower_case=do_lower_case)
示例#3
0
    def __init__(self,
                 type_path: str,
                 input_length: int,
                 output_length: int,
                 num_samples: int = None,
                 tokenizer=T5Tokenizer.from_pretrained('t5-small'),
                 sql2txt: bool = True) -> None:

        self.dataset = load_dataset('wikisql',
                                    'all',
                                    data_dir='data/',
                                    split=type_path)
        if num_samples:
            self.dataset = self.dataset.select(list(range(0, num_samples)))
        self.input_length = input_length
        self.tokenizer = tokenizer
        self.output_length = output_length
        self.sql2txt = sql2txt
 def __init__(
     self,
     model_name_or_path,
     tokenizer_name,
     model_cache_dir,
     input_max_length,
     target_max_length,
     summary_column_name,
     document_column_name,
     title_column_name,
     summarize_prefix,
     title_prefix,
     wandb_project,
     wandb_run_name,
     version_column=None,
     **kwargs,
 ):
     super().__init__(
         input_max_length,
         target_max_length,
         summary_column_name,
         document_column_name,
         wandb_project,
         wandb_run_name,
     )
     self.title_column_name = title_column_name
     self.tokenizer = T5Tokenizer.from_pretrained(
         tokenizer_name if tokenizer_name else model_name_or_path,
         cache_dir=model_cache_dir,
     )
     self.model = T5ForConditionalGeneration.from_pretrained(
         model_name_or_path,
         cache_dir=model_cache_dir,
     )
     self.summarize_prefix = summarize_prefix
     self.title_prefix = title_prefix
     self.version_column = version_column
     self.summarize_prefixes = {
         "en": "summarize",
         "de": "zusammenfassen",
         "fr": "résume"
     }
     self.title_prefixes = {"en": "title", "de": "titel", "fr": "titre"}
示例#5
0
    def get_tokenizer(self, config):
        tokenizer = None
        # if not configured, then no need to assign
        if config.tokenizer_type.startswith('word'):
            tokenizer = nltk.word_tokenize
        elif config.tokenizer_type.startswith('bert-'):
            tokenizer = BertTokenizer.from_pretrained(config.tokenizer_type,
                                                      do_lower_case=True)
        elif config.tokenizer_type.startswith('xlnet'):
            tokenizer = XLNetTokenizer.from_pretrained(config.tokenizer_type,
                                                       do_lower_case=True)
        elif config.tokenizer_type.startswith('t5-'):
            tokenizer = T5Tokenizer.from_pretrained(config.tokenizer_type,
                                                    do_lower_case=True)
        elif config.tokenizer_type.startswith('bart-'):
            tokenizer = BartTokenizer.from_pretrained(config.tokenizer_type,
                                                      do_lower_case=True)

        return tokenizer
示例#6
0
    def __init__(self,
                 s2v_model_path='s2v_old',
                 qg_model_path='Parth/result',
                 bq_model_path='ramsrigouthamg/t5_boolean_questions',
                 ap_model_path='Parth/boolean',
                 t5_tokenizer_path='t5-base'):

        self.tokenizer = T5Tokenizer.from_pretrained(t5_tokenizer_path)
        self.normalized_levenshtein = NormalizedLevenshtein()
        self.rand = random.Random(datetime.now())

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.qg_model = T5ForConditionalGeneration.from_pretrained(
            qg_model_path).to(self.device)
        self.bq_model = T5ForConditionalGeneration.from_pretrained(
            bq_model_path).to(self.device)
        self.ap_model = T5ForConditionalGeneration.from_pretrained(
            ap_model_path).to(self.device)
 def _test_TFT5Model(self, size, large=False):
     from transformers import T5Tokenizer, TFT5Model
     tokenizer = T5Tokenizer.from_pretrained(size)
     model = TFT5Model.from_pretrained(size)
     input_ids = \
         tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="tf").input_ids
     decoder_input_ids = \
         tokenizer("Studies show that", return_tensors="tf").input_ids
     input_dict = {
         "input_ids": input_ids,
         "decoder_input_ids": decoder_input_ids
     }
     spec, input_dict = self.spec_and_pad(input_dict)
     outputs = ["last_hidden_state"]
     self.run_test(model,
                   input_dict,
                   input_signature=spec,
                   outputs=outputs,
                   large=large)
示例#8
0
def summarize(function_directory, text):
    model_path = get_model_path(function_directory)
    logging.info(f"Loading model from {model_path}")
    start = time()
    tokenizer = T5Tokenizer.from_pretrained(model_path)
    model = T5ForConditionalGeneration.from_pretrained(model_path)
    logging.info(f"Model loaded in {round(time()-start, 2)}s.")

    logging.info("Tokenizing data...")
    input_text = tokenizer.encode(f"summarize: :{text}", return_tensors="pt")
    start = time()
    translated = model.generate(input_text)
    logging.info(f"Model executed in {round(time()-start, 2)}s.")

    logging.info("Generating result...")
    start = time()
    result = tokenizer.decode(translated[0], skip_special_tokens=True)
    logging.info(f"Result generated in {round(time()-start, 2)}s.")
    return result
示例#9
0
def question_generation(text):

    model = T5ForConditionalGeneration.from_pretrained(
        'ramsrigouthamg/t5_boolean_questions')
    tokenizer = T5Tokenizer.from_pretrained('t5-small')
    passage = text
    truefalse = "yes"

    text = "truefalse: %s passage: %s </s>" % (passage, truefalse)

    max_len = 256

    encoding = tokenizer.encode_plus(text, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"], encoding[
        "attention_mask"]

    output = beam_search_decoding(input_ids, attention_masks, model, tokenizer)

    return output
示例#10
0
    def __init__(self, **kwargs):
        """
        Initialize T5 embedder.

        :param str model_directory: where the weights of the model can be found
        :param device: whether to compute on the CPU or GPU
        :type device: str or torch.device or None
        :param bool decoder: Whether to use also the decoder (default: False)
        :param bool half_precision_model: Use the model in half precision (float16) mode (default: False)
        """
        # HIWI Benjamin
        # The user can use the half precision model either by specifiing the path or setting the flag
        # The half precision model with be used if either the flag is set or a path providied
        # This is performed before calling super so that the paths can be fetched if not provided
        if ('half_precision_model' in kwargs.keys()
                or 'half_precision_model_directory'
                in kwargs.keys()) and 'model_directory' not in kwargs.keys():
            # the necessary directories are changed since now 'model_directory' isn't needed but 'half_precision_model_dir' is
            self.necessary_directories = ["half_precision_model_directory"]
            # if the path was provided and the flag wasn't this sets the flag for later use
            kwargs['half_precision_model'] = True

        super().__init__(**kwargs)

        # set the model directory depending on whether to use half precision
        if 'half_precision_model' in kwargs.keys(
        ) and 'model_directory' not in kwargs.keys():
            self._model_directory = self._options[
                "half_precision_model_directory"]
        else:
            self._model_directory = self._options["model_directory"]

        # Until we know whether we need the decoder, let's keep it here as an undocumented option.
        # Should the need arise we can just split this class in to an encoder and a decoder subclass
        # by setting one subclass to _decoder=True and the other to _decoder=False
        self._decoder = self._options.get("decoder", False)
        self._half_precision_model = self._options.get("half_precision_model",
                                                       False)

        self._model = self.get_model().to(self._device).eval()
        self._model_fallback = None
        self._tokenizer = T5Tokenizer.from_pretrained(self._model_directory,
                                                      do_lower_case=False)
示例#11
0
    def __init__(self,
                 hparams: argparse.Namespace,
                 num_labels=None,
                 **config_kwargs) -> 'T5QaModel':
        super().__init__()
        self.hparams = hparams
        cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
        self.config = T5Config.from_pretrained(
            self.hparams.config_name
            if self.hparams.config_name else self.hparams.model_name_or_path,
            **({
                "num_labels": num_labels
            } if num_labels is not None else {}),
            cache_dir=cache_dir,
            **config_kwargs,
        )
        self.tokenizer = T5Tokenizer.from_pretrained(
            self.hparams.tokenizer_name if self.hparams.tokenizer_name else
            self.hparams.model_name_or_path,
            cache_dir=cache_dir,
        )
        self.model = T5ForConditionalGeneration.from_pretrained(
            self.hparams.model_name_or_path,
            from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
            config=self.config,
            cache_dir=cache_dir,
        )

        # fix for eos token id problem
        # see https://github.com/huggingface/transformers/issues/5142 for more info on the problem and workaround
        if self.tokenizer.eos_token_id == 1:
            self.tokenizer.add_special_tokens({'eos_token': '[EOS]'})
            self.model.resize_token_embeddings(len(self.tokenizer))

        self.dataset_kwargs: dict = dict(
            data_dir=self.hparams.input_dir,
            max_source_length=1024,
            max_target_length=56,
        )

        self.loss_names = ["loss"]
        self.metric_names = ROUGE_KEYS
        self.val_metric = "rouge2"
示例#12
0
def Summarize(document):
    model = T5ForConditionalGeneration.from_pretrained('t5-small')
    tokenizer = T5Tokenizer.from_pretrained('t5-small')
    device = torch.device('cpu')
    preprocess_text = document.strip().replace("\n", " ")
    t5_prepared_Text = "summarize: " + preprocess_text
    tokenized_text = tokenizer.encode(t5_prepared_Text,
                                      return_tensors="pt",
                                      max_length=5000,
                                      truncation=True).to(device)
    summary_ids = model.generate(tokenized_text,
                                 num_beams=2,
                                 no_repeat_ngram_size=2,
                                 min_length=10,
                                 max_length=100,
                                 early_stopping=False)

    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return output
示例#13
0
 def __init__(self, hparams):
     super().__init__()
     
     #Parameters stored in dictionary
     self.hparams = hparams
     
     #Tokenizer for decoding sentences
     self.tokenizer = T5Tokenizer.from_pretrained(self.hparams.t5_model)
     
     #Decoder -> Decode image embedding combined with the last hidden state of the encoder
     self.decoder = T5ForConditionalGeneration.from_pretrained(self.hparams.t5_model)
     
     #Sentence encoder -> just transformer encoder for questions
     if self.hparams.same_enc:
         self.sentence_encoder = self.decoder.get_encoder()
     else:
         self.sentence_encoder = T5EncoderModel.from_pretrained(self.hparams.t5_model)
     
     self.sync_dist = self.hparams.gpus > 1
示例#14
0
 def __init__(self,
              model_size: str = "small",
              num_beams: int = 4,
              no_repeat_ngram_size: int = 2,
              min_length: int = 30,
              max_length: int = 100,
              skip_special_tokens: bool = True):
     if model_size not in ["small", "base", "large", "xl", "xxl"]:
         raise ValueError(f"""model_size \"{model_size}\" not found.
             It might be a typo; if not, please consult our document.""")
     self.model = MT5ForConditionalGeneration.from_pretrained(
         f'google/mt5-{model_size}')
     self.tokenizer = T5Tokenizer.from_pretrained(
         f'google/mt5-{model_size}')
     self.num_beams = num_beams
     self.no_repeat_ngram_size = no_repeat_ngram_size
     self.min_length = min_length
     self.max_length = max_length
     self.skip_special_tokens = skip_special_tokens
示例#15
0
    def __init__(self, lang_code='en', max_questions=20):

        self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
        model = T5ForConditionalGeneration.from_pretrained('Parth/result')
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        # model.eval()
        self.device = device
        self.model = model
        self.nlp = self.try_load_spacy_model(lang_code)
        self.max_questions = int(max_questions)

        self.s2v = Sense2Vec().from_disk(
            '/Users/dev/Develop/text-to-anki/backend/src/Questgen.ai/Questgen.ai/Questgen/s2v_old'
        )

        self.fdist = FreqDist(brown.words())
        self.normalized_levenshtein = NormalizedLevenshtein()
        self.set_seed(42)
示例#16
0
def main():
    document = {}
    document['uuid'] = '1234567'
    document['text'] = "Que tal fazer uma poc inicial para vermos a viabilidade e identificarmos as dificuldades?\nA motivação da escolha desse problema " \
    "foi que boa parte dos atos de matrícula passam de 512 tokens, e ainda não temos uma solução definida para fazer treinamento e predições em " \
    "janelas usando o QA.\nEssa limitação dificulta o uso de QA para problemas que não sabemos onde a informação está no documento (por enquanto, " \
    "só aplicamos QA em tarefas que sabemos que a resposta está nos primeiros 512 tokens da matrícula).\nComo esse problema de identificar a proporção " \
    "de cada pessoa são duas tarefas (identificação + relação com uma pessoa), podemos usar a localização da pessoa no texto para selecionar apenas " \
    "uma pedaço do ato de alienação pra passar como contexto pro modelo, evitando um pouco essa limitação dos 512 tokens."
    document[
        'text'] = "PREFEITURA DE CAUCAIA\nSECRETARIA DE FINAN\u00c7AS,PLANEJAMENTO E OR\u00c7AMENTO\nCERTID\u00c3O NEGATIVA DE TRIBUTOS ECON\u00d4MICOS\nLA SULATE\nN\u00ba 2020000982\nRaz\u00e3o Social\nCOMPASS MINERALS AMERICA DO SUL INDUSTRIA E COMERC\nINSCRI\u00c7\u00c3O ECON\u00d4MICA Documento\nBairro\n00002048159\nC.N.P.J.: 60398138001860\nSITIO SALGADO\nLocalizado ROD CE 422 KM 17, S/N - SALA SUPERIOR 01 CXP - CAUCAIA-CE\nCEP\n61600970\nDADOS DO CONTRIBUINTE OU RESPONS\u00c1VEL\nInscri\u00e7\u00e3o Contribuinte / Nome\n169907 - COMPASS MINERALS AMERICA DO SUL INDUSTRIA E COMERC\nEndere\u00e7o\nROD CE 422 KM 17, S/N SALA SUPERIOR 01 CXP\nDocumento\nC.N.P.J.: 60.398.138/0018-60\nSITIO SALGADO CAUCAIA-CE CEP: 61600970\nNo. Requerimento\n2020000982/2020\nNatureza jur\u00eddica\nPessoa Juridica\nCERTID\u00c3O\nCertificamos para os devidos fins, que revendo os registros dos cadastros da d\u00edvida ativa e de\ninadimplentes desta Secretaria, constata-se - at\u00e9 a presente data \u2013 n\u00e3o existirem em nome do (a)\nrequerente, nenhuma pend\u00eancia relativa a tributos municipais.\nSECRETARIA DE FINAN\u00c7AS, PLANEJAMENTO E OR\u00c7AMENTO se reserva o direito de inscrever e cobrar as\nd\u00edvidas que posteriormente venham a ser apurados. Para Constar, foi lavrada a presente Certid\u00e3o.\nA aceita\u00e7\u00e3o desta certid\u00e3o est\u00e1 condicionada a verifica\u00e7\u00e3o de sua autenticidade na internet, nos\nseguinte endere\u00e7o: http://sefin.caucaia.ce.gov.br/\nCAUCAIA-CE, 03 DE AGOSTO DE 2020\nEsta certid\u00e3o \u00e9 v\u00e1lida por 090 dias contados da data de emiss\u00e3o\nVALIDA AT\u00c9: 31/10/2020\nCOD. VALIDA\u00c7\u00c3O 2020000982"
    document[
        'text'] = 'M Santander\nProposta de Abertura de Conta Poupança, Utilizando de\nProdutos e Serviços e Outras Avenças - Pessoa Física\nP3ID008159105563\n1695\nAgência Nº\nQuantidade de Titulares\nPAB N°\n1\nCondição de Movimentação da Conta\nModalidade de Poupança\nPOUPANCA ESPECIAL PF\nConta Poupança\n0033-1695-000600100141\nConta Corrente Vinculada\nConta Corrente Associada\nDados Básicos do Titular 1\nCPF |06621595271\nNome Completo\nTAISSA RIBEIRO GOMES\nDocumento de Identificação\n02-IDENTIDADE-RG\nNº do Documento \\/ N° da Série (CTPS)\n19117457\nÓrgão Emissor\nPC\nUF PA\nData de Emissão\n24\\/09\\/2018\nData de Vencimento\nData de Nascimento\n(12\\/04\\/2002\nCartório\nNº Livro\nNº Folha\nSexo FEMININO\nPaís de Nascimento\nBRASIL\nNacionalidade\nBRASILEIRA\nNaturalidade\nUF PA\nALTAMIRA\nSOLTEIRO(A)\nEstado Civil\nCondição Pessoal\n101-MAIOR COM RENDA\nNome da Mãe\nCREUZA ROSA RIBEIRO\nNome do Pai\nWELSON GOMES\nCidadania\nBRASILEIRA\nOutro domicilio fiscal\n| BRASIL\nEndereços\nEndereço Residencial\nRua\\/Av\\/Pça\\/Estrada\nTV AGRARIO CAVALCANTE\nNúmero\n| 338 _\n\\/\nComplemento\nCASA 04 VILA\nBairro\nRECREIO\nMunicipio ALTAMIRA\nPaís BRASIL\n| UF PA\n168371140\nCEP\nEndereço Comercial\nRua\\/Av\\/Pça\\/Estrada\nNúmero\nComplemento\nBairro\nMunicípio\n| UF\nUF |\nPaís BRASIL\nCEP\nEndereço Alternativo\nRua\\/Av\\/Pça\\/Estrada\nNúmero\nComplemento\nBairro |\nPag. 1 16\n'

    context_content = 'token'  #'position_token'
    start_position = 158
    max_size = 200

    # tokenizer = T5Tokenizer.from_pretrained('models/', do_lower_case=False)
    tokenizer = T5Tokenizer.from_pretrained(
        '/home/ramonpires/git/NLP/qa-t5/models/', do_lower_case=False)
    max_tokens = 512  #150
    question = 'Qual é a proporção?'

    context, offset = get_context(document,
                                  context_content=context_content,
                                  max_size=max_size,
                                  start_position=start_position,
                                  proportion_before=0.2,
                                  return_position_offset=True,
                                  tokenizer=tokenizer,
                                  max_tokens=max_tokens,
                                  question=question,
                                  window_overlap=0.5,
                                  verbose=True)

    print('--> testing the offset:')
    if isinstance(context, list):
        context, offset = context[-1], offset[-1]  # last window
    print('>>>>>>>>>> using the offset\n' +
          document['text'][offset:offset + len(context)])
    print('>>>>>>>>>> returned context\n' + context)
示例#17
0
def abstractive(text):
    model = T5ForConditionalGeneration.from_pretrained('t5-small')
    tokenizer = T5Tokenizer.from_pretrained('t5-small')
    device = torch.device('cpu')
    t5_prepared_Text = "summarize: " + text
    # print ("original text preprocessed: \n", preprocess_text)

    tokenized_text = tokenizer.encode(t5_prepared_Text,
                                      return_tensors="pt").to(device)

    # summmarize
    summary_ids = model.generate(tokenized_text,
                                 num_beams=4,
                                 no_repeat_ngram_size=2,
                                 min_length=30,
                                 max_length=100,
                                 early_stopping=True)

    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return output
示例#18
0
 def __init__(
     self,
     name,
     model_name,
     input_max_length,
     device,
     batch_size,
     summarize_prefix,
     title_prefix,
 ):
     super().__init__(name)
     if isinstance(model_name, str):
         model_name = [model_name, model_name]
     self.tokenizer = T5Tokenizer.from_pretrained(model_name[0])
     self.model = T5ForConditionalGeneration.from_pretrained(model_name[1])
     self.input_max_length = input_max_length
     self.device = device
     self.batch_size = batch_size
     self.summarize_prefix = summarize_prefix
     self.title_prefix = title_prefix
示例#19
0
def load_pretained_model_and_tokenizer(
    base_model: str,
    model_dict_path: str,
    gpu_device: str,
    eval=False,
):
    '''
    Load pretainted T5 model on UnifiedQA
    base_model: base model name for T5
    model_dict_path: trained model checkpoint for unifiedQA
    '''
    tokenizer = T5Tokenizer.from_pretrained(base_model)
    model = T5ForConditionalGeneration(T5Config.from_pretrained(base_model))

    if eval:
        model = torch.load(model_dict_path, map_location=gpu_device)
    else:
        load_tf_weights_in_t5(model, None, model_dict_path)

    return tokenizer, model
def summarize(text):
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    # encode the text into tensor of integers using the appropriate tokenizer
    inputs = tokenizer.encode("summarize: " + text,
                              return_tensors="pt",
                              max_length=512,
                              truncation=True)
    outputs = model.generate(inputs,
                             max_length=150,
                             min_length=40,
                             length_penalty=2.0,
                             num_beams=4,
                             no_repeat_ngram_size=2,
                             num_return_sequences=4,
                             early_stopping=True)
    # just for debugging
    #print(outputs)
    print(tokenizer.decode(outputs[0]))
    return tokenizer.decode(outputs[0])
示例#21
0
def get_summary(self, text):
    modelInfo = self.models.get("doc_summarization")
    model = modelInfo['model']
    model.eval()
    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    df = pd.DataFrame({'text': [""], 'ctext': [text]})
    params = {
        'batch_size': 1,
        'shuffle': True,
        'num_workers': 0
    }
    loader = DataLoader(
        CustomDataset(
            df,
            tokenizer,
            modelInfo["max_text_length"],
            modelInfo["max_sum_length"]),
        **params)
    predictions, truth = inference(tokenizer, model, "cpu", loader)
    return predictions[0]
示例#22
0
def perform_fine_tuning():
    """Fine-tune and save model"""

    # import data provider (e.g. dtr, rel, or events)
    data = importlib.import_module(args.data_reader)

    # need this to save a fine-tuned model
    if os.path.isdir(args.model_dir):
        shutil.rmtree(args.model_dir)
    os.mkdir(args.model_dir)

    # load pretrained T5 tokenizer
    tokenizer = T5Tokenizer.from_pretrained(args.model_name)

    # load a pretrained T5 model
    model = T5ForConditionalGeneration.from_pretrained(args.model_name)

    train_dataset = data.Data(xmi_dir=args.xmi_dir,
                              tokenizer=tokenizer,
                              max_input_length=args.max_input_length,
                              max_output_length=args.max_output_length,
                              partition='train',
                              n_files=args.n_files)
    train_data_loader = DataLoader(train_dataset,
                                   shuffle=True,
                                   batch_size=args.train_batch_size)

    val_dataset = data.Data(xmi_dir=args.xmi_dir,
                            tokenizer=tokenizer,
                            max_input_length=args.max_input_length,
                            max_output_length=args.max_output_length,
                            partition='dev',
                            n_files=args.n_files)
    val_data_loader = DataLoader(val_dataset,
                                 shuffle=False,
                                 batch_size=args.train_batch_size)

    # fine-tune model on thyme data and save it
    best_loss, optimal_epochs = fit(model, train_data_loader, val_data_loader,
                                    tokenizer)
    print('best loss %.3f after %d epochs\n' % (best_loss, optimal_epochs))
示例#23
0
    def __init__(self, hparams):
        """For uninitiated"""

        super(T5FineTuner, self).__init__()

        self.hparams = hparams
        self.model = T5ForConditionalGeneration.from_pretrained(
            hparams.model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(
            hparams.tokenizer_name_or_path)
        self.rouge_metric = load_metric('rouge')

        n_observations_per_split = {
            "train": self.hparams.n_train,
            "validation": self.hparams.n_val,
            "test": self.hparams.n_test,
        }
        self.n_obs = {
            k: v if v >= 0 else None
            for k, v in n_observations_per_split.items()
        }
示例#24
0
    def __init__(self, hparams):
        super().__init__()
        self.hparams = hparams

        if "base" in self.hparams.size:
            self.embedding_extractor = nn.Sequential(ConvBlock(3, 16),
                                                     ConvBlock(16, 64),
                                                     ConvBlock(64, 256),
                                                     ConvBlock(256, 512),
                                                     ConvBlock(512, 768),
                                                     Feature2Embedding(768))
        else:
            self.embedding_extractor = nn.Sequential(ConvBlock(3, 16),
                                                     ConvBlock(16, 64),
                                                     ConvBlock(64, 256),
                                                     ConvBlock(256, 512),
                                                     Feature2Embedding(512))
        print(f"Embedding extractor:\n{self.embedding_extractor}")
        self.decoder = T5ForConditionalGeneration.from_pretrained(
            self.hparams.size)
        self.tokenizer = T5Tokenizer.from_pretrained(self.hparams.size)
示例#25
0
    def test_greedy_generate(self):
        model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
        tokenizer = T5Tokenizer.from_pretrained("t5-small")

        sentences = ["Yesterday, my name was", "Today is a beautiful day and"]
        input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids

        generation_kwargs = {
            "bad_words_ids": [tokenizer("my").input_ids, tokenizer("ein schöner").input_ids],
            "no_repeat_ngram_size": 3,
            "do_sample": False,
            "repetition_penalty": 2.2,
        }

        output_ids = model.generate(input_ids, **generation_kwargs)

        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

        expected_output_string = ["Yesterday, my name was", "Heute ist ein schöne Tag und"]

        self.assertListEqual(expected_output_string, output_strings)
    def custom_init(self):
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')

        self.tokenizer = T5Tokenizer.from_pretrained(
            "/usr/src/WHOA-FAQ-Answer-Project/WHO-FAQ-Search-Engine/variation_generation/models/"
        )
        config = T5Config.from_json_file(
            '/usr/src/WHOA-FAQ-Answer-Project/WHO-FAQ-Search-Engine/variation_generation/T5config.json'
        )

        # TODO : Add model weight download
        # self.model = torch.load(path, map_location=self.device)
        self.model = T5ForConditionalGeneration.from_pretrained(\
            self.path, from_tf=True, config=config)
        self.model.to(self.device)
        self.model.eval()

        self.max_length = self.max_length
        self.num_variations = self.num_variations
        self.initialised = True
示例#27
0
def create_model(checkpoint_path):
    """Return a T5 model.
    """
    if os.path.isdir(checkpoint_path):
        model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)
    else:
        model = T5ForConditionalGeneration.from_pretrained('t5-small')
        os.mkdir(checkpoint_path)
    tokenizer = T5Tokenizer.from_pretrained('t5-small')
    optimizer = Adafactor(
        params=model.parameters(),
        lr=1e-4,
        eps=(1e-30, 1e-3),
        clip_threshold=1.0,
        decay_rate=-0.8,
        beta1=None,
        weight_decay=0.0,
        relative_step=False,
        scale_parameter=False,
        warmup_init=False)
    return model, optimizer, tokenizer
示例#28
0
    def __init__(self,
                 settings: NtrSettings = NtrSettings(),
                 device: str = None):
        super().__init__("Ntr", verbose=settings.verbose)

        # Model settings
        self.max_length = settings.max_length
        self.num_beams = settings.num_beams
        self.early_stopping = settings.early_stopping

        device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.device = torch.device(device)

        if self.verbose:
            logging.info(
                f"Initializing T5 using model {settings.model_name}...")
        self.model = (T5ForConditionalGeneration.from_pretrained(
            settings.model_name).to(device).eval())
        self.tokenizer = T5Tokenizer.from_pretrained(settings.model_name)
        self.nlp = English()
        self.history = []
示例#29
0
def generate_summaries(lns, output_file_path, model_size, batch_size, device):
    output_file = Path(output_file_path).open("w", encoding="utf-8")

    model = T5ForConditionalGeneration.from_pretrained(model_size)
    model.to(device)

    tokenizer = T5Tokenizer.from_pretrained(model_size)

    # update config with summarization specific params
    task_specific_params = model.config.task_specific_params
    if task_specific_params is not None:
        model.config.update(task_specific_params.get("summarization", {}))

    counter = 0
    for batch in tqdm(list(chunks(lns, batch_size))):
        batch = [model.config.prefix + text for text in batch]

        dct = tokenizer.batch_encode_plus(batch,
                                          max_length=512,
                                          return_tensors="pt",
                                          pad_to_max_length=True)
        input_ids = dct["input_ids"].to(device)
        attention_mask = dct["attention_mask"].to(device)

        summaries = model.generate(input_ids=input_ids,
                                   attention_mask=attention_mask)
        dec = [
            tokenizer.decode(g,
                             skip_special_tokens=True,
                             clean_up_tokenization_spaces=False)
            for g in summaries
        ]

        for hypothesis in dec:
            output_file.write(hypothesis + "\n")
            output_file.flush()

        counter += 1
        if counter > 100:
            break
示例#30
0
    def __init__(self, rouge_metrics=None, lang="english"):
        if rouge_metrics is None:
            rouge_metrics = ['rouge1', 'rougeL', 'rougeLsum']
            warnings.warn(
                f"Rouge metrics not defined, using default metrics {rouge_metrics}."
            )

        self.LANGUAGE = lang
        stemmer = Stemmer(self.LANGUAGE)

        # single-doc LexRank
        self.lr_sum = LexRankSummarizer(stemmer)
        self.lr_sum.stop_words = get_stop_words(self.LANGUAGE)

        # single-doc LSA
        self.lsa_sum = LsaSummarizer(stemmer)
        self.lsa_sum.stop_words = get_stop_words(self.LANGUAGE)

        # single-doc TextRank
        self.tr_sum = TextRankSummarizer(stemmer)
        self.tr_sum.stop_words = get_stop_words(self.LANGUAGE)

        # single-doc T5
        self.t5_sum_model = T5ForConditionalGeneration.from_pretrained(
            't5-base')
        self.t5_sum_tokenizer = T5Tokenizer.from_pretrained('t5-base')

        # single-doc BART
        self.bart_tokenizer = BartTokenizer.from_pretrained(
            'facebook/bart-large-cnn')
        # self.bart_sum_model = pipeline('summarization', model='facebook/bart-large-cnn',
        #                                tokenizer='facebook/bart-large-cnn')
        self.bart_sum_model = BartForConditionalGeneration.from_pretrained(
            'facebook/bart-large-cnn')

        # SCORES
        # what is stemming? - https://en.wikipedia.org/wiki/Stemming
        # Stemming is the process of reducing words to their root form.
        # For example: contesting -> contest ; contestant -> contest
        self.scorer = rouge_scorer.RougeScorer(rouge_metrics, use_stemmer=True)