示例#1
0
def main(json_path, model_name_or_dir):
    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name_or_dir)
    model = XLMRobertaForSequenceClassification.from_pretrained(
        model_name_or_dir).to(DEVICE)
    model.eval()

    with open(json_path) as json_file:
        data = json.load(json_file)

    predictions = []
    labels = []

    for pair in data:
        sentence = pair['text']
        label = pair['sentiment']

        inputs = tokenizer.encode(sentence,
                                  padding=False,
                                  truncation=True,
                                  return_tensors='pt').to(DEVICE)

        with torch.no_grad():
            output = model(inputs).logits
            prediction = torch.argmax(output, dim=-1)[0].item()

        predictions.append(prediction)
        labels.append(label)

    print(metrics.classification_report(labels, predictions, digits=6))
示例#2
0
    def train(self, train_set, dev_set):
        logging.info("Loading training set")

        self.tokenizer = XLMRobertaTokenizerFast.from_pretrained(
            self.settings["model"])
        train_generator = self.get_generator(self.settings["batch_size"],
                                             shuffle=True)
        train_generator.load(train_set)
        steps_per_epoch = min(len(train_generator),
                              self.settings["steps_per_epoch"])

        dev_generator = self.get_generator(self.settings["batch_size"],
                                           shuffle=False)
        dev_generator.load(dev_set)

        model_filename = self.dir + '/' + self.settings["model_file"]
        vocab_filename = self.dir + '/' + self.settings["vocab_file"]
        earlystop = EarlyStopping(monitor='val_f1',
                                  mode='max',
                                  patience=self.settings["patience"],
                                  restore_best_weights=True)

        logging.info("Training classifier")

        strategy = tf.distribute.MirroredStrategy()
        num_devices = strategy.num_replicas_in_sync
        with strategy.scope():
            self.model = self.load_model(self.settings["model"])
            self.model.compile(
                optimizer=self.settings["optimizer"],
                loss=SparseCategoricalCrossentropy(from_logits=True),
                metrics=[FScore(argmax=True),
                         MatthewsCorrCoef(argmax=True)])
        if logging.getLogger().level == logging.DEBUG:
            self.model.summary()
        self.model.fit(train_generator,
                       epochs=self.settings["epochs"],
                       steps_per_epoch=steps_per_epoch,
                       validation_data=dev_generator,
                       batch_size=self.settings["batch_size"],
                       callbacks=[earlystop],
                       verbose=1)
        self.model.save_pretrained(model_filename)
        self.tokenizer.save_pretrained(vocab_filename)

        y_true = dev_generator.y
        y_pred = self.model.predict(dev_generator, verbose=1).logits
        y_pred_probs = self.softmax_pos_prob(y_pred)
        y_pred = np.argmax(y_pred, axis=-1)
        logging.info(f"Dev precision: {precision_score(y_true, y_pred):.3f}")
        logging.info(f"Dev recall: {recall_score(y_true, y_pred):.3f}")
        logging.info(f"Dev f1: {f1_score(y_true, y_pred):.3f}")
        logging.info(f"Dev mcc: {matthews_corrcoef(y_true, y_pred):.3f}")

        A, B = calibrate_output(y_true, y_pred_probs)
        self.settings["calibration_params"] = (A, B)

        return y_true, y_pred
def main(
        output_dir,
        logging_dir,
        logging_steps,
        large,
        batch_size,
        gradient_accumulation_steps,
        learning_rate,
        num_train_epochs,
        warmup_ratio):
    sst_train_dataset = load_dataset('glue', 'sst2', split='train')
    sst_validation_dataset = load_dataset('glue', 'sst2', split='validation')

    if large:
        model_name = 'xlm-roberta-large'
    else:
        model_name = 'xlm-roberta-base'

    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name)
    model = XLMRobertaForSequenceClassification.from_pretrained(model_name)

    def preprocess_function(examples):
        return tokenizer(examples['sentence'], padding=False, truncation=True)

    sst_train_dataset = sst_train_dataset.map(preprocess_function, batched=True)
    sst_validation_dataset = sst_validation_dataset.map(preprocess_function, batched=True)

    training_args = TrainingArguments(
        output_dir=output_dir,
        do_train=True,
        do_eval=True,
        do_predict=False,
        evaluation_strategy='epoch',
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        num_train_epochs=num_train_epochs,
        warmup_ratio=warmup_ratio,
        logging_dir=logging_dir,
        logging_strategy='steps',
        logging_steps=logging_steps,
        save_strategy='epoch',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=sst_train_dataset,
        eval_dataset=sst_validation_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
示例#4
0
  def __init__(self, vocab_file, fail_on_mismatch=False):
      # self.vocab = bert_tokenization.load_vocab(vocab_file)
      # self.tokenizer = NonDestructiveFullTokenizer(vocab_file=vocab_file)
      vocab_file = '/Users/faisal/projects/mlingual_align/models/modified_wordpiece'
      if 'sentencepiece' in vocab_file:
          from transformers import XLMRobertaTokenizerFast
          # vocab_path = '/Users/faisal/projects/mlingual_align/models/modified_sentencepiece'
          self.tokenizer = XLMRobertaTokenizerFast.from_pretrained(vocab_file)
          special_tokens = [
              "<s>", "</s>", "<pad>", "[YES]", "[NO]", "[NoLongAnswer]",
              "[NoShortAnswer]", "[SA]", "[/SA]", "<UNK>", "<mask>"
          ]
      elif 'wordpiece' in vocab_file:
          from transformers import BertTokenizerFast
          # vocab_path = '/Users/faisal/projects/mlingual_align/models/modified_sentencepiece'
          self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

      self.tokenizer.add_tokens(special_tokens_bert)
      self.vocab = self.tokenizer.get_vocab()
      self.fail_on_mismatch = fail_on_mismatch
示例#5
0
def preprocessing(df):
    sentences = df.sentence.values
    labels = np.array([ l for l in df.label.values])

    tokenizer =XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-large" , do_lower_case=True )
    
    encoded_sentences = []
    for sent in sentences:
        encoded_sent = tokenizer.encode(
                            sent,
                            add_special_tokens = True,
                            truncation=True,
                            max_length = max_sent_length
                    )
        
        encoded_sentences.append(encoded_sent)
    #encoded_sentences = pad_sequences(encoded_sentences, maxlen=MAX_LEN, dtype="long", 
    #                        value=0, truncating="post", padding="post")
    
    return encoded_sentences, labels
示例#6
0
def main(sequence):
    # Pretrained model from https://huggingface.co/asahi417/tner-xlm-roberta-base-ontonotes5
    tokenizer = XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-base')
    model = XLMRobertaForTokenClassification.from_pretrained(
        'asahi417/tner-xlm-roberta-base-ontonotes5').to(DEVICE)
    model.eval()

    # Bit of a hack to get the tokens with the special tokens
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
    inputs = tokenizer.encode(sequence, return_tensors="pt").to(DEVICE)

    with torch.no_grad():
        outputs = model(inputs).logits
        predictions = torch.argmax(outputs, dim=2)[0]

    id2label = model.config.id2label
    print(id2label)

    for token, prediction in zip(tokens, predictions):
        print("{} - {}".format(token, id2label[prediction.item()]))
def main(train_json_path, val_json_path, model_name_or_dir, output_dir,
         logging_dir, logging_steps, batch_size, gradient_accumulation_steps,
         learning_rate, num_train_epochs, warmup_ratio, num_classes):
    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name_or_dir)
    model = XLMRobertaForTokenClassification.from_pretrained(
        model_name_or_dir, num_labels=num_classes)

    sh_ner_train_dataset, sh_ner_val_dataset = create_sh_ner_dataset(
        train_json_path, val_json_path, tokenizer)

    data_collator = DataCollatorForTokenClassification(tokenizer)

    training_args = TrainingArguments(
        output_dir=output_dir,
        do_train=True,
        do_eval=True,
        do_predict=False,
        evaluation_strategy='epoch',
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        num_train_epochs=num_train_epochs,
        warmup_ratio=warmup_ratio,
        logging_dir=logging_dir,
        logging_strategy='steps',
        logging_steps=logging_steps,
        save_strategy='epoch',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=sh_ner_train_dataset,
        eval_dataset=sh_ner_val_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
def main(json_path, model_name_or_dir):
    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name_or_dir)
    model = XLMRobertaForTokenClassification.from_pretrained(model_name_or_dir).to(DEVICE)
    model.eval()

    _, sh_ner_val_dataset = create_sh_ner_dataset(json_path, json_path, tokenizer)

    filtered_labels = []
    filtered_preds = []

    for instance in sh_ner_val_dataset:
        with torch.no_grad():
            input_ids = torch.tensor(instance['input_ids']).unsqueeze(0).to(DEVICE)
            attention_mask = torch.tensor(instance['attention_mask']).unsqueeze(0).to(DEVICE)
            outputs = model(input_ids, attention_mask).logits
            predictions = torch.argmax(outputs, dim=2)[0].detach().cpu().numpy()

            filtered_labels_inner = []
            filtered_preds_inner = []

            for label, prediction in zip(instance['labels'], predictions):
                if label != -100:
                    filtered_labels_inner.append(id_to_label(label))
                    filtered_preds_inner.append(id_to_label(prediction))

            filtered_labels.append(filtered_labels_inner)
            filtered_preds.append(filtered_preds_inner)

    accuracy = metrics.accuracy_score(filtered_labels, filtered_preds)
    f1 = metrics.f1_score(filtered_labels, filtered_preds)
    precision = metrics.precision_score(filtered_labels, filtered_preds)
    recall = metrics.recall_score(filtered_labels, filtered_preds)

    print("Accuracy: {}".format(accuracy))
    print("F1: {}".format(f1))
    print("Precision: {}".format(precision))
    print("Recall: {}".format(recall))

    print(metrics.classification_report(filtered_labels, filtered_preds, digits=6))
def main(handeset_csv_path, model_name_or_dir, output_dir, logging_dir,
         logging_steps, batch_size, gradient_accumulation_steps, learning_rate,
         num_train_epochs, warmup_ratio):
    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name_or_dir)
    model = XLMRobertaForSequenceClassification.from_pretrained(
        model_name_or_dir)

    handeset_train_dataset, handeset_val_dataset = create_handeset_dataset(
        handeset_csv_path, tokenizer, test_size=0.2)

    training_args = TrainingArguments(
        output_dir=output_dir,
        do_train=True,
        do_eval=True,
        do_predict=False,
        evaluation_strategy='epoch',
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        num_train_epochs=num_train_epochs,
        warmup_ratio=warmup_ratio,
        logging_dir=logging_dir,
        logging_strategy='steps',
        logging_steps=logging_steps,
        save_strategy='epoch',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=handeset_train_dataset,
        eval_dataset=handeset_val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
示例#10
0
def main(
        input_dir_path,
        output_dir_path,
        model_name_or_dir):
    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name_or_dir)
    model = XLMRobertaForSequenceClassification.from_pretrained(model_name_or_dir).to(DEVICE)
    model.eval()

    os.makedirs(output_dir_path, exist_ok=True)

    for file_name in os.listdir(input_dir_path):
        if file_name.endswith('.json'):
            count = 0
            input_file_path = os.path.join(input_dir_path, file_name)
            with open(input_file_path) as json_file:
                data = json.load(json_file)

                for session in data['sessions']:
                    for speech in session['speeches']:
                        content = []
                        for text in speech['content']:
                            inputs = tokenizer.encode(
                                text, padding=False, truncation=True, return_tensors='pt').to(DEVICE)

                            with torch.no_grad():
                                outputs = model(inputs).logits
                                predictions = torch.softmax(outputs, dim=-1)[0, 1].item()

                            content.append({ 'text': text, 'sentiment': round(predictions, 6) })
                            count += 1
                        speech['content'] = content

            output_file_path = os.path.join(output_dir_path, file_name)
            with open(output_file_path, 'w') as json_file:
                json.dump(data, json_file)

            print("File: {}, Count: {}".format(file_name, count))
示例#11
0
def convert_donut_checkpoint(model_name,
                             pytorch_dump_folder_path=None,
                             push_to_hub=False):
    # load original model
    original_model = DonutModel.from_pretrained(model_name).eval()

    # load HuggingFace model
    encoder_config, decoder_config = get_configs(original_model)
    encoder = DonutSwinModel(encoder_config)
    decoder = MBartForCausalLM(decoder_config)
    model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
    model.eval()

    state_dict = original_model.state_dict()
    new_state_dict = convert_state_dict(state_dict, model)
    model.load_state_dict(new_state_dict)

    # verify results on scanned document
    dataset = load_dataset("hf-internal-testing/example-documents")
    image = dataset["test"][0]["image"].convert("RGB")

    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name,
                                                        from_slow=True)
    feature_extractor = DonutFeatureExtractor(
        do_align_long_axis=original_model.config.align_long_axis,
        size=original_model.config.input_size[::-1])
    processor = DonutProcessor(feature_extractor, tokenizer)
    pixel_values = processor(image, return_tensors="pt").pixel_values

    if model_name == "naver-clova-ix/donut-base-finetuned-docvqa":
        task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
        question = "When is the coffee break?"
        task_prompt = task_prompt.replace("{user_input}", question)
    elif model_name == "naver-clova-ix/donut-base-finetuned-rvlcdip":
        task_prompt = "<s_rvlcdip>"
    elif model_name in [
            "naver-clova-ix/donut-base-finetuned-cord-v1",
            "naver-clova-ix/donut-base-finetuned-cord-v1-2560",
    ]:
        task_prompt = "<s_cord>"
    elif model_name == "naver-clova-ix/donut-base-finetuned-cord-v2":
        task_prompt = "s_cord-v2>"
    elif model_name == "naver-clova-ix/donut-base-finetuned-zhtrainticket":
        task_prompt = "<s_zhtrainticket>"
    elif model_name in [
            "naver-clova-ix/donut-proto", "naver-clova-ix/donut-base"
    ]:
        # use a random prompt
        task_prompt = "hello world"
    else:
        raise ValueError("Model name not supported")
    prompt_tensors = original_model.decoder.tokenizer(
        task_prompt, add_special_tokens=False,
        return_tensors="pt")["input_ids"]

    original_patch_embed = original_model.encoder.model.patch_embed(
        pixel_values)
    patch_embeddings, _ = model.encoder.embeddings(pixel_values)
    assert torch.allclose(original_patch_embed, patch_embeddings, atol=1e-3)

    # verify encoder hidden states
    original_last_hidden_state = original_model.encoder(pixel_values)
    last_hidden_state = model.encoder(pixel_values).last_hidden_state
    assert torch.allclose(original_last_hidden_state,
                          last_hidden_state,
                          atol=1e-2)

    # verify decoder hidden states
    original_logits = original_model(pixel_values, prompt_tensors, None).logits
    logits = model(pixel_values, decoder_input_ids=prompt_tensors).logits
    assert torch.allclose(original_logits, logits, atol=1e-3)
    print("Looks ok!")

    if pytorch_dump_folder_path is not None:
        print(f"Saving model and processor to {pytorch_dump_folder_path}")
        model.save_pretrained(pytorch_dump_folder_path)
        processor.save_pretrained(pytorch_dump_folder_path)

    if push_to_hub:
        model.push_to_hub("nielsr/" + model_name.split("/")[-1],
                          commit_message="Update model")
        processor.push_to_hub("nielsr/" + model_name.split("/")[-1],
                              commit_message="Update model")
示例#12
0
    Task.MULTILABEL_CLS: multilabel_classification_metrics
}

PUBLIC_MODEL = {
    'mbert': {
        'name':
        'bert-base-multilingual-cased',
        'tokenizer':
        BertTokenizerFast.from_pretrained('bert-base-multilingual-cased'),
        'config':
        BertConfig.from_pretrained('bert-base-multilingual-cased'),
    },
    'xlmr': {
        'name': 'xlm-roberta-base',
        'tokenizer':
        XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-base'),
        'config': XLMRobertaConfig.from_pretrained('xlm-roberta-base'),
    },
    'xlmr-large': {
        'name': 'xlm-roberta-large',
        'tokenizer':
        XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-large'),
        'config': XLMRobertaConfig.from_pretrained('xlm-roberta-base'),
    },
}

TOKENIZER_CLS = {
    'spm_camembert': CamembertTokenizer,
    'spm': ThaiRobertaTokenizer,
    'newmm': ThaiWordsNewmmTokenizer,
    'syllable': ThaiWordsSyllableTokenizer,
示例#13
0
    def __init__(self) -> None:
        self.lists = {}

        # M-BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.bert_multilingual_tokenizer = BertTokenizerFast.from_pretrained(
            'bert-base-multilingual-cased')
        self.bert_multilingual_model = BertForMaskedLM.from_pretrained(
            'bert-base-multilingual-cased').eval()
        self.lists["M-BERT"] = {
            "Tokenizer": self.bert_multilingual_tokenizer,
            "Model": self.bert_multilingual_model
        }
        print("====================================")
        print("[BERT] Google Multilingual BERT loaded")
        print("====================================")

        # KR-BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.krbert_tokenizer = BertTokenizerFast.from_pretrained(
            'snunlp/KR-Medium')
        self.krbert_model = BertForMaskedLM.from_pretrained(
            'snunlp/KR-Medium').eval()
        self.lists["KR-Medium"] = {
            "Tokenizer": self.krbert_tokenizer,
            "Model": self.krbert_model
        }
        print("====================================")
        print("[BERT] KR-BERT loaded")
        print("====================================")

        # BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.bert_kor_tokenizer = BertTokenizerFast.from_pretrained(
            'kykim/bert-kor-base')
        self.bert_kor_model = BertForMaskedLM.from_pretrained(
            'kykim/bert-kor-base').eval()
        self.lists["bert-kor-base"] = {
            "Tokenizer": self.bert_kor_tokenizer,
            "Model": self.bert_kor_model
        }
        print("====================================")
        print("[BERT] BERT-kor-base loaded")
        print("====================================")

        # ALBERT
        from transformers import AlbertForMaskedLM
        self.albert_tokenizer = BertTokenizerFast.from_pretrained(
            'kykim/albert-kor-base')
        self.albert_model = AlbertForMaskedLM.from_pretrained(
            'kykim/albert-kor-base').eval()
        self.lists["albert-kor-base"] = {
            "Tokenizer": self.albert_tokenizer,
            "Model": self.albert_model
        }
        print("====================================")
        print("[BERT] ALBERT-kor-base loaded")
        print("====================================")

        # XLM-Roberta
        from transformers import XLMRobertaTokenizerFast, XLMRobertaForMaskedLM
        self.xlmroberta_tokenizer = XLMRobertaTokenizerFast.from_pretrained(
            'xlm-roberta-base')
        self.xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained(
            'xlm-roberta-base').eval()
        self.lists["xlm-roberta-base"] = {
            "Tokenizer": self.xlmroberta_tokenizer,
            "Model": self.xlmroberta_model
        }
        print("====================================")
        print("[BERT] XLM-Roberta-kor loaded")
        print("====================================")

        from transformers import BertTokenizerFast, EncoderDecoderModel
        self.tokenizer_bertshared = BertTokenizerFast.from_pretrained(
            "kykim/bertshared-kor-base")
        self.bertshared_model = EncoderDecoderModel.from_pretrained(
            "kykim/bertshared-kor-base")
        self.lists["bertshared-kor-base"] = {
            "Tokenizer": self.tokenizer_bertshared,
            "Model": self.bertshared_model
        }
        print("====================================")
        print("[Seq2seq + BERT] bertshared-kor-base loaded")
        print("====================================")

        # gpt3-kor-small_based_on_gpt2
        from transformers import BertTokenizerFast, GPT2LMHeadModel
        self.tokenizer_gpt3 = BertTokenizerFast.from_pretrained(
            "kykim/gpt3-kor-small_based_on_gpt2")
        self.model_gpt3 = GPT2LMHeadModel.from_pretrained(
            "kykim/gpt3-kor-small_based_on_gpt2")
        self.lists["gpt3-kor-small_based_on_gpt2"] = {
            "Tokenizer": self.tokenizer_gpt3,
            "Model": self.model_gpt3
        }
        print("====================================")
        print("[GPT3] gpt3-small-based-on-gpt2 loaded")
        print("====================================")

        # electra-base-kor
        from transformers import ElectraTokenizerFast, ElectraModel
        self.tokenizer_electra = ElectraTokenizerFast.from_pretrained(
            "kykim/electra-kor-base")
        self.electra_model = ElectraModel.from_pretrained(
            "kykim/electra-kor-base")
        self.lists["electra-kor-base"] = {
            "Tokenizer": self.tokenizer_electra,
            "Model": self.electra_model
        }
        print("====================================")
        print("[ELECTRA] electra-kor-base loaded")
        print("====================================")

        from transformers import ElectraTokenizerFast, ElectraForQuestionAnswering
        self.electra_tokenizer_QA = ElectraTokenizerFast.from_pretrained(
            "monologg/koelectra-base-v3-finetuned-korquad")
        self.electra_model_QA = ElectraForQuestionAnswering.from_pretrained(
            "monologg/koelectra-base-v3-finetuned-korquad")
        self.lists["electra-kor-QA"] = {
            "Tokenizer": self.electra_tokenizer_QA,
            "Model": self.electra_model_QA
        }
        print("====================================")
        print("[ELECTRA] koelectra-base-v3-finetuned-korquad loaded")
        print("====================================")
示例#14
0
    def load(cls,
             pretrained_model_name_or_path,
             revision=None,
             tokenizer_class=None,
             use_fast=True,
             **kwargs):
        """
        Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
        model config or define it manually via `tokenizer_class`.

        :param pretrained_model_name_or_path:  The path of the saved pretrained model or its name (e.g. `bert-base-uncased`)
        :type pretrained_model_name_or_path: str
        :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
        :type revision: str
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
            use the Python one (False).
            Only DistilBERT, BERT and Electra fast tokenizers are supported.
        :type use_fast: bool
        :param kwargs:
        :return: Tokenizer
        """
        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        kwargs["revision"] = revision

        if tokenizer_class is None:
            tokenizer_class = cls._infer_tokenizer_class(
                pretrained_model_name_or_path)

        logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
        # return appropriate tokenizer object
        ret = None
        if "AlbertTokenizer" in tokenizer_class:
            if use_fast:
                ret = AlbertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = AlbertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif "XLMRobertaTokenizer" in tokenizer_class:
            if use_fast:
                ret = XLMRobertaTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = XLMRobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "RobertaTokenizer" in tokenizer_class:
            if use_fast:
                ret = RobertaTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = RobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "DistilBertTokenizer" in tokenizer_class:
            if use_fast:
                ret = DistilBertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DistilBertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "BertTokenizer" in tokenizer_class:
            if use_fast:
                ret = BertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = BertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "XLNetTokenizer" in tokenizer_class:
            if use_fast:
                ret = XLNetTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = XLNetTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif "ElectraTokenizer" in tokenizer_class:
            if use_fast:
                ret = ElectraTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = ElectraTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "EmbeddingTokenizer":
            if use_fast:
                logger.error(
                    'EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.'
                )
                ret = EmbeddingTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = EmbeddingTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "CamembertTokenizer" in tokenizer_class:
            if use_fast:
                ret = CamembertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = CamembertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "DPRQuestionEncoderTokenizer" in tokenizer_class:
            if use_fast:
                ret = DPRQuestionEncoderTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRQuestionEncoderTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "DPRContextEncoderTokenizer" in tokenizer_class:
            if use_fast:
                ret = DPRContextEncoderTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRContextEncoderTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        if ret is None:
            raise Exception("Unable to load tokenizer")
        else:
            return ret
示例#15
0
 def load(self):
     ''' Load fine-tuned model '''
     vocab_file = self.dir + '/' + self.settings["vocab_file"]
     self.tokenizer = XLMRobertaTokenizerFast.from_pretrained(vocab_file)
     self.model = self.load_model(self.dir + '/' +
                                  self.settings["model_file"])
albert_tokenizer = BertTokenizerFast.from_pretrained('kykim/albert-kor-base')
albert_model = AlbertForMaskedLM.from_pretrained(
    'kykim/albert-kor-base').eval()

# from transformers import BartForConditionalGeneration
# roberta_tokenizer = BertTokenizerFast.from_pretrained('kykim/bart-kor-base')
# roberta_model = BartForConditionalGeneration.from_pretrained('kykim/bart-kor-basee').eval()

from transformers import BertTokenizerFast, BertForMaskedLM
bert_multilingual_tokenizer = BertTokenizerFast.from_pretrained(
    'bert-base-multilingual-cased')
bert_multilingual_model = BertForMaskedLM.from_pretrained(
    'bert-base-multilingual-cased').eval()

from transformers import XLMRobertaTokenizerFast, XLMRobertaForMaskedLM
xlmroberta_tokenizer = XLMRobertaTokenizerFast.from_pretrained(
    'xlm-roberta-base')
xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained(
    'xlm-roberta-base').eval()


def decode(tokenizer, pred_idx, top_clean):
    ignore_tokens = string.punctuation + '[PAD][UNK]<pad><unk> '
    tokens = []
    for w in pred_idx:
        token = ''.join(tokenizer.decode(w).split())
        if token not in ignore_tokens:
            tokens.append(token.replace('##', ''))
    return ' / '.join(tokens[:top_clean])


def encode(tokenizer,
示例#17
0
    label_count_dict["total"] = total_count
    top_tweets_dict = {}
    for i in range(6):
        top_tweets_dict[label_mapping[i]] = top_tweets[i]
    print("DICTIONARY OF LABEL COUNT: ", label_count_dict)
    #    data = {"en": {"keywords": jsonify(keywords), "count": jsonify(label_count_dict), "top_tweets": top_tweets}, "hi": {"keywords": jsonify(keywords), "count": jsonify(label_count_dict), "top_tweets": top_tweets}}
    data = {
        "en": {
            "keywords": keywords,
            "count": label_count_dict,
            "top_tweets": top_tweets_dict
        },
        "hi": {
            "keywords": keywords,
            "count": label_count_dict,
            "top_tweets": top_tweets_dict
        }
    }

    return data


# jsonify andar wali dictionary -TODO maybe?

if __name__ == "__main__":
    output_dir = "data/xlm-roberta_model_save"
    tokenizer = XLMRobertaTokenizerFast.from_pretrained(output_dir)
    model_loaded = XLMRobertaForSequenceClassification.from_pretrained(
        output_dir)
    app.run(debug=True)