Exemplos de BertTokenizer.BertTokenizer em Python, exemplos de utils.bert.BertTokenizer.BertTokenizer em Python

Exemplo n.º 1

0

Exibir arquivo

def predict(input_text, net_trained, candidate_num=3, output_print=False):
    TEXT = pickle_load(PKL_FILE)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    tokenizer_bert = BertTokenizer(vocab_file=VOCAB_FILE, do_lower_case=False)
    text = preprocessing_text(input_text)
    text = tokenizer_bert.tokenize(text)
    text.insert(0, "[CLS]")
    text.append("[SEP]")
    token_ids = torch.ones((max_length)).to(torch.int64)
    ids_list = list(map(lambda x: TEXT.vocab.stoi[x], text))
    for i, index in enumerate(ids_list):
        token_ids[i] = index
    ids_list = token_ids.unsqueeze_(0)
    input = ids_list.to(device)
    input_mask = (input != 1)
    outputs, attention_probs = net_trained(input,
                                           token_type_ids=None,
                                           attention_mask=None,
                                           output_all_encoded_layers=False,
                                           attention_show_flg=True)

    offset_tensor = torch.tensor(offset, device=device)
    outputs -= offset_tensor
    if output_print == True: print(outputs)
    _, preds = torch.topk(outputs, candidate_num)
    return preds

Exemplo n.º 2

0

Exibir arquivo

Arquivo: predict.py Projeto: Sashimimochi/chABSA-dataset

def mk_html(input, preds, normlized_weights, TEXT):
    "HTMLデータを作成する"
    tokenizer_bert = BertTokenizer(vocab_file=VOCAB_FILE, do_lower_case=False)
    # indexの結果を抽出
    index = 0
    sentence = input.squeeze_(0) # 文章  #  torch.Size([1, 256])  > torch.Size([256]) 
    pred = preds[0]  # 予測


    # 予測結果を文字に置き換え
    if pred == 0:
        pred_str = "Negative"
    else:
        pred_str = "Positive"

    # 表示用のHTMLを作成する

    html = '推論ラベル：{}<br><br>'.format(pred_str)
    # Self-Attentionの重みを可視化。Multi-Headが12個なので、12種類のアテンションが存在

    for i in range(12):

        # indexのAttentionを抽出と規格化
        # 0単語目[CLS]の、i番目のMulti-Head Attentionを取り出す
        # indexはミニバッチの何個目のデータかをしめす
        attens = normlized_weights[index, i, 0, :]
        attens /= attens.max()

        #html += '[BERTのAttentionを可視化_' + str(i+1) + ']<br>'
        for word, attn in zip(sentence, attens):

            # 単語が[SEP]の場合は文章が終わりなのでbreak
            if tokenizer_bert.convert_ids_to_tokens([word.numpy().tolist()])[0] == "[SEP]":
                break

            # 関数highlightで色をつける、関数tokenizer_bert.convert_ids_to_tokensでIDを単語に戻す
            #html += highlight(tokenizer_bert.convert_ids_to_tokens(
            #    [word.numpy().tolist()])[0], attn)
        #html += "<br><br>"

    # 12種類のAttentionの平均を求める。最大値で規格化
    all_attens = attens*0  # all_attensという変数を作成する
    for i in range(12):
        attens += normlized_weights[index, i, 0, :]
    attens /= attens.max()

    html += '[BERTのAttentionを可視化_ALL]<br>'
    for word, attn in zip(sentence, attens):

        # 単語が[SEP]の場合は文章が終わりなのでbreak
        if tokenizer_bert.convert_ids_to_tokens([word.numpy().tolist()])[0] == "[SEP]":
            break

        # 関数highlightで色をつける、関数tokenizer_bert.convert_ids_to_tokensでIDを単語に戻す
        html += highlight(tokenizer_bert.convert_ids_to_tokens(
            [word.numpy().tolist()])[0], attn)
    html += "<br><br>"

    return html

Exemplo n.º 3

0

Exibir arquivo

Arquivo: dataset_jp_text.py Projeto: odmishien/bert_test

 def __init__(self, vocab_file, max_text_length=256, use_basic_form=False, mecab_dict=None):
     self.tokenizer = BertTokenizer(
         vocab_file=vocab_file, do_lower_case=False, do_basic_tokenize=False)
     if mecab_dict is not None:
         self.tagger = MeCab.Tagger("-d {}".format(mecab_dict))
     else:
         self.tagger = MeCab.Tagger("")
     self.text_field, self.label_field = self._prepare(
         max_text_length, use_basic_form)
     self.vocab, self.ids_to_tokens = self._load_vocab(vocab_file)

Exemplo n.º 4

0

Exibir arquivo

    def __init__(self, vocab_file, max_text_length=256, **kwargs):
        do_normalize_text = kwargs["do_normalize_text"] if "do_normalize_text" in kwargs else False
        use_basic_form = kwargs["use_basic_form"] if "use_basic_form" in kwargs else False
        mecab_dict = kwargs["mecab_dict"] if "mecab_dict" in kwargs else None

        self.tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=False, do_basic_tokenize=False)
        if mecab_dict is not None:
            self.tagger = MeCab.Tagger("-d {}".format(mecab_dict))
        else:
            self.tagger = MeCab.Tagger("")
        self.text_field, self.label_field = self._prepare(max_text_length, do_normalize_text, use_basic_form)
        self.vocab, self.ids_to_tokens = self._load_vocab(vocab_file)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: torch_bert_classify.py Projeto: mack-the-psych/vdok3

    def __init__(self,
                 data_dir=r'./',
                 bert_dir=r'./pytorch_advanced/nlp_sentiment_bert/'):
        self.data_dir = data_dir
        self.bert_dir = bert_dir
        self.tokenizer_bert = BertTokenizer(
            vocab_file=self.bert_dir + "vocab/bert-base-uncased-vocab.txt",
            do_lower_case=True)
        self.vocab_bert, self.ids_to_tokens_bert = load_vocab(
            vocab_file=self.bert_dir + "vocab/bert-base-uncased-vocab.txt")

        config = get_config(file_path=self.bert_dir +
                            "weights/bert_config.json")
        self.net_bert = BertModel(config)
        self.net_bert = set_learned_params(self.net_bert,
                                           weights_path=self.bert_dir +
                                           "weights/pytorch_model.bin")

Exemplo n.º 6

0

Exibir arquivo

import os
import io
import string
import re
import sys
import random
import spacy
import torchtext
import mojimoji
import string
#import MeCab
from torchtext.vocab import Vectors
from utils.bert import BertTokenizer, load_vocab

# 単語分割用のTokenizerを用意
tokenizer_bert = BertTokenizer(vocab_file="./vocab/vocab.txt",
                               do_lower_case=False)


def get_chABSA_DataLoaders_and_TEXT(max_length=256, batch_size=32):
    """IMDbのDataLoaderとTEXTオブジェクトを取得する。 """
    def preprocessing_text(text):

        # 半角・全角の統一
        text = mojimoji.han_to_zen(text)
        # 改行、半角スペース、全角スペースを削除
        text = re.sub('\r', '', text)
        text = re.sub('\n', '', text)
        text = re.sub('　', '', text)
        text = re.sub(' ', '', text)
        # 数字文字の一律「0」化
        text = re.sub(r'[0-9 ０-９]+', '0', text)  # 数字

Exemplo n.º 7

0

Exibir arquivo

def DataLoader(max_length=256, batch_size=32):
    """IMDbのDataLoaderとTEXTオブジェクトを取得する。 """
    # 乱数のシードを設定
    torch.manual_seed(0)
    np.random.seed(0)
    random.seed(0)
    # 単語分割用のTokenizerを用意
    tokenizer_bert = BertTokenizer(vocab_file=VOCAB_FILE, do_lower_case=False)

    def preprocessing_text(text):
        # 半角・全角の統一
        text = mojimoji.han_to_zen(text)
        # 改行、半角スペース、全角スペースを削除
        text = re.sub('\r', '', text)
        text = re.sub('\n', '', text)
        text = re.sub('　', '', text)
        text = re.sub(' ', '', text)
        text = re.sub("\"", '', text)
        # 数字文字の一律「0」化
        text = re.sub(r'[0-9 ０-９]+', '0', text)  # 数字

        # カンマ、ピリオド以外の記号をスペースに置換
        for p in string.punctuation:
            if (p == ".") or (p == ","):
                continue
            else:
                text = text.replace(p, " ")
            return text

    # 前処理と単語分割をまとめた関数を定義
    # 単語分割の関数を渡すので、tokenizer_bertではなく、tokenizer_bert.tokenizeを渡す点に注意
    def tokenizer_with_preprocessing(text, tokenizer=tokenizer_bert.tokenize):
        text = preprocessing_text(text)
        ret = tokenizer(text)  # tokenizer_bert
        return ret

    # データを読み込んだときに、読み込んだ内容に対して行う処理を定義します
    max_length = 256
    TEXT = torchtext.data.Field(sequential=True,
                                tokenize=tokenizer_with_preprocessing,
                                use_vocab=True,
                                lower=False,
                                include_lengths=True,
                                batch_first=True,
                                fix_length=max_length,
                                init_token="[CLS]",
                                eos_token="[SEP]",
                                pad_token='[PAD]',
                                unk_token='[UNK]')
    LABEL = torchtext.data.Field(sequential=False, use_vocab=False)

    # フォルダ「data」から各csvファイルを読み込みます
    # BERT用で処理するので、10分弱時間がかかります
    train_val_ds, test_ds = torchtext.data.TabularDataset.splits(
        path=DATA_PATH,
        train='train.csv',
        test='test.csv',
        format='csv',
        fields=[('Text', TEXT), ('Label', LABEL)])

    vocab_bert, ids_to_tokens_bert = load_vocab(vocab_file=VOCAB_FILE)
    TEXT.build_vocab(train_val_ds, min_freq=1)
    TEXT.vocab.stoi = vocab_bert

    batch_size = 32  # BERTでは16、32あたりを使用する
    train_dl = torchtext.data.Iterator(train_val_ds,
                                       batch_size=batch_size,
                                       train=True)
    val_dl = torchtext.data.Iterator(test_ds,
                                     batch_size=batch_size,
                                     train=False,
                                     sort=False)
    # 辞書オブジェクトにまとめる
    dataloaders_dict = {"train": train_dl, "val": val_dl}
    return train_dl, val_dl, TEXT, dataloaders_dict

Exemplo n.º 8

0

Exibir arquivo

Arquivo: predict.py Projeto: Sashimimochi/chABSA-dataset

def tokenizer_with_preprocessing(text):
    tokenizer_bert = BertTokenizer(vocab_file=VOCAB_FILE, do_lower_case=False)
    text = preprocessing_text(text)
    ret = tokenizer_bert.tokenize(text)  
    return ret

Exemplo n.º 9

0

Exibir arquivo

Arquivo: BertTraining.py Projeto: phamduyhk/toxic-comment-classification-challenge

    # カンマ、ピリオド以外の記号をスペースに置換
    for p in string.punctuation:
        if (p == ".") or (p == ","):
            continue
        else:
            text = text.replace(p, " ")

    # ピリオドなどの前後にはスペースを入れておく
    text = text.replace(".", " . ")
    text = text.replace(",", " , ")
    return text


# 単語分割用のTokenizerを用意
tokenizer_bert = BertTokenizer(
    vocab_file="./weights/bert-base-uncased-vocab.txt", do_lower_case=True)


# 前処理と単語分割をまとめた関数を定義
# 単語分割の関数を渡すので、tokenizer_bertではなく、tokenizer_bert.tokenizeを渡す点に注意
def tokenizer_with_preprocessing(text, tokenizer=tokenizer_bert.tokenize):
    text = preprocessing_text(text)
    ret = tokenizer(text)  # tokenizer_bert
    return ret


def main():
    # define output dataframe
    sample = pd.read_csv("./data/sample_submission.csv")
    # データを読み込んだときに、読み込んだ内容に対して行う処理を定義します
    max_length = 256

Exemplo n.º 10

0

Exibir arquivo

 def __init__(self, vocab_file, max_text_length, **kwargs):
     self.tokenizer = BertTokenizer(vocab_file=vocab_file,
                                    do_lower_case=True)
     self.text_field, self.label_field = self._prepare(max_text_length)
     self.vocab, self.ids_to_tokens = self._load_vocab(vocab_file)