Exemplo n.º 1
0
def preprocess_data(args):

    label_counter = Counter([])
    examples_per_file = Counter()

    print("Reading all files for labels.")
    for input_file in args.input_files:
        with xopen(input_file, "rt") as f:
            for example, labels in input_readers[args.task](f):
                examples_per_file[input_file] += 1
                label_counter.update(labels)

    if args.top_n_labels > 0:
        mlb_full = MultiLabelBinarizer(sparse_output=True)
        mlb_full = mlb_full.fit(label_counter.keys())
        label_counter = dict(label_counter.most_common(args.top_n_labels))

    mlb = MultiLabelBinarizer(sparse_output=True)
    # Passing a list in a list because that's what the function wants.
    if args.labels_in:
        labels = json.load(open(args.labels_in))
        mlb = mlb.fit([labels])
    else:
        mlb = mlb.fit([[pair for pair in label_counter]])

    # Save list of partial -> full mapping if doing top N labels.
    if args.top_n_labels > 0:

        label_mapping = np.where(np.in1d(mlb_full.classes_,
                                         mlb.classes_))[0].tolist()

        with xopen(args.label_mapping, "wt") as f:
            f.write(json.dumps(label_mapping))

        # Also save the full labels.
        with xopen(args.full_labels, "wt") as f:
            f.write(json.dumps(list(mlb_full.classes_)))

    # Save list of labels.
    with xopen(args.labels_out, "wt") as f:
        f.write(json.dumps(list(mlb.classes_)))

    # Set parallel tokenization thread count.
    os.environ["RAYON_NUM_THREADS"] = str(args.processes)

    from tokenizers import Tokenizer, decoders, trainers
    from tokenizers.models import WordPiece
    from tokenizers.normalizers import BertNormalizer
    from tokenizers.pre_tokenizers import BertPreTokenizer
    from tokenizers.processors import BertProcessing

    if args.task == 'cafa':
        # Define our custom tokenizer.
        # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word
        # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences.
        tokenizer = WordPiece.from_files(args.vocab,
                                         unk_token="[UNK]",
                                         max_input_chars_per_word=20000)
        tokenizer = Tokenizer(tokenizer)
        tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"])
        tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case)
        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.post_processor = BertProcessing(
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
            ("[CLS]", tokenizer.token_to_id("[CLS]")))
        tokenizer.decoder = decoders.WordPiece(prefix='##')
    else:
        tokenizer = BertWordPieceTokenizer(args.vocab,
                                           lowercase=args.do_lower_case)

    tokenizer.enable_padding(max_length=args.seq_len)
    tokenizer.enable_truncation(max_length=args.seq_len)

    for input_file in args.input_files:
        with xopen(input_file, 'rt') as in_f:

            file_name = generate_out_filename(input_file, args)

            with xopen(file_name, "wt") as out_f:
                print("Processing to: ", file_name)

                # Write the shape as the first row, useful for the finetuning.
                if args.labels_in:
                    n_labels = len(json.load(open(args.labels_in)))
                else:
                    n_labels = len(label_counter)
                out_f.write(
                    json.dumps((examples_per_file[input_file], n_labels)) +
                    '\n')

                batch_size = min(examples_per_file[input_file],
                                 args.processes * 100)
                example_batch = []
                labels_batch = []
                doc_idx_batch = []

                with ParallelGenerator(input_readers[args.task](in_f),
                                       max_lookahead=batch_size) as g:
                    START_POS = int(args.window_start) / 100
                    for doc_idx, (example, labels) in enumerate(g):
                        #example = ' '.join(example.split(' ')[-510:])
                        example_batch.append(example)
                        labels_batch.append(labels)
                        doc_idx_batch.append(doc_idx)

                        if len(example_batch) == batch_size:
                            example_batch = tokenizer.encode_batch(
                                example_batch)
                            labels_batch = mlb.transform(labels_batch)

                            for example, labels, doc_idx in zip(
                                    example_batch, labels_batch,
                                    doc_idx_batch):
                                # Convert sparse arrays to python lists for json dumping.
                                # print(labels);input()
                                labels = labels.nonzero()[1].tolist()
                                """try:
                                    [][0]
                                    print("DOC_LEN:",len(example.overflowing)+1)
                                    mid = len(example.overflowing)//2
                                    out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n')
                                except IndexError:
                                    out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')"""

                                if args.all_blocks or args.n_blocks > 0:
                                    blocks = [example.ids] + [
                                        blk.ids for blk in example.overflowing
                                    ]
                                    #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks])))
                                    for b, block in enumerate(blocks, 2):
                                        if b > args.n_blocks and args.n_blocks > 0:
                                            break
                                        out_f.write(
                                            json.dumps(
                                                [block, labels, doc_idx]) +
                                            '\n')
                                else:
                                    window = get_window(example, START_POS)
                                    assert len(window) == 512
                                    assert all(
                                        [type(y) is int for y in window])
                                    out_f.write(
                                        json.dumps([window, labels]) + '\n')

                            example_batch = []
                            labels_batch = []

                    # Write out whatever is left in the last smaller batch.
                    example_batch = tokenizer.encode_batch(example_batch)
                    labels_batch = mlb.transform(labels_batch)

                    for example, labels, doc_idx in zip(
                            example_batch, labels_batch, doc_idx_batch):
                        # Convert sparse arrays to python lists for json dumping.
                        # print(labels);input()
                        labels = labels.nonzero()[1].tolist()
                        """try:
                            [][0]
                            print("DOC_LEN:",len(example.overflowing)+1)
                            mid = len(example.overflowing)//2
                            out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n')
                        except IndexError:
                            out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')"""

                        if args.all_blocks or args.n_blocks > 0:
                            blocks = [example.ids] + [
                                blk.ids for blk in example.overflowing
                            ]
                            #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks])))
                            for b, block in enumerate(blocks, 2):
                                if b > args.n_blocks and args.n_blocks > 0:
                                    break
                                out_f.write(
                                    json.dumps([block, labels, doc_idx]) +
                                    '\n')
                        else:
                            out_f.write(
                                json.dumps(
                                    [get_window(example, START_POS), labels]) +
                                '\n')
Exemplo n.º 2
0
 def encodings(self, bert_files):
     tokenizer = BertWordPieceTokenizer.from_file(bert_files["vocab"])
     single_encoding = tokenizer.encode("I love HuggingFace")
     pair_encoding = tokenizer.encode("I love HuggingFace", "Do you?")
     return single_encoding, pair_encoding
from transformers import BertTokenizer, TFBertModel, BertConfig

max_len = 384
configuration = BertConfig()  # default paramters and configuration for BERT
"""
## Set-up BERT tokenizer
"""
# Save the slow pretrained tokenizer
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt",
                                   lowercase=True)
"""
## Load the data
"""
train_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
train_path = keras.utils.get_file("train.json", train_data_url)
eval_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
eval_path = keras.utils.get_file("eval.json", eval_data_url)
"""
## Preprocess the data

1. Go through the JSON file and store every record as a `SquadExample` object.
2. Go through each `SquadExample` and create `x_train, y_train, x_eval, y_eval`.
"""

Exemplo n.º 4
0
)
parser.add_argument("--name",
                    default="bert-wordpiece",
                    type=str,
                    help="The name of the output vocab files")
args = parser.parse_args()

files = glob.glob(args.files)
if not files:
    print(f"File does not exist: {args.files}")
    exit(1)

# Initialize an empty tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=True,
    lowercase=True,
)

# And then train
tokenizer.train(
    files,
    vocab_size=10000,
    min_frequency=2,
    show_progress=True,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    limit_alphabet=1000,
    wordpieces_prefix="##",
)

# Save the files
# tensboard log and graph output folder declaration
log_tensorboard_dir = output_path / "runs" / args.word_embedding_type
writer = SummaryWriter(log_tensorboard_dir)

# load datasets
train_path = Path(args.train_path)
test_path = Path(args.test_path)
eval_path = Path(args.eval_path)

train_data = Articles(train_path)
test_data = Articles(test_path)
eval_data = Articles(eval_path, index_file=args.index_file_path)
print("Data Loaded")

# initialize tokenizer from BERT library
tokenizer = BertWordPieceTokenizer(args.tokenizer_file, lowercase=True)
print("Tokenizer Initialized!")

# create and save or load dictionaries based on arguments
if args.create_dicts:
    (
        final_word_ids,
        final_url_ids,
        final_publication_ids,
    ) = dictionary.create_merged_dictionaries(
        train_data.examples, "target", args.tokenizer_file
    )
    print("Dictionaries Created")

    dict_path = Path(args.data_dir) / "dictionaries"
    if not dict_path.is_dir():
 def get_tokenizer(self, path):
     tokenizer = BertWordPieceTokenizer(os.path.join(path, 'vocab.txt'))
     return tokenizer
Exemplo n.º 7
0
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer()

tokenizer.train(["pg34988.txt"], vocab_size=100)

opt = tokenizer.encode("Welcome to the wonderland.")

print("Output: ")
print(opt.ids, opt.tokens, opt.offsets)

from tokenizers import BertWordPieceTokenizer
import os

MAX_LEN = 64
VOCAB = "vocab.txt"
CONFIG = "config.json"
MODEL = "pytorch_model.bin"
BERT_BASE = "./models/bert_base"
TRAIN_SET = "./data/train.csv"
TEST_SET = "./data/test.csv"
CLEANED_TRAIN_SET = "./data/cleaned_train.csv"
BERT_BASE_TOKENIZER = BertWordPieceTokenizer(os.path.join(BERT_BASE, VOCAB),
                                             lowercase=True)
CHECKPOINT_FOLDER = "./checkpoints"
BERTBASEWITHOUTSENTIMENT = "BertBaseWithOutSentiment"
BERTBASEWITHSENTIMENT = "BertBaseWithSentiment"

SENT_DIS = "./figs/sent_dis.png"
SENTIMENT_DIS = "./figs/sentiment_dis.png"
SENTIMENT_CONDITION_DIS = "./figs/sentiment_condition_dis.png"
LOG_FOLDER = "./logs"
Exemplo n.º 9
0
import torch
from tokenizers import BertWordPieceTokenizer

from amadeus_model import Amadeus

tokenizer = BertWordPieceTokenizer('data/bert-base-uncased-vocab.txt',
                                   lowercase=True)

model = Amadeus(num_tokens=tokenizer.get_vocab_size(),
                enc_seq_len=4096,
                dec_seq_len=1024)
model.load_state_dict(
    torch.load('models/amadeus-performer-2020-11-03-16.54.13.pt'))
model.eval(fix_proj_matrices=True)

in_seq = torch.randint(0, tokenizer.get_vocab_size(), (1, model.in_seq_len))
out_seq = torch.randint(0, tokenizer.get_vocab_size(), (1, model.out_seq_len))

traced_script_model = torch.jit.trace(model, (in_seq, out_seq),
                                      check_trace=False)
traced_script_model.save('traced.pt')
Exemplo n.º 10
0
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import numpy as np
import tensorflow_hub as hub
from tokenizers import BertWordPieceTokenizer
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
from tensorflow import keras
from tensorflow.keras import layers

max_seq_length = 384
bert_layer = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2",
    trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy().decode(
    "utf-8")
tokenizer = BertWordPieceTokenizer(vocab=vocab_file, lowercase=True)
model = keras.models.load_model('../model')


def preprocess(context, question):
    start_token_idx = -1
    end_token_idx = -1
    context = " ".join(str(context).split())
    question = " ".join(str(question).split())
    # tokenize context and question
    tokenized_context = tokenizer.encode(context)
    tokenized_question = tokenizer.encode(question)

    input_ids = tokenized_context.ids + tokenized_question.ids[1:]
    token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(
        tokenized_question.ids[1:])
Exemplo n.º 11
0
def main(args):
    print(args)
    if args['train']:
        tokenizer = BertWordPieceTokenizer(
            clean_text=True,
            handle_chinese_chars=True,
            strip_accents=True,  # Must be False if cased model
            lowercase=True,
            wordpieces_prefix="##"
        )

        tokenizer.train(
            files=['/data2/BERT/data/naver_news/news_3_preprocessed/naver_news.txt'],
            limit_alphabet=6000,
            vocab_size=32000
        )

        print(tokenizer.save_model("../BertWordPieceTokenizer_32000"))

    elif args['test']:
        test_str = '나는 워드피스 토크나이저를 써요. 성능이 좋은지 테스트 해보려 합니다.'

        print("=========== tokenizer ===========")
        tokenizer = BertWordPieceTokenizer("../BertWordPieceTokenizer_32000/vocab.txt")
        print(tokenizer)
        encoded_str = tokenizer.encode(test_str)
        print('encoding: ', encoded_str.ids)
        decoded_str = tokenizer.decode(encoded_str.ids)
        print(decoded_str)

        print("=========== BertTokenizer ===========")
        tokenizer = BertTokenizer("../BertWordPieceTokenizer_32000/vocab.txt")
        print(tokenizer)
        encoded_str = tokenizer.encode(test_str)
        print('encoding: ', encoded_str)
        decoded_str = tokenizer.decode(encoded_str)
        print(decoded_str)

        print("=========== BertTokenizer2 ===========")
        tokenizer = BertTokenizer.from_pretrained("../BertWordPieceTokenizer_32000")
        print(tokenizer)
        encoded_str = tokenizer.encode(test_str)
        print('encoding: ', encoded_str)
        decoded_str = tokenizer.decode(encoded_str)
        print(decoded_str)
Exemplo n.º 12
0
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

import pickle

sentence_re = r'''(?x)      # set flag to allow verbose regexps
        (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
      | \.\.\.              # ellipsis
      | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
    '''

vocab = "D:/Word embedding/bert/assets/vocab.txt"
tokenizer = BertWordPieceTokenizer(vocab, lowercase=True)
kp_training = pd.read_json('kp/kp20k_training.json', lines=True)
kp_testing = pd.read_json('kp/kp20k_testing.json', lines=True)
kp_validation = pd.read_json('kp/kp20k_validation.json', lines=True)

max_kp = 0
min_len =1e100
max_len = 512

all_reps = []
att_masks = []
key_positions = []
ref_positions = []

for i in range(len(kp_training[:20000])):
    text = kp_training['abstract'][i]
Exemplo n.º 13
0
print(movie_reviews.columns.values)
print(movie_reviews.sentiment.unique())

y = movie_reviews["sentiment"]

y = np.array(list(map(lambda x: 1 if x == "positive" else 0, y)))

slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)
tokenizer.enable_truncation(MAX_SEQ_LEN - 2)

train_count = 40000  # 40000
test_count = 2000  #

# X_train = convert_sentences_to_features(reviews[:40000], tokenizer)
# X_test = convert_sentences_to_features(reviews[40000:], tokenizer)

X_train = convert_sentences_to_features(reviews[:train_count], tokenizer)
X_test = convert_sentences_to_features(reviews[train_count : train_count + test_count], tokenizer)

one_hot_encoded = to_categorical(y)
# one_hot_encoded = tf.one_hot(y, 1)

# y_train = one_hot_encoded[:40000]
Exemplo n.º 14
0
#! /usr/bin/env python3
__AUTHORS__ = [("CS17BTECH11044", "YASH KHASBAGE"),
               ("CS17BTECH11029", "PUNEET MANGLA")]

import json
import pandas as pd
from tokenizers import BertWordPieceTokenizer

# Initialize an empty BERT tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=True,
)

# prepare text files to train vocab on them
files = ['input.txt']

# train BERT tokenizer
tokenizer.train(files,
                vocab_size=30000,
                min_frequency=10,
                show_progress=True,
                special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
                limit_alphabet=1000,
                wordpieces_prefix="##")

# save vocabulary
tokenizer.save('./udc_vocab.txt')
Exemplo n.º 15
0
 def __init__(self, sentences, bert_path, padding=140):
     self.sentences = sentences
     self.tokenizer = BertWordPieceTokenizer(f'{bert_path}/vocab.txt',
                                             lowercase=True)
     self.padding = padding
Exemplo n.º 16
0
from colorama import Fore
from tokenizers import BertWordPieceTokenizer
from tqdm import tqdm
from transformers import ElectraTokenizer, ElectraForQuestionAnswering
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from Reader.Sample import Sample

model = ElectraForQuestionAnswering.from_pretrained("Reader/electra_QA").to(
    device=torch.device('cpu'))
model.load_state_dict(
    torch.load('Reader/weight_electra/weights_3.pth',
               map_location=torch.device('cpu')))
model.eval()
tokenizer = BertWordPieceTokenizer("Reader/electra_base_uncased/vocab.txt",
                                   lowercase=True)


def inference(question, paragraph):
    squad_eg = Sample(tokenizer, question, paragraph)
    squad_eg.preprocess()
    dataset_dict = {
        "input_word_ids": [],
        "input_type_ids": [],
        "input_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }

    if squad_eg.skip is False:
        for key in dataset_dict:
Exemplo n.º 17
0
class Reader(object):
    def __init__(self,
                 bert_model: str,
                 tokenizer: BaseTokenizer = None,
                 cls: str = "[CLS]",
                 sep: str = "[SEP]",
                 threshold=6):

        self.tokenizer: BaseTokenizer = tokenizer
        self.cls = cls
        self.sep = sep
        if self.tokenizer is None:
            vocab_path: str = "tokenization/" + bert_model + ".txt"
            self.tokenizer = BertWordPieceTokenizer(vocab_path,
                                                    lowercase="-cased"
                                                    not in bert_model)

        self.threshold = threshold
        self.subword_alphabet: Optional[Alphabet] = None
        self.label_alphabet: Optional[Alphabet] = None

        self.train: Optional[List[SentInst]] = None
        self.dev: Optional[List[SentInst]] = None
        self.test: Optional[List[SentInst]] = None

    def _read_file(self, filename: str, mode: str = 'train') -> List[SentInst]:
        sent_list = []
        max_len = 0
        num_thresh = 0
        with open(filename, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line == "":  # last few blank lines
                    break

                raw_tokens = line.split(' ')
                tokens = raw_tokens
                chars = [list(t) for t in raw_tokens]

                entities = next(f).strip()
                if entities == "":  # no entities
                    sent_inst = SentInst(tokens, chars, [])
                else:
                    entity_list = []
                    entities = entities.split("|")
                    for item in entities:
                        pointers, label = item.split()
                        pointers = pointers.split(",")
                        if int(pointers[1]) > len(tokens):
                            pdb.set_trace()
                        span_len = int(pointers[1]) - int(pointers[0])
                        if span_len < 0:
                            print("Warning! span_len < 0")
                            continue
                        if span_len > max_len:
                            max_len = span_len
                        if span_len > self.threshold:
                            num_thresh += 1

                        new_entity = (int(pointers[0]), int(pointers[1]),
                                      label)
                        # may be duplicate entities in some datasets
                        if (mode == 'train' and new_entity
                                not in entity_list) or (mode != 'train'):
                            entity_list.append(new_entity)

                    # assert len(entity_list) == len(set(entity_list)) # check duplicate
                    sent_inst = SentInst(tokens, chars, entity_list)
                assert next(f).strip() == ""  # separating line

                sent_list.append(sent_inst)
        print("Max length: {}".format(max_len))
        print("Threshold {}: {}".format(self.threshold, num_thresh))
        return sent_list

    def _gen_dic(self) -> None:
        label_set = set()

        for sent_list in [self.train, self.dev, self.test]:
            num_mention = 0
            for sentInst in sent_list:
                for entity in sentInst.entities:
                    label_set.add(entity[2])
                num_mention += len(sentInst.entities)
            print("# mentions: {}".format(num_mention))

        vocab = [
            self.tokenizer.id_to_token(idx)
            for idx in range(self.tokenizer.get_vocab_size())
        ]
        self.subword_alphabet = Alphabet(vocab, 0)
        self.label_alphabet = Alphabet(label_set, 0)

    @staticmethod
    def _pad_batches(input_ids_batches: List[List[List[int]]],
                     first_subtokens_batches: List[List[List[int]]]) \
            -> Tuple[List[List[List[int]]],
                     List[List[List[int]]],
                     List[List[List[bool]]]]:

        padded_input_ids_batches = []
        input_mask_batches = []
        mask_batches = []

        all_batches = list(zip(input_ids_batches, first_subtokens_batches))
        for input_ids_batch, first_subtokens_batch in all_batches:

            batch_len = len(input_ids_batch)
            max_subtokens_num = max(
                [len(input_ids) for input_ids in input_ids_batch])
            max_sent_len = max([
                len(first_subtokens)
                for first_subtokens in first_subtokens_batch
            ])

            padded_input_ids_batch = []
            input_mask_batch = []
            mask_batch = []

            for i in range(batch_len):

                subtokens_num = len(input_ids_batch[i])
                sent_len = len(first_subtokens_batch[i])

                padded_subtoken_vec = input_ids_batch[i].copy()
                padded_subtoken_vec.extend([0] *
                                           (max_subtokens_num - subtokens_num))
                input_mask = [1] * subtokens_num + [0] * (max_subtokens_num -
                                                          subtokens_num)
                mask = [True] * sent_len + [False] * (max_sent_len - sent_len)

                padded_input_ids_batch.append(padded_subtoken_vec)
                input_mask_batch.append(input_mask)
                mask_batch.append(mask)

            padded_input_ids_batches.append(padded_input_ids_batch)
            input_mask_batches.append(input_mask_batch)
            mask_batches.append(mask_batch)

        return padded_input_ids_batches, input_mask_batches, mask_batches

    def get_batches(self, sentences: List[SentInst], batch_size: int) -> Tuple:
        subtoken_dic_dic = defaultdict(lambda: defaultdict(list))
        first_subtoken_dic_dic = defaultdict(lambda: defaultdict(list))
        last_subtoken_dic_dic = defaultdict(lambda: defaultdict(list))
        label_dic_dic = defaultdict(lambda: defaultdict(list))

        this_input_ids_batches = []
        this_first_subtokens_batches = []
        this_last_subtokens_batches = []
        this_label_batches = []

        for sentInst in sentences:
            subtoken_vec = []
            first_subtoken_vec = []
            last_subtoken_vec = []
            subtoken_vec.append(self.tokenizer.token_to_id(self.cls))
            for t in sentInst.tokens:
                encoding = self.tokenizer.encode(t)
                ids = [
                    v for v, mask in zip(encoding.ids,
                                         encoding.special_tokens_mask)
                    if mask == 0
                ]
                first_subtoken_vec.append(len(subtoken_vec))
                subtoken_vec.extend(ids)
                last_subtoken_vec.append(len(subtoken_vec))
            subtoken_vec.append(self.tokenizer.token_to_id(self.sep))

            label_list = [(u[0], u[1], self.label_alphabet.get_index(u[2]))
                          for u in sentInst.entities]

            subtoken_dic_dic[len(
                sentInst.tokens)][len(subtoken_vec)].append(subtoken_vec)
            first_subtoken_dic_dic[len(
                sentInst.tokens)][len(subtoken_vec)].append(first_subtoken_vec)
            last_subtoken_dic_dic[len(
                sentInst.tokens)][len(subtoken_vec)].append(last_subtoken_vec)
            label_dic_dic[len(
                sentInst.tokens)][len(subtoken_vec)].append(label_list)

        input_ids_batches = []
        first_subtokens_batches = []
        last_subtokens_batches = []
        label_batches = []
        for length1 in sorted(subtoken_dic_dic.keys(), reverse=True):
            for length2 in sorted(subtoken_dic_dic[length1].keys(),
                                  reverse=True):
                input_ids_batches.extend(subtoken_dic_dic[length1][length2])
                first_subtokens_batches.extend(
                    first_subtoken_dic_dic[length1][length2])
                last_subtokens_batches.extend(
                    last_subtoken_dic_dic[length1][length2])
                label_batches.extend(label_dic_dic[length1][length2])

        [
            this_input_ids_batches.append(input_ids_batches[i:i + batch_size])
            for i in range(0, len(input_ids_batches), batch_size)
        ]
        [
            this_first_subtokens_batches.append(
                first_subtokens_batches[i:i + batch_size])
            for i in range(0, len(first_subtokens_batches), batch_size)
        ]
        [
            this_last_subtokens_batches.append(
                last_subtokens_batches[i:i + batch_size])
            for i in range(0, len(last_subtokens_batches), batch_size)
        ]
        [
            this_label_batches.append(label_batches[i:i + batch_size])
            for i in range(0, len(label_batches), batch_size)
        ]

        this_input_ids_batches, this_input_mask_batches, this_mask_batches \
            = self._pad_batches(this_input_ids_batches, this_first_subtokens_batches)

        return (this_input_ids_batches, this_input_mask_batches,
                this_first_subtokens_batches, this_last_subtokens_batches,
                this_label_batches, this_mask_batches)

    def to_batch(self, batch_size: int) -> Tuple:
        ret_list = []
        for sent_list in [self.train, self.dev, self.test]:
            ret_list.append(self.get_batches(sent_list, batch_size))
        return tuple(ret_list)

    def read_all_data(self, file_path: str, train_file: str, dev_file: str,
                      test_file: str) -> None:
        self.train = self._read_file(file_path + train_file)
        self.dev = self._read_file(file_path + dev_file, mode='dev')
        self.test = self._read_file(file_path + test_file, mode='test')
        self._gen_dic()

    def debug_single_sample(self, subtoken: List[int],
                            label_list: List[Tuple[int, int, int]]) -> None:
        print(" ".join(
            [self.subword_alphabet.get_instance(t) for t in subtoken]))
        for label in label_list:
            print(label[0], label[1],
                  self.label_alphabet.get_instance(label[2]))
Exemplo n.º 18
0
    """
    Parse boolean arguments from the command line.
    """
    if s.lower() in ['off', 'false', '0']:
        return False
    if s.lower() in ['on', 'true', '1']:
        return True
    raise argparse.ArgumentTypeError("invalid value for a boolean flag (0 or 1)")

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Unsupervised training')
    parser.add_argument("--model_path", type=str, default="", help="Pretrained tokenizer path")
    parser.add_argument("--do_lower_case", type=bool_flag, default=False, help="do_lower_case")
    parser.add_argument("--input_file", type=str, default="", help="Input file to be tokenized")
    parser.add_argument("--output_file", type=str, default="", help="Output tokenized file")
    args = parser.parse_args()

    tokenizer = BertWordPieceTokenizer(args.model_path, lowercase=args.do_lower_case)
    count = 0
    with open(args.input_file, 'r', encoding='utf-8') as fin:
        with open(args.output_file, 'w', encoding='utf-8') as fout:
            for line in fin:
                if len(line.strip())>0:
                    output = tokenizer.encode(line.strip())
                    fout.write(' '.join(output.tokens)+'\n')
                else:
                    fout.write('\n')
                count += 1
                if count % 1000 == 0:
                    fout.flush()
                    print('%d sentences tokenized!'%count)
import os
from tokenizers import BertWordPieceTokenizer
from pathlib import Path

save_dir = "vocab"
paths = [
    str(x) for x in Path("/home/phmay/data/nlp/corpus/ready/").glob("*.txt")
]
print(paths)
vocab_size = 32_767  # 2^15-1
min_frequency = 2

os.makedirs(save_dir, exist_ok=True)

special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]

for i in range(767 - 5):
    special_tokens.append('[unused{}]'.format(i))

# https://github.com/huggingface/tokenizers/blob/04fb9e4ebe785a6b2fd428766853eb27ee894645/bindings/python/tokenizers/implementations/bert_wordpiece.py#L11
tokenizer = BertWordPieceTokenizer(strip_accents=False)
tokenizer.train(
    files=paths,
    vocab_size=vocab_size,
    min_frequency=min_frequency,
    special_tokens=special_tokens,
)

tokenizer.save_model(save_dir)
tokenizer.save(save_dir + "/tokenizer.json")
Exemplo n.º 20
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)
    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
    processor = PextProcessor()
    label_list = processor.get_labels()
    # load tokenizer
    tokenizer = BertWordPieceTokenizer(FLAGS.vocab_file,
                                       add_special_tokens=False)
    tokenizer.no_padding()
    tokenizer.no_truncation()

    # Create Estimator
    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    params = {
        "bert_config": bert_config,
        "num_labels": len(label_list),
        "init_checkpoint": FLAGS.init_checkpoint,
        "learning_rate": FLAGS.learning_rate,
        "num_train_steps": num_train_steps,
        "num_warmup_steps": num_warmup_steps
    }

    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=FLAGS.output_dir,
                                       params=params)

    if FLAGS.do_train:
        tf.gfile.MakeDirs(FLAGS.output_dir)
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        file_based_convert_examples_to_features(train_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, train_file)

        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)

        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            batch_size=FLAGS.train_batch_size)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

        # save config
        out_config_file = os.path.join(FLAGS.output_dir, 'config.json')
        with tf.gfile.GFile(out_config_file, "w") as writer:
            writer.write(bert_config.to_json_string())

        # save tokenizer
        tokenizer.save(FLAGS.output_dir)

    if FLAGS.do_eval:
        # load last ckpt
        ckpt_prefix = get_last_ckpt_prefix(FLAGS.output_dir)
        params = {
            'bert_config':
            modeling.BertConfig.from_json_file(
                os.path.join(FLAGS.output_dir, 'config.json')),
            'num_labels':
            len(label_list),
            'init_checkpoint':
            get_last_ckpt_prefix(FLAGS.output_dir),
        }
        estimator = tf.estimator.Estimator(model_fn=model_fn, params=params)

        # prepare data
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        #file_based_convert_examples_to_features(
        #    eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d", len(eval_examples))
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            batch_size=FLAGS.eval_batch_size)

        result = estimator.predict(input_fn=eval_input_fn,
                                   yield_single_examples=False)

        pred_ids, label_ids = [], []
        for prediction in result:
            pred_ids.append(prediction['pred_ids'])
            label_ids.append(prediction['label_ids'])

        pred_ids = np.concatenate(pred_ids, axis=0)
        label_ids = np.concatenate(label_ids, axis=0)

        y_pred = [[] for _ in range(label_ids.shape[0])]
        y_true = [[] for _ in range(label_ids.shape[0])]

        pad_token_label_id = 0
        for i in range(label_ids.shape[0]):
            for j in range(label_ids.shape[1]):
                if label_ids[i, j] != pad_token_label_id:
                    y_pred[i].append(label_list[pred_ids[i, j]])
                    y_true[i].append(label_list[label_ids[i, j]])

        report = metrics.classification_report(y_true, y_pred, digits=4)
        precision = metrics.precision_score(y_true, y_pred)
        recall = metrics.recall_score(y_true, y_pred)
        f1 = metrics.f1_score(y_true, y_pred)

        tf.logging.info("Eval result: \n" + report)

        out_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(out_eval_file, "w") as writer:
            writer.write(f'precision = {precision: f} \n')
            writer.write(f'recall = {recall: f} \n')
            writer.write(f'f1 = {f1 :f} \n')

    if FLAGS.do_predict:
        pass
    type=int,
    required=True,
    help='Vocabulary size',
)
args = parser.parse_args()

files = glob.glob(args.files)
if not files:
    logger.info(f"File does not exist: {args.files}")
    exit(1)

# CHINESE CHARACTERS???!!!
# Initialize an empty tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=True,
    lowercase=True,
)

# And then train
trainer = tokenizer.train(
    files,
    vocab_size=10000,
    min_frequency=2,
    show_progress=True,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    limit_alphabet=1000,
    wordpieces_prefix="##",
)

# Save the files
Exemplo n.º 22
0
import argparse
from tokenizers import BertWordPieceTokenizer

parser = argparse.ArgumentParser()

parser.add_argument("--corpus_file", type=str, default="../data/namuwiki.txt")
parser.add_argument("--vocab_size", type=int, default=22000)
parser.add_argument("--limit_alphabet", type=int, default=6000)

args = parser.parse_args()

tokenizer = BertWordPieceTokenizer(
    vocab_file=None,
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=False,  # Must be False if cased model
    lowercase=False,
    wordpieces_prefix="##")

tokenizer.train(files=[args.corpus_file],
                limit_alphabet=args.limit_alphabet,
                vocab_size=args.vocab_size)

tokenizer.save(
    "./ch-{}-wpm-{}-pretty".format(args.limit_alphabet, args.vocab_size), True)
class config:
    MAX_LEN = 128
    TOKENIZER = BertWordPieceTokenizer(
        '/home/koushik/Documents/Pretrained Models/bert-base-uncased/vocab.txt'
    )
    BERT_PATH = '/home/koushik/Documents/Pretrained Models/bert-base-uncased'
Exemplo n.º 24
0
    def __init__(self, args: argparse.Namespace):
        """Initialize a model, tokenizer and config."""
        super().__init__()
        if isinstance(args, argparse.Namespace):
            self.save_hyperparameters(args)
            self.args = args
        else:
            # eval mode
            TmpArgs = namedtuple("tmp_args", field_names=list(args.keys()))
            self.args = args = TmpArgs(**args)

        # self.bert_dir = args.bert_config_dir
        self.data_dir = self.args.data_dir
        self.bert_config_dir = BERT_DIR[self.args.model]
        self.tokenizer = BertWordPieceTokenizer(vocab=self.bert_config_dir +
                                                '/vocab.txt')

        if self.args.model == 'BERTMRC':
            self.tokenizer = BertWordPieceTokenizer(
                vocab=self.bert_config_dir + '/vocab.txt')
            bert_config = BertQueryNerConfig.from_pretrained(
                self.bert_config_dir,
                hidden_dropout_prob=args.bert_dropout,
                attention_probs_dropout_prob=args.bert_dropout,
                mrc_dropout=args.mrc_dropout)

            self.model = BERTModel[self.args.model].from_pretrained(
                self.bert_config_dir, config=bert_config)
        else:
            # self.tokenizer = AutoTokenizer.from_pretrained(self.bert_config_dir, do_lower_case=True)
            # self.model = BertForQuestionAnswering.from_pretrained(self.bert_config_dir)
            self.model = BERTModel[self.args.model](self.bert_config_dir,
                                                    self.args)
        logging.info(str(self.model))
        logging.info(
            str(args.__dict__ if isinstance(args, argparse.ArgumentParser
                                            ) else args))
        # self.ce_loss = CrossEntropyLoss(reduction="none")
        self.loss_type = args.loss_type
        # self.loss_type = "bce"
        if self.loss_type == "bce":
            self.bce_loss = BCEWithLogitsLoss(reduction="none")
        else:
            self.dice_loss = DiceLoss(with_logits=True,
                                      smooth=args.dice_smooth)
        # todo(yuxian): 由于match loss是n^2的,应该特殊调整一下loss rate
        weight_sum = args.weight_start + args.weight_end + args.weight_span
        self.weight_start = args.weight_start / weight_sum
        self.weight_end = args.weight_end / weight_sum
        self.weight_span = args.weight_span / weight_sum
        self.flat_ner = args.flat
        self.span_f1 = QuerySpanF1(flat=self.flat_ner)
        self.chinese = args.chinese
        self.optimizer = args.optimizer
        self.span_loss_candidates = args.span_loss_candidates
        self.dataset_train, self.dataset_valid, self.dataset_test = get_dataloader(
            args.tgt_domain,
            args.n_samples,
            args.batch_size,
            self.tokenizer,
            query_type=self.args.query_type)
Exemplo n.º 25
0
data_csv = "./data/ner_dataset.csv"

"""
## Setup Tokenizers
"""

# Save the slow pretrained tokenizer
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") #
save_path = "D:/spyder/tf_torch_model/torch_model/bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
# "bert_base_uncased/vocab.txt"
tokenizer = BertWordPieceTokenizer("D:/spyder/tf_torch_model/torch_model/bert_base_uncased/vocab.txt", lowercase=True)

"""
## Define model
"""

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction=tf.keras.losses.Reduction.NONE
)


def masked_ce_loss(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 17))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
Exemplo n.º 26
0
 def tokenizer(
         self) -> Union[BaseTokenizer, CountVectorizer, TfidfVectorizer]:
     pkl_path = os.path.join(self.tokenizer_path, "model.pkl")
     if self._tokenizer is not None:
         return self._tokenizer
     ### get pickled tokenizer
     if os.path.exists(pkl_path) and not self.retrain_tokenizer:
         with open(pkl_path, 'rb') as f:
             tokenizer = pickle.load(f)
     ### train new tokenizer
     else:
         self.retrain_tokenizer = False
         if self.algorithm == 'bert':
             from tokenizers import BertWordPieceTokenizer
             tokenizer = BertWordPieceTokenizer(
                 vocab_file=None if self._init_vocabulary is None else os.
                 path.join(self.cache_path, "bert_vocab.txt"))
             tokenizer.enable_truncation(max_length=self.max_length)
             tokenizer.enable_padding(length=self.max_length)
             # train the tokenizer
             if self._init_vocabulary is None:
                 path = os.path.join(self.cache_path, 'train.txt')
                 with open(path, 'w') as f:
                     for i in chain(self.train_text, self.valid_text,
                                    self.test_text):
                         if len(i) == 0:
                             continue
                         f.write(i + "\n" if i[-1] != "\n" else i)
                 tokenizer.train(files=path,
                                 vocab_size=self.vocab_size,
                                 min_frequency=self.min_frequency,
                                 limit_alphabet=self.limit_alphabet,
                                 show_progress=True)
             tokenizer.save_model(self.tokenizer_path)
         elif self.algorithm in ('count', 'tf', 'tfidf'):
             if self.algorithm == 'count':
                 tokenizer = CountVectorizer(
                     input='content',
                     ngram_range=self.ngram_range,
                     min_df=self.min_frequency,
                     max_df=self.max_frequency,
                     max_features=self.vocab_size,
                     vocabulary=self._init_vocabulary,
                     tokenizer=_simple_tokenizer,
                     stop_words='english')
             elif self.algorithm in ('tf', 'tfidf'):
                 tokenizer = TfidfVectorizer(
                     input='content',
                     ngram_range=self.ngram_range,
                     min_df=self.min_frequency,
                     max_df=self.max_frequency,
                     max_features=self.vocab_size,
                     stop_words='english',
                     vocabulary=self._init_vocabulary,
                     tokenizer=_simple_tokenizer,
                     use_idf=False if self.algorithm == 'tf' else True)
             tokenizer.fit((_simple_preprocess(i) for i in chain(
                 self.train_text, self.valid_text, self.test_text)))
         else:
             raise NotImplementedError
         # save the pickled model
         with open(pkl_path, "wb") as f:
             pickle.dump(tokenizer, f)
     ### assign and return
     self._tokenizer = tokenizer
     return self._tokenizer
Exemplo n.º 27
0
# plot_list_histogram(ocr_lengths, 'OCR')
# print_statistics(ocr_lengths, 'OCR')

# plot_list_histogram(gs_lengths, 'GS')
# print_statistics(gs_lengths, 'GS')


# In[5]:


full_ocr_tokens_path = os.path.join('results', 'combined_ocr_tokens.pickle')
full_gs_tokens_path = os.path.join('results', 'combined_gs_tokens.pickle')

vocab_path = os.path.join('vocabularies', 'bert-base-cased-vocab.txt')
tokenizer = BertWordPieceTokenizer(vocab_path)

if not os.path.exists(full_ocr_tokens_path) or not os.path.exists(full_gs_tokens_path):        
    ocr_tokens = []
    gs_tokens = []
    for i in range(len(ocr_file_data)):
        current_ids = tokenizer.encode(ocr_file_data[i]).ids
        if len(current_ids) > 2000:
            continue
            
        ocr_tokens.append(current_ids)
        gs_tokens.append(tokenizer.encode(gs_file_data[i]).ids)
    
    with open(full_ocr_tokens_path, 'wb') as ocr_handle:
        pickle.dump(ocr_tokens, ocr_handle, protocol=-1)
    
Exemplo n.º 28
0
# Random seed define
SEED_NUM = 1234
tf.random.set_seed(SEED_NUM)
np.random.seed(SEED_NUM)

# Save the slow pretrained tokenizer
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased",
                                               lowercase=False)
save_path = "bert-base-multilingual-cased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("bert-base-multilingual-cased/vocab.txt",
                                   lowercase=False)

# get dataset
train_data_url = "https://korquad.github.io/dataset/KorQuAD_v1.0_train.json"
train_path = keras.utils.get_file("train.json", train_data_url)
eval_data_url = "https://korquad.github.io/dataset/KorQuAD_v1.0_dev.json"
eval_path = keras.utils.get_file("eval.json", eval_data_url)

wget.download(
    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json',
    out='./bert-base-multilingual-cased/')
os.rename(
    './bert-base-multilingual-cased/bert-base-multilingual-cased-config.json',
    './bert-base-multilingual-cased/config.json')

wget.download(
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer



if __name__ == '__main__': 
# =============================================================================
# Bert Tokenizer Format   
# =============================================================================
    from tokenizers import BertWordPieceTokenizer
    
    # Initialize an empty BERT tokenizer
    tokenizer = BertWordPieceTokenizer(
      clean_text=False,
      handle_chinese_chars=False,
      strip_accents=False,
      lowercase=True,
    )
    
    # prepare text files to train vocab on them
    files = ['data/merged_CC.txt', 'data/merged_wiki.txt']
    
    # train BERT tokenizer
    tokenizer.train(
      files,
      vocab_size=50000,
      min_frequency=2,
      show_progress=True,
      special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
      limit_alphabet=1000,
      wordpieces_prefix="##"
Exemplo n.º 30
0
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import tensorflow as tf
import traceback
from keras.backend import set_session
import numpy as np
from tokenizers import BertWordPieceTokenizer
from keras.models import load_model
from keras_bert import get_custom_objects
from flask import Flask
from flask import request, jsonify

max_len = 300

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("chinese_L-12_H-768_A-12/vocab.txt",
                                   lowercase=True)


class SquadExample:
    def __init__(self, question, context, start_char_idx, answer_text,
                 all_answers):
        self.question = question
        self.context = context
        self.start_char_idx = start_char_idx
        self.answer_text = answer_text
        self.all_answers = all_answers
        self.skip = False

    def preprocess(self):
        context = self.context
        question = self.question