Exemplo n.º 1
0
    def __init__(self, num_class=4):
        """Constructor"""

        super(BagOfEmbeddings, self).__init__()

        tokenizer = CharBPETokenizer('../Tokenize/thyme-tokenizer-vocab.json',
                                     '../Tokenize/thyme-tokenizer-merges.txt')
        vocab_size = tokenizer.get_vocab_size()

        self.embed = nn.Embedding(num_embeddings=vocab_size,
                                  embedding_dim=cfg.getint('model', 'emb_dim'))

        self.hidden1 = nn.Linear(in_features=cfg.getint('model', 'emb_dim'),
                                 out_features=cfg.getint(
                                     'model', 'hidden_size'))

        self.relu = nn.ReLU()

        self.hidden2 = nn.Linear(
            in_features=cfg.getint('model', 'hidden_size'),
            out_features=cfg.getint('model', 'hidden_size'))

        self.dropout = nn.Dropout(cfg.getfloat('model', 'dropout'))

        self.classif = nn.Linear(in_features=cfg.getint(
            'model', 'hidden_size'),
                                 out_features=num_class)
Exemplo n.º 2
0
    def test_basic_encode(self, openai_files):
        tokenizer = CharBPETokenizer(openai_files["vocab"],
                                     openai_files["merges"])

        output = tokenizer.encode("My name is John", "pair")
        assert output.ids == [0, 253, 1362, 544, 0, 7, 12662, 2688]
        assert output.tokens == [
            "<unk>",
            "y</w>",
            "name</w>",
            "is</w>",
            "<unk>",
            "o",
            "hn</w>",
            "pair</w>",
        ]
        assert output.offsets == [
            (0, 1),
            (1, 2),
            (3, 7),
            (8, 10),
            (11, 12),
            (12, 13),
            (13, 15),
            (0, 4),
        ]
        assert output.type_ids == [0, 0, 0, 0, 0, 0, 0, 1]
Exemplo n.º 3
0
    def __init__(self, num_classes=3):
        """We have some of the best constructors in the world"""

        super(TransformerClassifier, self).__init__()

        tokenizer = CharBPETokenizer('../Tokenize/thyme-tokenizer-vocab.json',
                                     '../Tokenize/thyme-tokenizer-merges.txt')
        vocab_size = tokenizer.get_vocab_size()

        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                      embedding_dim=cfg.getint(
                                          'model', 'emb_dim'))

        self.position = PositionalEncoding(
            embedding_dim=cfg.getint('model', 'emb_dim'))

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=cfg.getint('model', 'emb_dim'),
            nhead=cfg.getint('model', 'num_heads'),
            dim_feedforward=cfg.getint('model', 'feedforw_dim'))

        self.trans_encoder = nn.TransformerEncoder(encoder_layer=encoder_layer,
                                                   num_layers=cfg.getint(
                                                       'model', 'num_layers'))

        self.dropout = nn.Dropout(cfg.getfloat('model', 'dropout'))

        self.linear = nn.Linear(in_features=cfg.getint('model', 'emb_dim'),
                                out_features=num_classes)

        self.init_weights()
    def test_train_from_iterator(self):
        text = ["A first sentence", "Another sentence", "And a last one"]
        tokenizer = CharBPETokenizer()
        tokenizer.train_from_iterator(text, show_progress=False)

        output = tokenizer.encode("A sentence")
        assert output.tokens == ["A</w>", "sentence</w>"]
Exemplo n.º 5
0
 def __init__(self, text_list, vocab_size, lazy=False):
     if not lazy:
         self.tokenizer = CharBPETokenizer()
         self.tokenizer.train(text_list,
                              vocab_size=vocab_size,
                              special_tokens=[PAD, BOS, EOS, "<unk>"])
         self.tokenizer.add_special_tokens([PAD, BOS, EOS])
     else:
         self.tokenizer = None
Exemplo n.º 6
0
    def __init__(self, cache_dir, max_length=None, vocab_size=400):
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.cache_dir = cache_dir
        self.name = "%d-%s" % (vocab_size, max_length)
        self.tokenizer = None

        vocab = os.path.join(self.cache_dir, self.name + '-vocab.json')
        merges = os.path.join(self.cache_dir, self.name + '-merges.txt')
        if os.path.exists(vocab) and os.path.exists(merges):
            self.tokenizer = CharBPETokenizer(vocab, merges, lowercase=True)
            print('Using cached HuggingFaceTokenizer')
Exemplo n.º 7
0
    def __init__(self, path='subword/'):
        """ 
        Args:
            path: str, a path to vocab file.
        """
        
        # Load vocab
        self.subword_tokenizer = CharBPETokenizer(vocab_file=path+"/bpe-vocab.json", merges_file=path+"/bpe-merges.txt")

        self.encode = self._encode_subwords
        self.id_to_token = self._id_to_subword()
        self.token_to_id = self._subword_to_id()
Exemplo n.º 8
0
 def test_lowercase(self, openai_files):
     tokenizer = CharBPETokenizer(openai_files["vocab"],
                                  openai_files["merges"],
                                  lowercase=True)
     output = tokenizer.encode("My name is John",
                               "pair",
                               add_special_tokens=False)
     assert output.ids == [547, 1362, 544, 2476, 2688]
     assert output.tokens == [
         "my</w>", "name</w>", "is</w>", "john</w>", "pair</w>"
     ]
     assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]
     assert output.type_ids == [0, 0, 0, 0, 1]
Exemplo n.º 9
0
    def __init__(self, num_class=4):
        """Constructor"""

        super(LstmClassifier, self).__init__()

        tokenizer = CharBPETokenizer('../Tokenize/thyme-tokenizer-vocab.json',
                                     '../Tokenize/thyme-tokenizer-merges.txt')
        vocab_size = tokenizer.get_vocab_size()

        self.embed = nn.Embedding(num_embeddings=vocab_size,
                                  embedding_dim=cfg.getint('model', 'emb_dim'))
        self.lstm = nn.LSTM(input_size=cfg.getint('model', 'emb_dim'),
                            hidden_size=cfg.getint('model', 'hidden_size'))
        self.dropout = nn.Dropout(p=cfg.getfloat('model', 'dropout'))
        self.linear = nn.Linear(in_features=cfg.getint('model', 'hidden_size'),
                                out_features=num_class)
Exemplo n.º 10
0
    def load(vocab_file=None):
        if not os.path.exists(vocab_file):
            raise Exception("{} is not exist".format(vocab_file))
        path, filename = os.path.split(vocab_file)
        ttype = filename.split("_")[0]
        merges_file = os.path.join(
            path, filename.replace("vocab.json", "merges.txt"))
        if ttype == "byte":
            if not os.path.exists(merges_file):
                raise Exception("{} is not exist".format(merges_file))
            tokenizer = ByteLevelBPETokenizer(
                add_prefix_space=True,  # required
                lowercase=True,  # required
                unicode_normalizer=None,  # required
                vocab_file=vocab_file,
                merges_file=merges_file,
                dropout=None,
                continuing_subword_prefix=None,
                end_of_word_suffix=None)

        elif ttype == "char":
            if not os.path.exists(merges_file):
                raise Exception("{} is not exist".format(merges_file))
            tokenizer = CharBPETokenizer(
                unk_token=unk_token,  # required
                suffix=suffix_token,  # required
                lowercase=True,  # required
                unicode_normalizer=None,  # required
                vocab_file=vocab_file,
                merges_file=merges_file,
                dropout=None)

        elif ttype == "bert":
            tokenizer = BertWordPieceTokenizer(
                clean_text=True,  # required
                handle_chinese_chars=True,  # required
                strip_accents=True,  # required
                lowercase=True,  # required
                vocab_file=vocab_file,
                # add_special_tokens=True,
                unk_token=BUNK,
                sep_token=BSEP,
                cls_token=BCLS,
                wordpieces_prefix=BPRE)

        elif ttype == "sent":
            if not os.path.exists(merges_file):
                raise Exception("{} is not exist".format(merges_file))
            tokenizer = SentencePieceBPETokenizer(
                add_prefix_space=True,  # required
                unk_token=unk_token,
                replacement=rep_token,
                vocab_file=vocab_file,
                merges_file=merges_file,
                dropout=None)

        else:
            raise Exception("Not implement yet")

        return tokenizer
Exemplo n.º 11
0
def get_data():
    transcript_folder = os.path.join('data', 'transcripts')
    summary_folder = os.path.join('data', 'summary')

    train_files, train_result_files, test_files, test_result_files = get_dataset_files(transcript_folder,
                                                                                       summary_folder)
    train_data, train_results, test_data, test_results = get_dataset(train_files, train_result_files, test_files,
                                                                     test_result_files)

    tokenizer = CharBPETokenizer()
    all_files = np.concatenate([train_files, train_result_files, test_files, test_result_files])
    tokenizer.train(list(all_files))

    train_data = tokenize_data(tokenizer, train_data)
    test_data = tokenize_data(tokenizer, test_data)

    return train_data, train_results, test_data, test_results
Exemplo n.º 12
0
def test():
    """Test trained tokenizer"""

    tokenizer = CharBPETokenizer('./thyme-tokenizer-vocab.json',
                                 './thyme-tokenizer-merges.txt')

    vocab = tokenizer.get_vocab()
    print('vocab size:', len(vocab))

    encoded = tokenizer.encode('patient dr. who diagnosed with brain abc')
    encoded.pad(15)

    print('encoded:', encoded.ids)
    print('decoded:', tokenizer.decode(encoded.ids))

    print(encoded.tokens)
    print(encoded.attention_mask)
Exemplo n.º 13
0
 def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
     kwargs.setdefault("unk_token", unk_token)
     super().__init__(
         CharBPETokenizer(vocab_file=vocab_file,
                          merges_file=merges_file,
                          unk_token=unk_token,
                          lowercase=True),
         **kwargs,
     )
def main():
    #argparser
    parser = argparse.ArgumentParser(
        prog="train_mlm_camembert_thai.py",
        description="train mlm for Camembert with huggingface Trainer",
    )

    #required
    parser.add_argument("--bpe_tokenizer",
                        type=str,
                        default='sentencepiece',
                        help='Specify the name of BPE Tokenizer')
    parser.add_argument("--vocab_size", type=int, default=52000)
    parser.add_argument("--min_frequency", type=int, default=2)
    parser.add_argument(
        "--train_dir",
        type=str,
    )
    parser.add_argument(
        "--output_dir",
        type=str,
    )
    parser.add_argument("--ext", type=str, default='.txt')

    args = parser.parse_args()

    fnames = [str(x) for x in glob.glob(f"{args.train_dir}/*{args.ext}")]

    # Initialize a tokenizer
    if args.bpe_tokenizer == 'byte_level':
        _BPE_TOKENIZER = ByteLevelBPETokenizer()
    if args.bpe_tokenizer == 'char':
        _BPE_TOKENIZER = CharBPETokenizer()
    if args.bpe_tokenizer == 'sentencepiece':
        _BPE_TOKENIZER = SentencePieceBPETokenizer()

    tokenizer = _BPE_TOKENIZER

    # Customize training
    tokenizer.train(files=fnames,
                    vocab_size=args.vocab_size,
                    min_frequency=args.min_frequency,
                    special_tokens=[
                        "<s>",
                        "<pad>",
                        "</s>",
                        "<unk>",
                        "<mask>",
                    ])

    # Save files to disk
    tokenizer.save_model(args.output_dir)

    #test
    tokenizer = CamembertTokenizer.from_pretrained(args.output_dir)
    print(tokenizer.encode_plus('สวัสดีครับ hello world'))
Exemplo n.º 15
0
class SubwordEncoder:
    "Subword tokenization" 

    def __init__(self, path='subword/'):
        """ 
        Args:
            path: str, a path to vocab file.
        """
        
        # Load vocab
        self.subword_tokenizer = CharBPETokenizer(vocab_file=path+"/bpe-vocab.json", merges_file=path+"/bpe-merges.txt")

        self.encode = self._encode_subwords
        self.id_to_token = self._id_to_subword()
        self.token_to_id = self._subword_to_id()

    def get_vocab_size(self):
        return self.subword_tokenizer.get_vocab_size()   

    def _encode_subwords(self, sentence, with_eos):
        """ 
        Args:
            sentence: str, texts to be encoded.
            with_eos: end with <EOS> token.
        Returns:
            tokens: list, encoded sequence.
        """
        tokens = self.subword_tokenizer.encode(sentence).ids
        if with_eos:
            tokens += [2] # 2 is the id of <EOS> token
        return tokens

    def _id_to_subword(self):
        id2subword = {}
        for i in range(self.get_vocab_size()):
            id2subword[i] = self.subword_tokenizer.id_to_token(i)
        return id2subword

    def _subword_to_id(self):
        subword2id = {}
        for i in range(self.get_vocab_size()):
            subword2id[self.subword_tokenizer.id_to_token(i)] = i
        return subword2id
Exemplo n.º 16
0
def to_lstm_inputs(texts, max_len=None):
  """Padded at the beginning rather than at the end"""

  tokenizer = CharBPETokenizer(
    '../Tokenize/thyme-tokenizer-vocab.json',
    '../Tokenize/thyme-tokenizer-merges.txt')
  seqs = [tokenizer.encode(text).ids for text in texts]

  if max_len is None:
    # set max_len to the length of the longest sequence
    max_len = max(len(id_seq) for id_seq in seqs)

  ids = torch.zeros(len(seqs), max_len, dtype=torch.long)

  for i, seq in enumerate(seqs):
    if len(seq) > max_len:
      seq = seq[:max_len]
    ids[i, -len(seq):] = torch.tensor(seq)

  return ids
Exemplo n.º 17
0
def to_token_id_sequences(texts, max_len=None):
  """Matrix of token ids"""

  tokenizer = CharBPETokenizer(
    '../Tokenize/thyme-tokenizer-vocab.json',
    '../Tokenize/thyme-tokenizer-merges.txt')
  seqs = [tokenizer.encode(text).ids for text in texts]

  if max_len is None:
    # set max_len to the length of the longest sequence
    max_len = max(len(id_seq) for id_seq in seqs)

  ids = torch.zeros(len(seqs), max_len, dtype=torch.long)

  for i, seq in enumerate(seqs):
    if len(seq) > max_len:
      seq = seq[:max_len]
    ids[i, :len(seq)] = torch.tensor(seq)

  return ids
Exemplo n.º 18
0
def train(args):

    tokenizer = CharBPETokenizer()

    tokenizer.train([args.corpus], vocab_size=1000)

    tokenizer.save("src/dev_scripts/tokenizer.json")
Exemplo n.º 19
0
def create_tokenizer_imbd(data_path, file_name, vocab_size):
    #df = pd.read_csv(os.path.join(data_path, file_name))
    tokenizer = CharBPETokenizer()
    tokenizer.train(
        os.path.join(data_path, file_name),
        vocab_size=vocab_size,
        min_frequency=2,
        show_progress=True,
        special_tokens=["[CLS]", "[PAD]", "[MASK]", "[UNK]", "[SEP]"])

    print("[CLS]: {}, [PAD]: {}, [MASK]: {}, [UNK]: {}, [SEP]: {}".format(
        str(tokenizer.token_to_id("[CLS]")),
        str(tokenizer.token_to_id("[PAD]")),
        str(tokenizer.token_to_id("[MASK]")),
        str(tokenizer.token_to_id("[UNK]")),
        str(tokenizer.token_to_id("[SEP]"))))

    tokenizer.save(data_path, "tokenizer")
Exemplo n.º 20
0
def to_transformer_inputs(texts, max_len=None):
  """Matrix of token ids and a square attention mask for eash sample"""

  tokenizer = CharBPETokenizer(
    '../Tokenize/thyme-tokenizer-vocab.json',
    '../Tokenize/thyme-tokenizer-merges.txt')
  seqs = [tokenizer.encode(text).ids for text in texts]

  if max_len is None:
    # set max_len to the length of the longest sequence
    max_len = max(len(id_seq) for id_seq in seqs)

  ids = torch.zeros(len(seqs), max_len, dtype=torch.long)
  mask = torch.zeros(len(seqs), max_len, max_len, dtype=torch.long)

  for i, seq in enumerate(seqs):
    if len(seq) > max_len:
      seq = seq[:max_len]
    ids[i, :len(seq)] = torch.tensor(seq)
    mask[i, :len(seq), :len(seq)] = 1

  return ids, mask
Exemplo n.º 21
0
  def __init__(self, num_class=3):
    """Constructor"""

    super(BagOfEmbeddings, self).__init__()

    tokenizer = CharBPETokenizer(
      '../Tokenize/thyme-tokenizer-vocab.json',
      '../Tokenize/thyme-tokenizer-merges.txt')
    vocab_size = tokenizer.get_vocab_size()

    self.embed = nn.Embedding(
      num_embeddings=vocab_size,
      embedding_dim=cfg.getint('model', 'emb_dim'))

    self.posit = positions.BertPositionalEncoding.from_pretrained(
      'bert-base-uncased')

    self.dropout = nn.Dropout(cfg.getfloat('model', 'dropout'))

    self.classif = nn.Linear(
      in_features=cfg.getint('model', 'emb_dim'),
      out_features=num_class)
Exemplo n.º 22
0
    def build(self, texts):
        if self.tokenizer is not None:
            return

        tmp_file = tempfile.NamedTemporaryFile()

        with open(tmp_file.name, "w") as f:
            f.write(' '.join(texts).lower())

        self.tokenizer = CharBPETokenizer(lowercase=True)
        self.tokenizer.train(
            [tmp_file.name],
            vocab_size=self.vocab_size,
            special_tokens=[
                NUL_token,
                PAD_token,
                BOS_token,
                UNK_token,
            ],
        )
        os.makedirs(self.cache_dir, exist_ok=True)
        self.tokenizer.save(self.cache_dir, self.name)
Exemplo n.º 23
0
    def __init__(self, args):
        self.args = args
        if self.args.type == "byte":
            self.tokenizer = ByteLevelBPETokenizer(
                add_prefix_space=True,  # required
                lowercase=True,  # required
                unicode_normalizer=None,  # required
                vocab_file=None,
                merges_file=None,
                dropout=None,
                continuing_subword_prefix=None,
                end_of_word_suffix=None)

        elif self.args.type == "char":
            self.tokenizer = CharBPETokenizer(
                unk_token=unk_token,  # required
                suffix=suffix_token,  # required
                lowercase=True,  # required
                unicode_normalizer=None,  # required
                vocab_file=None,
                merges_file=None,
                dropout=None)

        elif self.args.type == "bert":
            self.tokenizer = BertWordPieceTokenizer(
                clean_text=True,  # required
                handle_chinese_chars=True,  # required
                strip_accents=True,  # required
                lowercase=True,  # required
                vocab_file=None,
                # add_special_tokens=True,
                unk_token=BUNK,
                sep_token=BSEP,
                cls_token=BCLS,
                wordpieces_prefix=BPRE)

        elif self.args.type == "sent":
            self.tokenizer = SentencePieceBPETokenizer(
                add_prefix_space=True,  # required
                unk_token=unk_token,
                replacement=rep_token,
                vocab_file=None,
                merges_file=None,
                dropout=None)

        else:
            raise Exception("Not implement yet")

        pass
Exemplo n.º 24
0
    def _cbpe(self):
        tokenizer = CharBPETokenizer(
            vocab=self.conf.vocab,
            merges=self.conf.merges,
            unk_token=self.conf.cbpe_unk_token,
            suffix=self.conf.suffix,
            dropout=self.conf.dropout,
            lowercase=self.conf.lowercase,
            unicode_normalizer=self.conf.unicode_normalizer,
            bert_normalizer=self.conf.bert_normalizer,
            split_on_whitespace_only=self.conf.split_on_whitespace_only,
        )

        tokenizer.train(
            files=self.files,
            vocab_size=self.conf.vocab_size,
            min_frequency=self.conf.min_frequency,
            special_tokens=self.conf.special_tokens,
            limit_alphabet=self.conf.limit_alphabet,
            initial_alphabet=self.conf.initial_alphabet,
            suffix=self.conf.cpbe_train_shuffix,
        )

        return tokenizer
class HuggingFaceTokenizer:
    def __init__(self, cache_dir, max_length=None, vocab_size=400):
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.cache_dir = cache_dir
        self.name = "%d-%s" % (vocab_size, max_length)
        self.tokenizer = None

        vocab = os.path.join(self.cache_dir, self.name + '-vocab.json')
        merges = os.path.join(self.cache_dir, self.name + '-merges.txt')
        if os.path.exists(vocab) and os.path.exists(merges):
            self.tokenizer = CharBPETokenizer(vocab, merges, lowercase=True)
            print('Using cached HuggingFaceTokenizer')

    def build(self, texts):
        if self.tokenizer is not None:
            return

        tmp_file = tempfile.NamedTemporaryFile()

        with open(tmp_file.name, "w") as f:
            f.write(' '.join(texts).lower())

        self.tokenizer = CharBPETokenizer(lowercase=True)
        self.tokenizer.train(
            [tmp_file.name],
            vocab_size=self.vocab_size,
            special_tokens=[
                NUL_token,
                PAD_token,
                BOS_token,
                UNK_token,
            ],
        )
        os.makedirs(self.cache_dir, exist_ok=True)
        self.tokenizer.save(self.cache_dir, self.name)

    def encode(self, text):
        token_ids = self.tokenizer.encode(text.lower()).ids
        token_ids = token_ids[:self.max_length]

        return token_ids

    def decode(self, tokens, skip_special_tokens=True):
        text = self.tokenizer.decode(  # My special tokens
            tokens,
            # [token for token in tokens if token > 3],   # aren't skipped
            skip_special_tokens=skip_special_tokens,  # even I set f*****g
        )  # skip_special_tokens
        return text  # to True

    def decode_plus(self, token_batch):
        sentences = []
        for tokens in token_batch:
            sentences.append(self.decode(tokens))
        return sentences
 def __init__(self, tokenizers=None, cleaner=english_cleaners):
     if tokenizers == None:
         tokenizers = CharBPETokenizer(
             './BPE-1024/-vocab.json',
             './BPE-1024/-merges.txt',
             lowercase=True,
         )
     punctuation = string.punctuation
     punctuation = punctuation.replace("+", "")
     punctuation = punctuation.replace("&", "")
     table = str.maketrans(punctuation, " " * len(punctuation))
     if cleaner != None:
         print('Use cleaner !')
     self.table = table
     self.cleaner = cleaner
     self.token = tokenizers
     self.vocab_size = self.token.get_vocab_size()
Exemplo n.º 27
0
def create_tokenizer(data_path, vocab_size):

    tokenizer = CharBPETokenizer()
    tokenizer.train([
        os.path.join(data_path, file) for file in
        [f
         for f in os.listdir(data_path) if f.find("uncased_chunk") != -1][:20]
    ],
                    vocab_size=vocab_size,
                    min_frequency=2,
                    show_progress=True,
                    special_tokens=[
                        "[CLS]", "[PAD]", "[MASK]", "[UNK]", "[SEP]"
                    ])

    print("[CLS]: {}, [PAD]: {}, [MASK]: {}, [UNK]: {}, [SEP]: {}".format(
        str(tokenizer.token_to_id("[CLS]")),
        str(tokenizer.token_to_id("[PAD]")),
        str(tokenizer.token_to_id("[MASK]")),
        str(tokenizer.token_to_id("[UNK]")),
        str(tokenizer.token_to_id("[SEP]"))))

    tokenizer.save(data_path, "tokenizer")
Exemplo n.º 28
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--raw_data_path',
                        default='data/train.txt',
                        type=str,
                        required=False,
                        help='原始训练语料')
    parser.add_argument('--batch_size',
                        default=2,
                        type=int,
                        required=False,
                        help='模型推断batch大小')
    parser.add_argument('--model_path',
                        default='./model/epoch_5/model.bin',
                        type=str,
                        required=False,
                        help='模型保存位置')

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    model_path = args.model_path

    # device
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    # tokenizer
    tokenizer = CharBPETokenizer("./vocab/bpe.tokenizer.json-vocab.json",
                                 './vocab/bpe.tokenizer.json-merges.txt')

    # model
    with open('./config/model_config.json', 'r', encoding='utf-8') as f:
        text = f.read()
        config = json.loads(text)
    model = GPT2LMHeadModel(config)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    model.to(device)
Exemplo n.º 29
0
def train():
    """My main man"""

    base = os.environ['DATA_ROOT']
    corpus_path = base + 'Thyme/Text/train+dev+test/*'
    files = glob.glob(corpus_path)

    tokenizer = CharBPETokenizer(lowercase=True)
    tokenizer.train(files=files,
                    vocab_size=10000,
                    min_frequency=3,
                    show_progress=True)
    tokenizer.save('.', name='thyme-tokenizer')
Exemplo n.º 30
0
def build_tokenizer(args):
    tokenizer = None
    if args.tokenizer_type == "bbpe":
        tokenizer = ByteLevelBPETokenizer(unicode_normalizer="nfkc")
    elif args.tokenizer_type == "cbpe":
        tokenizer = CharBPETokenizer(
            unk_token="<unk>",
            unicode_normalizer="nfkc",
            bert_normalizer=False,
            split_on_whitespace_only=True,
        )
    elif args.tokenizer_type == "wp":
        tokenizer = BertWordPieceTokenizer(
            clean_text=False,
            handle_chinese_chars=True,
            strip_accents=False,
            lowercase=False,
        )
    return tokenizer