예제 #1
0
    def __init__(self):
        self.config_path, self.checkpoint_path, self.dict_path, self.max_seq_len = config_name, ckpt_name, vocab_file, max_seq_len
        # 全局使用,使其可以django、flask、tornado等调用
        global graph
        graph = tf.get_default_graph()
        global model
        model = load_trained_model_from_checkpoint(self.config_path, self.checkpoint_path,
                                                        seq_len=self.max_seq_len)
        model.summary(120)
        # 如果只选一层,就只取对应那一层的weight
        if len(layer_indexes) == 1:
            encoder_layer = model.get_layer(index=len(model.layers)-2).output
        # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数
        else:
            # layer_indexes must be [1,2,3,......12]
            all_layers = [model.get_layer(index=lay).output for lay in layer_indexes]
            encoder_layer = k_keras.concatenate(all_layers, -1)
        output_layer = NonMaskingLayer()(encoder_layer)
        model = Model(model.inputs, output_layer)

        # reader tokenizer
        self.token_dict = {}
        with codecs.open(self.dict_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                self.token_dict[token] = len(self.token_dict)

        self.tokenizer = Tokenizer(self.token_dict)
    def __init__(self):
        self.config_path, self.checkpoint_path, self.dict_path, self.max_seq_len = config_name, ckpt_name, vocab_file, max_seq_len
        # 全局使用,使其可以django、flask、tornado等调用
        global graph
        graph = tf.get_default_graph()
        global model
        model = load_trained_model_from_checkpoint(self.config_path,
                                                   self.checkpoint_path,
                                                   seq_len=self.max_seq_len)
        print(model.output)
        print(len(model.layers))
        # lay = model.layers
        #一共104个layer,其中前八层包括token,pos,embed等,
        # 每4层(MultiHeadAttention,Dropout,Add,LayerNormalization)
        # 一共24层
        layer_dict = []
        layer_0 = 7
        for i in range(12):
            layer_0 = layer_0 + 8
            layer_dict.append(layer_0)
        # 输出它本身
        if len(layer_indexes) == 0:
            encoder_layer = model.output
        # 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层输出
        elif len(layer_indexes) == 1:
            if layer_indexes[0] in [i + 1 for i in range(23)]:
                encoder_layer = model.get_layer(
                    index=layer_dict[layer_indexes[0]]).output
            else:
                encoder_layer = model.get_layer(index=layer_dict[-1]).output
        # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数
        else:
            # layer_indexes must be [1,2,3,......12]
            # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes]
            all_layers = [
                model.get_layer(index=layer_dict[lay - 1]).output
                if lay in [i + 1 for i in range(23)] else model.get_layer(
                    index=layer_dict[-1]).output  #如果给出不正确,就默认输出最后一层
                for lay in layer_indexes
            ]
            print(layer_indexes)
            print(all_layers)
            all_layers_select = []
            for all_layers_one in all_layers:
                all_layers_select.append(all_layers_one)
            encoder_layer = Add()(all_layers_select)
            print(encoder_layer.shape)
        print("KerasBertEmbedding:")
        print(encoder_layer.shape)
        output_layer = NonMaskingLayer()(encoder_layer)
        model = Model(model.inputs, output_layer)
        # model.summary(120)
        # reader tokenizer
        self.token_dict = {}
        with codecs.open(self.dict_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                self.token_dict[token] = len(self.token_dict)

        self.tokenizer = Tokenizer(self.token_dict)
예제 #3
0
    def make_BERT(self):
        # Location of BERT model
        description = self._get_description('bert-' + self.lang)
        if description is not None:
            self.lang = description["lang"]
            config_file = description["path-config"]
            weight_file = description["path-weights"]
            vocab_file = description["path-vocab"]

            print('init BERT')

            # load the pretrained model
            with self.graph.as_default():
                #    with self.session.as_default():
                #with tf.variable_scope('', reuse=tf.AUTO_REUSE):
                self.bert_model = load_trained_model_from_checkpoint(
                    config_file, weight_file)
                self.bert_model.summary(line_length=120)
                self.bert_model._make_predict_function()

            # init the tokenizer
            token_dict = {}
            with codecs.open(vocab_file, 'r', 'utf8') as reader:
                for line in reader:
                    token = line.strip()
                    token_dict[token] = len(token_dict)
            print('token_dict size:', len(token_dict))
            self.bert_tokenizer = Tokenizer(token_dict, cased=True)
 def init_bert(self,config):
     bert_config = config['bert']['config_json']
     bert_checkpoint = config['bert']['bert_ckpt']
     bert_vocab = config['bert']['bert_vocab']
     bert_vocabs = load_vocabulary(bert_vocab)
     self.bert_token = Tokenizer(bert_vocabs)
     self.bert = self.load_bert(bert_config, bert_checkpoint)
예제 #5
0
    def test_uncased(self):
        tokens = [
            '[PAD]', '[UNK]', '[CLS]', '[SEP]', 'want', '##want',
            '##ed', 'wa', 'un', 'runn', '##ing', ',',
            '\u535A', '\u63A8',
        ]
        token_dict = {token: i for i, token in enumerate(tokens)}
        tokenizer = Tokenizer(token_dict)
        text = u"UNwant\u00E9d, running  \nah\u535A\u63A8zzz\u00AD"
        tokens = tokenizer.tokenize(text)
        expected = [
            '[CLS]', 'un', '##want', '##ed', ',', 'runn', '##ing',
            'a', '##h', '\u535A', '\u63A8', 'z', '##z', '##z',
            '[SEP]',
        ]
        self.assertEqual(expected, tokens)
        indices, segments = tokenizer.encode(text)
        expected = [2, 8, 5, 6, 11, 9, 10, 1, 1, 12, 13, 1, 1, 1, 3]
        self.assertEqual(expected, indices)
        expected = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        self.assertEqual(expected, segments)

        decoded = tokenizer.decode(indices)
        expected = [
            'un', '##want', '##ed', ',', 'runn', '##ing',
            '[UNK]', '[UNK]', '\u535A', '\u63A8', '[UNK]', '[UNK]', '[UNK]',
        ]
        self.assertEqual(expected, decoded)
예제 #6
0
    def make_BERT(self):
        # Location of BERT model
        description = self.get_description('bert-base-' + self.lang)
        if description is not None:
            self.lang = description["lang"]
            config_file = description["path-config"]
            weight_file = description["path-weights"]
            vocab_file = description["path-vocab"]

            print('init BERT')

            # load the pretrained model
            with self.graph.as_default():
                # there are different typical pooling strategies for getting BERT features:
                # - concatenation of 4 last layers (the one from the original BERT paper, BERT_embed_size is then 3072)
                # - last layer (BERT_embed_size is 768)
                # - average of 4 last layers (BERT_embed_size is 768)
                # - sum of the 4 last layers (BERT_embed_size is 768)
                self.bert_model = load_trained_model_from_checkpoint(
                    config_file, weight_file, output_layer_num=4)
                self.bert_model.summary(line_length=120)
                self.bert_model._make_predict_function()

            # init the tokenizer
            token_dict = {}
            with codecs.open(vocab_file, 'r', 'utf8') as reader:
                for line in reader:
                    token = line.strip()
                    token_dict[token] = len(token_dict)
            print('token_dict size:', len(token_dict))
            self.bert_tokenizer = Tokenizer(token_dict, cased=True)
예제 #7
0
	def _text_process(self, text):
		Tokener = Tokenizer(self.vocab_dict)
		encoder = [Tokener.encode(first=doc[0],second=doc[1], max_len=self.max_seq_len) for doc in text]
		input_ids = [i[0] for i in encoder]
		input_type = [i[1] for i in encoder]
		input_mask = [[0 if l==0 else 1 for l in i] for i in input_ids]
		return (input_ids,input_mask,input_type)
예제 #8
0
def load_bert_data(raw_file, train=True):
    config = Config()
    dict_path = './corpus/vocab.txt'
    token_dict = {}
    with codecs.open(dict_path, 'r', 'utf8') as reader:
        for line in reader:
            token = line.strip()
            token_dict[token] = len(token_dict)
    tags2id, id2tags = load_tags('tags.txt')
    x_ids = list()
    x_segments = list()
    x_label = list()
    with codecs.open(raw_file, encoding='utf-8') as f:
        for line in f:
            x = json.loads(line)
            input_sent = x['title']
            tokenizer = Tokenizer(token_dict)
            x_sent_id, x_sent_segment = tokenizer.encode(
                input_sent, max_len=config.max_len_word)
            x_ids.append(x_sent_id)
            x_segments.append(x_sent_segment)
            if train:
                y = load_label(x, tags2id)
                x_label.append(y)
    x_label = np.asarray(x_label)
    return x_ids, x_segments, x_label, id2tags, None
    def __init__(self,
                 with_bert=True,
                 fine_tune=True,
                 language_backbone='chinese_wwm_base',
                 batch_size=16):
        """

        :param with_bert:
        :param fine_tune:
        :param model_arch: pointer, seq2seq or transformer
        """
        self.language_backbone = language_backbone
        self.with_bert = with_bert
        self.fine_tune = fine_tune
        self.paths = download_pretrained_bert(language_backbone)
        self.bs = batch_size
        token_dict = {}
        with codecs.open(self.paths.vocab, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                token_dict[token] = len(token_dict)
        self.tokenizer = Tokenizer(token_dict)
        self.dataloader = Dataloader_v1(tokenizer=self.tokenizer,
                                        batch_size=self.bs,
                                        split_rate=0.1)
예제 #10
0
 def __init__(self, label, bs = 16, token_dict = None):
     self.batch_size = bs
     self.random = random
     self.ans = label
     self.maxlen_doc = 512
     self.tokenizer = Tokenizer(token_dict)
     self.iter_index = np.arange(len(self.ans))
예제 #11
0
파일: util.py 프로젝트: andy94077/SDML
def load_task2_testX(dict_path, data_dir):
    if not os.path.exists(os.path.join(
            data_dir, 'task2_testX.npy')) or not os.path.exists(
                os.path.join(data_dir, 'task2_test_seg.npy')):
        df = pd.read_csv(os.path.join(data_dir, 'task2_public_testset.csv'),
                         dtype=str)
        abstract = df.values[:, 2]

        # collect words
        token_dict = load_vocabulary(dict_path)
        tokenizer = Tokenizer(token_dict)
        input_data = []
        input_seg = []
        seq_len = 512  # maximum should be 638, while bert-BASE only support up to 512
        for i in tqdm(abstract):
            j = i.replace('$$$', ' ')
            idx, seg = tokenizer.encode(j, max_len=seq_len)
            input_data.append(idx)
            input_seg.append(seg)
        X = np.asarray(input_data)
        seg = np.asarray(input_seg)

        np.save(os.path.join(data_dir, 'task2_testX.npy'), X)
        np.save(os.path.join(data_dir, 'task2_test_seg.npy'), seg)
    else:
        X, seg = np.load(os.path.join(data_dir, 'task2_testX.npy')), np.load(
            os.path.join(data_dir, 'task2_test_seg.npy'))
    return X, seg
    def work1(self, text1):
        out = []
        if type(text1) == str:
            text1 = [text1]
        for i in text1:
            resu = i.replace('|', '').replace(' ', '').replace('“', '“').replace('”', '”') \
                .replace('‘', '‘').replace('’', '’').replace('〔', '(').replace('〕', ')').replace('/', '') \
                .replace('·', '·').replace('•', '·').replace("\\n", "\n").replace("\\r", "\r").replace("\\t",
                                                                                                              "\t")
            resu = re.split(r'\s+', resu)
            dr = re.compile(r'<[^>]+>', re.S)
            dd = dr.sub('', '。'.join(resu))
            line = re.sub(self.restr, '', dd)
            eng = [",", "!", "?", ":", ";", "(", ")", "[", "]", "$", "。。"]
            chi = [",", "!", "?", ":", ";", "(", ")", "【", "】", "¥", '。']
            for i, j in zip(eng, chi):
                line = line.replace(i, j)
            out.append(line[:28])
        token_dict = {}
        dict_path = "../chinese_L-12_H-768_A-12/vocab.txt"
        with codecs.open(dict_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                token_dict[token] = len(token_dict)

        tokenizer = Tokenizer(token_dict)
        x1, x2 = [], []
        for text in out:
            indices, segments = tokenizer.encode(first=text, max_len=512)
            x1.append(indices)
            x2.append(segments)
        return x1, x2
예제 #13
0
 def __init__(self, ):
     #分词词典加载
     with cs.open('../data/segment_dic.txt', 'r', 'utf-8') as fp:
         segment_dic = {}
         for line in fp:
             if line.strip():
                 segment_dic[line.strip()] = 0
     self.segment_dic = segment_dic
     self.max_seq_len = 20
     begin = time.time()
     jieba.load_userdict('../data/segment_dic.txt')
     print('加载用户分词词典时间为:%.2f' % (time.time() - begin))
     #加载训练好的实体识别模型
     custom_objects = get_custom_objects()
     self.ner_model = load_model('../data/model/ner_model.h5',
                                 custom_objects=custom_objects)
     #加载bert tokenlizer
     dict_path = '../../news_classifer_task/wwm/vocab.txt'
     token_dict = {}
     with cs.open(dict_path, 'r', 'utf8') as reader:
         for line in reader:
             token = line.strip()
             token_dict[token] = len(token_dict)
     self.tokenizer = Tokenizer(token_dict)
     print('mention extractor loaded')
예제 #14
0
 def test_empty(self):
     tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]']
     token_dict = {token: i for i, token in enumerate(tokens)}
     tokenizer = Tokenizer(token_dict)
     text = u''
     self.assertEqual(['[CLS]', '[SEP]'], tokenizer.tokenize(text))
     indices, segments = tokenizer.encode(text)
     self.assertEqual([2, 3], indices)
     self.assertEqual([0, 0], segments)
예제 #15
0
 def init_all(self, config):
     if self.train:
         bert_config = config['bert']['config_json']
         bert_checkpoint = config['bert']['bert_ckpt']
         bert_vocab = config['bert']['bert_vocab']
         bert_vocabs = load_vocabulary(bert_vocab)
         self.bert_token = Tokenizer(bert_vocabs)
         self.bert = self.init_bert(bert_config, bert_checkpoint)
     self.get_sentence(config['train_list'] if self.train else config['eval_list'], training=self.train)
예제 #16
0
 def PreProcessInputData(self, text):
     tokenizer = Tokenizer(self.vocab)
     word_labels = []
     seq_types = []
     for sequence in text:
         code = tokenizer.encode(first=sequence, max_len=self.max_seq_length)
         word_labels.append(code[0])
         seq_types.append(code[1])
     return word_labels, seq_types
 def article_preprocess(self):
     tokenizer = Tokenizer(self.token_dict)
     self.text_split = [ele for ele in self.text.split('。') if len(ele) > 0]
     self.sent_num = len(self.text_split)
     tok = [tokenizer.encode(sent)[0] for sent in self.text_split]
     tok_pad = pad_sequences(tok, maxlen=self.seqence_len)
     self.data_in = [
         tok_pad,
         np.zeros(shape=(self.sent_num, self.seqence_len))
     ]
예제 #18
0
 def tokenizer_init(self):
     """字典"""
     # reader tokenizer
     token2idx = {}
     path_dict = os.path.join(self.path_dir, "vocab.txt")
     with codecs.open(path_dict, 'r', 'utf8') as reader:
         for line in reader:
             token = line.strip()
             token2idx[token] = len(token2idx)
     self.tokenizer = Tokenizer(token2idx)
예제 #19
0
  def tokenizer_init(self):
    pretrained_path = 'uncased_L-12_H-768_A-12'
    config_path = os.path.join(pretrained_path, 'bert_config.json')
    model_path = os.path.join(pretrained_path, 'bert_model.ckpt')
    vocab_path = os.path.join(pretrained_path, 'vocab.txt')

    token_dict = load_vocabulary(vocab_path)
    print("Total vocabulary loaded: {}".format(len(token_dict)))

    self.tokenizer = Tokenizer(token_dict)
예제 #20
0
def load_data(texts):
    tokenizer = Tokenizer(token_dict)
    indices = []
    indices_mask = []
    for text in tqdm(texts):
        ids,masked_ids = tokenizer.encode(text[0],text[1],max_len=SEQ_LEN)
        indices.append(ids)
        indices_mask.append(masked_ids)
    indices = np.array(indices)
    indices_mask = np.array(indices_mask)
    return [indices, indices_mask]
예제 #21
0
class twitterProcessor():
    def __init__(self, vocab_path, data_dir, SEQ_LEN):
        self.vocab_path = vocab_path
        self.data_dir = data_dir
        self.seq_len = SEQ_LEN

    def get_train_examples(self, data_dir):
        token_dict = {}
        with codecs.open(self.vocab_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                token_dict[token] = len(token_dict)
        self.tokenizer = Tokenizer(token_dict)

        with open(data_dir, 'r', encoding='utf-8') as f:
            reader = f.readlines()
            x_train, y_train = self.create_examples(reader, "train")
        return x_train, y_train

    def create_examples(self, lines, set_type):
        examples = []
        indices, labels = [], []
        for index, line in enumerate(lines):
            guid = "%s-%s" % (set_type, index)
            split_line = line.strip().split('+++$+++')
            ids, segments = self.tokenizer.encode(split_line[1],
                                                  max_len=self.seq_len)
            sentiment = split_line[0]
            indices.append(ids)
            labels.append(sentiment)
        return [indices, np.zeros_like(indices)], np.array(labels)

    def get_test_examples(self, data_dir):
        token_dict = {}
        with codecs.open(self.vocab_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                token_dict[token] = len(token_dict)
        self.tokenizer = Tokenizer(token_dict)

        with open(data_dir, 'r', encoding='utf-8') as f:
            reader = f.readlines()
            x_test = self.create_test_examples(reader, "train")
            return x_test

    def create_test_examples(self, lines, set_type):
        examples = []
        indices = []
        for index, line in enumerate(lines):
            guid = "%s-%s" % (set_type, index)
            ids, segments = self.tokenizer.encode(line.strip(),
                                                  max_len=self.seq_len)
            indices.append(ids)
        return [indices, np.zeros_like(indices)]
예제 #22
0
 def encode_input_x(self, sentences):
     '''数据X序列化编码  使用BERT的Tokenizer:Token编码, 句子编码   sentences是句子列表,字符串'''
     tokenizer = Tokenizer(self.vocab)
     sent_token_ids = []
     sent_segment_ids = []
     for sequence in sentences:
         token_ids, segment_ids = tokenizer.encode(
             first=sequence, max_len=self.seq_maxlen)  # 输入只有1个句子!
         sent_token_ids.append(token_ids)
         sent_segment_ids.append(segment_ids)
     return [sent_token_ids, sent_segment_ids]
예제 #23
0
 def __init__(self, docs, vec):
     self.texts = np.array(docs)
     self.vec = vec
     paths = get_checkpoint_paths(".")
     inputs = load_trained_model_from_checkpoint(
         config_file=paths.config,
         checkpoint_file=paths.checkpoint,
         seq_len=50)
     outputs = MaskedGlobalMaxPool1D(name='Pooling')(inputs.output)
     self.model = Model(inputs=inputs.inputs, outputs=outputs)
     self.vocab = load_vocabulary(paths.vocab)
     self.tokenizer = Tokenizer(self.vocab)
예제 #24
0
    def get_test_examples(self, data_dir):
        token_dict = {}
        with codecs.open(self.vocab_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                token_dict[token] = len(token_dict)
        self.tokenizer = Tokenizer(token_dict)

        with open(data_dir, 'r', encoding='utf-8') as f:
            reader = f.readlines()
            x_test = self.create_test_examples(reader, "train")
            return x_test
예제 #25
0
    def __init__(self, vocab_path, label_file):

        #load vocab
        self.bert_vocab_dict = {}
        self.load_bert_vocab(vocab_path, self.bert_vocab_dict)
        self.tokenizer = Tokenizer(self.bert_vocab_dict)

        #load label
        self.label_2_index = {}
        self.label_table_size = 0
        self.load_label_vocab(label_file, self.label_2_index)
        self.label_table_size = len(self.label_2_index)
예제 #26
0
    def __init__(self):
        self.config_path, self.checkpoint_path, self.dict_path, self.max_seq_len = config_name, ckpt_name, vocab_file, max_seq_len
        # 全局使用,使其可以django、flask、tornado等调用
        global graph
        graph = tf.compat.v1.get_default_graph()
        global model
        model = load_trained_model_from_checkpoint(self.config_path,
                                                   self.checkpoint_path,
                                                   seq_len=self.max_seq_len)

        layer_dict = [7]
        layer_0 = 7
        for i in range(12):
            layer_0 = layer_0 + 8
            layer_dict.append(layer_0)
        # 输出它本身
        if len(layer_indexes) == 0:
            encoder_layer = model.output
        # 分类如果只有一层,就只取最后那一层的weight,取得不正确
        elif len(layer_indexes) == 1:
            if layer_indexes[0] in [i + 1 for i in range(13)]:
                encoder_layer = model.get_layer(
                    index=layer_dict[layer_indexes[0]]).output
            else:
                encoder_layer = model.get_layer(index=layer_dict[-1]).output
        # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数
        else:
            # layer_indexes must be [1,2,3,......13]
            # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes]
            all_layers = [
                model.get_layer(index=layer_dict[lay - 1]).output
                if lay in [i + 1 for i in range(13)] else model.get_layer(
                    index=layer_dict[-1]).output  # 如果给出不正确,就默认输出最后一层
                for lay in layer_indexes
            ]
            # 其中layer==1的output是格式不对,第二层输入input是list
            all_layers_select = []
            for all_layers_one in all_layers:
                all_layers_select.append(all_layers_one)
            encoder_layer = Add()(all_layers_select)
        output_layer = NonMaskingLayer()(encoder_layer)
        model = Model(model.inputs, output_layer)
        # model.summary(120)
        # reader tokenizer
        self.token_dict = {}
        with codecs.open(self.dict_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                self.token_dict[token] = len(self.token_dict)

        self.tokenizer = Tokenizer(self.token_dict)
        print("完成词向量生成器初始化!")
예제 #27
0
def get_infer_input(input_file, out_file):
    id_type = pd.read_pickle('../data/id_type.pkl')
    type_index = pd.read_pickle('../data/type_index.pkl')
    entity_id = pd.read_pickle('../data/entity_id.pkl')

    id_text = pd.read_pickle('../data/id_text.pkl')

    token_dict = get_token_dict()
    tokenizer = Tokenizer(token_dict)
    out_file = open(out_file, 'w')
    file_index = 0
    with open(input_file) as f:
        for line in f:
            if file_index % 100 == 0:
                print(file_index)
            file_index += 1

            temDict = json.loads(line)
            text = temDict['text']
            mention_data = temDict['mention_data']
            for men in mention_data:
                mention = men['mention']

                offset = int(men['offset'])
                begin = int(offset) + 1
                end = begin + len(mention)

                link_id = get_link_entity_test(mention, entity_id)
                men['link_id'] = link_id
                link_data = {
                    'ids': [],
                    'seg': [],
                    'begin': [],
                    'end': [],
                    'en_type': []
                }
                for id in link_id:

                    kb_text = id_text[id]
                    kb_type = type_index[id_type[id][0]]
                    indice, segment = tokenizer.encode(first=text,
                                                       second=kb_text,
                                                       max_len=256)
                    link_data['ids'].append(indice)
                    link_data['seg'].append(segment)
                    link_data['begin'].append([begin])
                    link_data['end'].append([end])
                    link_data['en_type'].append([kb_type])
                men['link_data'] = link_data

            out_file.write(json.dumps(temDict, ensure_ascii=False))
            out_file.write('\n')
예제 #28
0
def bert_sen_token(token_dict, traininstance, maxlen):
    tokenizer = Tokenizer(token_dict)
    train_indices = []
    train_segments = []
    train_text = []
    for text in traininstance:
        tokens = tokenizer.tokenize(text)
        indices, segments = tokenizer.encode(first=text, max_len=maxlen)
        train_indices.append(indices)
        train_segments.append(segments)
        train_text.append(tokens)

    return train_indices, train_segments, train_text
예제 #29
0
    def test_padding(self):
        tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]']
        token_dict = {token: i for i, token in enumerate(tokens)}
        tokenizer = Tokenizer(token_dict)
        text = '\u535A\u63A8'

        # single
        indices, segments = tokenizer.encode(first=text, max_len=100)
        expected = [2, 1, 1, 3] + [0] * 96
        self.assertEqual(expected, indices)
        expected = [0] * 100
        self.assertEqual(expected, segments)
        decoded = tokenizer.decode(indices)
        self.assertEqual(['[UNK]', '[UNK]'], decoded)
        indices, segments = tokenizer.encode(first=text, max_len=3)
        self.assertEqual([2, 1, 3], indices)
        self.assertEqual([0, 0, 0], segments)

        # paired
        indices, segments = tokenizer.encode(first=text, second=text, max_len=100)
        expected = [2, 1, 1, 3, 1, 1, 3] + [0] * 93
        self.assertEqual(expected, indices)
        expected = [0, 0, 0, 0, 1, 1, 1] + [0] * 93
        self.assertEqual(expected, segments)
        decoded = tokenizer.decode(indices)
        self.assertEqual((['[UNK]', '[UNK]'], ['[UNK]', '[UNK]']), decoded)
        indices, segments = tokenizer.encode(first=text, second=text, max_len=4)
        self.assertEqual([2, 1, 3, 3], indices)
        self.assertEqual([0, 0, 0, 1], segments)
예제 #30
0
def encode(text):
    vocabs = set()
    with open(dict_path, encoding='utf8') as f:
        for l in f:
            vocabs.add(l.replace('\n', ''))

    token_dict = {}
    with codecs.open(dict_path, 'r', 'utf8') as reader:
        for line in reader:
            token = line.strip()
            token_dict[token] = len(token_dict)
    tokenizer = Tokenizer(token_dict)
    tokens = ['[CLS]'] + [ch if ch in vocabs else '[UNK]'
                          for ch in text] + ['[SEP]']
    return tokenizer._convert_tokens_to_ids(tokens), [0] * len(tokens)