def extract(args):
    from tokenizer import load_vocab
    logger.info('load vocab from {}'.format(args.vocab_path))

    vocab = load_vocab(vocab_path=args.vocab_path)
    logger.info('vocab size: {}'.format(vocab.size()))
    load_pretrain_embedding(vocab,
                            embed_size=args.embed_size,
                            embedding_path=args.file_path)
示例#2
0
def convert_vocab(vocab_file):
    """GluonNLP specific code to convert the original vocabulary to nlp.vocab.BERTVocab."""
    original_vocab = load_vocab(vocab_file)
    token_to_idx = dict(original_vocab)
    num_tokens = len(token_to_idx)
    idx_to_token = [None] * len(original_vocab)
    for word in original_vocab:
        idx = int(original_vocab[word])
        idx_to_token[idx] = word

    def swap(token, target_idx, token_to_idx, idx_to_token, swap_idx):
        original_idx = token_to_idx[token]
        original_token = idx_to_token[target_idx]
        token_to_idx[token] = target_idx
        token_to_idx[original_token] = original_idx
        idx_to_token[target_idx] = token
        idx_to_token[original_idx] = original_token
        swap_idx.append((original_idx, target_idx))

    reserved_tokens = [
        gluonnlp.vocab.BERTVocab.PADDING_TOKEN,
        gluonnlp.vocab.BERTVocab.CLS_TOKEN, gluonnlp.vocab.BERTVocab.SEP_TOKEN,
        gluonnlp.vocab.BERTVocab.MASK_TOKEN
    ]

    unknown_token = gluonnlp.vocab.BERTVocab.UNKNOWN_TOKEN
    padding_token = gluonnlp.vocab.BERTVocab.PADDING_TOKEN
    swap_idx = []
    assert unknown_token in token_to_idx
    assert padding_token in token_to_idx
    swap(unknown_token, 0, token_to_idx, idx_to_token, swap_idx)
    for i, token in enumerate(reserved_tokens):
        swap(token, i + 1, token_to_idx, idx_to_token, swap_idx)

    # sanity checks
    assert len(token_to_idx) == num_tokens
    assert len(idx_to_token) == num_tokens
    assert None not in idx_to_token
    assert len(set(idx_to_token)) == num_tokens

    bert_vocab_dict = {}
    bert_vocab_dict['idx_to_token'] = idx_to_token
    bert_vocab_dict['token_to_idx'] = token_to_idx
    bert_vocab_dict['reserved_tokens'] = reserved_tokens
    bert_vocab_dict['unknown_token'] = unknown_token
    bert_vocab_dict['padding_token'] = padding_token
    bert_vocab_dict['bos_token'] = None
    bert_vocab_dict['eos_token'] = None
    bert_vocab_dict['mask_token'] = gluonnlp.vocab.BERTVocab.MASK_TOKEN
    bert_vocab_dict['sep_token'] = gluonnlp.vocab.BERTVocab.SEP_TOKEN
    bert_vocab_dict['cls_token'] = gluonnlp.vocab.BERTVocab.CLS_TOKEN
    json_str = json.dumps(bert_vocab_dict)
    converted_vocab = gluonnlp.vocab.BERTVocab.from_json(json_str)
    return converted_vocab, swap_idx
示例#3
0
def convert_vocab(vocab_file):
    """GluonNLP specific code to convert the original vocabulary to nlp.vocab.Vocab."""
    original_vocab = load_vocab(vocab_file)
    token_to_idx = dict(original_vocab)
    num_tokens = len(token_to_idx)
    idx_to_token = [None] * len(original_vocab)
    for word in original_vocab:
        idx = int(original_vocab[word])
        idx_to_token[idx] = word

    def swap(token, target_idx, token_to_idx, idx_to_token, swap_idx):
        original_idx = token_to_idx[token]
        original_token = idx_to_token[target_idx]
        token_to_idx[token] = target_idx
        token_to_idx[original_token] = original_idx
        idx_to_token[target_idx] = token
        idx_to_token[original_idx] = original_token
        swap_idx.append((original_idx, target_idx))

    reserved_tokens = ['[PAD]', '[CLS]', '[SEP]', '[MASK]']
    unknown_token = '[UNK]'
    padding_token = '[PAD]'
    swap_idx = []
    assert unknown_token in token_to_idx
    assert padding_token in token_to_idx
    swap(unknown_token, 0, token_to_idx, idx_to_token, swap_idx)
    for i, token in enumerate(reserved_tokens):
        swap(token, i + 1, token_to_idx, idx_to_token, swap_idx)

    # sanity checks
    assert len(token_to_idx) == num_tokens
    assert len(idx_to_token) == num_tokens
    assert None not in idx_to_token
    assert len(set(idx_to_token)) == num_tokens

    vocab_dict = {}
    vocab_dict['idx_to_token'] = idx_to_token
    vocab_dict['token_to_idx'] = token_to_idx
    vocab_dict['reserved_tokens'] = reserved_tokens
    vocab_dict['unknown_token'] = unknown_token
    vocab_dict['padding_token'] = padding_token
    vocab_dict['bos_token'] = None
    vocab_dict['eos_token'] = None
    json_str = json.dumps(vocab_dict)
    converted_vocab = gluonnlp.Vocab.from_json(json_str)
    return converted_vocab, swap_idx
示例#4
0
parser.add_argument('--word_vocab', required=True)
parser.add_argument('--train_file', required=True)
parser.add_argument('--test_file', required=True)
parser.add_argument('--model_config')
parser.add_argument('--batch_size', default=32, type=int)
parser.add_argument('--learning_rate', default=1e-4, type=float)
parser.add_argument('--num_epoch', default=10, type=int)
parser.add_argument('--device', default='cpu')
parser.add_argument('--log_dir', default='logs')
parser.add_argument('--weight_dir', default='weight')

if __name__ == "__main__":
    args = parser.parse_args()

    print("Load vocab")
    tokenizer = load_vocab(args.char_vocab, args.word_vocab)

    print("Prepare data")
    train_ds = BERTDataset(args.train_file, tokenizer)
    test_ds = BERTDataset(args.test_file, tokenizer)
    train_dl = DataLoader(train_ds, shuffle=True, batch_size=args.batch_size)
    test_dl = DataLoader(test_ds, shuffle=False, batch_size=args.batch_size)

    print("Init model")
    char_vocab_len = len(tokenizer.char_stoi)
    word_vocab_len = len(tokenizer.word_stoi)

    if args.model_config:
        with open(args.model_config) as f:
            config = json.load(f)
    else:
示例#5
0
        ]
        src = self.tokenizer.tokenize_char(src)
        if src.shape[0] < self.tgt_pad_len:
            src = np.concatenate([src, src_pad])
        label = self.tgt[index]
        label = np.array(label, dtype=np.int64)
        mask = np.array(mask, dtype=np.int64)
        pos = np.array(pos, dtype=np.int64)
        src = np.array(src, dtype=np.int64)
        if src.shape[0] > self.tgt_pad_len: src = src[:self.tgt_pad_len]
        if mask.shape[0] > self.tgt_pad_len: mask = mask[:self.tgt_pad_len]
        if label.shape[0] > self.tgt_pad_len: label = label[:self.tgt_pad_len]
        if pos.shape[0] > self.tgt_pad_len: pos = pos[:self.tgt_pad_len]
        return src, mask, label, pos


if __name__ == "__main__":
    from tokenizer import load_vocab
    tokenizer = load_vocab('vocab/char_vocab.txt', 'vocab/word_vocab.txt')
    ds = SingleDataset('data/test.txt', tokenizer)
    src, mask, label, pos = ds.__getitem__(0)
    print(src.dtype)
    print(mask.dtype)
    print(label.dtype)
    print(pos.dtype)
    label = tokenizer._id_to_token(label[:10], tokenizer.word_itos)
    print(src[:10])
    print(mask[:10])
    print(label[:10])
    print(pos[:10])