示例#1
0
def prepare_tokenizer(args):
    config = BertConfig.from_json_file(args.config_file)
    # tokenizer = BertTokenizerFast(args.vocab_file, model_max_length=512)
    # print('config', type(config), config,)
    tokenizer = BertTokenizerFast(
        args.vocab_file, model_max_length=config.max_position_embeddings)
    return tokenizer
示例#2
0
def load_pretrained_bert_tokenizer(vocab_file=None):
    """Create tokenizer from file, using Transformers library"""

    from transformers import BertTokenizerFast

    if vocab_file is None:
        vocab_file = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            "bert-base-uncased-vocab.txt"
        )

    tokenizer = BertTokenizerFast(
        vocab_file=vocab_file,
        # following arguments are all same as default, listed for clarity
        clean_text=True,
        tokenize_chinese_chars=True,
        do_lower_case=True,
        strip_accents=True,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
    )
    return tokenizer
示例#3
0
def init_tgt(params):
    """
    Initialize the parameters of the target model
    """
    prob = None
    if params.prob:
        print(' | load word translation probs!')
        prob = torch.load(params.prob)

    print(f'| load English pre-trained model: {params.src_model}')
    config = AutoConfig.from_pretrained(params.src_model,
                                        cache_dir=params.cache_dir)
    model = AutoModelForMaskedLM.from_pretrained(
        params.src_model,
        from_tf=bool(".ckpt" in params.src_model),
        config=config,
        cache_dir=params.cache_dir,
    )
    if 'roberta' in params.src_model:
        assert params.src_merge, "merge file should be provided!"
        src_tokenizer = RobertaTokenizer(params.src_vocab, params.src_merge)
    else:
        # note that we do not lowercase here
        src_tokenizer = AutoTokenizer.from_pretrained(
            params.src_model,
            cache_dir='/home/georgios.vernikos/workspace/LMMT/MonoEgo/cache',
            use_fast=True)

    # get English word-embeddings and bias
    src_embs = model.base_model.embeddings.word_embeddings.weight.detach(
    ).clone()
    src_bias = model.cls.predictions.bias.detach().clone()

    # initialize target tokenizer, we always use BertWordPieceTokenizer for the target language
    tgt_tokenizer = BertTokenizerFast(vocab_file=params.tgt_vocab,
                                      do_lower_case=True,
                                      strip_accents=False)

    tgt_embs, tgt_bias = guess(src_embs,
                               src_bias,
                               tgt_tokenizer,
                               src_tokenizer,
                               prob=prob)

    # checksum for debugging purpose
    print(' checksum src | embeddings {:.5f} - bias {:.5f}'.format(
        src_embs.norm().item(),
        src_bias.norm().item()))
    model.base_model.embeddings.word_embeddings.weight.data = tgt_embs
    model.cls.predictions.bias.data = tgt_bias
    model.tie_weights()
    print(' checksum tgt | embeddings {:.5f} - bias {:.5f}'.format(
        model.base_model.embeddings.word_embeddings.weight.norm().item(),
        model.cls.predictions.bias.norm().item()))

    # save the model
    model_to_save = (model.module if hasattr(model, "module") else model
                     )  # Take care of distributed/parallel training
    model_to_save.save_pretrained(params.tgt_model)
示例#4
0
def get_bert_tokenizer(bert_model_type):
    if bert_model_type in [
            'bert-base-uncased', 'prod-bert-base-uncased', 'bert-base-cased',
            'bert-large-uncased', 'tune_bert-base-uncased_nsp',
            'bert-large-uncased-whole-word-masking',
            'bert-large-uncased-whole-word-masking-finetuned-squad'
    ]:
        if '-cased' in bert_model_type:
            do_lower_case = False
        else:
            do_lower_case = True  # default
        return BertTokenizerFast(vocab_file=BERT_VOCAB_FILE[bert_model_type],
                                 do_lower_case=do_lower_case)
    elif bert_model_type in [
            'roberta-base', 'prod-roberta-base-cased', 'roberta-large',
            'roberta-large-mnli', 'distilroberta-base'
    ]:
        return RobertaTokenizerFast(
            vocab_file=BERT_VOCAB_FILE[bert_model_type],
            merges_file=BERT_MERGE_FILE[bert_model_type],
            add_prefix_space=True)
    elif bert_model_type in ['xlnet-base-cased']:
        if '-uncased' in bert_model_type:
            do_lower_case = True
        else:
            do_lower_case = False  # default
        return XLNetTokenizer(vocab_file=BERT_VOCAB_FILE[bert_model_type],
                              do_lower_case=do_lower_case)
    elif bert_model_type in [
            'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1',
            'albert-xxlarge-v1'
    ]:
        return AlbertTokenizer(vocab_file=BERT_VOCAB_FILE[bert_model_type])
    elif bert_model_type in ['gpt2', 'gpt2-medium']:
        tokenizer = GPT2TokenizerFast(
            vocab_file=BERT_VOCAB_FILE[bert_model_type],
            merges_file=BERT_MERGE_FILE[bert_model_type],
            add_prefix_space=True)
        # https://github.com/huggingface/transformers/issues/3859
        tokenizer.pad_token = tokenizer.eos_token
        return tokenizer
    elif bert_model_type in ['transfo-xl']:
        return TransfoXLTokenizerFast(
            vocab_file=BERT_VOCAB_FILE[bert_model_type])
    elif bert_model_type in [
            'distilbert-base-uncased',
            'distilbert-base-uncased-distilled-squad'
    ]:
        if '-cased' in bert_model_type:
            do_lower_case = False
        else:
            do_lower_case = True  # default
        return DistilBertTokenizerFast(
            vocab_file=BERT_VOCAB_FILE[bert_model_type],
            do_lower_case=do_lower_case)
    else:
        raise ValueError(
            f'`bert_model_type` not understood: {bert_model_type}')
示例#5
0
def get_tokenizer(vocab_file):
    tokenizer = BertTokenizerFast(
        vocab_file=vocab_file,
        do_basic_tokenize=True
    )

    special_tokens_dict = {'additional_special_tokens': ["<end>", "<begin>"]}
    tokenizer.add_special_tokens(special_tokens_dict)
    return tokenizer
 def __init__(self, output_fname, vocab_file, max_seq_length,
              blanks_separate_docs, do_lower_case):
     self._blanks_separate_docs = blanks_separate_docs
     tokenizer = BertTokenizerFast(vocab_file=vocab_file,
                                   do_lower_case=do_lower_case)
     self._example_builder = ExampleBuilder(tokenizer, max_seq_length)
     self._writers = []
     self._wd = tf.io.TFRecordWriter(output_fname)
     self.n_written = 0
示例#7
0
def preprocess_corpus_fixed_lines(ln, mapper, max_seq, lines_limit):
    """
    Merge/separate lines so that every line is made of approximately max_seq tokens. Create:
        - a new baseline corpus, lowercased.
        - a new cID corpus, made of cID-strings.
    """
    original_corpus_path = get_corpus(ln, dense=False, cid=False)
    dense_corpus_path = get_corpus(ln, dense=True, cid=False)
    cID_dense_corpus_path = get_corpus(ln, dense=False, cid=True)


    with open(original_corpus_path, "r", encoding='utf-8') as original_corpus, \
            open(dense_corpus_path, "x", encoding='utf-8') as dense_corpus, \
            open(cID_dense_corpus_path, "x", encoding='utf-8') as cID_dense_corpus:
        lines_list = original_corpus.read().splitlines()
        tokenizer = BertTokenizerFast(Path(args.icebert_folder) /
                                      args.monolingual_tokenizers_root_path /
                                      (ln + '.txt'),
                                      do_lower_case=False,
                                      add_special_tokens=True)
        sentence_tokenizer = NLTKSegmenter()
        dense_line = []
        cID_dense_line = []
        line_length = 0
        # avoid reading beyond the EOF
        line_index = 0
        number_of_dense_lines = 0
        tot_lines = len(lines_list)
        while number_of_dense_lines < lines_limit:
            # reset the lines counter
            if line_index == tot_lines:
                line_index = 0
            sentences = sentence_tokenizer.segment_string(
                lines_list[line_index], lowercase=args.lowercase_corpus)
            line_index += 1
            # we work at sentence level (not line level, to avoid cutting very long lines)
            for sentence in sentences:
                cIDs = encode_cID(
                    fast_tokenize(sentence, ln, tokenizer, mark=True), mapper)
                line_length += len(cIDs)
                dense_line.append(sentence.strip())
                cID_dense_line.append(" ".join(cIDs).strip())
                # if we reach the maximum number of tokens, we wrote down the dense sentence and start building a new one.
                if line_length > max_seq:
                    dense_corpus.write(" ".join(dense_line) + "\n")
                    cID_dense_corpus.write(" ".join(cID_dense_line) + "\n")
                    number_of_dense_lines += 1
                    dense_line = []
                    cID_dense_line = []
                    line_length = 0

    return
示例#8
0
    def test_export_custom_bert_model(self):
        from transformers import BertModel

        vocab = ["[UNK]", "[SEP]", "[CLS]", "[PAD]", "[MASK]", "some", "other", "words"]
        with NamedTemporaryFile(mode="w+t") as vocab_file:
            vocab_file.write("\n".join(vocab))
            vocab_file.flush()
            tokenizer = BertTokenizerFast(vocab_file.name)

        with TemporaryDirectory() as bert_save_dir:
            model = BertModel(BertConfig(vocab_size=len(vocab)))
            model.save_pretrained(bert_save_dir)
            self._test_export(bert_save_dir, "pt", 12, tokenizer)
示例#9
0
def build_model(args):
    if args.pretrained_path == '':
        config = GPT2Config.from_json_file(args.model_config)
        model = GPT2LMHeadModel(config)
        tokenizer = BertTokenizerFast(args.vocab)
        # XXX: must add this, or can't tokenize special token in string to single char
        tokenizer.sanitize_special_tokens()
        info = None
    else:
        config = GPT2Config.from_pretrained(args.pretrained_path)
        model, info = GPT2LMHeadModel.from_pretrained(args.pretrained_path,
                                                      config=config,
                                                      output_loading_info=True)
        tokenizer = BertTokenizerFast.from_pretrained(args.pretrained_path)
    return model, tokenizer, info
示例#10
0
 def __init__(self,
              vocab_path,
              strip_accents,
              clean_text,
              lowercase,
              from_pretrained=False):
     common_params = {
         'strip_accents': strip_accents,
         'clean_text': clean_text,
         'lowercase': lowercase
     }
     if from_pretrained:
         self._tokenizer = BertTokenizerFast.from_pretrained(
             pretrained_model_name_or_path=vocab_path, **common_params)
     else:
         self._tokenizer = BertTokenizerFast(vocab_file=vocab_path,
                                             **common_params)
示例#11
0
def main():
    params = parser.parse_args()
    params.lowercase = params.lowercase == 'True'
    print(params)
    model_name = 'bert-base-uncased' if params.lowercase else 'bert-base-cased'
    print(model_name)
    src_tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        cache_dir='/home/georgios.vernikos/workspace/LMMT/MonoEgo/cache',
        use_fast=True)
    tgt_tokenizer = BertTokenizerFast(vocab_file=params.tgt_vocab,
                                      do_lower_case=params.lowercase,
                                      strip_accents=False)

    src_embs, src_subwords = get_subword_embeddings(src_tokenizer,
                                                    params.src_aligned_vec,
                                                    params.topn,
                                                    params.lowercase)
    tgt_embs, tgt_subwords = get_subword_embeddings(tgt_tokenizer,
                                                    params.tgt_aligned_vec,
                                                    params.topn,
                                                    params.lowercase)
    src_embs = renorm(src_embs, 1)
    tgt_embs = renorm(tgt_embs, 1)
    # initialize sparse-max

    sparsemax = Sparsemax(1)

    print(f'| # src subwords founds: {len(src_subwords)}')
    print(f'| # tgt subwords founds: {len(tgt_subwords)}')
    print('| compute translation probability')
    scores = tgt_embs @ src_embs.t()
    a = sparsemax(scores)  # (Vf, Ve)
    print('| generating translation table!')
    probs = {}

    for i, tt in tqdm(enumerate(tgt_subwords), total=len(tgt_subwords)):
        probs[tt] = {}
        ix = torch.nonzero(a[i]).view(-1)
        px = a[i][ix].tolist()
        wx = [src_subwords[j] for j in ix.tolist()]
        probs[tt] = {w: p for w, p in zip(wx, px)}
    n_avg = np.mean([len(ss) for ss in probs.values()])
    print(f'| average # source / target: {n_avg:.2f} ')
    print(f"| save translation probabilities: {params.save}")
    torch.save(probs, params.save)
def main():
    parser = argparse.ArgumentParser()  # creating an ArgumentParser object

    # input data and model directories
    parser.add_argument('--config', type=str, required=True)
    parser.add_argument('--output', type=int, required=True)
    parser.add_argument('--thread', type=int, required=True)
    parser.add_argument('--language', type=str, required=True)

    args, _ = parser.parse_known_args()
    with open(args.config) as f:
        # print(json.load(f)["records"])
        config = json.load(f)
        files_paths = config["paths"]
        record_params = config["records"]
        seed_input = config["seed"]

    language = args.language

    character_tokenizer = BertTokenizerFast(
        Path(files_paths["vocab_file_root"]) / language / "alphabet",
        do_lower_case=False)
    ocr_errors_generator = ErrorTable(character_tokenizer)
    with open(Path(files_paths["ocr_errors_root"]) / language /
              "ocr_errors.txt",
              encoding="utf-8") as f:
        ocr_errors_generator.load_table_from_file(f)
    dataset_generator = CorrectionDatasetGenerator(
        character_tokenizer, ocr_errors_generator,
        record_params["sequence_length"])
    output_dir = Path(files_paths["output_file_root"]) / language
    tf.io.gfile.makedirs(str(output_dir))
    writer = tf.io.TFRecordWriter(str(output_dir / f"tf_record_{args.output}"),
                                  options="GZIP")
    logging.basicConfig(level=logging.INFO)
    inst_idx = 0
    start_time = time.time()

    for repeat in range(record_params["dupe_factor"]):
        example_cache = []
        for inputs, outputs in dataset_generator.generate_dataset(
                files_paths["dataset_dir"], args.thread, seed_input):
            # for inputs, outputs in tqdm.tqdm(dataset_generator.generate_dataset(files_paths["test_input_dir"])):
            inst_idx += 1
            break
示例#13
0
def get_tokenizer() -> BertTokenizerFast:
    """
    Returns tokenizer for that model.

    Parameters:
        None
    Returns:
        tokenizer (BertTokenizerFast) : loaded and set tokenizer.
    """
    # Loading bert from tf hub
    print(f"\nTrying to load BERT layer from {BERT_LAYER_HUB_URL}\n")
    bert_layer = tf_hub.KerasLayer(BERT_LAYER_HUB_URL, trainable=False)
    print(f"\nLoaded BERT layer from {BERT_LAYER_HUB_URL}")
    # Getting vocab from layer
    vocab_path = bert_layer.resolved_object.vocab_file.asset_path.numpy().decode("utf-8")
    # Creating new tokenizer
    tokenizer = BertTokenizerFast(vocab_path)
    return tokenizer
示例#14
0
    def __init__(self, train=True):
        if train:
            path = ("/data/data_train.txt", "/data/pos_train.txt")
        else:
            path = ("/data/data_val.txt", "/data/pos_val.txt")
        self.tokenizer = BertTokenizerFast("wiki-vocab.txt")
        self.paragraphs = [[]]
        self.pos_labels = set([])

        valid = True
        with open(path[0], encoding="utf-8") as f_data:
            with open(path[1], encoding="utf-8") as f_pos:
                for d, p in tqdm(zip(f_data, f_pos), desc="load_data"):
                    if len(d.strip()) == 0:
                        if len(self.paragraphs[-1]) > 0:
                            self.paragraphs.append([])
                        else:
                            valid = True
                    elif valid:
                        _d, _p = d.strip().split(), p.strip().split()
                        if len(_d) != len(_p) or len(_p) > 256:
                            valid = False
                            self.paragraphs[-1] = []
                        else:
                            assert len(_d) == len(_p), f"{len(_d)} {len(_p)}"
                            self.paragraphs[-1].append((_d, _p))
                            self.pos_labels |= set(_p)

        print(len(self.paragraphs))

        if train:
            self.pos_labels_to_ids = {}
            for i, pos_label in enumerate(sorted(self.pos_labels)):
                self.pos_labels_to_ids[pos_label] = i + 1
        else:
            with open('./pretrain.wiki.dict') as f:
                self.pos_labels_to_ids = eval(f.read())

            i = len(self.pos_labels_to_ids)
            for _, pos_label in enumerate(sorted(self.pos_labels)):
                if pos_label not in self.pos_labels_to_ids:
                    self.pos_labels_to_ids[pos_label] = i
                    i += 1
示例#15
0
    def __init__(self, config):
        super().__init__(config)

        self.tokenizer = BertTokenizerFast("../Bert/assets/vocab.txt")

        self.num_labels = config.num_labels

        self.bert = BertModel(config)

        self.cls = BertOnlyMLMHead(config)

        # projected_emb = tf.layers.dense(output_layer, params["projection_size"])
        # projected_emb = tf.keras.layers.LayerNormalization(axis=-1)(projected_emb)
        # if is_training:
        #     projected_emb = tf.nn.dropout(projected_emb, rate=0.1)

        self.dense = nn.Linear(config.hidden_size, 128)
        self.LayerNorm = nn.LayerNorm(128)
        self.projected_emb = nn.Dropout(0.1)
示例#16
0
 def create_dense_files(original_corpus_path, dense_corpus_path,
                        cID_dense_corpus_path, ln, mapper, max_seq):
     with open(original_corpus_path, "r", encoding='utf-8') as original_corpus, \
             open(dense_corpus_path, "x", encoding='utf-8') as dense_corpus, \
             open(cID_dense_corpus_path, "x", encoding='utf-8') as cID_dense_corpus:
         lines_list = original_corpus.read().splitlines()
         tokenizer = BertTokenizerFast(
             Path(args.icebert_folder) /
             args.monolingual_tokenizers_root_path / (ln + '.txt'),
             do_lower_case=False,
             add_special_tokens=True)
         sentence_tokenizer = NLTKSegmenter()
         dense_line = []
         cID_dense_line = []
         line_length = 0
         marked_lines = 0
         for line in tqdm(lines_list):
             if not (line[:6] == "</doc>" or line[:4] == "<doc"):
                 sentences = sentence_tokenizer.segment_string(
                     line, lowercase=args.lowercase_corpus)
                 # we work at sentence level (not line level, to avoid cutting very long lines)
                 for sentence in sentences:
                     cIDs = encode_cID(
                         fast_tokenize(sentence, ln, tokenizer, mark=True),
                         mapper)
                     line_length += len(cIDs)
                     dense_line.append(sentence.strip())
                     cID_dense_line.append(" ".join(cIDs).strip())
                     # if we reach the maximum number of tokens, we wrote down the dense sentence and start building a new one.
                     if line_length > max_seq:
                         dense_corpus.write(" ".join(dense_line) + "\n")
                         cID_dense_corpus.write(" ".join(cID_dense_line) +
                                                "\n")
                         dense_line = []
                         cID_dense_line = []
                         line_length = 0
             else:
                 marked_lines += 1
     return marked_lines
示例#17
0
def test_realm_tokenizer(vocabfile):
    tokenizer = BertTokenizerFast(vocabfile)
    tens_tokenizer = bert.tokenization.FullTokenizer(vocab_file=vocabfile,
                                                     do_lower_case=True)

    dataset = NQ().get_train_data()

    print(
        "\nTesting the similarity of the Tensorflow & Pytorch tokenizer using NQ"
    )
    for text in tqdm(dataset):
        if (not tens_tokenizer.convert_tokens_to_ids(
            ['[CLS]'] + tens_tokenizer.tokenize(text['question']) + ['[SEP]'])
                == tokenizer(text['question'])['input_ids']):
            raise Exception(
                "The Tensorflow tokenizer and the Pytorch tokenizer don't have similar matches: {} and {}"
                .format(
                    tens_tokenizer.convert_tokens_to_ids(
                        ['[CLS]'] + tens_tokenizer.tokenize(text['question']) +
                        ['[SEP]']),
                    tokenizer(text['question'])['input_ids']))

    print("Tokenizer is correctly imported")
def line_tokenizer_reader(args, file_queue: Queue, line_counter: Counter,
                          file_counter: Counter):
    blanks_separate_docs = args.blanks_separate_docs

    # tokenizer = tokenization.FullTokenizer(
    #     vocab_file=args.vocab_file,
    #     do_lower_case=args.do_lower_case)
    # tokenizer = BertTokenizerFast(vocab_file=args.vocab_file,
    #                           do_lower_case=args.do_lower_case,
    #                           unk_token='[UNK]',
    #                           tokenize_chinese_chars=False,
    #                           wordpieces_prefix='##')
    tokenizer = BertTokenizerFast(vocab_file=args.vocab_file,
                                  do_lower_case=args.do_lower_case,
                                  unk_token='[UNK]',
                                  sep_token='[SEP]',
                                  strip_accents=False,
                                  clean_text=True,
                                  tokenize_chinese_chars=False,
                                  wordpieces_prefix='##')

    example_builder = ExampleBuilder(tokenizer, args.max_seq_length)
    output_filename_template = os.path.join(
        args.output_dir, "pretrain_data-{:04d}.tfrecord.lz")

    sent_messages = 0
    logger = logging.getLogger()

    TWO_HUNDREND_MB = 400e6
    writer = TFWriter(TWO_HUNDREND_MB, output_filename_template, file_counter)

    def send(example):
        if not example:
            return 0
        while not shutdown_event.is_set():
            try:
                #message_queue.put(example.SerializeToString(), block=True, timeout=0.05)
                writer.write(example.SerializeToString())
                break
            except queue.Full:
                logger.warning('queue is full')
                time.sleep(0.01)
                continue
        writer.close()
        return 1

    while not shutdown_event.is_set():
        try:
            input_file = file_queue.get(block=True, timeout=0.05)
        except EOFError:
            continue
        except queue.Empty:
            continue
        except ValueError:
            break
        try:
            lines_read = 0
            start = time.time()
            bytes_read = 0
            already_sent = sent_messages
            previous_reading = 0
            with tf.io.gfile.GFile(input_file) as f:
                for line in f:
                    bytes_read += len(line)
                    line = line.strip()
                    if line or blanks_separate_docs:
                        sent_messages += send(example_builder.add_line(line))
                        lines_read += 1
                        elapsed = time.time()
                        if lines_read % 100:
                            line_counter.increment(lines_read -
                                                   previous_reading)
                            previous_reading = lines_read
                        logger.info('reading %.1f lines/sec',
                                    lines_read / (elapsed - start))
                        logger.info('read  %s bytes',
                                    humanize.filesize.naturalsize(bytes_read))
                        logger.info('sending %.1f messages/sec',
                                    (sent_messages - already_sent) /
                                    (elapsed - start))
                sent_messages += send(example_builder.add_line(""))
        except Exception as exc:
            import traceback
            traceback.print_exc()
            logger.error('reading file %r %s', exc, input_file)
    return sent_messages
示例#19
0
def main():
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    print("Config before overwrite max_position_embeddings:", config)
    config.max_position_embeddings = 4096
    config.num_hidden_layers = 6
    config.num_attention_heads = 8
    config.hidden_size = 512
    config.intermediate_size = 2048
    print("Config after overwrite max_position_embeddings:", config)

    # if model_args.tokenizer_name:
    #     tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    # elif model_args.model_name_or_path:
    #     tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    # else:
    #     raise ValueError(
    #         "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
    #         "and load it from here, using --tokenizer_name"
    #     )

    logging.info("Loading tokenizer")
    if model_args.tokenizer_name:
        tokenizer = BertTokenizerFast(model_args.tokenizer_name,
                                      clean_text=True,
                                      lowercase=False,
                                      strip_accents=True)
    else:
        raise ValueError("Specify tokenizer name")

    logging.info("Loading model")
    if model_args.model_name_or_path:
        model = AutoModelWithLMHead.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelWithLMHead.from_config(config)

    logging.info("Resizing embeddings")
    model.resize_token_embeddings(len(tokenizer))
    print(len(tokenizer.get_vocab()), len(tokenizer))

    if config.model_type in ["bert", "roberta", "distilbert", "camembert"
                             ] and not data_args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the"
            "--mlm flag (masked language modeling).")

    # Get datasets
    logging.info("Loading train dataset")
    train_dataset = get_dataset(data_args) if training_args.do_train else None
    logging.info("Loading eval dataset")
    eval_dataset = (get_dataset(
        data_args,
        evaluate=True,
    ) if training_args.do_eval else None)
    if config.model_type == "xlnet":
        data_collator = DataCollatorForPermutationLanguageModeling(
            tokenizer=tokenizer,
            plm_probability=data_args.plm_probability,
            max_span_length=data_args.max_span_length,
        )
    else:
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=data_args.mlm,
            mlm_probability=data_args.mlm_probability,
        )

    # Initialize our Trainer
    logging.info("Initializing trainer")
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        logging.info("Training")
        model_path = (model_args.model_name_or_path
                      if model_args.model_name_or_path is not None
                      and os.path.isdir(model_args.model_name_or_path) else
                      None)
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_lm.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

        results.update(result)

    return results
示例#20
0
def main():
    # 初始化参数
    args = set_args()

    # 设置使用哪些显卡进行训练
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device

    args.cuda = not args.no_cuda

    if args.batch_size < 2048 and args.warmup_steps <= 4000:
        print('[Warning] The warmup steps may be not enough.\n' \
              '(sz_b, warmup) = (2048, 4000) is the official setting.\n' \
              'Using smaller batch w/o longer warmup may cause ' \
              'the warmup stage ends with only little data trained.')

    # 创建日志对象
    logger = create_logger(args)
    # 当用户使用GPU,并且GPU可用时
    args.cuda = torch.cuda.is_available() and not args.no_cuda
    device = 'cuda:0' if args.cuda else 'cpu'
    args.device = device
    logger.info('using device:{}'.format(device))

    # 初始化tokenizer
    tokenizer = BertTokenizerFast(vocab_file=args.vocab_path,
                                  sep_token="[SEP]",
                                  pad_token="[PAD]",
                                  cls_token="[CLS]")
    args.sep_id = tokenizer.sep_token_id
    args.pad_id = tokenizer.pad_token_id
    args.cls_id = tokenizer.cls_token_id

    # 创建模型的输出目录
    if not os.path.exists(args.save_model_path):
        os.mkdir(args.save_model_path)

    # 创建模型
    if args.pretrained_model:  # 加载预训练模型
        model = GPT2LMHeadModel.from_pretrained(args.pretrained_model)
    else:  # 初始化模型
        model_config = GPT2Config.from_json_file(args.model_config)
        model = GPT2LMHeadModel(config=model_config)
    model = model.to(device)
    logger.info('model config:\n{}'.format(model.config.to_json_string()))
    assert model.config.vocab_size == tokenizer.vocab_size

    # 并行训练模型
    if args.cuda and torch.cuda.device_count() > 1:
        model = DataParallel(model).cuda()
        # model = BalancedDataParallel(args.gpu0_bsz, model, dim=0).cuda()
        logger.info("use GPU {} to train".format(args.device))

    # 计算模型参数数量
    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    logger.info('number of model parameters: {}'.format(num_parameters))

    # 记录参数设置
    logger.info("args:{}".format(args))

    # 加载训练集和验证集
    # ========= Loading Dataset ========= #
    train_dataset, validate_dataset = load_dataset(logger, args)

    train(model, logger, train_dataset, validate_dataset, args)
        lowercase=False,
    )

    wp_tokenizer.train(
        files='/opt/ml/code/KBOBERT/KBOBERT_Data.txt',
        vocab_size=32000,
        min_frequency=2,
        show_progress=True,
        special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
        wordpieces_prefix="##")

    wp_tokenizer.save_model('./')

    tokenizer = BertTokenizerFast(
        vocab_file="/opt/ml/code/KBOBERT/vocab.txt",
        max_len=512,
        do_lower_case=False,
    )

    tokenizer.add_special_tokens({'mask_token': '[MASK]'})

    # https://huggingface.co/transformers/model_doc/bert.html#bertconfig

    config = BertConfig(vocab_size=32000,
                        hidden_size=256,
                        num_hidden_layers=6,
                        num_attention_heads=4,
                        intermediate_size=3072,
                        hidden_act="gelu",
                        hidden_dropout_prob=0.1,
                        attention_probs_dropout_prob=0.1,
示例#22
0
    def __init__(self, train=True):
        if train:
            path = "/data/KorQuAD_v1.0_train.json"
            db_name = "korquad_train.qas"
        else:
            path = "/data/KorQuAD_v1.0_dev.json"
            db_name = "korquad_dev.qas"
        self.tokenizer = BertTokenizerFast("wiki-vocab.txt")

        data = json.load(open(path, encoding="utf-8"))["data"]

        self.qas = []
        if not os.path.exists(db_name):
            with open(db_name, "wb") as f:
                self.mecab = Mecab()
                ignored_cnt = 0
                for paragraphs in tqdm(data):
                    paragraphs = paragraphs["paragraphs"]
                    for paragraph in paragraphs:
                        _context = paragraph["context"]
                        for qa in paragraph["qas"]:
                            question = qa["question"]
                            answer = qa["answers"][0]["text"]
                            (
                                input_ids,
                                token_type_ids,
                                start_token_pos,
                                end_token_pos,
                            ) = self.extract_features(
                                _context,
                                question,
                                answer,
                                qa["answers"][0]["answer_start"],
                            )
                            if len(input_ids) > 512:
                                if not train:
                                    pickle.dump(
                                        (
                                            input_ids,
                                            token_type_ids,
                                            start_token_pos,
                                            end_token_pos,
                                        ),
                                        f,
                                    )
                            else:
                                if train:
                                    pickle.dump(
                                        (
                                            input_ids,
                                            token_type_ids,
                                            start_token_pos,
                                            end_token_pos,
                                        ),
                                        f,
                                    )
                                else:
                                    pickle.dump(
                                        (
                                            input_ids,
                                            token_type_ids,
                                            start_token_pos,
                                            end_token_pos,
                                        ),
                                        f,
                                    )

        with open(db_name, "rb") as f:
            while True:
                try:
                    data = pickle.load(f)
                    self.qas.append(data)
                except EOFError:
                    break
            print(len(self.qas))
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).
        Args:
            args (argparse.Namespace): parsed command-line arguments
        """

        # **YD** add BertTokenizerFast to be suitable for CONLL2003 NER task, pipeline is similar to
        # https://github.com/huggingface/transformers/tree/master/examples/token-classification
        # 1.obtain tokenizer and data_collator

        tokenizer = BertTokenizerFast(args.dict)
        data_collator = YD_DataCollatorForELClassification(
            tokenizer, max_length=args.max_pred_length, padding=True)

        # 2. process datasets, (tokenization of NER data)
        # **YD**, add args in option.py for fine-tuning task
        data_files = {}
        if args.train_file is not None:
            data_files["train"] = args.train_file
        if args.validation_file is not None:
            data_files["validation"] = args.validation_file
        if args.test_file is not None:
            data_files["test"] = args.test_file
        extension = args.extension_file
        dataset = datasets.load_dataset(extension, data_files=data_files)

        # 3. setup num_labels
        if 'train' in dataset:
            column_names = dataset["train"].column_names
            features = dataset["train"].features
        elif 'validation' in dataset:
            column_names = dataset["validation"].column_names
            features = dataset["validation"].features
        elif 'test' in dataset:
            column_names = dataset["test"].column_names
            features = dataset["test"].features
        else:
            raise ValueError(
                'dataset must contain "train"/"validation"/"test"')

        text_column_name = 'tokens'
        label_column_name = 'ner_tags'
        entity_column_name = 'entity_names'

        assert text_column_name in column_names
        assert label_column_name in column_names
        assert entity_column_name in column_names

        if isinstance(features[label_column_name].feature, ClassLabel):
            label_list = features[label_column_name].feature.names
            # No need to convert the labels since they are already ints.
            label_to_id = {i: i for i in range(len(label_list))}
        else:
            if 'train' in label_column_name:
                label_list = get_label_list(
                    datasets["train"][label_column_name])
            elif 'validation' in label_column_name:
                label_list = get_label_list(
                    datasets["validation"][label_column_name])
            elif 'test' in label_column_name:
                label_list = get_label_list(
                    datasets["test"][label_column_name])
            else:
                raise ValueError(
                    'dataset must contain "train"/"validation"/"test"')

            label_to_id = {l: i for i, l in enumerate(label_list)}

        num_labels = len(label_list)

        # **YD** preparing ent_name_id from deep_ed to
        # transform (entity name or entity wikiid) to thid (entity embedding lookup index)
        ent_name_id = EntNameID(args)

        # 4. tokenization
        # Tokenize all texts and align the labels with them.
        # Tokenize all texts and align the **NER_labels and Entity_labels** with them.
        # **YD** only mention (GT label= 'B' or 'I') is considered to do entity disambiguation task.
        # in training time, if certain entity in dictionary, it is labeled with correct entity id.
        # if certain entity is not in dictionary, or certain mention has no corresponding entity,
        # it is labelled with incorrect entity.

        # in inference time, NER label together with ED label to do evaluation.
        # if certain token labels with 'B' and has not unknown predicted entity, it is predicted with entity. The mention
        # part is decided with the following 'I' label.
        # otherwise, if it has unknown predicted entity, all 'B' and following 'I' becomes 'O' label.
        def tokenize_and_align_labels(examples, label_all_tokens=False):
            tokenized_inputs = tokenizer(
                examples[text_column_name],
                padding=False,
                truncation=True,
                # We use this argument because the texts in our dataset are lists of words (with a label for each word).
                is_split_into_words=True,
                return_offsets_mapping=True,
            )
            # print('tokenized_inputs', tokenized_inputs)

            offset_mappings = tokenized_inputs.pop("offset_mapping")
            labels = []
            entity_labels = []
            for label, offset_mapping, entity_label in zip(
                    examples[label_column_name], offset_mappings,
                    examples[entity_column_name]):

                label_index = 0
                current_label = -100
                label_ids = []

                current_entity_label = -100
                entity_label_ids = []
                for offset in offset_mapping:
                    # We set the label for the first token of each word.
                    # Special characters will have an offset of (0, 0)
                    # so the test ignores them.
                    if offset[0] == 0 and offset[1] != 0:
                        current_label = label_to_id[label[label_index]]
                        label_index += 1
                        label_ids.append(current_label)

                        current_entity_label = entity_label[label_index - 1]

                        if label[label_index - 1] == NER_LABEL_DICT['O']:
                            current_entity_label = -100
                        else:
                            # print(label[label_index-1])
                            # print(label_to_id)
                            assert label[label_index - 1] == NER_LABEL_DICT['B'] or label[label_index - 1] == \
                                   NER_LABEL_DICT['I']

                            if current_entity_label == _EMPTY_ENTITY_NAME or label[
                                    label_index - 1] == NER_LABEL_DICT['I']:
                                current_entity_label = -100
                            else:
                                assert label[label_index -
                                             1] == NER_LABEL_DICT['B']
                                tmp_label = ent_name_id.get_thid(
                                    ent_name_id.get_ent_wikiid_from_name(
                                        current_entity_label, True))
                                if tmp_label != ent_name_id.unk_ent_thid:
                                    current_entity_label = tmp_label
                                else:
                                    current_entity_label = _OUT_DICT_ENTITY_ID

                        entity_label_ids.append(current_entity_label)
                    # For special tokens, we set the label to -100 so it's automatically ignored in the loss function.
                    elif offset[0] == 0 and offset[1] == 0:
                        label_ids.append(-100)
                        entity_label_ids.append(-100)
                    # For the other tokens in a word, we set the label to either the current label or -100, depending on
                    # the label_all_tokens flag.
                    else:
                        label_ids.append(
                            current_label if label_all_tokens else -100)
                        entity_label_ids.append(
                            current_entity_label if label_all_tokens else -100)

                labels.append(label_ids)
                entity_labels.append(entity_label_ids)

            tokenized_inputs["labels"] = labels
            tokenized_inputs["entity_labels"] = entity_labels

            return tokenized_inputs

        tokenized_datasets = dataset.map(
            tokenize_and_align_labels,
            batched=True,
            num_proc=1,  # set 1 for faster processing
            load_from_cache_file=False,
        )

        # 5. set up dataset format and input/output pipeline of dataset
        tokenized_datasets.set_format(type='torch', columns=_EL_COLUMNS)

        # include components in args
        args.tokenized_datasets = tokenized_datasets
        args.num_labels = num_labels
        args.tokenizer = tokenizer
        args.data_collator = data_collator

        # load entity embedding and set up shape parameters
        args.EntityEmbedding = torch.load(args.ent_vecs_filename,
                                          map_location='cpu')
        args.num_entity_labels = args.EntityEmbedding.shape[0]
        args.dim_entity_emb = args.EntityEmbedding.shape[1]

        return cls(args)
示例#24
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).
        Args:
            args (argparse.Namespace): parsed command-line arguments
        """

        # **YD** add BertTokenizerFast to be suitable for CONLL2003 NER task, pipeline is similar to
        # https://github.com/huggingface/transformers/tree/master/examples/token-classification
        # 1.obtain tokenizer and data_collator

        tokenizer = BertTokenizerFast(args.dict)
        data_collator = YD_DataCollatorForTokenClassification(
            tokenizer, max_length=args.max_pred_length, padding=True)

        # 2. process datasets, (tokenization of NER data)
        # **YD**, add args in option.py for fine-tuning task
        data_files = {}
        if args.train_file is not None:
            data_files["train"] = args.train_file
        if args.validation_file is not None:
            data_files["validation"] = args.validation_file
        if args.test_file is not None:
            data_files["test"] = args.test_file
        extension = args.extension_file
        dataset = datasets.load_dataset(extension, data_files=data_files)

        # 3. setup num_labels
        if 'train' in dataset:
            column_names = dataset["train"].column_names
            features = dataset["train"].features
        elif 'validation' in dataset:
            column_names = dataset["validation"].column_names
            features = dataset["validation"].features
        elif 'test' in dataset:
            column_names = dataset["test"].column_names
            features = dataset["test"].features
        else:
            raise ValueError(
                'dataset must contain "train"/"validation"/"test"')

        text_column_name = "tokens" if "tokens" in column_names else column_names[
            0]
        label_column_name = ('ner_tags' if 'ner_tags' in column_names else
                             column_names[1])

        if isinstance(features[label_column_name].feature, ClassLabel):
            label_list = features[label_column_name].feature.names
            # No need to convert the labels since they are already ints.
            label_to_id = {i: i for i in range(len(label_list))}
        else:
            if 'train' in label_column_name:
                label_list = get_label_list(
                    datasets["train"][label_column_name])
            elif 'validation' in label_column_name:
                label_list = get_label_list(
                    datasets["validation"][label_column_name])
            elif 'test' in label_column_name:
                label_list = get_label_list(
                    datasets["test"][label_column_name])
            else:
                raise ValueError(
                    'dataset must contain "train"/"validation"/"test"')

            label_to_id = {l: i for i, l in enumerate(label_list)}

        num_labels = len(label_list)

        # 4. tokenization
        # Tokenize all texts and align the labels with them.
        def tokenize_and_align_labels(examples, label_all_tokens=False):
            tokenized_inputs = tokenizer(
                examples[text_column_name],
                padding=False,
                truncation=True,
                # We use this argument because the texts in our dataset are lists of words (with a label for each word).
                is_split_into_words=True,
                return_offsets_mapping=True,
            )
            offset_mappings = tokenized_inputs.pop("offset_mapping")
            labels = []
            for label, offset_mapping in zip(examples[label_column_name],
                                             offset_mappings):
                label_index = 0
                current_label = -100
                label_ids = []
                for offset in offset_mapping:
                    # We set the label for the first token of each word. Special characters will have an offset of (0, 0)
                    # so the test ignores them.
                    if offset[0] == 0 and offset[1] != 0:
                        current_label = label_to_id[label[label_index]]
                        label_index += 1
                        label_ids.append(current_label)
                    # For special tokens, we set the label to -100 so it's automatically ignored in the loss function.
                    elif offset[0] == 0 and offset[1] == 0:
                        label_ids.append(-100)
                    # For the other tokens in a word, we set the label to either the current label or -100, depending on
                    # the label_all_tokens flag.
                    else:
                        label_ids.append(
                            current_label if label_all_tokens else -100)

                labels.append(label_ids)
            tokenized_inputs["labels"] = labels
            return tokenized_inputs

        tokenized_datasets = dataset.map(
            tokenize_and_align_labels,
            batched=True,
            num_proc=args.num_workers,
            load_from_cache_file=False,
        )

        # 5. set up dataset format and input/output pipeline of dataset
        tokenized_datasets.set_format(type='torch', columns=_NER_COLUMNS)

        # include components in args
        args.tokenized_datasets = tokenized_datasets
        args.num_labels = num_labels
        args.tokenizer = tokenizer
        args.data_collator = data_collator

        return cls(args)
示例#25
0
def main():
    args = set_args()
    logger = create_logger(args)
    # 当用户使用GPU,并且GPU可用时
    args.cuda = torch.cuda.is_available() and not args.no_cuda
    device = 'cuda' if args.cuda else 'cpu'
    logger.info('using device:{}'.format(device))
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device
    tokenizer = BertTokenizerFast(vocab_file=args.vocab_path,
                                  sep_token="[SEP]",
                                  pad_token="[PAD]",
                                  cls_token="[CLS]")
    # tokenizer = BertTokenizer(vocab_file=args.voca_path)
    model = GPT2LMHeadModel.from_pretrained(args.model_path)
    model = model.to(device)
    model.eval()
    if args.save_samples_path:
        if not os.path.exists(args.save_samples_path):
            os.makedirs(args.save_samples_path)
        samples_file = open(args.save_samples_path + '/samples.txt',
                            'a',
                            encoding='utf8')
        samples_file.write("聊天记录{}:\n".format(datetime.now()))
    # 存储聊天记录,每个utterance以token的id的形式进行存储
    history = []
    print('开始和chatbot聊天,输入CTRL + Z以退出')

    while True:
        try:
            text = input("user:"******"你好"
            if args.save_samples_path:
                samples_file.write("user:{}\n".format(text))
            text_ids = tokenizer.encode(text, add_special_tokens=False)
            history.append(text_ids)
            input_ids = [tokenizer.cls_token_id]  # 每个input以[CLS]为开头

            for history_id, history_utr in enumerate(
                    history[-args.max_history_len:]):
                input_ids.extend(history_utr)
                input_ids.append(tokenizer.sep_token_id)
            input_ids = torch.tensor(input_ids).long().to(device)
            input_ids = input_ids.unsqueeze(0)
            response = []  # 根据context,生成的response
            # 最多生成max_len个token
            for _ in range(args.max_len):
                outputs = model(input_ids=input_ids)
                logits = outputs.logits
                next_token_logits = logits[0, -1, :]
                # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率
                for id in set(response):
                    next_token_logits[id] /= args.repetition_penalty
                next_token_logits = next_token_logits / args.temperature
                # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token
                next_token_logits[tokenizer.convert_tokens_to_ids(
                    '[UNK]')] = -float('Inf')
                filtered_logits = top_k_top_p_filtering(next_token_logits,
                                                        top_k=args.topk,
                                                        top_p=args.topp)
                # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标
                next_token = torch.multinomial(F.softmax(filtered_logits,
                                                         dim=-1),
                                               num_samples=1)
                if next_token == tokenizer.sep_token_id:  # 遇到[SEP]则表明response生成结束
                    break
                response.append(next_token.item())
                input_ids = torch.cat((input_ids, next_token.unsqueeze(0)),
                                      dim=1)
                # his_text = tokenizer.convert_ids_to_tokens(curr_input_tensor.tolist())
                # print("his_text:{}".format(his_text))
            history.append(response)
            text = tokenizer.convert_ids_to_tokens(response)
            print("chatbot:" + "".join(text))
            if args.save_samples_path:
                samples_file.write("chatbot:{}\n".format("".join(text)))
        except KeyboardInterrupt:
            if args.save_samples_path:
                samples_file.close()
            break
    vocab_size=task_id_vocab_size,
    hidden_size=128,
    num_hidden_layers=2,
    num_attention_heads=2,
    max_position_embeddings=512,
    type_vocab_size=1,
)

# tokenizer._tokenizer.post_processor = BertProcessing(
#     ("</s>", tokenizer.token_to_id("</s>")),
#     ("<s>", tokenizer.token_to_id("<s>")),
# )
# tokenizer.enable_truncation(max_length=512)
# tokenizer = BertTokenizerFast.from_pretrained("./tmp", max_len=512)
# uid_task_id_sequence_path = 'data/feature_sequence/uid_task_id.txt'
tokenizer = BertTokenizerFast('data/bert_and_tokenizer/uid_task_id-vocab.txt')

model = BertForMaskedLM(config=config)

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=uid_task_id_sequence_path,
    block_size=512,  # 序列最大长度
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
                                                mlm_probability=0.15)

training_args = TrainingArguments(
    output_dir="./tmp",
示例#27
0
            torch.tensor(all_attention_mask, dtype=torch.long),
            torch.tensor(all_token_type_ids, dtype=torch.long),
            torch.tensor(all_label_ids, dtype=torch.long))
        sampler = RandomSampler(dataset)
        dataloader = DataLoader(dataset,
                                batch_size=config.batch_size,
                                sampler=sampler)
        return dataloader

    @staticmethod
    def load_dataloader(tokenizer, file_path, verbose=False):
        samples = DataProcessor._read_file(file_path)
        dataloader = DataProcessor._build_dataloader(samples, tokenizer,
                                                     verbose)
        return dataloader


if __name__ == '__main__':
    print(
        max([
            len(x.sent) for x in DataProcessor._read_file('pku_training.utf8')
        ]))  # 1019
    from transformers import BertTokenizerFast
    tokenizer = BertTokenizerFast(config.bert_tokenizer_path)
    dataloader = DataProcessor.load_dataloader(tokenizer,
                                               'pku_training.utf8',
                                               verbose=True)
    for batch in dataloader:
        a, b, c, d = batch
        print(a, b, c, d)
        break
import io
import argparse
import torch
from transformers import BertTokenizerFast, BertForTokenClassification
from flask import Flask, jsonify, request
from server.utils import preprocess_data, predict, idx2tag

app = Flask(__name__)
app.config['JSON_SORT_KEYS'] = False

MAX_LEN = 500
NUM_LABELS = 12
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_PATH = 'bert-base-uncased'
STATE_DICT = torch.load("model-state.bin", map_location=DEVICE)
TOKENIZER = BertTokenizerFast("./vocab/vocab.txt", lowercase=True)

model = BertForTokenClassification.from_pretrained(
    'bert-base-uncased',
    state_dict=STATE_DICT['model_state_dict'],
    num_labels=NUM_LABELS)
model.to(DEVICE)


@app.route('/predict', methods=['POST'])
def predict_api():
    if request.method == 'POST':
        data = io.BytesIO(request.files.get('resume').read())
        resume_text = preprocess_data(data)
        entities = predict(model, TOKENIZER, idx2tag, DEVICE, resume_text,
                           MAX_LEN)
def create_trelm_roberta_model(pretrained_model_path, vocab_path,
                               do_lower_case, vocab_emb_path, vocab_emb_type,
                               save_model_to, langid_list):

    tokenizer = BertTokenizerFast(vocab_path, do_lower_case=do_lower_case)

    vocab_emb_weights = None
    if vocab_emb_type == 'pth':
        vocab_emb_data = torch.load(vocab_emb_path)
        vocab_emb_weights = vocab_emb_data['vectors']
        assert tokenizer.vocab_size == vocab_emb_weights.size(0)
    elif vocab_emb_type == 'word2vec':
        wv_model = KeyedVectors.load_word2vec_format(vocab_emb_path)
        vocab_emb_weights = torch.FloatTensor(wv_model.vectors)
        assert tokenizer.vocab_size == vocab_emb_weights.size(0)

    model = TrelmRobertaForMaskedLM.from_pretrained(pretrained_model_path)

    if vocab_emb_weights is not None:
        assert model.config.hidden_size == vocab_emb_weights.size(1)

    # set the hyperparameters
    model.config.vocab_size = tokenizer.vocab_size
    model.config.pad_token_id = tokenizer.pad_token_id
    # model.config.bos_token_id = tokenizer.bos_token_id
    # model.config.eos_token_id = tokenizer.eos_token_id
    model.config.max_position_embeddings = model.config.max_position_embeddings - 1  #
    model.config.model_type = 'trelm_roberta'
    model.config.architectures = ['TrelmRobertaForMaskedLM']
    model.config.type_vocab_size = 2
    model.config.n_langs = 2
    model.config.langs_to_id = {
        langid: idx
        for idx, langid in enumerate(langid_list)
    }

    # initial the word embeddings
    model.trelm_roberta.embeddings.word_embeddings = nn.Embedding(
        tokenizer.vocab_size,
        model.config.hidden_size,
        padding_idx=model.config.pad_token_id)
    if vocab_emb_weights is not None:
        model.trelm_roberta.embeddings.word_embeddings.weight.data.copy_(
            vocab_emb_weights)
    else:
        logger.info('word_embeddings random initialized!')
        model.trelm_roberta.embeddings.word_embeddings.weight.data.normal_(
            mean=0.0, std=model.config.initializer_range)

    # reset lm_head
    delattr(model, "lm_head")

    # initial the position embeddings
    old_position_emb_weight = model.trelm_roberta.embeddings.position_embeddings.weight.data
    model.trelm_roberta.embeddings.position_embeddings = nn.Embedding(
        model.config.max_position_embeddings,
        model.config.hidden_size,
        padding_idx=model.config.pad_token_id)
    model.trelm_roberta.embeddings.position_embeddings.weight.data.copy_(
        old_position_emb_weight[1:])
    model.trelm_roberta.embeddings.position_ids = torch.arange(
        model.config.max_position_embeddings).expand((1, -1))

    # initial lang embeddings?

    # initial type embeddings
    new_token_type_embeddings = model.trelm_roberta.embeddings.token_type_embeddings.weight.new_empty(
        model.config.type_vocab_size, model.config.hidden_size)
    new_token_type_embeddings[
        0, :] = model.trelm_roberta.embeddings.token_type_embeddings.weight
    model.trelm_roberta.embeddings.token_type_embeddings.weight.data = new_token_type_embeddings

    # initial the translation layer
    layer = model.trelm_roberta.encoder.layer[int(
        model.config.num_hidden_layers / 2)]

    model.trelm_roberta.encoder.tlayer.attention.self.query.weight = layer.attention.self.query.weight
    model.trelm_roberta.encoder.tlayer.attention.self.query.bias = layer.attention.self.query.bias
    model.trelm_roberta.encoder.tlayer.attention.self.key.weight = layer.attention.self.key.weight
    model.trelm_roberta.encoder.tlayer.attention.self.key.bias = layer.attention.self.key.bias
    model.trelm_roberta.encoder.tlayer.attention.self.value.weight = layer.attention.self.value.weight
    model.trelm_roberta.encoder.tlayer.attention.self.value.bias = layer.attention.self.value.bias

    model.trelm_roberta.encoder.tlayer.attention.output.dense.weight = layer.attention.output.dense.weight
    model.trelm_roberta.encoder.tlayer.attention.output.dense.bias = layer.attention.output.dense.bias
    model.trelm_roberta.encoder.tlayer.attention.output.LayerNorm.weight = layer.attention.output.LayerNorm.weight
    model.trelm_roberta.encoder.tlayer.attention.output.LayerNorm.bias = layer.attention.output.LayerNorm.bias

    model.trelm_roberta.encoder.tlayer.intermediate.dense.weight = layer.intermediate.dense.weight
    model.trelm_roberta.encoder.tlayer.intermediate.dense.bias = layer.intermediate.dense.bias

    model.trelm_roberta.encoder.tlayer.output.dense.weight = layer.output.dense.weight
    model.trelm_roberta.encoder.tlayer.output.dense.bias = layer.output.dense.bias
    model.trelm_roberta.encoder.tlayer.output.LayerNorm.weight = layer.output.LayerNorm.weight
    model.trelm_roberta.encoder.tlayer.output.LayerNorm.bias = layer.output.LayerNorm.bias

    logger.info(f'saving model to {save_model_to}')
    model.save_pretrained(save_model_to)
    tokenizer.save_pretrained(save_model_to)
            encoder outputs from Encoder, in shape (T,B,H)
        :param src_len:
            used for masking. NoneType or tensor in shape (B) indicating sequence length
        :return
            attention energies in shape (B,T)
        '''
        att = self.attn(x)
        att = F.tanh(att)
        att = F.softmax(att, 1)
        att_x = att * x
        return att_x.sum(1)


# In[6]:

tokenizer = BertTokenizerFast('../model_weight/nezha/vocab.txt')
test_set = CustomDataset(test,
                         maxlen=128,
                         tokenizer=tokenizer,
                         with_labels=False)
test_loader = Data.DataLoader(test_set,
                              batch_size=batch_size,
                              num_workers=5,
                              shuffle=False)

train_set = CustomDataset(train, maxlen=128, tokenizer=tokenizer)
train_loader = Data.DataLoader(train_set,
                               batch_size=batch_size,
                               num_workers=5,
                               shuffle=True)