Пример #1
0
    def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer] = None,
        domain_identifier: str = None,
        bert_model_name: str = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self._domain_identifier = domain_identifier

        if bert_model_name is not None:
            self.bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
            self.lowercase_input = "uncased" in bert_model_name
        else:
            self.bert_tokenizer = None
            self.lowercase_input = False
    def setup_class(self):
        self.use_gpu = torch.cuda.is_available()
        self.test_dir = Path(tempfile.mkdtemp())

        self.base_tokenizer = BertTokenizer.from_pretrained(
            'bert-base-uncased', do_lower_case=True, cache_dir=self.test_dir)
        self.rust_tokenizer = PyBertTokenizer(get_from_cache(
            self.base_tokenizer.pretrained_vocab_files_map['vocab_file']
            ['bert-base-uncased']),
                                              do_lower_case=True)
        self.model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased', output_attentions=False).eval()
        if self.use_gpu:
            self.model.cuda()
        self.sentence_list = [
            'For instance, on the planet Earth, man had always assumed that he was more intelligent '
            'than dolphins because he had achieved so much—the wheel, New York, wars and so on—whilst'
            ' all the dolphins had ever done was muck about in the water having a good time. But '
            'conversely, the dolphins had always believed that they were far more intelligent than '
            'man—for precisely the same reasons.'
        ] * 64

        # Pre-allocate GPU memory
        tokens_list = [
            self.base_tokenizer.tokenize(sentence)
            for sentence in self.sentence_list
        ]
        features = [
            self.base_tokenizer.convert_tokens_to_ids(tokens)
            for tokens in tokens_list
        ]
        features = [
            self.base_tokenizer.prepare_for_model(input,
                                                  None,
                                                  add_special_tokens=True,
                                                  max_length=128)
            for input in features
        ]
        all_input_ids = torch.tensor([f['input_ids'] for f in features],
                                     dtype=torch.long)

        if self.use_gpu:
            all_input_ids = all_input_ids.cuda()

        with torch.no_grad():
            _ = self.model(all_input_ids)[0].cpu().numpy()
Пример #3
0
    def setUp(self):

        self.monkeypatch = MonkeyPatch()
        # monkeypatch the PretrainedBertModel to return the tiny test fixture model
        config_path = self.FIXTURES_ROOT / "bert" / "config.json"
        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        config = BertConfig.from_json_file(config_path)
        self.monkeypatch.setattr(BertModel, "from_pretrained", lambda _: BertModel(config))
        self.monkeypatch.setattr(
            BertTokenizer, "from_pretrained", lambda _: BertTokenizer(vocab_path)
        )

        super().setUp()
        self.set_up_model(
            self.FIXTURES_ROOT / "bert_srl" / "experiment.jsonnet",
            self.FIXTURES_ROOT / "conll_2012",
        )
Пример #4
0
def load_dataset(args, model_name_or_path, type):

    #仅用于定义变量
    input_file_name_or_path = ''
    max_seq_len = 0
    batch_size = 0

    tokenizer = BertTokenizer.from_pretrained(model_name_or_path)

    pro = processer()
    labellist = pro.get_labels()

    if type == 'train':
        input_file_name_or_path = os.path.join(args.train_file_path,
                                               'train.txt')
        max_seq_len = args.train_max_seq_len
        batch_size = args.train_batch_size

    elif type == 'valid':
        input_file_name_or_path = os.path.join(args.valid_file_path,
                                               'valid.txt')
        max_seq_len = args.valid_max_seq_len
        batch_size = args.valid_batch_size

    elif type == 'test':
        input_file_name_or_path = os.path.join(args.predict_file_path,
                                               'test.txt')
        max_seq_len = args.predict_max_seq_len
        batch_size = args.predict_batch_size

    data = pro.read_txt(filename=input_file_name_or_path)
    examples = pro.create_examples(data=data, type=type)
    features = pro.convert_examples_to_features(examples=examples,
                                                tokenizer=tokenizer,
                                                max_length=max_seq_len,
                                                label_list=labellist,
                                                output_mode='classification')
    dataset = pro.create_dataset(features=features)

    sampler = SequentialSampler(dataset)  #顺序取样
    dataloader = DataLoader(dataset=dataset,
                            sampler=sampler,
                            batch_size=batch_size,
                            collate_fn=collate_fn)

    return data, dataloader
Пример #5
0
def train_BertSERModel(num_epochs, batch_size, model_file_name, lr, is_hinge=False, train_documents=None):
    dataset_reader = SerSentenceDataset
    
    tokenizer = BertTokenizer.from_pretrained(bert_model_name)
    pretrained_bert = BertModel.from_pretrained(bert_model_name)
    pretrained_bert.eval()
    
    bert_encoder = BertModel.from_pretrained(bert_model_name)
    model = BertSERModel(bert_encoder)
    if is_hinge:
        model.criterion = torch.nn.HingeEmbeddingLoss()
    
    collate_fn = Sent_collate
    indexer = SentIdx(tokenizer, pretrained_bert)
    input_names = ['input_ids', 'token_type_ids', 'attention_mask', 'label']
    trainer = SER_Trainer(model, collate_fn, indexer, dataset_reader, input_names, lr, is_hinge=is_hinge)
    trainer.train(num_epochs, batch_size, model_file_name, train_documents=train_documents)
    def __init__(self, save_data_name, save_model_dir):

        self.num_layer = 4
        self.num_iter = 2
        self.labelcomment = ''
        self.resultfile = 'resultfile'
        self.lr = 0.0015
        self.use_biword = True
        self.use_char = True
        self.model_type = 'lstm'
        self.hidden_dim = 300
        self.use_count = True
        self.gpu = False
        self.data = self.load_data(save_data_name)
        self.model = self.load_model(save_model_dir)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-chinese',
                                                       do_lower_case=True)
Пример #7
0
def train_sgroup_model(num_epochs, batch_size, model_file_name, lr, is_hinge=False, is_score=False):
    dataset_reader = SerSGroupDataset
    
    tokenizer = BertTokenizer.from_pretrained(bert_model_name)
    pretrained_bert = BertModel.from_pretrained(bert_model_name)
    pretrained_bert.eval()
    
    model = SGroupModel.from_pretrained(bert_model_name)
    if is_hinge:
        model.criterion = torch.nn.HingeEmbeddingLoss()
    
    collate_fn = SGroup_collate
    indexer = SGroupIdx(tokenizer, pretrained_bert)
    input_names = ['input_ids', 'token_type_ids', 'attention_mask',
                   'tf_type', 'idf_type', 'sf_score', 'atype_ent_match']
    trainer = SER_Trainer(model, collate_fn, indexer, dataset_reader, input_names, lr, is_hinge=is_hinge)
    trainer.train(num_epochs, batch_size, model_file_name, is_score=is_score)
Пример #8
0
def get_bert_tokenizer(pretrained_cfg_name: str, biencoder: BiEncoder, do_lower_case: bool = True):
    """If needed, this tokenizer will be added one special token [QST] representing the question token"""
    tokenizer = BertTokenizer.from_pretrained(
        pretrained_cfg_name, do_lower_case=do_lower_case
    )
    # Add [QST] token
    encoder_embeddings = biencoder.question_model.resize_token_embeddings()
    before = encoder_embeddings.weight.shape
    tokenizer.add_special_tokens({"additional_special_tokens": ["[QST]"]})

    with torch.no_grad():
        encoder_embeddings = biencoder.question_model.resize_token_embeddings(len(tokenizer))
        encoder_embeddings.weight[-1, :] = encoder_embeddings.weight[tokenizer.cls_token_id, :].detach().clone()  # intialize with [CLS] embedding
    assert biencoder.ctx_model.resize_token_embeddings().weight.shape[0] == encoder_embeddings.weight.shape[0], \
        "Context and question encoders are not the same!"
    logger.info(f"Added [QST] token: before: {tuple(before)}, after: {tuple(encoder_embeddings.weight.shape)}")

    return tokenizer
Пример #9
0
    def gen_tokenize_method(self, split_type, user_dict=None, bert_vocab=None):
        lower_split_type = split_type.lower()

        if lower_split_type == "char":
            return self._char_split

        if lower_split_type == "word":
            jieba.setLogLevel(20)
            if user_dict is not None:
                jieba.load_userdict(user_dict)
            return self._word_split

        if lower_split_type == "word_piece":
            bert_vocab = bert_vocab or self.local_bert
            tokenizer = BertTokenizer.from_pretrained(bert_vocab)
            return partial(self._piece_split, tokenizer)

        raise TypeError(f"error tokenize type: {split_type}")
def nama_generate(length, fix_word):
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

    tokenizer = BertTokenizer.from_pretrained('./ML/company_model',
                                              do_lower_case=False,
                                              tokenize_chinese_chars=False)

    compamy_name = generate(tokenizer,
                            device,
                            max_iter=20,
                            length=length,
                            model='./ML/company_model',
                            fix_word=fix_word,
                            samples=1)
    print('---------------')
    print(compamy_name)
    return compamy_name
Пример #11
0
    def __init__(self, device, serial_model_path, par_model_path):
        self.device = device

        pretrained_path = 'cl-tohoku/bert-base-japanese-whole-word-masking'
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_path,
                                                       do_lower_case=False)
        config = BertConfig.from_pretrained(pretrained_path)
        config.num_labels = 4
        self.serial_model = BertForSequenceClassification(config)
        config.num_labels = 2
        self.par_model = BertForSequenceClassification(config)

        self.serial_model.load_state_dict(torch.load(serial_model_path))
        self.serial_model.to(self.device)
        self.serial_model.eval()
        self.par_model.load_state_dict(torch.load(par_model_path))
        self.par_model.to(self.device)
        self.par_model.eval()
Пример #12
0
    def load(cls,
             pretrained_model_name_or_path,
             tokenizer_class=None,
             **kwargs):
        """
        Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
        `pretrained_model_name_or_path` or define it manually via `tokenizer_class`.

        :param pretrained_model_name_or_path:  The path of the saved pretrained model or its name (e.g. `bert-base-uncased`)
        :type pretrained_model_name_or_path: str
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param kwargs:
        :return: Tokenizer
        """
        # guess tokenizer type from name
        if tokenizer_class is None:
            if "albert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "AlbertTokenizer"
            elif "roberta" in pretrained_model_name_or_path.lower():
                tokenizer_class = "RobertaTokenizer"
            elif "bert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "BertTokenizer"
            elif "xlnet" in pretrained_model_name_or_path.lower():
                tokenizer_class = "XLNetTokenizer"
            else:
                raise ValueError(
                    f"Could not infer tokenizer_type from name '{pretrained_model_name_or_path}'. Set arg `tokenizer_type` in Tokenizer.load() to one of: 'bert', 'roberta', 'xlnet' "
                )
            logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
        # return appropriate tokenizer object
        # TODO raise error if this does not return a tokenizer
        if tokenizer_class == "AlbertTokenizer":
            return AlbertTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        if tokenizer_class == "RobertaTokenizer":
            return RobertaTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "BertTokenizer":
            return BertTokenizer.from_pretrained(pretrained_model_name_or_path,
                                                 **kwargs)
        elif tokenizer_class == "XLNetTokenizer":
            return XLNetTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
def bertTokenizer(*args, **kwargs):
    """
    Instantiate a BertTokenizer from a pre-trained/customized vocab file
    Args:
    pretrained_model_name_or_path: Path to pretrained model archive
                                   or one of pre-trained vocab configs below.
                                       * bert-base-uncased
                                       * bert-large-uncased
                                       * bert-base-cased
                                       * bert-large-cased
                                       * bert-base-multilingual-uncased
                                       * bert-base-multilingual-cased
                                       * bert-base-chinese
    Keyword args:
    cache_dir: an optional path to a specific directory to download and cache
               the pre-trained model weights.
               Default: None
    do_lower_case: Whether to lower case the input.
                   Only has an effect when do_wordpiece_only=False
                   Default: True
    do_basic_tokenize: Whether to do basic tokenization before wordpiece.
                       Default: True
    max_len: An artificial maximum length to truncate tokenized sequences to;
             Effective maximum length is always the minimum of this
             value (if specified) and the underlying BERT model's
             sequence length.
             Default: None
    never_split: List of tokens which will never be split during tokenization.
                 Only has an effect when do_wordpiece_only=False
                 Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]

    Example:
        >>> import torch
        >>> sentence = 'Hello, World!'
        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        >>> toks = tokenizer.tokenize(sentence)
        ['Hello', '##,', 'World', '##!']
        >>> ids = tokenizer.convert_tokens_to_ids(toks)
        [8667, 28136, 1291, 28125]
    """
    tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
    return tokenizer
Пример #14
0
def prepare_ref(lines: List[str], ltp_tokenizer: LTP,
                bert_tokenizer: BertTokenizer):
    ltp_res = []

    for i in range(0, len(lines), 100):
        res = ltp_tokenizer.seg(lines[i:i + 100])[0]
        res = [get_chinese_word(r) for r in res]
        ltp_res.extend(res)
    assert len(ltp_res) == len(lines)

    bert_res = []
    for i in range(0, len(lines), 100):
        res = bert_tokenizer(lines[i:i + 100],
                             add_special_tokens=True,
                             truncation=True,
                             max_length=512)
        bert_res.extend(res["input_ids"])
    assert len(bert_res) == len(lines)

    ref_ids = []
    for input_ids, chinese_word in zip(bert_res, ltp_res):

        input_tokens = []
        for id in input_ids:
            token = bert_tokenizer._convert_id_to_token(id)
            input_tokens.append(token)
        input_tokens = add_sub_symbol(input_tokens, chinese_word)
        ref_id = []
        # We only save pos of chinese subwords start with ##, which mean is part of a whole word.
        for i, token in enumerate(input_tokens):
            if token[:2] == "##":
                clean_token = token[2:]
                # save chinese tokens' pos
                if len(clean_token) == 1 and _is_chinese_char(
                        ord(clean_token)):
                    ref_id.append(i)
        ref_ids.append(ref_id)

    assert len(ref_ids) == len(bert_res)

    return ref_ids
Пример #15
0
    def __init__(self,
                 path,
                 tokenizer: BertTokenizer,
                 max_seq_length: int = 512,
                 readin: int = 2000000,
                 dupe_factor: int = 6,
                 small_seq_prob: float = 0.1):
        self.dupe_factor = dupe_factor
        self.max_seq_length = max_seq_length
        self.small_seq_prob = small_seq_prob

        documents = []
        instances = []
        with open(path, encoding='utf-8') as fd:
            document = []
            for i, line in enumerate(tqdm(fd)):
                line = line.replace('\n', '')
                # document = line
                # if len(document.split("<sep>")) <= 3:
                #     continue
                if len(line) == 0:  # This is end of document
                    documents.append(document)
                    document = []
                if len(line.split(' ')) > 2:
                    document.append(tokenizer.tokenize(line))
            if len(document) > 0:
                documents.append(document)

        documents = [x for x in documents if x]
        print(documents[0])
        print(len(documents))
        self.documents = documents
        for _ in range(self.dupe_factor):
            for index in range(len(self.documents)):
                instances.extend(self.create_training_instance(index))

        shuffle(instances)
        self.instances = instances
        self.len = len(self.instances)
        self.documents = None
        documents = None
Пример #16
0
    def __init__(self,
                 path,
                 tokenizer: BertTokenizer,
                 max_seq_length,
                 readin: int = 2000000,
                 dupe_factor: int = 5,
                 small_seq_prob: float = 0.1):
        self.dupe_factor = dupe_factor
        self.max_seq_length = max_seq_length
        self.small_seq_prob = small_seq_prob

        documents = []
        instances = []
        with open(path, encoding='utf-8') as fd:
            for i, line in enumerate(tqdm(fd)):
                line = line.replace('\n', '')
                # Expected format (Q,T,U,S,D)
                # query, title, url, snippet, document = line.split('\t')
                # ! remove this following line later
                document = line
                if len(document.split("<sep>")) <= 3:
                    continue
                lines = document.split("<sep>")
                document = []
                for seq in lines:
                    document.append(tokenizer.tokenize(seq))
                # document = list(map(tokenizer.tokenize, lines))
                documents.append(document)

        documents = [x for x in documents if x]

        self.documents = documents
        for _ in range(self.dupe_factor):
            for index in range(len(self.documents)):
                instances.extend(self.create_training_instance(index))

        shuffle(instances)
        self.instances = instances
        self.len = len(self.instances)
        self.documents = None
        documents = None
Пример #17
0
 def __init__(self,
              data_path: str,
              treatment: str,
              subset: str,
              text_column: str,
              label_column: str,
              bert_pretrained_model: str = BERT_PRETRAINED_MODEL,
              max_seq_length: int = MAX_SENTIMENT_SEQ_LENGTH):
     super().__init__()
     if subset not in ("train", "dev", "test", "train_debug", "dev_debug",
                       "test_debug"):
         raise ValueError("subset argument must be {train, dev,test}")
     self.dataset_file = f"{data_path}/{treatment}_{subset}.csv"
     self.subset = subset
     self.text_column = text_column
     self.label_column = label_column
     self.max_seq_length = max_seq_length
     self.tokenizer = BertTokenizer.from_pretrained(
         bert_pretrained_model,
         do_lower_case=bool(BERT_PRETRAINED_MODEL.endswith("uncased")))
     self.dataset = self.preprocessing_pipeline()
def preprocess(data: List[Dict], model: str, label2idx: Dict,
               max_seq_length: int) -> List[BertInputItem]:
    """
    Runs the full preprocessing pipeline on a list of data items.

    Args:
        data: a list of examples as dicts of the form {"text": ..., "label": ...}
        model: the name of the BERT model
        label2idx: a dict that maps label strings to label ids
        max_seq_length: the maximum sequence length for the input items

    Returns: a list of BertInputItems
    """
    if "distilbert" in model:
        tokenizer = DistilBertTokenizer.from_pretrained(model)
    else:
        tokenizer = BertTokenizer.from_pretrained(model)
    bert_items = convert_data_to_input_items(data, label2idx, max_seq_length,
                                             tokenizer)

    return bert_items
    def test_tokenization_bert(self):
        # Given
        self.base_tokenizer = BertTokenizer.from_pretrained(
            'bert-base-uncased', do_lower_case=True, cache_dir=self.test_dir)
        self.rust_tokenizer = PyBertTokenizer(get_from_cache(
            self.base_tokenizer.pretrained_vocab_files_map['vocab_file']
            ['bert-base-uncased']),
                                              do_lower_case=True,
                                              strip_accents=True)
        output_baseline = []
        for example in self.examples:
            output_baseline.append(
                self.base_tokenizer.encode_plus(
                    example.text_a,
                    text_pair=example.text_b,
                    add_special_tokens=True,
                    return_overflowing_tokens=True,
                    return_special_tokens_mask=True,
                    max_length=128))

        # When
        output_rust = self.rust_tokenizer.encode_pair_list(
            [(example.text_a, example.text_b) for example in self.examples],
            max_len=128,
            truncation_strategy='longest_first',
            stride=0)

        # Then
        for idx, (rust,
                  baseline) in enumerate(zip(output_rust, output_baseline)):
            assert rust.token_ids == baseline[
                'input_ids'], f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' \
                              f'Sentence a: {self.examples[idx].text_a} \n' \
                              f'Sentence b: {self.examples[idx].text_b} \n' \
                              f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n' \
                              f'Rust: {rust.token_ids} \n' \
                              f'Python {baseline["input_ids"]}'
            assert (rust.segment_ids == baseline['token_type_ids'])
            assert (
                rust.special_tokens_mask == baseline['special_tokens_mask'])
Пример #20
0
    def __init__(
        self,
        mappings: dict = None,
        model: nn.Module = None,
        options: dict = None,
        model_dir: str = None,
    ):

        self.mappings = mappings
        self.model = model
        self.options = options

        self.bert_use = options.get("embeddings").get("bert").get("use")
        self.bert_voc_dir = os.path.join(model_dir, "embeddings", "bert",
                                         "vocab")
        self.bert_lowercase = (
            options.get("embeddings").get("bert").get("do_lower_case"))

        self.pretrained_use = (
            options.get("embeddings").get("pretrained").get("use"))
        self.elmo_use = options.get("embeddings").get("pretrained").get("use")
        self.char_use = options.get("embeddings").get("chr_cnn").get("use")

        if self.bert_use:
            self.bert_tokenizer = BertTokenizer.from_pretrained(
                self.bert_voc_dir, do_lower_case=self.bert_lowercase)

        self.tok_pad_id = None
        self.chr_pad_id_literal = None
        self.chr_pad_id_utf8 = None

        if self.pretrained_use:
            self.tok_pad_id = mappings.get("toks").get("pad_id")

        if self.char_use:
            self.chr_pad_id_literal = (
                mappings.get("chrs").get("char_literal").get("<pad>"))
            self.chr_pad_id_utf8 = (
                mappings.get("chrs").get("char_utf8").get("<pad>"))
Пример #21
0
def main():
    args = parse_argument()

    if args.seed is not -1:
        random.seed(args.seed)
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available()
                          and not args.no_cuda else "cpu")

    if device != "cpu":
        torch.cuda.manual_seed_all(args.seed)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=False,
                                              tokenize_chinese_chars=False)

    if args.do_train:
        train(args, tokenizer, device)
    if args.do_generate:
        generate(tokenizer, device, max_iter=args.max_iter,
                 length=args.seq_length, model=args.bert_model,
                 fix_word=args.fix_word, samples=args.samples)
Пример #22
0
    def __init__(self,
                 hparams,
                 root,
                 hidden_size=256,
                 num_attention_heads=8,
                 num_hidden_layers=6):
        super().__init__()

        self.hparams = hparams
        self.root = root
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                                       do_lower_case=True)

        self.bert_decoder_config = BertConfig(
            is_decoder=True,
            hidden_size=hidden_size,
            num_attention_heads=num_attention_heads,
            num_hidden_layers=num_hidden_layers)
        #self.enc_dec_config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder_config= self.bert_config, decoder_config= self.bert_config)
        #self.model = EncoderDecoderModel(config= self.enc_dec_config)
        self.bert_decoder = BertModel(config=self.bert_decoder_config)

        self.detr = DETRdemo(num_classes=91)
        state_dict = torch.hub.load_state_dict_from_url(
            url='https://dl.fbaipublicfiles.com/detr/detr_demo-da2a99e9.pth',
            map_location='cpu',
            check_hash=True)
        self.detr.load_state_dict(state_dict)
        del state_dict

        self.ans_to_index = self._mapping_ansto_index()

        self.classifier = nn.Linear(hidden_size * 2, len(self.ans_to_index))

        self.drop_out = nn.Dropout(p=0.2)
        self.log_softmax = nn.LogSoftmax().cuda()
Пример #23
0
    def __init__(self,
                 pretrain_dir="pretrains/baseline/models",
                 feat_dir=None,
                 max_seq_length=256,
                 batch_size=4,
                 device=torch.device('cpu')):

        self.tokenizer = BertTokenizer.from_pretrained(
            "bert-base-multilingual-cased")
        processor = NERProcessor(None, self.tokenizer)
        self.fe = FeatureExtractor(
            dict_dir=feat_dir) if feat_dir is not None else None
        self.label_list = processor.labels
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.device = device
        num_labels = processor.get_num_labels()

        _, self.model, self.feature = model_builder_from_pretrained(
            "bert-base-multilingual-cased",
            num_labels,
            pretrain_dir,
            feat_dir=feat_dir)
        self.model.to(device)
 def setup_python_tokenizer(self):
     self.base_tokenizer = BertTokenizer.from_pretrained(
         'bert-base-uncased', do_lower_case=True, cache_dir=self.test_dir)
Пример #25
0

if __name__ == '__main__':

    class Args:
        name_model = "bert-base-multilingual-cased"
        bert_model = '../resources/cache_bert_cased'
        max_seq_length = 500
        predict_batch_size = 20
        batch_size = 20
        n_best_size = 20
        max_answer_length = 30
        do_lower_case = False
        max_query_length = 64
        no_cuda = True
        seed = 42
        THRESH_HOLD = 0.95

    args = Args()

    tokenizer = BertTokenizer.from_pretrained(args.name_model,
                                              cache_dir=args.bert_model,
                                              do_lower_case=args.do_lower_case)
    path_input_data = "../dataset/sample_pair_sequence.csv"
    load_squad_to_torch_dataset(path_input_data,
                                tokenizer,
                                args.max_seq_length,
                                args.max_query_length,
                                args.batch_size,
                                is_training=True)
Пример #26
0
                print(
                    f"Model outputs name: {[tmp_obj.name for tmp_obj in sess.get_outputs()]}"
                )
                print(
                    f"Model outputs shape: {[tmp_obj.shape for tmp_obj in sess.get_outputs()]}"
                )
                # Run the model (None = get all the outputs)
                outputs_onnx = sess.run(None, inputs_onnx)
                print("Model inference correctly")
        except RuntimeException as re:
            print("Error while loading the model: {}".format(re))


if __name__ == '__main__':
    # Example
    model_name_or_path = "/Data/enningxie/Codes/transformers_xz/saved_models/intent_detection_2_10_0_onnx/chinese-roberta-wwm-ext"
    # load origin model from transformers.
    tmp_config = BertConfig.from_pretrained(model_name_or_path)
    tmp_tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
    tmp_model = TFBertForSequenceClassification(tmp_config).from_pretrained(
        model_name_or_path)

    # convert
    tmp_saved_path = model_name_or_path
    tmp_onnx_converter = ONNXConverterTF(tmp_tokenizer,
                                         hidden_size=768,
                                         num_heads=12)
    _, optimized_model_saved_path = tmp_onnx_converter.convert(
        tmp_model, tmp_saved_path, 'tf_model')
    tmp_onnx_converter.verify(optimized_model_saved_path)
Пример #27
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument("--data_dir",
                        default='./data/input/',
                        type=str,
                        required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--bert_model", default='bert-base-chinese', type=str, required=True,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                             "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
                             "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--config_file", default='bert-base-chinese', type=str, required=True,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                             "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
                             "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default='xgfy',
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument("--vacab_root",
                        default='./data/model/',
                        type=str,
                        required=True,
                        help="The directory where the vocab file is saved.")
                        
    parser.add_argument("--output_dir",
                        default='./data/output/',
                        type=str,
                        required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument("--weight_name",
                        default='net_weight_1.bin',
                        type=str,
                        )
    parser.add_argument("--config_name",
                        default='config_name_1.bin',
                        type=str,
                        )
    # Other parameters
    parser.add_argument("--cache_dir",
                        default="./data/model/",
                        type=str,
                        help="Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_lower_case",
                        action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--log_frq",
                        default=50,
                        type=int)
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=1.0,
                        type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--n_warmup",
                        default=1000,
                        type=int,
                        help="step of training to perform linear learning rate warmup for.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--parall',
                        action='store_true')
    parser.add_argument('--loss_scale',
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")
    args = parser.parse_args()


    # 新冠肺炎
    processors = {
        "xgfy": SimProcessor
    }

    num_labels_task = {
        "xgfy": 2,
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda:0" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        # torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
            args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
    #     raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    # if not os.path.exists(args.output_dir):
    #     os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]
    num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(args.vacab_root, do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{0}')
    # cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(str(args.local_rank)))
    config = BertConfig.from_pretrained(args.config_file, num_labels=num_labels)
    model = BertForSequenceClassification.from_pretrained(args.bert_model,
                                                          config=config,
                                                          cache_dir=cache_dir)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        model = DDP(model)
    elif n_gpu > 1 and args.parall:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)

    else:
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)


    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    if args.do_train:
        train_features = convert_examples_to_features(
            train_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
        
        
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=args.n_warmup, num_training_steps=t_total
        )
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
                if n_gpu > 1 and args.parall:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
                    global_step += 1
                    if (global_step) % args.log_frq == 0:
                        logger.info("TrLoss: {:.2f} | Loss: {:.2f} | Lr: {:.2f}".format(tr_loss, loss.item(), scheduler.get_lr()[0]))

    if args.do_train:
        # Save a trained model and the associated configuration
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, args.weight_name)
        torch.save(model_to_save.state_dict(), output_model_file)
        output_config_file = os.path.join(args.output_dir, args.config_name)
        with open(output_config_file, 'w') as f:
            f.write(model_to_save.config.to_json_string())

        # Load a trained model and config that you have fine-tuned
        config = BertConfig(output_config_file)
        model = BertForSequenceClassification(config)
        model.load_state_dict(torch.load(output_model_file))
    else:
        output_model_file = os.path.join(args.output_dir, args.weight_name)
        output_config_file = os.path.join(args.output_dir, args.config_name)
        config = BertConfig(output_config_file)
        model = BertForSequenceClassification(config)
        model.load_state_dict(torch.load(output_model_file))
        # model = BertForSequenceClassification.from_pretrained(args.bert_model)
    model.to(device)

    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_dev_examples(args.data_dir)
        eval_features = convert_examples_to_features(
            eval_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_eval_loss, logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {'eval_loss': eval_loss,
                  'eval_accuracy': eval_accuracy,
                  'global_step': global_step,
                  'loss': loss}
        logger.info(result)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
def main():
    parser = ArgumentParser()
    parser.add_argument('--pregenerated_neg_data', type=Path, required=True)
    parser.add_argument('--pregenerated_data', type=Path, required=True)
    parser.add_argument('--output_dir', type=Path, required=True)
    parser.add_argument(
        "--bert_model",
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument("--do_lower_case", action="store_true")
    parser.add_argument(
        "--reduce_memory",
        action="store_true",
        help=
        "Store training data as on-disc memmaps to massively reduce memory usage"
    )

    parser.add_argument("--max_seq_len", default=512, type=int)

    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--epochs",
                        type=int,
                        default=3,
                        help="Number of epochs to train for")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--kr_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--kr_freq", default=0.7, type=float)
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--learning_rate",
                        default=1e-4,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    args = parser.parse_args()

    assert args.pregenerated_data.is_dir(), \
        "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"

    samples_per_epoch = []
    for i in range(args.epochs):
        epoch_file = args.pregenerated_data / f"epoch_{i}.json"
        metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json"
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit("No training data was found!")
            print(
                f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})."
            )
            print(
                "This script will loop over the available data, but training diversity may be negatively impacted."
            )
            num_data_epochs = i
            break
    else:
        num_data_epochs = args.epochs

    if args.local_rank == -1 or args.no_cuda:
        print(torch.cuda.is_available())
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
        print(n_gpu)
        print("no gpu?")
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        print("GPU Device: ", device)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    logging.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # if n_gpu > 0:
    torch.cuda.manual_seed_all(args.seed)

    pt_output = Path(getenv('PT_OUTPUT_DIR', ''))
    args.output_dir = Path(os.path.join(pt_output, args.output_dir))

    if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
        logging.warning(
            f"Output directory ({args.output_dir}) already exists and is not empty!"
        )
    args.output_dir.mkdir(parents=True, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    total_train_examples = 0
    for i in range(args.epochs):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(total_train_examples /
                                       args.train_batch_size /
                                       args.gradient_accumulation_steps)
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )

    # Prepare model
    config = BertConfig.from_pretrained(args.bert_model)
    # config.num_hidden_layers = args.num_layers
    model = FuckWrapper(config)
    model.to(device)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=num_train_optimization_steps)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    global_step = 0
    logging.info("***** Running training *****")
    logging.info(f"  Num examples = {total_train_examples}")
    logging.info("  Batch size = %d", args.train_batch_size)
    logging.info("  Num steps = %d", num_train_optimization_steps)
    model.train()

    before_train_path = Path(os.path.join(args.output_dir, "before_training"))
    print("Before training path: ", before_train_path)
    before_train_path.mkdir(parents=True, exist_ok=True)
    model.save_pretrained(os.path.join(args.output_dir, "before_training"))
    tokenizer.save_pretrained(os.path.join(args.output_dir, "before_training"))

    neg_epoch_dataset = PregeneratedDataset(
        epoch=0,
        training_path=args.pregenerated_neg_data,
        tokenizer=tokenizer,
        num_data_epochs=num_data_epochs,
        reduce_memory=args.reduce_memory)
    if args.local_rank == -1:
        neg_train_sampler = RandomSampler(neg_epoch_dataset)
    else:
        neg_train_sampler = DistributedSampler(neg_epoch_dataset)

    neg_train_dataloader = DataLoader(neg_epoch_dataset,
                                      sampler=neg_train_sampler,
                                      batch_size=args.train_batch_size)

    def inf_train_gen():
        while True:
            for kr_step, kr_batch in enumerate(neg_train_dataloader):
                yield kr_step, kr_batch

    kr_gen = inf_train_gen()

    for epoch in range(args.epochs):
        epoch_dataset = PregeneratedDataset(
            epoch=epoch,
            training_path=args.pregenerated_data,
            tokenizer=tokenizer,
            num_data_epochs=num_data_epochs,
            reduce_memory=args.reduce_memory)
        if args.local_rank == -1:
            train_sampler = RandomSampler(epoch_dataset)
        else:
            train_sampler = DistributedSampler(epoch_dataset)

        train_dataloader = DataLoader(epoch_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        if n_gpu > 1 and args.local_rank == -1 or (n_gpu <= 1):
            logging.info("** ** * Saving fine-tuned model ** ** * ")
            model.save_pretrained(args.output_dir)
            tokenizer.save_pretrained(args.output_dir)

        with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar:
            for step, batch in enumerate(train_dataloader):
                model.train()

                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids = batch

                outputs = model(input_ids=input_ids,
                                attention_mask=input_mask,
                                token_type_ids=segment_ids,
                                masked_lm_labels=lm_label_ids,
                                negated=False)
                loss = outputs[0]
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)

                if args.local_rank == 0 or args.local_rank == -1:
                    nb_tr_steps += 1
                    pbar.update(1)
                    mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
                    pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    scheduler.step()  # Update learning rate schedule
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if random.random() > args.kr_freq:
                    kr_step, kr_batch = next(kr_gen)
                    kr_batch = tuple(t.to(device) for t in kr_batch)
                    input_ids, input_mask, segment_ids, lm_label_ids = kr_batch

                    outputs = model(input_ids=input_ids,
                                    attention_mask=input_mask,
                                    token_type_ids=segment_ids,
                                    masked_lm_labels=lm_label_ids,
                                    negated=True)
                    loss = outputs[0]
                    if n_gpu > 1:
                        loss = loss.mean()  # mean() to average on multi-gpu.
                    if args.gradient_accumulation_steps > 1:
                        loss = loss / args.gradient_accumulation_steps

                    if args.fp16:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), args.max_grad_norm)
                    else:
                        loss.backward()
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args.max_grad_norm)

                    tr_loss += loss.item()
                    nb_tr_examples += input_ids.size(0)
                    if args.local_rank == -1:
                        nb_tr_steps += 1
                        mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
                        pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
                    if (step + 1) % args.gradient_accumulation_steps == 0:
                        scheduler.step()  # Update learning rate schedule
                        optimizer.step()
                        optimizer.zero_grad()
                        global_step += 1

    # Save a trained model
    if n_gpu > 1 and args.local_rank == -1 or (n_gpu <= 1):
        logging.info("** ** * Saving fine-tuned model ** ** * ")
        model.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
Пример #29
0
def read_instance_with_gaz(num_layer,
                           input_file,
                           gaz,
                           word_alphabet,
                           biword_alphabet,
                           biword_count,
                           char_alphabet,
                           gaz_alphabet,
                           gaz_count,
                           gaz_split,
                           label_alphabet,
                           number_normalized,
                           max_sent_length,
                           char_padding_size=-1,
                           char_padding_symbol='</pad>'):

    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese',
                                              do_lower_case=True)

    in_lines = open(input_file, 'r', encoding="utf-8").readlines()
    instence_texts = []
    instence_Ids = []
    words = []
    biwords = []
    chars = []
    labels = []
    word_Ids = []
    biword_Ids = []
    char_Ids = []
    label_Ids = []
    for idx in range(len(in_lines)):
        line = in_lines[idx]
        if len(line) > 2:
            pairs = line.strip().split()
            word = pairs[0]
            if number_normalized:
                word = normalize_word(word)
            label = pairs[-1]
            if idx < len(in_lines) - 1 and len(in_lines[idx + 1]) > 2:
                biword = word + in_lines[idx + 1].strip().split()[0]
            else:
                biword = word + NULLKEY
            biwords.append(biword)
            words.append(word)
            labels.append(label)
            word_Ids.append(word_alphabet.get_index(word))
            biword_index = biword_alphabet.get_index(biword)
            biword_Ids.append(biword_index)
            label_Ids.append(label_alphabet.get_index(label))
            char_list = []
            char_Id = []
            for char in word:
                char_list.append(char)
            if char_padding_size > 0:
                char_number = len(char_list)
                if char_number < char_padding_size:
                    char_list = char_list + [char_padding_symbol] * (
                        char_padding_size - char_number)
                assert (len(char_list) == char_padding_size)
            else:
                ### not padding
                pass
            for char in char_list:
                char_Id.append(char_alphabet.get_index(char))
            chars.append(char_list)
            char_Ids.append(char_Id)

        else:
            if ((max_sent_length < 0) or
                (len(words) < max_sent_length)) and (len(words) > 0):
                gaz_Ids = []
                layergazmasks = []
                gazchar_masks = []
                w_length = len(words)

                #可在此处针对每一个字所对应的4个数组再增加一个数组
                gazs = [
                    [[] for i in range(4)] for _ in range(w_length)
                ]  # gazs:[c1,c2,...,cn]  ci:[B,M,E,S]  B/M/E/S :[w_id1,w_id2,...]  None:0
                gazs_count = [[[] for i in range(4)] for _ in range(w_length)]

                gaz_char_Id = [
                    [[] for i in range(4)] for _ in range(w_length)
                ]  ## gazs:[c1,c2,...,cn]  ci:[B,M,E,S]  B/M/E/S :[[w1c1,w1c2,...],[],...]

                max_gazlist = 0
                max_gazcharlen = 0
                for idx in range(w_length):

                    matched_list = gaz.enumerateMatchList(words[idx:])
                    matched_length = [len(a) for a in matched_list]
                    matched_Id = [
                        gaz_alphabet.get_index(entity)
                        for entity in matched_list
                    ]

                    if matched_length:
                        max_gazcharlen = max(max(matched_length),
                                             max_gazcharlen)

                    for w in range(len(matched_Id)):
                        gaz_chars = []
                        g = matched_list[w]
                        for c in g:
                            gaz_chars.append(word_alphabet.get_index(c))

                        if matched_length[w] == 1:  ## Single
                            gazs[idx][3].append(matched_Id[w])
                            gazs_count[idx][3].append(1)  #为什么是添加一个1
                            gaz_char_Id[idx][3].append(gaz_chars)
                        else:
                            gazs[idx][0].append(matched_Id[w])  ## Begin
                            gazs_count[idx][0].append(gaz_count[matched_Id[w]])
                            gaz_char_Id[idx][0].append(gaz_chars)
                            wlen = matched_length[w]
                            gazs[idx + wlen - 1][2].append(
                                matched_Id[w])  ## End
                            gazs_count[idx + wlen - 1][2].append(
                                gaz_count[matched_Id[w]])
                            gaz_char_Id[idx + wlen - 1][2].append(gaz_chars)
                            for l in range(wlen - 2):
                                gazs[idx + l + 1][1].append(
                                    matched_Id[w])  ## Middle
                                gazs_count[idx + l + 1][1].append(
                                    gaz_count[matched_Id[w]])
                                gaz_char_Id[idx + l + 1][1].append(gaz_chars)

                    for label in range(4):
                        if not gazs[idx][label]:
                            gazs[idx][label].append(0)
                            gazs_count[idx][label].append(1)
                            gaz_char_Id[idx][label].append([0])

                        max_gazlist = max(len(gazs[idx][label]), max_gazlist)

                    matched_Id = [
                        gaz_alphabet.get_index(entity)
                        for entity in matched_list
                    ]  #词号
                    if matched_Id:
                        gaz_Ids.append([matched_Id, matched_length])
                    else:
                        gaz_Ids.append([])

                ## batch_size = 1
                for idx in range(w_length):
                    gazmask = []
                    gazcharmask = []

                    for label in range(4):
                        label_len = len(gazs[idx][label])
                        count_set = set(gazs_count[idx][label])
                        if len(count_set) == 1 and 0 in count_set:
                            gazs_count[idx][label] = [1] * label_len

                        mask = label_len * [0]
                        mask += (max_gazlist - label_len) * [1]

                        gazs[idx][label] += (max_gazlist -
                                             label_len) * [0]  ## padding
                        gazs_count[idx][label] += (max_gazlist -
                                                   label_len) * [0]  ## padding

                        char_mask = []
                        for g in range(len(gaz_char_Id[idx][label])):
                            glen = len(gaz_char_Id[idx][label][g])
                            charmask = glen * [0]
                            charmask += (max_gazcharlen - glen) * [1]
                            char_mask.append(charmask)
                            gaz_char_Id[idx][label][g] += (max_gazcharlen -
                                                           glen) * [0]
                        gaz_char_Id[idx][label] += (
                            max_gazlist -
                            label_len) * [[0 for i in range(max_gazcharlen)]]
                        char_mask += (max_gazlist - label_len) * [[
                            1 for i in range(max_gazcharlen)
                        ]]

                        gazmask.append(mask)
                        gazcharmask.append(char_mask)
                    layergazmasks.append(gazmask)
                    gazchar_masks.append(gazcharmask)

                texts = ['[CLS]'] + words + ['[SEP]']
                bert_text_ids = tokenizer.convert_tokens_to_ids(texts)

                sentences = "".join(words)
                simi_sentence = ''.join(sentences)  # 得到batch的句子
                simi = cal_simi.calc_sim(
                    'TCM_corpus/dictionary_smptom_afterprocess.txt',
                    simi_sentence, 'TCM_corpus/word_sp.txt',
                    'TCM_corpus/simi/simitrain.txt')

                instence_texts.append([words, biwords, chars, gazs, labels])
                instence_Ids.append([
                    word_Ids, biword_Ids, char_Ids, gaz_Ids, label_Ids, gazs,
                    gazs_count, gaz_char_Id, layergazmasks, gazchar_masks,
                    bert_text_ids, words, simi
                ])

            words = []
            biwords = []
            chars = []
            labels = []
            word_Ids = []
            biword_Ids = []
            char_Ids = []
            label_Ids = []

    return instence_texts, instence_Ids
 def get_tokenizer(self, **kwargs):
     return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)