def __init__(self, vocab_file, max_seq_length=128, init_checkpoint=None, output_dir=None, gpu_ids=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, max_position_embeddings=1024, do_lower_case=True, truncate_method='LIFO'): super(LMModule, self).__init__(init_checkpoint, output_dir, gpu_ids) self.batch_size = 0 self.max_seq_length = max_seq_length self.truncate_method = truncate_method self._given = 1 self._id_to_label = None self.__init_args__ = locals() self.tokenizer = get_word_piece_tokenizer(vocab_file, do_lower_case) self.gpt2_config = get_gpt2_config(n_vocab=len(self.tokenizer.vocab), n_predict=max_seq_length, n_ctx=max_position_embeddings, n_embed=hidden_size, n_head=num_attention_heads, n_layer=num_hidden_layers) self._key_to_depths = get_key_to_depths(num_hidden_layers) if '<eos>' not in self.tokenizer.vocab: self.tokenizer.add('<eos>') self.gpt2_config.n_vocab += 1 self._eos_id = self.tokenizer.convert_tokens_to_ids(['<eos>'])[0]
def __init__(self, vocab_file, source_max_seq_length=64, target_max_seq_length=64, init_checkpoint=None, output_dir=None, gpu_ids=None, hidden_size=768, num_hidden_layers=6, num_attention_heads=12, do_lower_case=True, truncate_method='LIFO'): super(MTModule, self).__init__(init_checkpoint, output_dir, gpu_ids) self.batch_size = 0 self.source_max_seq_length = source_max_seq_length self.target_max_seq_length = target_max_seq_length self.truncate_method = truncate_method self._hidden_size = hidden_size self._num_hidden_layers = num_hidden_layers self._num_attention_heads = num_attention_heads self._id_to_label = None self.__init_args__ = locals() self.tokenizer = get_word_piece_tokenizer(vocab_file, do_lower_case) self._key_to_depths = get_key_to_depths(num_hidden_layers) if '<s>' not in self.tokenizer.vocab: self.tokenizer.add('<s>') tf.logging.info('Add necessary token `<s>` into vocabulary.') if '</s>' not in self.tokenizer.vocab: self.tokenizer.add('</s>') tf.logging.info('Add necessary token `</s>` into vocabulary.')
def __init__(self, config_file, vocab_file, max_seq_length=128, label_size=None, label_weight=None, init_checkpoint=None, output_dir=None, gpu_ids=None, drop_pooler=False, do_lower_case=True, truncate_method='LIFO'): super(ClassifierModule, self).__init__( init_checkpoint, output_dir, gpu_ids) self.batch_size = 0 self.max_seq_length = max_seq_length self.label_size = label_size self.label_weight = label_weight self.truncate_method = truncate_method self._drop_pooler = drop_pooler self._id_to_label = None self.__init_args__ = locals() self.albert_config = get_albert_config(config_file) self.tokenizer = get_word_piece_tokenizer(vocab_file, do_lower_case) self._key_to_depths = get_key_to_depths( self.albert_config.num_hidden_layers)
def __init__(self, vocab_file, max_seq_length=128, label_size=None, init_checkpoint=None, output_dir=None, gpu_ids=None, filter_sizes='2,4,6', num_channels=6, hidden_size=256, do_lower_case=True, truncate_method='LIFO'): super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) self.batch_size = 0 self.max_seq_length = max_seq_length self.label_size = label_size self.truncate_method = truncate_method self._filter_sizes = filter_sizes self._num_channels = num_channels self._hidden_size = hidden_size self._id_to_label = None self.__init_args__ = locals() self.tokenizer = get_word_piece_tokenizer(vocab_file, do_lower_case) self._key_to_depths = get_key_to_depths() if '[CLS]' not in self.tokenizer.vocab: self.tokenizer.add('[CLS]') self.bert_config.n_vocab += 1 if '[SEP]' not in self.tokenizer.vocab: self.tokenizer.add('[SEP]') self.bert_config.n_vocab += 1
def __init__(self, config_file, vocab_file, max_seq_length=128, label_size=None, init_checkpoint=None, output_dir=None, gpu_ids=None, drop_pooler=False, hidden_size=384, num_hidden_layers=4, do_lower_case=True, truncate_method='LIFO'): super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) self.batch_size = 0 self.max_seq_length = max_seq_length self.label_size = label_size self.truncate_method = truncate_method self._drop_pooler = drop_pooler self._id_to_label = None self.__init_args__ = locals() self.bert_config = get_bert_config(config_file) self.tokenizer = get_word_piece_tokenizer(vocab_file, do_lower_case) self._key_to_depths = 'unsupported' self.student_config = copy.deepcopy(self.bert_config) self.student_config.hidden_size = hidden_size self.student_config.intermediate_size = 4 * hidden_size self.student_config.num_hidden_layers = num_hidden_layers
def __init__(self, config_file, vocab_file, max_seq_length=128, label_size=None, init_checkpoint=None, output_dir=None, gpu_ids=None, drop_pooler=False, do_lower_case=True, truncate_method='LIFO'): super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) self.batch_size = 0 self.max_seq_length = max_seq_length self.label_size = label_size self.truncate_method = truncate_method self._drop_pooler = drop_pooler self._id_to_label = None self.__init_args__ = locals() self.albert_config = get_albert_config(config_file) self.tokenizer = get_word_piece_tokenizer(vocab_file, do_lower_case) self._key_to_depths = get_key_to_depths( self.albert_config.num_hidden_layers) if '[CLS]' not in self.tokenizer.vocab: self.tokenizer.add('[CLS]') self.albert_config.vocab_size += 1 tf.logging.info('Add necessary token `[CLS]` into vocabulary.') if '[SEP]' not in self.tokenizer.vocab: self.tokenizer.add('[SEP]') self.albert_config.vocab_size += 1 tf.logging.info('Add necessary token `[SEP]` into vocabulary.')
def __init__(self, vocab_file, max_seq_length=128, init_checkpoint=None, output_dir=None, gpu_ids=None, reduced_size=64, topic_size=1024, hidden_size=256, num_hidden_layers=6, num_attention_heads=8, do_lower_case=True, truncate_method='LIFO'): super(LMModule, self).__init__(init_checkpoint, output_dir, gpu_ids) self.batch_size = 0 self.max_seq_length = max_seq_length self.truncate_method = truncate_method self._reduced_size = reduced_size self._topic_size = topic_size self._hidden_size = hidden_size self._num_hidden_layers = num_hidden_layers self._num_attention_heads = num_attention_heads self._bias = 0 self._id_to_label = None self.__init_args__ = locals() self.tokenizer = get_word_piece_tokenizer(vocab_file, do_lower_case) self._key_to_depths = get_key_to_depths(num_hidden_layers) if '[SEP]' not in self.tokenizer.vocab: self.tokenizer.add('[SEP]') tf.logging.info('Add necessary token `[SEP]` into vocabulary.')
def __init__(self, config_file, vocab_file, max_seq_length=128, init_checkpoint=None, output_dir=None, gpu_ids=None, drop_pooler=False, do_sample_next_sentence=True, max_predictions_per_seq=20, masked_lm_prob=0.15, short_seq_prob=0.1, do_whole_word_mask=False, mode='bi', do_lower_case=True, truncate_method='LIFO'): super(LMModule, self).__init__( init_checkpoint, output_dir, gpu_ids) self.batch_size = 0 self.max_seq_length = max_seq_length self.label_size = 2 self.do_sample_next_sentence = do_sample_next_sentence self.masked_lm_prob = masked_lm_prob self.short_seq_prob = short_seq_prob self.do_whole_word_mask = do_whole_word_mask self.truncate_method = truncate_method self._drop_pooler = drop_pooler self._max_predictions_per_seq = max_predictions_per_seq self.mode = mode self._id_to_label = None self.__init_args__ = locals() assert mode in ('bi', 'l2r', 'r2l', 's2s'), ( 'Wrong value of `mode`: %s. Pick one from `bi` (bidirectional), ' '`l2r` (left-to-right), `r2l` (right-to-left) and ' '`s2s` (seq-to-seq).' % mode) tf.logging.info( 'LM Mode: `%s`. Use method `.to_mode()` to convert it into ' '`bi`, `l2r`, `r2l` or `s2s`.' % mode) self.bert_config = get_bert_config(config_file) self.tokenizer = get_word_piece_tokenizer(vocab_file, do_lower_case) self._key_to_depths = get_key_to_depths( self.bert_config.num_hidden_layers) if '[CLS]' not in self.tokenizer.vocab: self.tokenizer.add('[CLS]') self.bert_config.vocab_size += 1 tf.logging.info('Add necessary token `[CLS]` into vocabulary.') if '[SEP]' not in self.tokenizer.vocab: self.tokenizer.add('[SEP]') self.bert_config.vocab_size += 1 tf.logging.info('Add necessary token `[SEP]` into vocabulary.') if '[EOS]' not in self.tokenizer.vocab: self.tokenizer.add('[EOS]') self.bert_config.vocab_size += 1 tf.logging.info('Add necessary token `[EOS]` into vocabulary.')
def __init__(self, config_file, vocab_file, max_seq_length=256, init_checkpoint=None, output_dir=None, gpu_ids=None, do_lower_case=True, reading_module='bert', matching_mechanism='cross-attention', beta_1=0.5, beta_2=0.5, threshold=1.0, truncate_method='longer-FO'): super(MRCModule, self).__init__(init_checkpoint, output_dir, gpu_ids) self.batch_size = 0 self.max_seq_length = max_seq_length self.truncate_method = truncate_method self.beta_1 = beta_1 self.beta_2 = beta_2 self._do_lower_case = do_lower_case self._on_predict = False self._reading_module = reading_module self._matching_mechanism = matching_mechanism self._threshold = threshold self.__init_args__ = locals() if reading_module == 'albert': self.bert_config = get_albert_config(config_file) else: self.bert_config = get_bert_config(config_file) assert reading_module in ('bert', 'roberta', 'albert', 'electra'), ( 'Invalid value of `reading_module`: %s. Pick one from ' '`bert`, `roberta`, `albert` and `electra`.') assert matching_mechanism in ( 'cross-attention', 'matching-attention'), ( 'Invalid value of `matching_machanism`: %s. Pick one from ' '`cross-attention` and `matching-attention`.') self.tokenizer = get_word_piece_tokenizer(vocab_file, do_lower_case) self._key_to_depths = get_key_to_depths( self.bert_config.num_hidden_layers) if '[CLS]' not in self.tokenizer.vocab: self.tokenizer.add('[CLS]') self.bert_config.vocab_size += 1 tf.logging.info('Add necessary token `[CLS]` into vocabulary.') if '[SEP]' not in self.tokenizer.vocab: self.tokenizer.add('[SEP]') self.bert_config.vocab_size += 1 tf.logging.info('Add necessary token `[SEP]` into vocabulary.')
def __init__(self, config_file, vocab_file, max_seq_length=128, init_checkpoint=None, output_dir=None, gpu_ids=None, drop_pooler=False, do_sample_sentence=True, max_predictions_per_seq=20, dupe_factor=1, masked_lm_prob=0.15, short_seq_prob=0.1, n_gram=3, favor_shorterngram=True, do_permutation=False, do_whole_word_mask=True, do_lower_case=True, truncate_method='LIFO'): super(LMModule, self).__init__( init_checkpoint, output_dir, gpu_ids) self.batch_size = 0 self.max_seq_length = max_seq_length self.label_size = 2 self.do_sample_sentence = do_sample_sentence self.dupe_factor = dupe_factor self.masked_lm_prob = masked_lm_prob self.short_seq_prob = short_seq_prob self.ngram = n_gram self.favor_shorter_ngram = favor_shorterngram self.do_whole_word_mask = do_whole_word_mask self.truncate_method = truncate_method self._drop_pooler = drop_pooler self._max_predictions_per_seq = max_predictions_per_seq self._do_permutation = do_permutation self._id_to_label = None self.__init_args__ = locals() self.albert_config = get_albert_config(config_file) self.tokenizer = get_word_piece_tokenizer(vocab_file, do_lower_case) self._key_to_depths = get_key_to_depths( self.albert_config.num_hidden_layers) if '[CLS]' not in self.tokenizer.vocab: self.tokenizer.add('[CLS]') self.albert_config.n_vocab += 1 if '[SEP]' not in self.tokenizer.vocab: self.tokenizer.add('[SEP]') self.albert_config.n_vocab += 1
def __init__(self, config_file, vocab_file, max_seq_length=128, label_size=None, init_checkpoint=None, output_dir=None, gpu_ids=None, drop_pooler=False, hidden_size=384, num_hidden_layers=4, pred_temporature=1.0, emd_temporature=1.0, beta=0.01, do_lower_case=True, truncate_method='LIFO'): super(ClassifierModule, self).__init__( init_checkpoint, output_dir, gpu_ids) self.batch_size = 0 self.max_seq_length = max_seq_length self.label_size = label_size self.truncate_method = truncate_method self.pred_temporature = pred_temporature self.emd_temporature = emd_temporature self.beta = beta self._drop_pooler = drop_pooler self._id_to_label = None self.__init_args__ = locals() self.bert_config = get_bert_config(config_file) self.tokenizer = get_word_piece_tokenizer(vocab_file, do_lower_case) self._key_to_depths = 'unsupported' self.student_config = copy.deepcopy(self.bert_config) self.student_config.hidden_size = hidden_size self.student_config.intermediate_size = 4 * hidden_size self.student_config.num_hidden_layers = num_hidden_layers if '[CLS]' not in self.tokenizer.vocab: self.tokenizer.add('[CLS]') self.bert_config.vocab_size += 1 tf.logging.info('Add necessary token `[CLS]` into vocabulary.') if '[SEP]' not in self.tokenizer.vocab: self.tokenizer.add('[SEP]') self.bert_config.vocab_size += 1 tf.logging.info('Add necessary token `[SEP]` into vocabulary.')
def __init__(self, config_file, vocab_file, max_seq_length=256, init_checkpoint=None, output_dir=None, gpu_ids=None, do_lower_case=True, reading_module='bert', split_sign='. ', alpha=0.5, truncate_method='longer-FO'): super(MRCModule, self).__init__(init_checkpoint, output_dir, gpu_ids) self.batch_size = 0 self.max_seq_length = max_seq_length self.truncate_method = truncate_method self.split_sign = split_sign self._do_lower_case = do_lower_case self._on_predict = False self._reading_module = reading_module self._alpha = alpha self.__init_args__ = locals() if reading_module == 'albert': self.bert_config = get_albert_config(config_file) else: self.bert_config = get_bert_config(config_file) assert reading_module in ('bert', 'albert', 'electra'), ( 'Invalid value of `reading_module`: %s. Pick one from ' '`bert`, `albert` and `electra`.') self.tokenizer = get_word_piece_tokenizer(vocab_file, do_lower_case) self._key_to_depths = get_key_to_depths( self.bert_config.num_hidden_layers) if '[CLS]' not in self.tokenizer.vocab: self.tokenizer.add('[CLS]') self.bert_config.vocab_size += 1 tf.logging.info('Add necessary token `[CLS]` into vocabulary.') if '[SEP]' not in self.tokenizer.vocab: self.tokenizer.add('[SEP]') self.bert_config.vocab_size += 1 tf.logging.info('Add necessary token `[SEP]` into vocabulary.')
def __init__(self, config_file, vocab_file, max_seq_length=128, label_size=None, init_checkpoint=None, output_dir=None, gpu_ids=None, wide_features=None, deep_module='bert', do_lower_case=True, truncate_method='LIFO'): super(ClassifierModule, self).__init__( init_checkpoint, output_dir, gpu_ids) self.batch_size = 0 self.max_seq_length = max_seq_length self.label_size = label_size self.truncate_method = truncate_method self.wide_features = wide_features self._deep_module = deep_module self._id_to_label = None self.__init_args__ = locals() if deep_module == 'albert': self.bert_config = get_albert_config(config_file) else: self.bert_config = get_bert_config(config_file) assert deep_module in ('bert', 'roberta', 'albert', 'electra'), ( 'Invalid value of `deep_module`: %s. Pick one from ' '`bert`, `roberta`, `albert` and `electra`.') self.tokenizer = get_word_piece_tokenizer(vocab_file, do_lower_case) self._key_to_depths = get_key_to_depths( self.bert_config.num_hidden_layers) if '[CLS]' not in self.tokenizer.vocab: self.tokenizer.add('[CLS]') self.bert_config.vocab_size += 1 tf.logging.info('Add necessary token `[CLS]` into vocabulary.') if '[SEP]' not in self.tokenizer.vocab: self.tokenizer.add('[SEP]') self.bert_config.vocab_size += 1 tf.logging.info('Add necessary token `[SEP]` into vocabulary.')
def __init__(self, config_file, vocab_file, max_seq_length=256, init_checkpoint=None, output_dir=None, gpu_ids=None, do_lower_case=True, truncate_method='longer-FO'): super(MRCModule, self).__init__(init_checkpoint, output_dir, gpu_ids) self.batch_size = 0 self.max_seq_length = max_seq_length self.truncate_method = truncate_method self._id_to_label = None self.__init_args__ = locals() self.bert_config = get_bert_config(config_file) self.tokenizer = get_word_piece_tokenizer(vocab_file, do_lower_case) self._key_to_depths = get_key_to_depths( self.bert_config.num_hidden_layers)
def __init__(self, vocab_file, model_size='base', max_seq_length=128, init_checkpoint=None, output_dir=None, gpu_ids=None, generator_weight=1.0, discriminator_weight=50.0, max_predictions_per_seq=20, masked_lm_prob=0.15, do_whole_word_mask=False, do_lower_case=True, truncate_method='LIFO'): super(LMModule, self).__init__( init_checkpoint, output_dir, gpu_ids) self.batch_size = 0 self.max_seq_length = max_seq_length self.generator_weight = generator_weight self.discriminator_weight = discriminator_weight self.masked_lm_prob = masked_lm_prob self.do_whole_word_mask = do_whole_word_mask self.truncate_method = truncate_method self._model_size = model_size self._max_predictions_per_seq = max_predictions_per_seq self._id_to_label = None self.__init_args__ = locals() self.tokenizer = get_word_piece_tokenizer(vocab_file, do_lower_case) self._key_to_depths = 'unsupported' if '[CLS]' not in self.tokenizer.vocab: self.tokenizer.add('[CLS]') tf.logging.info('Add necessary token `[CLS]` into vocabulary.') if '[SEP]' not in self.tokenizer.vocab: self.tokenizer.add('[SEP]') tf.logging.info('Add necessary token `[SEP]` into vocabulary.')