Exemplo n.º 1
0
def simple_bert():
    set_seed(33)

    opt = Adam(learning_rate=2e-5)

    id_ = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    mask_ = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    atn_ = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)

    config = BertConfig()
    config.output_hidden_states = False  # Set to True to obtain hidden states
    bert_model = TFBertModel.from_pretrained('bert-base-uncased',
                                             config=config)

    embedding = bert_model(id_, attention_mask=mask_, token_type_ids=atn_)[0]
    x = Bidirectional(  # 加上这个就变成了双向lstm
        LSTM(  # 这个是单向lstm
            64,
            # 权重初始化
            kernel_initializer='he_normal',
            # 返回每个token的输出,如果设置为False 只出最后一个。
            return_sequences=True))(embedding)
    #x=Attention(128)(x)

    x = GlobalAveragePooling1D()(x)
    #x = Dropout(0.2)(x)
    #x = Dense(64, activation='relu')(embedding)
    out = Dense(len(map_label), activation='softmax')(x)

    model = Model(inputs=[id_, mask_, atn_], outputs=out)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])

    return model
Exemplo n.º 2
0
def simple_bert():
    set_seed(33)

    opt = Adam(learning_rate=2e-5)

    id_ = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    mask_ = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    atn_ = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)

    config = BertConfig()
    config.output_hidden_states = False  # Set to True to obtain hidden states
    bert_model = TFBertModel.from_pretrained('bert-base-uncased',
                                             config=config)

    embedding = bert_model(id_, attention_mask=mask_, token_type_ids=atn_)[0]

    x = GlobalAveragePooling1D()(embedding)
    #x = Dropout(0.2)(x)
    #x = Dense(64, activation='relu')(embedding)
    out = Dense(len(map_label), activation='softmax')(x)

    model = Model(inputs=[id_, mask_, atn_], outputs=out)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])

    return model
def get_bert_hidden(n_hidden_layers=1, bert_path=BERT_PATH):
    input_word_ids = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_word_ids')
    input_masks = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_masks')
    input_segments = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_segments')
    
    config = BertConfig()
    config.output_hidden_states = True
    bert_layer = TFBertModel.from_pretrained(bert_path, config=config)
    hidden_layers = bert_layer([input_word_ids, input_masks, input_segments])[-1]  
    
    selected_hidden_layers = list()
    for i in range(n_hidden_layers):
        layer_idx = -(i+1)
        selected_hidden_layers.append(
            tf.reshape(hidden_layers[layer_idx][:,0], (-1,768))
        )
    if n_hidden_layers > 1:
        output_layer = tf.keras.layers.concatenate(inputs=selected_hidden_layers, axis=1)
    else:
        output_layer = selected_hidden_layers[0]
    
    bert_model = tf.keras.models.Model(inputs=[input_word_ids, input_masks, input_segments], 
                                       outputs=output_layer)
    return bert_model
 def __init__(self, bert_path, dropout, hidden_size, output_size):
     super().__init__()
     config = BertConfig()
     config.output_hidden_states = True
     self.bert_layer = BertModel.from_pretrained(bert_path, config=config)
     self.dropout_layer = torch.nn.Dropout(dropout)
     self.linear_layer = torch.nn.Linear(hidden_size, output_size)
     self.activation = torch.nn.Sigmoid()
Exemplo n.º 5
0
def get_book_feature(b_isbn_embedd, b_author_embedd, b_year_embedd,
                     b_publisher_embedd, book_title_id, book_title_type_id,
                     book_title_mask):
    # 首先对前4个特征连接Dense层
    b_isbn_dense = keras.layers.Dense(b_dense,
                                      activation='relu',
                                      kernel_regularizer=tf.nn.l2_loss,
                                      name='b_isbn_dense')(b_isbn_embedd)
    b_author_dense = keras.layers.Dense(b_dense,
                                        activation='relu',
                                        kernel_regularizer=tf.nn.l2_loss,
                                        name='b_author_dense')(b_author_embedd)
    b_year_dense = keras.layers.Dense(b_dense,
                                      activation='relu',
                                      kernel_regularizer=tf.nn.l2_loss,
                                      name='b_year_dense')(b_year_embedd)
    b_publisher_dense = keras.layers.Dense(
        b_dense,
        activation='relu',
        kernel_regularizer=tf.nn.l2_loss,
        name='b_publisher_dense')(b_publisher_embedd)
    # 合并这四个特征,  b_combine_four shape = (?, 1, 16)
    b_combine_four = keras.layers.concatenate(
        [b_isbn_dense, b_author_dense, b_year_dense, b_publisher_dense],
        name='b_four_combine')
    print('b_combine_four.shape', b_combine_four.shape)
    b_combine_four_reshape = keras.layers.Reshape(
        [b_combine_four.shape[2]],
        name='b_combine_four_reshape')(b_combine_four)

    config = BertConfig()
    # 获取隐藏层的信息
    config.output_hidden_states = True
    bert_model = TFBertModel.from_pretrained(bert_path +
                                             'bert-base-uncased-tf_model.h5',
                                             config=config)
    book_title_cls = bert_model(book_title_id,
                                attention_mask=book_title_mask,
                                token_type_ids=book_title_type_id)
    print(len(book_title_cls))
    print(book_title_cls[0].shape)
    print(book_title_cls[1].shape)
    book_feature_layer = keras.layers.Dense(64, activation='tanh')(
        book_title_cls[1])
    b_combine_book = keras.layers.concatenate(
        [book_feature_layer, b_combine_four_reshape],
        axis=1,
        name='b_combine_book')
    # 得到书籍矩阵
    b_feature_layer = keras.layers.Dense(200,
                                         name='b_feature_layer',
                                         activation='tanh')(b_combine_book)
    return b_feature_layer
Exemplo n.º 6
0
def dual_bert():
    set_seed(33)

    opt = Adam(learning_rate=2e-5)

    id1 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    id2 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)

    mask1 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    mask2 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)

    atn1 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    atn2 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)

    config = BertConfig()
    config.output_hidden_states = False  # Set to True to obtain hidden states
    bert_model1 = TFBertModel.from_pretrained('bert-base-uncased',
                                              config=config)
    bert_model2 = TFBertModel.from_pretrained('bert-base-uncased',
                                              config=config)

    embedding1 = bert_model1(id1, attention_mask=mask1, token_type_ids=atn1)[0]
    embedding2 = bert_model2(id2, attention_mask=mask2, token_type_ids=atn2)[0]
    x = Concatenate()([embedding1, embedding2])
    x = keras.layers.Bidirectional(  # 加上这个就变成了双向lstm
        keras.layers.LSTM(  # 这个是单向lstm
            64,
            # 权重初始化
            kernel_initializer='he_normal',
            # 返回每个token的输出,如果设置为False 只出最后一个。
            return_sequences=True))(x)
    #x = Lambda(lambda x: x[:, 0], name='CLS-token')(x)#降维
    #x1 = GlobalAveragePooling1D()(embedding1)
    #x2 = GlobalAveragePooling1D()(embedding2)

    #x = Concatenate()([x1, x2])
    x = Attention(128)(x)  # 加入attention

    x = Dense(64, activation='relu')(x)
    x = Dropout(0.2)(x)
    #out = Dense(len(map_label), activation='softmax')(x)
    out = Dense(5, activation='softmax')(x)

    model = Model(inputs=[id1, mask1, atn1, id2, mask2, atn2], outputs=out)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])  #加个评测指标

    return model
Exemplo n.º 7
0
def get_book_feature(book_title_id, book_title_type_id, book_title_mask):
    config = BertConfig()
    # 获取隐藏层的信息
    config.output_hidden_states = True
    bert_model = TFBertModel.from_pretrained(bert_path +
                                             'bert-base-uncased-tf_model.h5',
                                             config=config)
    book_title_cls = bert_model(book_title_id,
                                attention_mask=book_title_mask,
                                token_type_ids=book_title_type_id)
    print(len(book_title_cls))
    print(book_title_cls[0].shape)
    print(book_title_cls[1].shape)
    book_feature_layer = keras.layers.Dense(100, activation='tanh')(
        book_title_cls[1])
    return book_feature_layer
Exemplo n.º 8
0
    def __init__(self, config: BertConfig, num_hidden_layers=None):
        super().__init__()
        self.logger = get_logger(__name__)
        config.output_hidden_states = True
        self.embeddings = BertEmbeddings(config)
        num_hidden_layers = config.num_hidden_layers if num_hidden_layers is None else num_hidden_layers
        assert num_hidden_layers > 0, 'bert_layers must > 0'

        # 需要注意的是和原始transformer的BERT_Encoder的输出不一样
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states

        layer = BertLayer(config)
        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_hidden_layers)])
        self.config = config
        self.num_hidden_layers = num_hidden_layers
        self.apply(self.init_bert_weights)
def create_model1():
    q_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    a_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)

    q_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    a_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)

    q_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    a_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)

    config = BertConfig()
    config.output_hidden_states = False

    bert_model = TFBertModel.from_pretrained('bert-base-uncased',
                                             config=config)

    q_embedding = bert_model(q_id, attention_mask=q_mask,
                             token_type_ids=q_atn)[0]
    a_embedding = bert_model(a_id, attention_mask=a_mask,
                             token_type_ids=a_atn)[0]

    q = tf.keras.layers.GlobalAveragePooling1D()(q_embedding)
    a = tf.keras.layers.GlobalAveragePooling1D()(a_embedding)

    x = tf.keras.layers.Concatenate()([q, q])
    x = tf.keras.layers.Reshape((1, x.shape[-1]))(x)

    cnn = tf.keras.layers.Conv1D(64, 3, padding='same', activation='relu')(x)
    cnn = tf.keras.layers.MaxPooling1D(pool_size=1, strides=2)(cnn)
    cnn = tf.keras.layers.BatchNormalization()(cnn)

    lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=64))(cnn)
    lstm = tf.keras.layers.Dropout(0.2)(lstm)

    dense = tf.keras.layers.Dense(64, activation='relu')(lstm)
    x = tf.keras.layers.BatchNormalization()(x)

    x = tf.keras.layers.Dense(TARGET_COUNT1, activation='softmax')(dense)
    x = tf.keras.layers.BatchNormalization()(x)

    model = tf.keras.models.Model(
        inputs=[q_id, q_mask, q_atn, a_id, a_mask, a_atn], outputs=x)

    return model
Exemplo n.º 10
0
def create_model():
    q_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    q_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    q_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)

    config = BertConfig()
    config.output_hidden_states = False

    bert_model = TFBertModel.from_pretrained('bert-base-uncased',
                                             config=config)
    q_embedding = bert_model(q_id, attention_mask=q_mask,
                             token_type_ids=q_atn)[0]
    q = tf.keras.layers.GlobalAveragePooling1D()(q_embedding)

    x = tf.keras.layers.Dropout(0.2)(q)
    x = tf.keras.layers.Dense(TARGET_COUNT, activation='sigmoid')(x)

    model = tf.keras.models.Model(inputs=[q_id, q_mask, q_atn], outputs=x)
    return model
def create_model():
    id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    attn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    config = BertConfig()
    config.output_hidden_states = True
    bert_model = TFBertModel.from_pretrained(
        'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5',
        config=config)
    _, _, hidden_states = bert_model(id,
                                     attention_mask=mask,
                                     token_type_ids=attn)
    h12 = tf.reshape(hidden_states[-1][:, 0], (-1, 1, 768))
    h11 = tf.reshape(hidden_states[-2][:, 0], (-1, 1, 768))
    h10 = tf.reshape(hidden_states[-3][:, 0], (-1, 1, 768))
    h09 = tf.reshape(hidden_states[-4][:, 0], (-1, 1, 768))
    concat_hidden = tf.keras.layers.Concatenate(axis=2)([h12, h11, h10, h09])
    x = tf.keras.layers.GlobalAveragePooling1D()(concat_hidden)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(MAX_TARGET_LEN, activation='sigmoid')(x)
    model = tf.keras.models.Model(inputs=[id, mask, attn], outputs=x)
    return model
Exemplo n.º 12
0
    def __init__(
        self,
        pretrained_bert_model,
        language,
        name,
        prediction_type,
        output_hidden_states,
        output_attentions,
        attention_length_before=1,
        attention_length_after=1,
        config_path=None,
        max_length=512,
        number_of_sentence=1,
        number_of_sentence_before=0,
        number_of_sentence_after=0,
        seed=1111,
        hidden_dropout_prob=0.,
        attention_probs_dropout_prob=0.,
        stop_attention_at_sent_before=None,
        stop_attention_before_sent=0,
    ):
        super(BertExtractor, self).__init__()
        # Load pre-trained model tokenizer (vocabulary)
        # Crucially, do not do basic tokenization; PTB is tokenized. Just do wordpiece tokenization.
        if config_path is None:
            configuration = BertConfig()
            configuration.hidden_dropout_prob = hidden_dropout_prob
            configuration.attention_probs_dropout_prob = attention_probs_dropout_prob
            configuration.output_hidden_states = output_hidden_states
            configuration.output_attentions = output_attentions
            self.model = BertModel.from_pretrained(
                pretrained_bert_model,
                config=configuration)  #, config=configuration
        else:
            self.model = BertModel.from_pretrained(
                pretrained_bert_model)  #, config=configuration
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_bert_model)

        self.language = language
        self.attention_length_before = attention_length_before
        self.attention_length_after = attention_length_after
        self.pretrained_bert_model = pretrained_bert_model
        self.NUM_HIDDEN_LAYERS = self.model.config.num_hidden_layers
        self.FEATURE_COUNT = self.model.config.hidden_size
        self.NUM_ATTENTION_HEADS = self.model.config.num_attention_heads
        self.name = name
        self.config = {
            'max_length':
            max_length,
            'seed':
            seed,
            'number_of_sentence':
            number_of_sentence,
            'number_of_sentence_before':
            number_of_sentence_before,
            'number_of_sentence_after':
            number_of_sentence_after,
            'attention_length_before':
            attention_length_before,
            'attention_length_after':
            attention_length_after,
            'stop_attention_at_sent_before':
            stop_attention_at_sent_before,
            'stop_attention_before_sent':
            stop_attention_before_sent,
            'output_hidden_states':
            output_hidden_states,
            'output_attentions':
            output_attentions,
            'model_type':
            'bert',
            'hidden_size':
            self.model.config.hidden_size,
            'hidden_act':
            self.model.config.hidden_act,
            'initializer_range':
            self.model.config.initializer_range,
            'vocab_size':
            self.model.config.vocab_size,
            'hidden_dropout_prob':
            self.model.config.hidden_dropout_prob,
            'num_attention_heads':
            self.model.config.num_attention_heads,
            'type_vocab_size':
            self.model.config.type_vocab_size,
            'max_position_embeddings':
            self.model.config.max_position_embeddings,
            'num_hidden_layers':
            self.model.config.num_hidden_layers,
            'intermediate_size':
            self.model.config.intermediate_size,
            'attention_probs_dropout_prob':
            self.model.config.attention_probs_dropout_prob
        }
        if config_path is not None:
            with open(config_path, 'r') as f:
                self.config.update(json.load(f))

        self.prediction_type = prediction_type  # ['sentence', 'token-level']
Exemplo n.º 13
0
def run_model(pos_train_file, neg_train_file, pos_dev_file, neg_dev_file,
              nrows_train, nrows_dev, epochs, out_dir):
    batch_size = 16

    #x_train = _read_data('../data/train_bal.csv', nrows_train)
    #x_dev = _read_data('../data/dev_bal.csv', nrows_dev)

    #train_data = list( zip( x_train['comment_text'].values, x_train['target'].values  ))

    #train_dataloader = DataLoader(  train_data,
    #                            collate_fn=my_collate,
    #                            batch_size=batch_size , shuffle=True,  )
    # #

    #dev_data = list( zip( x_dev['comment_text'].values, x_dev['target'].values  ))

    #dev_dataloader = DataLoader(  dev_data,
    #                            collate_fn=my_collate,
    #                            batch_size=batch_size, shuffle=False,  )

    train_dataloader = get_data_loader_bal(pos_train_file,
                                           neg_train_file,
                                           batch_size=batch_size,
                                           nrows_pos=nrows_train,
                                           nrows_neg=nrows_train,
                                           mode='train')
    dev_dataloader = get_data_loader_bal(pos_dev_file,
                                         neg_dev_file,
                                         batch_size=batch_size,
                                         nrows_pos=nrows_dev,
                                         nrows_neg=nrows_dev,
                                         mode='dev')

    device = get_device()

    bert_hidden_states = 4
    config = BertConfig()
    config.output_hidden_states = True

    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",  # Use the 12-layer BERT model, with an uncased vocab.
        num_labels=
        2,  # The number of output labels--2 for binary classification.
        # You can increase this for multi-class tasks.
        output_attentions=False,  # Whether the model returns attentions weights.
        output_hidden_states=
        False,  # Whether the model returns all hidden-states.
    )
    model = model.to(device)

    optimizer = AdamW(
        model.parameters(),
        lr=2e-5,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
        eps=1e-8  # args.adam_epsilon  - default is 1e-8.
    )

    total_steps = len(train_dataloader) * epochs

    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,  # Default value in run_glue.py
        num_training_steps=total_steps)

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    best_score = -np.inf

    stats_vec = []
    for epoch in range(epochs):
        stats = train_epoch(model, train_dataloader, dev_dataloader, optimizer,
                            scheduler)
        print(stats)

        if stats['accuracy'] > best_score:
            best_score = stats['accuracy']
            f = out_dir + '/' + 'best_model_ch.pt'
            torch.save({
                'epoch': epoch,
                'model': model,
                'stats': stats,
            }, f)

        stats_vec.append(stats)

    stats_vec = pd.DataFrame(stats_vec)

    f = out_dir + '/' + 'last_model_ch.pt'
    torch.save({
        'epoch': epoch,
        'model': model,
        'stats': stats,
    }, f)

    print(stats_vec)
    stats_vec.to_csv(out_dir + '/' + 'stats.csv')
 def __init__(self, bert_path):
     super().__init__()
     config = BertConfig()
     config.output_hidden_states = True
     self.bert_layer = BertModel.from_pretrained(bert_path, config=config)
Exemplo n.º 15
0
tokenizer_new = BertTokenizer.from_pretrained(distil_bert, do_lower_case=True, add_special_tokens=True,max_length=128, pad_to_max_length=True)

def tokenize_new(sentences, tokenizer_new):
    input_ids, input_masks, input_segments = [],[],[]
    for sentence in tqdm(sentences):
        inputs = tokenizer_new.encode_plus(sentence, add_special_tokens=True, max_length=128, pad_to_max_length=True, 
                                             return_attention_mask=True, return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])        
        
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'), np.asarray(input_segments, dtype='int32')

config1 = BertConfig(dropout=0.2, attention_dropout=0.2)
config1.output_hidden_states = False
transformer_model =TFBertModel.from_pretrained(distil_bert, config = config1)

input_ids_in = tf.keras.layers.Input(shape=(128,), name='input_token', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(128,), name='masked_token', dtype='int32') 

embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1,kernel_initializer='normal'))(embedding_layer)
X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(50, activation='relu',kernel_initializer='normal')(X)
X = tf.keras.layers.Dropout(0.2)(X)
X = tf.keras.layers.Dense(1, activation='linear',kernel_initializer='normal')(X)
model1 = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = X)

for layer in model1.layers[:3]:
  layer.trainable = False
Exemplo n.º 16
0
from utils import DataCollatorForMLM, MLMDataset
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
# from train_args import TrainingArguments
from middle_train import Middle_Trainer
import numpy
import torch
import torch.nn as nn
import os
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", type=int)
args = parser.parse_args()
os.environ['CUDA_VISIBLE_DEVICES'] = '1,2,3'
config = BertConfig()
config.output_hidden_states = True
config.vocab_size = 41460
model = AutoModelForMaskedLM.from_config(config)
model.bert.embeddings.word_embeddings = nn.Embedding(1839, 768, padding_idx=0)
con_tokenizer = BertTokenizer.from_pretrained('y2d1')
lab_tokenizer = BertTokenizer.from_pretrained('z2d')
data_collator = DataCollatorForMLM(tokenizer=con_tokenizer,
                                   mlm=True,
                                   mlm_probability=0.2)
train_dataset = MLMDataset(con_tokenizer=con_tokenizer,
                           lab_tokenizer=lab_tokenizer,
                           file_path='./data/trainpath')
eval_dataset = MLMDataset(con_tokenizer=con_tokenizer,
                          lab_tokenizer=lab_tokenizer,
                          file_path='./data/evalpath')
training_args = TrainingArguments(
Exemplo n.º 17
0
def run_model(pos_train_file,
              neg_train_file,
              pos_dev_file,
              neg_dev_file,
              nrows_train,
              nrows_dev,
              epochs,
              out_dir,
              dropout=0.2,
              model='bert',
              batch_size=16,
              test_file='../data/test_data_clean.csv',
              lr=2e-5,
              lmda=10.0,
              stnc_emb='last'):

    device = get_device()

    bert_hidden_states = 4

    if model == 'bert':
        config = BertConfig()
        config.output_hidden_states = True
        model = BertForToxic(
            config,
            bert_hidden_states=bert_hidden_states,
            dropout=dropout,
            update_bert=True,
        )
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                                  do_lower_case=True)

    if model == 'distilbert':
        #config = DistilBertConfig()
        config = BertConfig()
        config.output_hidden_states = True
        model = DistilBertForToxic(config,
                                   bert_hidden_states=bert_hidden_states,
                                   dropout=dropout,
                                   update_bert=True,
                                   lmda=lmda,
                                   stnc_emb=stnc_emb)
        #tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                                  do_lower_case=True)

    train_dataloader = get_data_loader_bal(pos_train_file,
                                           neg_train_file,
                                           batch_size=batch_size,
                                           nrows_pos=nrows_train,
                                           nrows_neg=nrows_train * 10,
                                           mode='train',
                                           tokenizer=tokenizer)
    dev_dataloader = get_data_loader_bal(pos_dev_file,
                                         neg_dev_file,
                                         batch_size=batch_size,
                                         nrows_pos=nrows_dev,
                                         nrows_neg=nrows_dev,
                                         mode='dev',
                                         tokenizer=tokenizer)

    model.to(device)

    optimizer = AdamW(
        model.parameters(),
        lr=lr,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
        eps=1e-8  # args.adam_epsilon  - default is 1e-8.
    )

    total_steps = len(train_dataloader) * epochs

    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,  # Default value in run_glue.py
        num_training_steps=total_steps)

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    best_score = -np.inf

    stats_vec = []
    dev_pred_vec = []
    for epoch in range(epochs):
        stats, dev_pred = train_epoch(model, train_dataloader, dev_dataloader,
                                      optimizer, scheduler)
        print(epoch, stats)

        if stats['accuracy'] > best_score:
            best_score = stats['accuracy']
            f = out_dir + '/' + 'best_model_ch.pt'
            torch.save({
                'epoch': epoch,
                'model': model,
                'stats': stats,
            }, f)

        stats_vec.append(stats)
        dev_pred_vec.append(dev_pred)

    stats_vec = pd.DataFrame(stats_vec)
    dev_pred_vec = pd.concat(dev_pred_vec, axis=0)

    f = out_dir + '/' + 'last_model_ch.pt'
    torch.save({
        'epoch': epoch,
        'model': model,
        'stats': stats,
    }, f)

    print(stats_vec)
    stats_vec.to_csv(out_dir + '/' + 'stats.csv')

    out_file = out_dir + '/train_pred.csv'
    df = get_data_pred(
        train_dataloader,
        model,
        out_file,
    )

    out_file = out_dir + '/dev_pred.csv'
    df = get_data_pred(
        dev_dataloader,
        model,
        out_file,
    )

    test_dataloader = get_data_loader_pred(test_file, tokenizer, nrows=None)
    out_file = out_dir + '/test_pred.csv'
    df = get_data_pred(
        test_dataloader,
        model,
        out_file,
    )
Exemplo n.º 18
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        help="The name of the task for training.")
    parser.add_argument("--data_dir", default=None, type=str, required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument("--bert_model",
                        default="bert-base-uncased",
                        type=str,
                        help="student bert model configuration folder")
    parser.add_argument("--encoder_checkpoint",
                        default=None,
                        type=str,
                        help="check point for student encoder")
    parser.add_argument("--cls_checkpoint",
                        default=None,
                        type=str,
                        help="check point for student classifier")
    parser.add_argument("--alpha",
                        default=0.95,
                        type=float,
                        help="alpha for distillation")
    parser.add_argument("--T",
                        default=10.,
                        type=float,
                        help="temperature for distillation")
    parser.add_argument("--beta",
                        default=0.0,
                        type=float,
                        help="weight for AT loss")
    parser.add_argument("--fc_layer_idx",
                        default=None,
                        type=str,
                        help="layers ids we will put FC layers on")
    parser.add_argument("--normalize_patience",
                        default=False,
                        help="normalize patience or not")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="do training or not")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="do evaluation during training or not")

    parser.add_argument("--train_type", default="finetune_teacher",
                        choices=["finetune_teacher","train_student"],
                        help="choose which to train")
    parser.add_argument("--log_every_step",
                        default=50,
                        type=int,
                        help="output to log every global x training steps, default is 1")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=2e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--num_train_epochs",
                        default=3,
                        type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--logging_steps',
                        type=int,
                        default=1000,
                        help="Log every X updates steps.")
    parser.add_argument('--student_hidden_layers',
                        type=int,
                        default=12,
                        help="number of transformer layers for student, default is None (use all layers)")
    parser.add_argument('--teacher_prediction',
                        type=str,
                        default=None,
                        help="teacher prediction file to guild the student's output")
    parser.add_argument("--warmup_steps", default=0, type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument('--overwrite_output_dir', action='store_true',
                        help="Overwrite the content of the output directory")
    args = parser.parse_args()

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
    logger.info('actual batch size on all GPU = %d' % args.train_batch_size)

    if args.train_type == 'finetune_teacher':
        args.student_hidden_layers = 12 if 'base' in args.bert_model else 24
        args.alpha = 0.0   # alpha = 0 is equivalent to fine-tuning for KD
    elif args.train_type == "train_student":
        args.student_hidden_layers = 6
        args.kd_model = "kd.cls"
        args.alpha = 0.7
        args.beta = 500
        args.T = 10
        args.fc_layer_idx = "1,3,5,7,9"   # this for pkd-skip
        args.normalize_patience = True
    else:
        raise ValueError("please pick train_type from finetune_teacher,train_student")

    if args.encoder_checkpoint is None:
        args.encoder_checkpoint = os.path.join(args.bert_model, 'pytorch_model.bin')
        logger.info('encoder checkpoint not provided, use pre-trained at %s instead' % args.encoder_checkpoint)

    if args.do_train:
        # Create output directory if needed
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir))


    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()
    #args.n_gpu = 1
    logger.info("device: {} n_gpu: {}".format(args.device, args.n_gpu))

    # set seed
    set_seed(args)

    # prepare task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    args.num_labels = len(label_list)

    # prepare tokenizer and model
    config = BertConfig()
    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True)

    config.output_hidden_states = True

    encoder = BertForSequenceClassificationEncoder(config, num_hidden_layers=args.student_hidden_layers)
    classifier = FCClassifierForSequenceClassification(config, args.num_labels, config.hidden_size, 0)

    n_student_layer = len(encoder.bert.encoder.layer)
    encoder = load_model(encoder, args.encoder_checkpoint, args, 'student', verbose=True)
    logger.info('*' * 77)
    classifier = load_model(classifier, args.cls_checkpoint, args, 'classifier', verbose=True)


    n_param_student = count_parameters(encoder) + count_parameters(classifier)
    logger.info('number of layers in student model = %d' % n_student_layer)
    logger.info('num parameters in student model are %d' % n_param_student)

    # Training
    if args.do_train:
        read_set = 'train'
        if args.train_type == "train_student":
            assert args.teacher_prediction is not None
            assert args.alpha > 0
            logger.info('loading teacher\'s predictoin')
            teacher_predictions = pickle.load(open(args.teacher_prediction, 'rb'))['train'] if args.teacher_prediction is not None else None
            logger.info('teacher acc = %.2f, teacher loss = %.5f' % (
            teacher_predictions['acc'] * 100, teacher_predictions['loss']))
            train_examples, train_dataloader, _ = get_task_dataloader(args, read_set, tokenizer,
                                                                      SequentialSampler,
                                                                      batch_size=args.train_batch_size,
                                                                      knowledge=teacher_predictions['pred_logit'],
                                                                      extra_knowledge=teacher_predictions[
                                                                          'feature_maps'])
        else:
            assert args.alpha == 0
            logger.info("runing teacher fine-tuning")
            train_examples, train_dataloader, _ = get_task_dataloader(args, read_set, tokenizer,
                                                                      SequentialSampler,
                                                                      batch_size=args.train_batch_size)

        global_step, tr_loss = train(args, train_dataloader, encoder, classifier, tokenizer)
        #################
        # information of teacher model (like [CLS])
        #################
        if args.train_type == "finetune_teacher":
            all_res = {'train': None}

            encoder_file = os.path.join(args.output_dir,f'{args.train_type}_epoch{args.num_train_epochs-1}.encoder.pkl')
            cls_file = os.path.join(args.output_dir,f'{args.train_type}_epoch{args.num_train_epochs-1}.cls.pkl')
            print("encoder_file")

            encoder = BertForSequenceClassificationEncoder(config, num_hidden_layers=args.student_hidden_layers)
            classifier = FCClassifierForSequenceClassification(config, args.num_labels, config.hidden_size, 0)

            encoder = load_model(encoder, encoder_file, args, 'exact', verbose=True)
            classifier = load_model(classifier, cls_file, args, 'exact', verbose=True)
            
            train_res = eval_model_dataloader(encoder, classifier, train_dataloader, args.device, detailed=True,
                                              verbose=False)
            all_res['train'] = train_res

            logger.info('saving teacher results')

            fname = os.path.join(args.output_dir,
                                 args.task_name + f'_teacher_{args.student_hidden_layers}layer_information.pkl')
            with open(fname, 'wb') as fp:
                pickle.dump(all_res, fp)

        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Evaluation
    if args.do_eval:


        test_examples, test_dataloader, test_label_ids = get_task_dataloader(args, 'dev', tokenizer,
                                                                             SequentialSampler,
                                                                             batch_size=args.eval_batch_size)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(test_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        result = evaluate(args, test_label_ids, encoder,classifier,test_dataloader)

        output_test_file = os.path.join(args.output_dir, "test_results_" + '.txt')
        with open(output_test_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
    return