예제 #1
0
def train_on_all_set():
    train_config = get_config()
    bert_config = get_bert_config(train_config)
    import pickle
    with open('tok_text_uncased.pkl', 'rb') as h:
        text = pickle.load(h)
    with open('y_train.pkl', 'rb') as h:
        label = pickle.load(h)
    with open('y_aux.pkl', 'rb') as h:
        aux = pickle.load(h)
    iden_df = pd.read_csv('processed_data/train_tok_iden.csv')
    weights = get_weights_new(iden_df)
    del iden_df
    lw = 1 / np.mean(weights)
    train_seg = [[0 for _ in t] for t in text]

    train_gen = GeneralDataGenerator(
        inputs=[text, train_seg],
        outputs=[label, aux],
        sample_weights=[weights, np.ones_like(weights)],
        batch_size=32,
        pad_fn=[
            lambda x: seq_padding(x, truncate=False),
            lambda x: seq_padding(x, truncate=False)
        ])
    # train_gen = AllDataGenerator(text, label, aux, sample_weight)

    with tf.device('/cpu:0'):
        model = get_bert_multi_model(bert_config)
    # model.load_weights('save_models/bert.weights.h5')

    # OPTIMIZER PARAMs
    lr = 2e-5
    weight_decay = 0.01
    bsz = 32
    decay_steps = 1 * len(train_gen)
    warmup_steps = int(0.1 * decay_steps)

    optimizer = AdamWarmup(
        decay_steps=decay_steps,
        warmup_steps=warmup_steps,
        lr=lr,
        weight_decay=weight_decay,
    )

    parallel_model = multi_gpu_model(model, gpus=2)
    parallel_model.compile(loss='binary_crossentropy',
                           optimizer=optimizer,
                           loss_weights=[lw, 1.])
    parallel_model.fit_generator(
        train_gen.__iter__(),
        steps_per_epoch=len(train_gen),
        epochs=1,
        max_queue_size=100,
    )
    model.save('save_models/bert.weights-uncased-new_weight_all.h5')
    print("DONE")
예제 #2
0
def train_on_train_test_split():
    train_config = get_config()
    bert_config = get_bert_config(train_config)
    cased = train_config.BERT_DIR.split('/')[-1].startswith('cased')
    tokenizer = FullTokenizer(bert_config.vocab, do_lower_case=cased)

    with tf.device('/cpu:0'):
        model = get_bert_base_model(bert_config)

    text, label = load_data(os.path.join(train_config.DATA_DIR, 'train.csv'))
    train_text, val_text, train_label, val_label = train_test_split(
        text, label, test_size=0.055, random_state=59)
    train_gen = DataGenerator(train_text,
                              train_label,
                              tokenizer,
                              batch_size=32)

    val_text = tokenize_examples(val_text, tokenizer, max_len=512)
    val_text = seq_padding(val_text)

    logger = Logger(model=model,
                    val_text=val_text,
                    val_label=(val_label > 0.5).astype(np.float32))

    # OPTIMIZER PARAMs
    lr = 2e-5
    weight_decay = 0.01
    bsz = 32
    decay_steps = 1 * len(train_gen)
    warmup_steps = int(0.1 * decay_steps)

    optimizer = AdamWarmup(
        decay_steps=decay_steps,
        warmup_steps=warmup_steps,
        lr=lr,
        weight_decay=weight_decay,
    )

    parallel_model = multi_gpu_model(model, gpus=4)
    parallel_model.compile(loss='binary_crossentropy', optimizer=optimizer)
    parallel_model.fit_generator(train_gen.__iter__(),
                                 steps_per_epoch=len(train_gen),
                                 epochs=1,
                                 callbacks=[logger],
                                 max_queue_size=100)
예제 #3
0
def train_ml_all_set():
    train_config = get_config()
    bert_config = get_bert_config(train_config)
    import pickle
    with open('tok_text_uncased.pkl', 'rb') as h:
        text = pickle.load(h)
    with open('y_train.pkl', 'rb') as h:
        label = pickle.load(h)
    with open('y_aux.pkl', 'rb') as h:
        aux = pickle.load(h)
    iden_df = pd.read_csv('processed_data/train_tok_iden.csv')
    weights = get_weights_new(iden_df)
    del iden_df

    train_text, _, train_label, _, train_aux, _, train_weights, _ = train_test_split(
        text, label, aux, weights, test_size=0.055, random_state=59)
    train_seg = [[0 for _ in t] for t in train_text]

    train_gen = GeneralDataGenerator(
        inputs=[train_text, train_seg],
        outputs=[train_label, train_aux],
        sample_weights=[train_weights,
                        np.ones_like(train_weights)],
        pad_fn=[
            lambda x: seq_padding(x, truncate=False),
            lambda x: seq_padding(x, truncate=False)
        ],
        batch_size=64)

    with tf.device('/cpu:0'):
        model = get_bert_multi_model(bert_config)

    # optimizer = Adam(lr=2e-5)
    # OPTIMIZER PARAMs
    lr = 2e-5
    weight_decay = 0.01
    bsz = 32
    decay_steps = 1 * len(train_gen)
    warmup_steps = int(0.1 * decay_steps)

    optimizer = AdamWarmup(
        decay_steps=decay_steps,
        warmup_steps=warmup_steps,
        lr=lr,
        weight_decay=weight_decay,
        weight_decay_pattern=[
            'embeddings', 'kernel', 'W1', 'W2', 'Wk', 'Wq', 'Wv', 'Wo'
        ],
    )

    parallel_model = multi_gpu_model(model, gpus=4)
    # parallel_model.compile(loss=[focal_loss(gamma=2., alpha=.25), 'binary_crossentropy'], optimizer=optimizer)
    parallel_model.compile(loss='binary_crossentropy', optimizer=optimizer)

    parallel_model.fit_generator(
        train_gen.__iter__(),
        steps_per_epoch=len(train_gen),
        epochs=1,
        max_queue_size=20,
    )
    model.save('save_models/bert.weights-large-nw.h5')
    # print('SAVED')
    # parallel_model.fit_generator(train_gen.__iter__(),
    #                              steps_per_epoch=len(train_gen),
    #                              epochs=1,
    #                              max_queue_size=20,
    #                              )
    # model.save('save_models/bert.weights-uncased-ml2-e2.h5')
    print("DONE")
예제 #4
0
def train_elmo():
    # 489603
    # <S> 489604
    # </S> 489605
    df = pd.read_csv('new_processed_data/train_tok.csv')
    iden_df = pd.read_csv('processed_data/train_tok_iden.csv')

    datadir = os.path.join('elmo_model')
    options_file = os.path.join(
        datadir, 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json')
    weight_file = os.path.join(
        datadir, 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5')

    texts = df['comment_text'].values
    labels_aux = df[AUX_COLUMNS].values
    identities = df[IDENTITY_COLUMNS].fillna(0).values
    labels = df[TARGET_COLUMN].values
    weights = get_weights_new(iden_df)

    # import pickle
    # word_index = pickle.load(open('new_processed_data/word_index.pkl', 'rb'))
    # with open('new_processed_data/vocab.txt', 'w', encoding='utf8') as f:
    #     for k in word_index.keys():
    #         f.write(f'{k}\n')
    #     f.write('</S>')
    #     f.write('<S>')
    import pickle
    embedding_matrix = pickle.load(open('new_processed_data/emb.pkl', 'rb'))
    texts_ids = pickle.load(open('new_processed_data/texts.pkl', 'rb'))

    batcher = BT('new_processed_data/vocab.txt', 50)

    del iden_df
    del df

    train_ind, val_ind = train_test_split(range(len(texts)),
                                          random_state=59,
                                          test_size=0.055)
    # FOR ELMO
    train_texts, val_texts = texts[train_ind], texts[val_ind]
    train_texts = [s.split(' ')[:512] for s in train_texts]
    val_texts = [s.split(' ')[:512] for s in val_texts]

    # FOR W2v
    train_texts_ids = [texts_ids[i] for i in train_ind]
    val_texts_ids = [texts_ids[i] for i in val_ind]

    train_texts_ids = [ti[:512] for ti in train_texts_ids]
    val_texts_ids = [ti[:512] for ti in val_texts_ids]

    train_labels, val_labels = labels[train_ind], labels[val_ind]
    train_weight = weights[train_ind]
    lw = 1 / np.mean(train_weight)
    train_aux_labels = labels_aux[train_ind]
    train_iden, val_iden = identities[train_ind], identities[val_ind]

    pad_fn = [
        lambda x: seq_padding(x, truncate=False), batcher.batch_sentences
    ]
    train_gen = GeneralDataGenerator(
        inputs=[train_texts_ids, train_texts],
        outputs=[train_labels, train_aux_labels],
        sample_weights=[train_weight, np.ones_like(train_weight)],
        batch_size=32,
        pad_fn=pad_fn)
    val_gen = ELMoPredictGenerator(text_ids=val_texts_ids,
                                   text=val_texts,
                                   pad_fn=pad_fn,
                                   batch_size=32)

    model = get_lstm_elmo_model(embedding_matrix, weight_file, options_file,
                                1024, len(AUX_COLUMNS))

    # lr = 1e-3
    # weight_decay = 0.01
    # bsz = 32
    # decay_steps = 3 * len(train_gen)
    # warmup_steps = int(0.05 * decay_steps)
    #
    # optimizer = AdamWarmup(
    #     decay_steps=decay_steps,
    #     warmup_steps=warmup_steps,
    #     lr=lr,
    #     weight_decay=weight_decay
    # )
    optimizer = Adam(1e-3)
    load_model_weights(model,
                       'save_models/weights.3-93.597.elmo_w2v_lstm2_dp05.pkl')
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  loss_weights=[lw, 1.])
    model.summary()

    logger = KFoldLogger('elmo_w2v_lstm2_dp05',
                         val_gen,
                         val_true=val_labels,
                         val_iden=val_iden)

    model.fit_generator(train_gen.__iter__(),
                        len(train_gen),
                        epochs=5,
                        callbacks=[logger],
                        initial_epoch=3)
예제 #5
0
 def pad_fn(ts):
     ts = [bpe.encode(t)[:512] for t in ts]
     return seq_padding(ts, truncate=False)