示例#1
0
def train_5_round():
    # 设置内存自适应
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    session = tf.Session(config=tf_config)
    KTF.set_session(session)

    f1_record = []
    for round_id in range(1, 6):
        f1_record.append(['round', round_id])

        vocabulary, embed_matrix = dh.load_embed(file_name='../data/word_embedding.txt')

        train_file_name = '../data/train.csv'
        test_file_name = '../data/test.csv'
        test_data = pd.read_csv(test_file_name)
        q1, q2, y = data_pipeline(train_file_name, vocabulary=vocabulary, return_label=True)
        q1_test, q2_test = data_pipeline(test_file_name, vocabulary=vocabulary, return_label=False)

        kf = KFold(n_splits=5)
        split_id = 0
        for t_idx, v_idx in kf.split(q1, q2, y):
            split_id += 1

            print('Round [{}] fold [{}]'.format(round_id, split_id))

            q1_train = q1[t_idx]
            q1_val = q1[v_idx]

            q2_train = q2[t_idx]
            q2_val = q2[v_idx]

            t_y = y[t_idx]
            v_y = y[v_idx]

            keras.backend.clear_session()
            model = esim(embeddings=embed_matrix, maxlen=config['maxlen'])
            model_path = '../models/esim_word/roung_{}_kf_{}.h5'.format(round_id, split_id)
            cb = CB(val_data=([q1_val, q2_val], v_y), model_path=model_path)
            model.fit(x=[q1_train, q2_train], y=t_y, batch_size=config['batch_size'], epochs=config['epochs'],
                      validation_data=([q1_val, q2_val], v_y), verbose=True, callbacks=[cb])

            model.load_weights(model_path)

            # 保存max f1
            max_f1 = cb.get_max_f1()
            f1_record.append([split_id, max_f1])
            f1_record_df = pd.DataFrame(f1_record, columns=['fold', 'f1'])
            f1_record_df.to_csv('../log/5round_esim_word_f1.csv', index=False)
            print('Kfold [{}] max f1: {}'.format(split_id, max_f1))

            # 保存在验证集上的结果
            v_pred = model.predict(x=[q1_val, q2_val])
            np.save('../prediction/esim_word/round_{}_train_kf_{}.npy'.format(round_id, split_id), v_pred)

            # 保存在测试集上的结果
            y_pred = model.predict(x=[q1_test, q2_test])
            np.save('../prediction/esim_word/round_{}_test_kf_{}.npy'.format(round_id, split_id), y_pred)
    for item in f1_record:
        print('{}\t{}'.format(item[0], item[1]))
示例#2
0
def train():
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    session = tf.Session(config=tf_config)
    KTF.set_session(session)

    vocabulary, embed_matrix = dh.load_embed(
        file_name='../data/word_embedding.txt')

    train_file_name = '../data/train.csv'
    test_file_name = '../data/test.csv'
    test_data = pd.read_csv(test_file_name)
    q1, q2, y = data_pipeline(train_file_name,
                              vocabulary=vocabulary,
                              return_label=True,
                              maxlen=config['maxlen'])
    q1_test, q2_test = data_pipeline(test_file_name,
                                     vocabulary=vocabulary,
                                     return_label=False,
                                     maxlen=config['maxlen'])

    kf = KFold(n_splits=5)
    split_id = 0
    f1_record = []
    for t_idx, v_idx in kf.split(q1, q2, y):
        split_id += 1
        print('## KFOLD [{}]'.format(split_id))

        q1_train = q1[t_idx]
        q1_val = q1[v_idx]

        q2_train = q2[t_idx]
        q2_val = q2[v_idx]

        train_y = y[t_idx]
        val_y = y[v_idx]

        model = decomposable_attention(embeddings=embed_matrix,
                                       maxlen=config['maxlen'])
        model_path = config['model_path'].format(split_id)
        cb = CB(val_data=([q1_val, q2_val], val_y), model_path=model_path)

        model.fit(x=[q1_train, q2_train],
                  y=train_y,
                  batch_size=config['batch_size'],
                  epochs=config['epochs'],
                  validation_data=([q1_val, q2_val], val_y),
                  verbose=True,
                  callbacks=[cb])
        model.load_weights(model_path)
        max_f1 = cb.get_max_f1()
        print('kfold [{}] max f1: {}'.format(split_id, max_f1))
        f1_record.append([split_id, max_f1])
        f1_record_df = pd.DataFrame(f1_record, columns=['fold', 'f1'])
        f1_record_df.to_csv(config['log_path'], index=False)
示例#3
0
def train():
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    session = tf.Session(config=tf_config)
    KTF.set_session(session)

    vocabulary, embed_matrix = dh.load_embed(
        file_name='../data/char_embedding.txt')

    train_file_name = '../data/train.csv'
    test_file_name = '../data/test.csv'

    q1, q2, y = data_pipeline(train_file_name,
                              vocabulary=vocabulary,
                              return_label=True,
                              maxlen=config['maxlen'])

    q1_test, q2_test = data_pipeline(test_file_name,
                                     vocabulary=vocabulary,
                                     return_label=False,
                                     maxlen=config['maxlen'])

    kf = KFold(n_splits=5)
    f1_record = []
    split_id = 0
    for t_idx, v_idx in kf.split(q1, q2, y):
        split_id += 1

        q1_train = q1[t_idx]
        q1_val = q1[v_idx]

        q2_train = q2[t_idx]
        q2_val = q2[v_idx]

        train_y = y[t_idx]
        val_y = y[v_idx]

        # 设置内存自适应
        keras.backend.clear_session()

        model = get_model(embed_matrix=embed_matrix, maxlen=config['maxlen'])

        model_path = '../models/tmp.h5'.format(split_id)
        cb = CB(val_data=([q1_val, q2_val], val_y), model_path=model_path)
        model.fit(x=[q1_train, q2_train],
                  y=train_y,
                  batch_size=config['batch_size'],
                  epochs=config['epochs'],
                  validation_data=([q1_val, q2_val], val_y),
                  callbacks=[cb])

        # 保存f1
        max_f1 = cb.get_max_f1()
        f1_record.append([split_id, max_f1])
        f1_record_df = pd.DataFrame(f1_record, columns=['fold', 'f1'])
        # f1_record_df.to_csv(config['log_file'], index=False)

        # # 验证集上的预测结果
        # model.load_weights(model_path)
        # v_pred = model.predict([q1_val, q2_val])
        # np.save('../prediction/bimpm_char/train_kf_{}.npy'.format(split_id), v_pred)
        #
        # # 测试集合上预测结果
        # y_pred = model.predict([q1_test, q2_test])
        # np.save('../prediction/bimpm_char/test_kf_{}.npy'.format(split_id), y_pred)
        for item in f1_record:
            print('{}\t{}'.format(item[0], item[1]))
示例#4
0
def train_5_round():
    # 设置内存自适应
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    session = tf.Session(config=tf_config)
    KTF.set_session(session)

    f1_record = []
    for round_id in range(1, 6):
        f1_record.append(['round', round_id])
        vocabulary, embed_matrix = dh.load_embed(
            file_name='../data/char_embedding.txt')
        gl_vocabulary, gl_embed_matrix = dh.load_embed(
            file_name='../data/gl_char_vectors.txt')

        train_file_name = '../data/train.csv'
        test_file_name = '../data/test.csv'
        q1, q2, y = data_pipeline(train_file_name,
                                  vocabulary=vocabulary,
                                  return_label=True,
                                  maxlen=config['maxlen'])
        q1_gl, q2_gl = data_pipeline(train_file_name,
                                     vocabulary=gl_vocabulary,
                                     return_label=False,
                                     maxlen=config['maxlen'])

        q1_test, q2_test = data_pipeline(test_file_name,
                                         vocabulary=vocabulary,
                                         return_label=False,
                                         maxlen=config['maxlen'])
        q1_test_gl, q2_test_gl = data_pipeline(test_file_name,
                                               vocabulary=gl_vocabulary,
                                               return_label=False,
                                               maxlen=config['maxlen'])

        train_feature = pd.read_csv('../feature/train_feature.csv')
        train_feature = train_feature.values
        test_feature = pd.read_csv('../feature/test_feature.csv')
        test_feature = test_feature.values

        kf = KFold(n_splits=5)
        split_id = 0

        for t_idx, v_idx in kf.split(q1, q2, y):
            split_id += 1

            print('Round [{}] fold [{}]'.format(round_id, split_id))

            q1_train = q1[t_idx]
            q1_val = q1[v_idx]

            q2_train = q2[t_idx]
            q2_val = q2[v_idx]

            q1_gl_train = q1_gl[t_idx]
            q1_gl_val = q1_gl[v_idx]

            q2_gl_train = q2_gl[t_idx]
            q2_gl_val = q2_gl[v_idx]

            f_train = train_feature[t_idx]
            f_val = train_feature[v_idx]

            t_y = y[t_idx]
            v_y = y[v_idx]

            print('Ratio : {}'.format(np.sum(v_y) / len(v_y)))

            # 交换q1 和q2 的位置,形成新的数据
            # tmp1 = np.concatenate([q1_train, q2_train], axis=0)
            # tmp2 = np.concatenate([q2_train, q1_train], axis=0)
            # tmp3 = np.concatenate([q1_gl_train, q2_gl_train], axis=0)
            # tmp4 = np.concatenate([q2_gl_train, q1_gl_train], axis=0)
            #
            # q1_train = copy.deepcopy(tmp1)
            # q2_train = copy.deepcopy(tmp2)
            # q1_gl_train = copy.deepcopy(tmp3)
            # q2_gl_train = copy.deepcopy(tmp4)
            # f_train = np.concatenate([f_train, f_train], axis=0)
            # t_y = copy.deepcopy(np.concatenate([t_y, t_y], axis=0))

            keras.backend.clear_session()

            model = esim(embeddings=embed_matrix,
                         gl_embed_matrix=gl_embed_matrix,
                         maxlen=config['maxlen'],
                         lr=0.001,
                         f_dim=test_feature.shape[1])

            model_path = '../models/esim_char_gl_feature/roung_{}_fold_{}.h5'.format(
                round_id, split_id)
            cb = CB(val_data=([q1_val, q2_val, q1_gl_val, q2_gl_val,
                               f_val], v_y),
                    model_path=model_path)
            model.fit(
                x=[q1_train, q2_train, q1_gl_train, q2_gl_train, f_train],
                y=t_y,
                batch_size=config['batch_size'],
                epochs=config['epochs'],
                validation_data=([q1_val, q2_val, q1_gl_val, q2_gl_val,
                                  f_val], v_y),
                verbose=True,
                callbacks=[cb])

            # 加载最优模型
            model.load_weights(model_path)

            # 保存各个fold的max f1
            max_f1 = cb.get_max_f1()
            f1_record.append([split_id, max_f1])
            f1_record_df = pd.DataFrame(f1_record, columns=['fold', 'f1'])
            f1_record_df.to_csv('../log/5round_esim_char_gl_feature_f1.csv',
                                index=False)
            print('Kfold [{}] max f1: {}'.format(split_id, max_f1))

            # 保存验证集上的预测结果
            v_pred = model.predict(
                x=[q1_val, q2_val, q1_gl_val, q2_gl_val, f_train])
            np.save(
                '../prediction/esim_char_gl_feature/round_{}_train_kf_{}.npy'.
                format(round_id, split_id), v_pred)

            y_pred = model.predict(
                x=[q1_test, q2_test, q1_test_gl, q2_test_gl, test_feature])
            np.save(
                '../prediction/esim_char_gl_feature/round_{}_test_kf_{}.npy'.
                format(round_id, split_id), y_pred)

        # 输出5round的结果
        for item in f1_record:
            print('{}\t{}'.format(item[0], item[1]))
示例#5
0
def train():
    # 设置内存自适应
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    session = tf.Session(config=tf_config)
    KTF.set_session(session)

    vocabulary, embed_matrix = dh.load_embed(
        file_name='../data/char_embedding.txt')
    gl_vocabulary, gl_embed_matrix = dh.load_embed(
        file_name='../data/gl_char_vectors.txt')

    train_file_name = '../data/train.csv'
    test_file_name = '../data/test.csv'
    test_data = pd.read_csv(test_file_name)
    q1, q2, y = data_pipeline(train_file_name,
                              vocabulary=vocabulary,
                              return_label=True,
                              maxlen=config['maxlen'])
    q1_gl, q2_gl = data_pipeline(train_file_name,
                                 vocabulary=gl_vocabulary,
                                 return_label=False,
                                 maxlen=config['maxlen'])

    q1_test, q2_test = data_pipeline(test_file_name,
                                     vocabulary=vocabulary,
                                     return_label=False,
                                     maxlen=config['maxlen'])

    train_feature = pd.read_csv('../feature/train_feature.csv')
    train_feature = train_feature.values
    test_feature = pd.read_csv('../feature/test_feature.csv')
    test_feature = test_feature.values

    print('Feature dim : {}'.format(test_feature.shape[-1]))

    kf = KFold(n_splits=5)
    split_id = 0
    f1_record = []
    for t_idx, v_idx in kf.split(q1, q2, y):
        split_id += 1

        print('Kfold [{}]'.format(split_id))

        q1_train = q1[t_idx]
        q1_val = q1[v_idx]

        q2_train = q2[t_idx]
        q2_val = q2[v_idx]

        q1_gl_train = q1_gl[t_idx]
        q1_gl_val = q1_gl[v_idx]

        q2_gl_train = q2_gl[t_idx]
        q2_gl_val = q2_gl[v_idx]

        f_train = train_feature[t_idx]
        f_val = train_feature[v_idx]

        t_y = y[t_idx]
        v_y = y[v_idx]

        keras.backend.clear_session()
        model = esim(embeddings=embed_matrix,
                     gl_embed_matrix=gl_embed_matrix,
                     maxlen=config['maxlen'],
                     lr=0.001,
                     f_dim=test_feature.shape[1])

        model_path = config['model_path'].format(split_id)
        cb = CB(val_data=([q1_val, q2_val, q1_gl_val, q2_gl_val, f_val], v_y),
                model_path=model_path)
        model.fit(
            x=[q1_train, q2_train, q1_gl_train, q2_gl_train, f_train],
            y=t_y,
            batch_size=config['batch_size'],
            epochs=config['epochs'],
            validation_data=([q1_val, q2_val, q1_gl_val, q2_gl_val,
                              f_val], v_y),
            verbose=True,
            callbacks=[cb])

        # 保存各个fold的max f1
        max_f1 = cb.get_max_f1()
        f1_record.append([split_id, max_f1])

        for item in f1_record:
            print('{}\t{}'.format(item[0], item[1]))
示例#6
0
def train_5_round():
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    session = tf.Session(config=tf_config)
    KTF.set_session(session)

    f1_record = []
    for round_id in range(1, 6):
        f1_record.append(['round', round_id])

        vocabulary, embed_matrix = dh.load_embed(
            file_name='../data/char_embedding.txt')

        train_file_name = '../data/train.csv'
        test_file_name = '../data/test.csv'

        q1, q2, y = data_pipeline(train_file_name,
                                  vocabulary=vocabulary,
                                  return_label=True,
                                  maxlen=config['maxlen'])
        q1_test, q2_test = data_pipeline(test_file_name,
                                         vocabulary=vocabulary,
                                         return_label=False,
                                         maxlen=config['maxlen'])

        kf = KFold(n_splits=5)
        split_id = 0
        for t_idx, v_idx in kf.split(q1, q2, y):
            split_id += 1
            print('Round [{}] fold [{}]'.format(round_id, split_id))

            q1_train = q1[t_idx]
            q1_val = q1[v_idx]

            q2_train = q2[t_idx]
            q2_val = q2[v_idx]

            train_y = y[t_idx]
            val_y = y[v_idx]

            model = decomposable_attention(embeddings=embed_matrix,
                                           maxlen=config['maxlen'])
            model_path = '../models/de_att_char/round_{}_kf_{}'.format(
                round_id, split_id)
            cb = CB(val_data=([q1_val, q2_val], val_y), model_path=model_path)

            model.fit(x=[q1_train, q2_train],
                      y=train_y,
                      batch_size=config['batch_size'],
                      epochs=config['epochs'],
                      validation_data=([q1_val, q2_val], val_y),
                      verbose=True,
                      callbacks=[cb])
            model.load_weights(model_path)

            max_f1 = cb.get_max_f1()
            print('Round [{}] fold [{}] max f1: {}'.format(
                round_id, split_id, max_f1))
            f1_record.append([split_id, max_f1])
            f1_record_df = pd.DataFrame(f1_record, columns=['fold', 'f1'])
            f1_record_df.to_csv('../log/5round_de_att_char_f1.csv',
                                index=False)

            # 验证集上的预测结果
            v_pred = model.predict([q1_val, q2_val])
            np.save(
                '../prediction/de_att_char/round_{}_train_kf_{}.npy'.format(
                    round_id, split_id), v_pred)

            y_pred = model.predict([q1_test, q2_test])
            np.save(
                '../prediction/de_att_char/round_{}_test_kf_{}.npy'.format(
                    round_id, split_id), y_pred)
        pass
示例#7
0
def train():
    # 设置内存自适应
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    session = tf.Session(config=tf_config)
    KTF.set_session(session)

    vocabulary, embed_matrix = dh.load_embed(
        file_name='../data/char_embedding.txt')

    train_file_name = '../data/train.csv'
    test_file_name = '../data/test.csv'
    test_data = pd.read_csv(test_file_name)
    q1, q2, y = data_pipeline(train_file_name,
                              vocabulary=vocabulary,
                              return_label=True,
                              maxlen=config['maxlen'])
    q1_test, q2_test = data_pipeline(test_file_name,
                                     vocabulary=vocabulary,
                                     return_label=False,
                                     maxlen=config['maxlen'])

    result = []
    kf = KFold(n_splits=5)
    split_id = 0
    f1_record = []
    for t_idx, v_idx in kf.split(q1, q2, y):
        split_id += 1

        print('Kfold [{}]'.format(split_id))

        q1_train = q1[t_idx]
        q1_val = q1[v_idx]

        q2_train = q2[t_idx]
        q2_val = q2[v_idx]

        t_y = y[t_idx]
        v_y = y[v_idx]

        keras.backend.clear_session()
        model = esim(embeddings=embed_matrix,
                     maxlen=config['maxlen'],
                     lr=0.001)

        model_path = config['model_path'].format(split_id)
        cb = CB(val_data=([q1_val, q2_val], v_y), model_path=model_path)
        model.fit(x=[q1_train, q2_train],
                  y=t_y,
                  batch_size=config['batch_size'],
                  epochs=config['epochs'],
                  validation_data=([q1_val, q2_val], v_y),
                  verbose=True,
                  callbacks=[cb])

        # 加载最优模型
        model.load_weights(model_path)

        # 保存各个fold的max f1
        max_f1 = cb.get_max_f1()
        f1_record.append([split_id, max_f1])
        f1_record_df = pd.DataFrame(f1_record, columns=['fold', 'f1'])
        f1_record_df.to_csv(config['log_file'], index=False)
        print('Kfold [{}] max f1: {}'.format(split_id, max_f1))

        # # 保存验证集上的预测结果
        # v_pred = model.predict(x=[q1_val, q2_val])
        # np.save('../prediction/esim_char/train_kf_{}.npy'.format(split_id), v_pred)
        #
        # y_pred = model.predict(x=[q1_test, q2_test])
        # np.save('../prediction/esim_char/test_kf_{}.npy'.format(split_id), y_pred)
        for item in f1_record:
            print('{}\t{}'.format(item[0], item[1]))