def classify_pair_corpus(bert_model):
    # 数据预处理
    from utils.text_tools import text_preprocess, txtRead, txtWrite
    from conf.path_config import path_webank_sim
    import random

    webank_q_2_l = txtRead(path_webank_sim, encodeType='gbk')
    questions = []
    labels = []
    for ques_label in webank_q_2_l[1:]:
        q_2_l = ques_label.split(',')
        q_1 = q_2_l[0]
        q_2 = "".join(q_2_l[1:-1])
        label = q_2_l[-1]
        questions.append([text_preprocess(q_1), text_preprocess(q_2)])
        label_int = int(label)
        labels.append([0, 1] if label_int == 1 else [1, 0])

    questions = np.array(questions)
    labels = np.array(labels)
    index = [i for i in range(len(labels))]
    random.shuffle(index)
    questions = questions[index]
    labels = labels[index]
    len_train = int(len(labels) * 0.9)

    train_x, train_y = questions[0:len_train], labels[0:len_train]
    test_x, test_y = questions[len_train:], labels[len_train:]

    input_ids, input_masks, input_type_ids = bert_model.process_pair(train_x)
    input_ids2, input_masks2, input_type_ids2 = bert_model.process_pair(test_x)

    return train_x, train_y, test_x, test_y, input_ids, input_masks, input_type_ids, input_ids2, input_masks2, input_type_ids2
def classify_pair_corpus_webank(bert_model, path_webank):
    # 数据预处理
    from utils.text_tools import text_preprocess, txtRead, txtWrite
    import random

    webank_q_2_l = txtRead(path_webank, encodeType='utf-8')
    questions = []
    labels = []
    for ques_label in webank_q_2_l[1:]:
        q_2_l = ques_label.split(',')
        q_1 = q_2_l[0]
        q_2 = "".join(q_2_l[1:-1])
        label = q_2_l[-1]
        questions.append([text_preprocess(q_1), text_preprocess(q_2)])
        label_int = int(label)
        labels.append([0, 1] if label_int == 1 else [1, 0])

    questions = np.array(questions)
    labels = np.array(labels)

    input_ids, input_masks, input_type_ids = bert_model.process_pair(questions)

    return questions, labels, input_ids, input_masks, input_type_ids