예제 #1
0
def data_generator(fold=0, is_train=True):
    global train_len
    global eva_len
    logger.info('generator pre ...   info fold: {} is_train: {}'.format(
        fold, is_train))
    if is_train:
        data = joblib.load(join(MID_PATH, 'train_{}.joblib'.format(fold)))
    else:
        data = joblib.load(join(MID_PATH, 'val_{}.joblib'.format(fold)))
    if is_train:
        mid_data = []
        for d in data:
            mid_data.append(trains_pairs(d))
        data = mid_data
        del mid_data

    # 把正反例对,转换为有标签的对
    # print('data: ')
    # print(data[0])
    # print(len(data[0]))
    labeled_data = []
    for piece in data:
        poses = piece[0]
        neges = piece[1]
        for pos in poses:
            labeled_data.append((pos, 1))
        for neg in neges:
            labeled_data.append((neg, 0))

    random.shuffle(labeled_data)
    if is_train:
        train_len = len(labeled_data)
    else:
        eva_len = len(labeled_data)
    logger.info('generator pre over.  data num: {}'.format(len(labeled_data)))

    _num = 0
    while True:
        X, S, L = [], [], []
        for data, label in labeled_data:
            _num += 1
            # a, b, label = data[0], data[1], label
            if not is_train or _num <= 0:
                a, b, label = data[0], data[1], label
            else:
                a, b, label = data_aug.trans_main(data[0], data[1], label)
            a = a[:max_set_len - 3]
            b = b[:max_set_len - 3]
            x, s = tokenizer.encode(a, b)
            X.append(x)
            S.append(s)
            L.append(label)
            if len(X) == batch_size:
                X = padding(X)
                S = padding(S)
                L = one_hot(L, 2)
                yield [X, S], L
                X, S, L = [], [], []
예제 #2
0
def split_data(split_n=MAX_FOLD):
    # 5个交叉验证集
    print('read...')
    data = list(get_train())
    print('shuffle...')
    random.shuffle(data)
    val_len = int(len(data) / split_n)
    for i in range(split_n):
        if i == split_n - 1:
            val = data[val_len * i:]
        else:
            val = data[val_len * i:val_len * (i + 1)]
        train = data[:val_len * i]
        train.extend(data[val_len * (i + 1):])
        val_final = []
        for d in val:
            val_final.append(trains_pairs(d, 2))
        random.shuffle(val_final)
        print('save {}'.format(i))
        joblib.dump(train, join(MID_PATH, 'train_{}.joblib'.format(i)))
        joblib.dump(val_final, join(MID_PATH, 'val_{}.joblib'.format(i)))
예제 #3
0
def test_label_num():
    from data.aa_augmentation import DataAug
    from data.aa_read import trains_pairs
    import random
    import joblib

    data = joblib.load(join(MID_PATH, 'train_{}.joblib'.format(0)))
    data_aug = DataAug()
    mid_data = []
    for d in data:
        mid_data.append(trains_pairs(d))
    data = mid_data
    del mid_data
    labeled_data = []
    for piece in data:
        poses = piece[0]
        neges = piece[1]
        for pos in poses:
            labeled_data.append((pos, 1))
        for neg in neges:
            labeled_data.append((neg, 0))
    random.shuffle(labeled_data)

    num = 0
    for data, label in labeled_data:
        a = data[0]
        b = data[1]
        data_aug.trans_main(a, b, label)
        num += 1
        if num % 87 == 0:
            print('\rnum: {}'.format(num), end='     ')
        if num % 1000 == 999:
            s = input('continue? ')
            if s == 'q':
                data_aug.describe()
                break
            elif s == 'des':
                data_aug.describe()