예제 #1
0
def cross_validation():
    data = read.textfile2list(data_path)
    counter_label = dict(Counter(data[0]))
    label2int = {label: idx for idx, label in enumerate(counter_label.keys())}
    read.save_in_json(os.path.join(bert_input, "label2int"), label2int)
    questions, label = data
    data_set = list(zip(questions, label))
    question_idx = list(range(0, len(questions)))
    for fold, [train_idx, validation_idx, test_idx] in enumerate(
            k_fold_cross_validation(question_idx, 5, randomize=True)):
        train = [data_set[question_id] for question_id in train_idx]
        validation = [data_set[question_id] for question_id in validation_idx]
        test = [data_set[question_id] for question_id in test_idx]
        input_data = [train, validation, test]
        for idx, mode in enumerate(modes):
            input = [dataitem[1] for dataitem in input_data[idx]]
            label = [dataitem[0] for dataitem in input_data[idx]]
            read.save_in_json(
                os.path.join(processed_dir,
                             "question_" + mode + "_" + str(fold)),
                input_data[idx])
            generate_tsv(
                input, label, label2int,
                os.path.join(bert_input,
                             "grad_" + mode + "_" + str(fold) + ".tsv"))
def split_label(level, mode):
    labels = read.read_from_json(os.path.join(processed_dir, "label_" + mode))
    labels_level = []
    for label in labels:
        label_level = get_label_at_level(label, level)
        labels_level.append(label_level)
    read.save_in_json(
        os.path.join(processed_dir,
                     str(level) + "/" + "label_" + mode), labels_level)
def label_dict():
    for layer in range(1, 3):
        label_counts = defaultdict(float)
        for mode in modes:
            split_label(layer, mode)
            labels = read.read_from_json(
                os.path.join(processed_dir,
                             str(layer) + "/" + "label_" + mode))
            for label in labels:
                label = label.strip()
                label_counts[label] += 1.0
        label2int = {j: i for i, j in enumerate(label_counts)}
        read.save_in_json(
            os.path.join(processed_dir,
                         str(layer) + "/label_counts"), label_counts)
        read.save_in_json(os.path.join(bert_input,
                                       str(layer) + "/label2int"), label2int)
예제 #4
0
def read_questions(path):

    data = read.read_from_tsv(path)[1:]

    labels = []
    questions = []
    labels_powerset = []
    labels_all = []
    for line in data:
        labels.append(line[3].split(","))
        labels_all+=line[3].split(",")
        labels_powerset.append(line[3])
        questions.append(line[2])

    counter_labels = dict(Counter(labels_all))
    label2int = {label:idx for idx,label in enumerate(counter_labels.keys())}
    read.save_in_json(os.path.join(bert_input, "label2int"), label2int)
    return questions, labels
예제 #5
0
def generate_cv():
    data = read_questions(data_path)
    label2int = read.read_from_json(os.path.join(bert_input,"label2int"))

    questions, label = data
    data_set= list(zip(questions,label))
    question_idx = list(range(0, len(questions)))
    for fold, [train_idx, validation_idx,test_idx] in enumerate(k_fold_cross_validation(question_idx, 10, randomize=True)):
        train = [data_set[question_id] for question_id in train_idx]
        validation = [data_set[question_id] for question_id in validation_idx]
        test = [data_set[question_id] for question_id in test_idx]
        data = [train, validation, test]
        for idx, mode in enumerate(modes):
            input = [dataitem[0] for dataitem in data[idx]]
            labels = [dataitem[1] for dataitem in data[idx]]
            idx_label = [[label2int[label_single]  for label_single in label] for label in labels ]
            read.save_in_json(os.path.join(processed_dir,"index_label_" + mode +"_"+ str(fold)), idx_label)
            read.save_in_json(os.path.join(processed_dir,"question_" + mode +"_"+ str(fold)), data[idx])
            generate_tsv(input, label, label2int,os.path.join(bert_input,"lat_" + mode +"_"+ str(fold) + ".tsv"),mode)
def process():
    dataset_all = read.read_from_tsv(train)[1:]
    dataset_test = read.read_from_tsv(test)[1:]

    random.seed(2009221)
    question_idx = list(range(0, len(dataset_all)))
    folds = []
    for training, validation in k_fold_cross_validation(question_idx,
                                                        10,
                                                        randomize=True):
        folds.append([training, validation])

    dataset_train = [dataset_all[question_id] for question_id in folds[0][0]]
    dataset_dev = [dataset_all[question_id] for question_id in folds[0][1]]
    data = [dataset_train, dataset_dev, dataset_test]

    for idx, mode in enumerate(modes):
        questions = [dataitem[5] for dataitem in data[idx]]
        labels = [dataitem[4] for dataitem in data[idx]]
        qids = [dataitem[0] for dataitem in data[idx]]
        read.save_in_json(os.path.join(processed_dir, "label_" + mode), labels)
        read.save_in_json(os.path.join(processed_dir, "question_" + mode),
                          questions)
        read.save_in_json(os.path.join(processed_dir, "question_id_" + mode),
                          qids)