def cross_validation(): data = read.textfile2list(data_path) counter_label = dict(Counter(data[0])) label2int = {label: idx for idx, label in enumerate(counter_label.keys())} read.save_in_json(os.path.join(bert_input, "label2int"), label2int) questions, label = data data_set = list(zip(questions, label)) question_idx = list(range(0, len(questions))) for fold, [train_idx, validation_idx, test_idx] in enumerate( k_fold_cross_validation(question_idx, 5, randomize=True)): train = [data_set[question_id] for question_id in train_idx] validation = [data_set[question_id] for question_id in validation_idx] test = [data_set[question_id] for question_id in test_idx] input_data = [train, validation, test] for idx, mode in enumerate(modes): input = [dataitem[1] for dataitem in input_data[idx]] label = [dataitem[0] for dataitem in input_data[idx]] read.save_in_json( os.path.join(processed_dir, "question_" + mode + "_" + str(fold)), input_data[idx]) generate_tsv( input, label, label2int, os.path.join(bert_input, "grad_" + mode + "_" + str(fold) + ".tsv"))
def split_label(level, mode): labels = read.read_from_json(os.path.join(processed_dir, "label_" + mode)) labels_level = [] for label in labels: label_level = get_label_at_level(label, level) labels_level.append(label_level) read.save_in_json( os.path.join(processed_dir, str(level) + "/" + "label_" + mode), labels_level)
def label_dict(): for layer in range(1, 3): label_counts = defaultdict(float) for mode in modes: split_label(layer, mode) labels = read.read_from_json( os.path.join(processed_dir, str(layer) + "/" + "label_" + mode)) for label in labels: label = label.strip() label_counts[label] += 1.0 label2int = {j: i for i, j in enumerate(label_counts)} read.save_in_json( os.path.join(processed_dir, str(layer) + "/label_counts"), label_counts) read.save_in_json(os.path.join(bert_input, str(layer) + "/label2int"), label2int)
def read_questions(path): data = read.read_from_tsv(path)[1:] labels = [] questions = [] labels_powerset = [] labels_all = [] for line in data: labels.append(line[3].split(",")) labels_all+=line[3].split(",") labels_powerset.append(line[3]) questions.append(line[2]) counter_labels = dict(Counter(labels_all)) label2int = {label:idx for idx,label in enumerate(counter_labels.keys())} read.save_in_json(os.path.join(bert_input, "label2int"), label2int) return questions, labels
def generate_cv(): data = read_questions(data_path) label2int = read.read_from_json(os.path.join(bert_input,"label2int")) questions, label = data data_set= list(zip(questions,label)) question_idx = list(range(0, len(questions))) for fold, [train_idx, validation_idx,test_idx] in enumerate(k_fold_cross_validation(question_idx, 10, randomize=True)): train = [data_set[question_id] for question_id in train_idx] validation = [data_set[question_id] for question_id in validation_idx] test = [data_set[question_id] for question_id in test_idx] data = [train, validation, test] for idx, mode in enumerate(modes): input = [dataitem[0] for dataitem in data[idx]] labels = [dataitem[1] for dataitem in data[idx]] idx_label = [[label2int[label_single] for label_single in label] for label in labels ] read.save_in_json(os.path.join(processed_dir,"index_label_" + mode +"_"+ str(fold)), idx_label) read.save_in_json(os.path.join(processed_dir,"question_" + mode +"_"+ str(fold)), data[idx]) generate_tsv(input, label, label2int,os.path.join(bert_input,"lat_" + mode +"_"+ str(fold) + ".tsv"),mode)
def process(): dataset_all = read.read_from_tsv(train)[1:] dataset_test = read.read_from_tsv(test)[1:] random.seed(2009221) question_idx = list(range(0, len(dataset_all))) folds = [] for training, validation in k_fold_cross_validation(question_idx, 10, randomize=True): folds.append([training, validation]) dataset_train = [dataset_all[question_id] for question_id in folds[0][0]] dataset_dev = [dataset_all[question_id] for question_id in folds[0][1]] data = [dataset_train, dataset_dev, dataset_test] for idx, mode in enumerate(modes): questions = [dataitem[5] for dataitem in data[idx]] labels = [dataitem[4] for dataitem in data[idx]] qids = [dataitem[0] for dataitem in data[idx]] read.save_in_json(os.path.join(processed_dir, "label_" + mode), labels) read.save_in_json(os.path.join(processed_dir, "question_" + mode), questions) read.save_in_json(os.path.join(processed_dir, "question_id_" + mode), qids)