def transform(input_path, output_path): dataset = csv_handler.csv_readlines(input_path) dataset = dataset[1:] dataset = transformer.map_func( range(len(dataset)), lambda i: (i, dataset[i][0], 1 if dataset[i][1] == "True" else 0)) csv_handler.csv_writelines(output_path, dataset)
def bert_estimate(input_path, text_col, output_path, model_dir, gpu, with_header): # assign GPU os.environ['CUDA_VISIBLE_DEVICES'] = gpu # load model if model_dir == '': model_dir = os.path.join(os.environ['FACTMINE_HOME'], 'cache/company_model') print("Loading model from %s ...\n" % (model_dir)) model = BertModel(model_dir) print("Calculating negative score, positive score, and argmax class...\n") dataset = csv_handler.csv_readlines(input_path) header = None if with_header == True: header = dataset[0] dataset = dataset[1:] text_col = text_col - 1 texts = transform.map_func(dataset, lambda row: row[text_col]) preds = model.predict(texts) assert (len(dataset) == len(preds)) # apply sigmoid preds = transform.map_func( preds, lambda quad: [util.sigmoid(quad[0]), util.sigmoid(quad[1]), quad[2], quad[3]]) for i in range(len(dataset)): dataset[i].append(preds[i][0]) dataset[i].append(preds[i][1]) dataset[i].append(preds[i][2]) if with_header == True: header.append('score_0') header.append('score_1') header.append('arg_class') dataset = [header] + dataset csv_handler.csv_writelines(output_path, dataset) if output_path != "": print("Finished, Results are ready at %s " % (output_path))
def extract(data, typename): dataset = csv_handler.csv_readlines("./dataset/" + data + "set.csv") dataset = transformer.indexleft_func(dataset) dataset = transformer.map_func(dataset, lambda row: (row[0], row[1][1], row[1][2])) output_path = "./" + data + ".csv" def e_func(triplet): label = 0 if triplet[2] == typename: label = 1 return (triplet[0], triplet[1], label) final = transformer.map_func(dataset, lambda triplet: e_func(triplet)) csv_handler.csv_writelines(output_path, final)
def max_balancer(input_csv_path, output_csv_path='./output.csv'): dataset = csv_handler.csv_readlines(input_csv_path) pos_dataset = transform.filter_func(dataset, lambda row: row[2] == '1') neg_dataset = transform.filter_func(dataset, lambda row: row[2] == '0') assert (len(pos_dataset) <= len(neg_dataset)) sampler = Sampler() neg_dataset = sampler.sample_rows(neg_dataset, len(pos_dataset)) pos_ids = transform.map_func(pos_dataset, lambda row: row[0]) neg_ids = transform.map_func(neg_dataset, lambda row: row[0]) select_id_set = set(pos_ids + neg_ids) final = transform.filter_func(dataset, lambda row: row[0] in select_id_set) csv_handler.csv_writelines(output_csv_path, final)
def extract(data, typename): dataset = csv_handler.csv_readlines("./dataset/" + data + "_raw.csv") output_path = "./" + data + ".csv" def func_1(triplet): label = 0 if triplet[2] == typename: label = 1 return (triplet[0], triplet[1], label) e_func = func_1 def func_2(triplet): label = 1 if triplet[2] == typename: label = 0 return (triplet[0], triplet[1], label) if typename == 'NoArgument': e_func = func_2 final = transformer.map_func(dataset, lambda triplet: e_func(triplet)) csv_handler.csv_writelines(output_path, final)
def evaluate(input_path, col_true, col_pred, metric, output_path, with_header): '''evaluate the quality of predictions with a metric (f1 by default), and output the metric scores''' result = [] dataset = csv_handler.csv_readlines(input_path) if with_header == True: dataset = dataset[1:] col_true = int(col_true) - 1 col_pred = int(col_pred) - 1 y_true = transform.map_func(dataset, lambda row : int(row[col_true])) y_pred = transform.map_func(dataset, lambda row : int(row[col_pred])) def check_validity(class_array): for cls in class_array: assert(cls == 0 or cls == 1) check_validity(y_true) check_validity(y_pred) support_set = {'f1', 'accuracy', 'cohen', 'quad'} if metric not in support_set: sys.exit('please specify a valid metric in terms of f1, accuracy, cohen, or quad (i.e. precision_recall_fscore_support)') elif metric == 'f1': result.append(['f1']) result.append([f1_score(y_true, y_pred)]) elif metric == 'accuracy': result.append(['accuracy']) result.append([accuracy_score(y_true, y_pred)]) elif metric == 'cohen': result.append([cohen_kappa_score(y_true, y_pred)]) elif metric == 'quad': (precision, recall, fscore, support) = precision_recall_fscore_support(y_true, y_pred) result.append(['class', 'precision', 'recall', 'fscore', 'support']) result.append([0, precision[0], recall[0], fscore[0], support[0]]) result.append([1, precision[1], recall[1], fscore[1], support[1]]) csv_handler.csv_writelines(output_path, result)
def write_preds(ori_preds, labels, dev_path , output_path = "./tmp.csv"): eval_examples = csv_handler.csv_readlines(dev_path) assert(len(ori_preds) == len(labels)) assert(len(ori_preds) == len(eval_examples)) # append header header = ['id', 'sent', 'label'] for i in range(len(ori_preds[0])): header.append('p' + str(i)) header.append('pred') final = [] final.append(header) # append data for i in range(len(labels)): row = [] sent_id = eval_examples[i][0] sent = eval_examples[i][1] sent_label = int(eval_examples[i][2]) assert(sent_label == labels[i]) row.append(sent_id) row.append(sent) row.append(sent_label) for j in range(len(ori_preds[i])): row.append(ori_preds[i][j]) pred_label = np.argmax(ori_preds[i]) row.append(pred_label) final.append(row) csv_handler.csv_writelines(output_path, final)
examples = [] for (i, txt) in enumerate(batch): guid = "%s" % (i) text_a = txt label = self.dummy_label examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples if __name__ == "__main__": data_path = sys.argv[1] model_dir = sys.argv[2] output_path = sys.argv[3] # load test dataset raw_dataset = csv_handler.csv_readlines(data_path) ids = transform.map_func(raw_dataset, lambda row : row[0]) texts = transform.map_func(raw_dataset, lambda row : row[1]) # load model model = BertModel(model_dir) pred = model.predict(texts, 100) assert(len(ids) == len(pred)) output = transform.map_func(range(len(ids)), lambda idx : [ids[idx]] + pred[idx]) csv_handler.csv_writelines(output_path, output)
return (rid, sent, label, split) records = transformer.map_func(range(len(records)), lambda i: row_functor(i, records)) print(len(records)) dataset = dataset + records[1:] print(len(dataset)) print(dataset[0]) def select(split, dataset): final = transformer.filter_func(dataset, lambda row: row[3] == split) final = transformer.map_func(final, lambda row: (row[0], row[1], row[2])) return final train_set = select("train", dataset) print(len(train_set)) dev_set = select("test", dataset) print(len(dev_set)) val_set = select("val", dataset) print(len(val_set)) csv_handler.csv_writelines(data_root + "/train_raw.csv", train_set) csv_handler.csv_writelines(data_root + "/dev_raw.csv", dev_set) csv_handler.csv_writelines(data_root + "/val_raw.csv", val_set)
#('clf', LogisticRegression(class_weight='balanced', random_state=seed, solver='liblinear')), ('clf', LogisticRegression(random_state=seed, solver='liblinear')), ]) text_clf.fit(X_train, y_train) train_finish_time = time.time() train_duration = train_finish_time - start_time print("train time is " + str(train_finish_time - start_time)) print("predicting...") predicted = text_clf.predict(X_dev) predicted_proba = text_clf.predict_proba(X_dev) assert (len(predicted_proba) == len(X_dev)) assert (len(X_dev) == len(y_dev)) print("logging...") csv_handler.append_row(output_path, ['score_0', 'score_1', 'predict', 'text', 'ground']) result = [] for i in range(len(predicted_proba)): score_0 = predicted_proba[i][0] score_1 = predicted_proba[i][1] predict = predicted[i] text = X_dev[i] ground = y_dev[i] result.append([score_0, score_1, predict, text, ground]) csv_handler.csv_writelines(output_path, result)
train_test_ids.sort() assert (anno_ids == train_test_ids) id_to_file = transformer.map_func(files, lambda file: (get_id(file), file)) id_to_file = dict(id_to_file) def extractor(anno_dir, id_to_file, paper_id): file_path = anno_dir + id_to_file[paper_id] label_sent_dataset = csv_handler.csv_readlines(file_path, delimit='\t') indexed_result = transformer.indexleft_func(label_sent_dataset) final = transformer.map_func( indexed_result, lambda p: (paper_id + "_" + str(p[0]), p[1][1], p[1][0])) return final tmp = extractor(anno_dir, id_to_file, 'r18RxrXlG') transformer.print_rows(tmp, 3) train_dataset = transformer.flatmap_func( train_id_dataset, lambda paper_id: extractor(anno_dir, id_to_file, paper_id)) csv_handler.csv_writelines(train_output_path, train_dataset) test_dataset = transformer.flatmap_func( test_id_dataset, lambda paper_id: extractor(anno_dir, id_to_file, paper_id)) csv_handler.csv_writelines(test_output_path, test_dataset)
import transform as transform import csv_handler as csv_handler if __name__ == "__main__": result = [] first_file = sys.argv[1] first_idx = sys.argv[2].split(',') first_idx = transform.map_func(first_idx, lambda idx: int(idx)) second_file = sys.argv[3] second_idx = sys.argv[4].split(',') second_idx = transform.map_func(second_idx, lambda idx: int(idx)) output_csv_file = sys.argv[5] first_dataset = csv_handler.csv_readlines(first_file) result = transform.map_func(first_dataset, lambda row: [row[idx] for idx in first_idx]) second_dataset = csv_handler.csv_readlines(second_file) second_result = transform.map_func( second_dataset, lambda row: [row[idx] for idx in second_idx]) assert (len(first_dataset) == len(second_dataset)) final = transform.map_func(zip(result, second_result), lambda p: p[0] + p[1]) csv_handler.csv_writelines(output_csv_file, final)
# get idx for review_id, text, and funny_count def get_triplet_idx(header): idx_id = header.index('review_id') idx_text = header.index('text') idx_count = header.index('funny') return (idx_id, idx_text, idx_count) def selector(row, idx_id, idx_text, idx_count): r_id = row[idx_id] text = row[idx_text] funny_count = int(row[idx_count]) is_funny = None if (funny_count >= 5): is_funny = 1 elif (funny_count == 0): is_funny = 0 return (r_id, text, is_funny) (idx_id, idx_text, idx_count) = get_triplet_idx(dataset[0]) selected_datasets = transformer.map_func( dataset[1:], lambda line: selector(line, idx_id, idx_text, idx_count)) final_datasets = transformer.filter_func(selected_datasets, lambda row: row[2] != None) csv_handler.csv_writelines(review_output_path, final_datasets)
elif num_true == 1: assert (num_false == 0) line = line.replace('"has_spoiler": true,', '"has_spoiler": "true",') else: assert (False) line = json.loads(line) return line json_dataset = transformer.map_func(dataset, lambda line: format_func(line)) id_sents = transformer.map_func( json_dataset, lambda jsn: (jsn['review_id'], jsn['review_sentences'])) def flat_sents_func(review_id, sents): result = [] for i in range(len(sents)): sent_id = review_id + "###" + str(i) sent_text = sents[i][1] sent_label = sents[i][0] result.append((sent_id, sent_text, sent_label)) return result final_sents = transformer.flatmap_func(id_sents, lambda p: flat_sents_func(p[0], p[1])) csv_handler.csv_writelines(output_path, final_sents)