def add_clf_scores_to_file(model_file, eval_file, error_types, skip_deps=False): print("Loading models") clf, scaler, selector = load_models(model_file) print("Loading data records") records, all_ann_sents = get_all_records(eval_file, error_types=error_types) print(f"{len(records)} records and {len(all_ann_sents)} " f"sentences were loaded") print("Doing feature preprocessing") features, labels, features_names = get_x_and_y(records, skip_deps) labels = np.array([int(x > 0.5) for x in labels]) features_norm = scaler.transform(features) features_selected = selector.transform(features_norm) print("Doing prediction") scores = get_clf_pred_probs(clf, features_selected) print("Adding scores to sentences") cnt = 0 label_stats = [] for ann_sent in all_ann_sents: for ann in ann_sent.iter_annotations(): ann.meta['clf_score'] = scores[cnt][1] label_stats.append(float(ann.meta['label'])) cnt += 1 assert cnt == len(records) print("Calculate accuracy") acc = calculate_and_print_metrics(labels, scores) print("Saving results to the file") output = [x.get_annotated_text() for x in all_ann_sents] clf_name = os.path.basename(model_file).replace(".pkl", "") et = "None" if not error_types else " ".join(error_types) out_file = eval_file.replace(".txt", f"_scored_by_{clf_name}_on_{et}.txt") write_lines(out_file, output)
def evaluate_with_m2(gold_annotations, output_annotations, tmp_filename): assert len(gold_annotations) == len(output_annotations) gold_ann_tokens = [AnnotatedTokens(AnnotatedText(anno_text)) for anno_text in gold_annotations] gold_m2_annotations = [] for ann_tokens in gold_ann_tokens: try: converted = MultiAnnotatedSentence.from_annotated_tokens(ann_tokens).to_m2_str() + '\n' gold_m2_annotations.append(converted) except Exception as e: # print(e) # print(ann_tokens.get_original_text()) # print(ann_tokens.get_annotated_text()) # print(ann_tokens.get_annotated_text(with_meta=False)) # new_ann_tokens = AnnotatedTokens(ann_tokens._tokens) for ann in ann_tokens.iter_annotations(): if not ann.suggestions or str(ann.suggestions[0]) == "NO_SUGGESTIONS": ann_tokens.remove(ann) new_converted = MultiAnnotatedSentence.from_annotated_tokens(ann_tokens).to_m2_str() + '\n' gold_m2_annotations.append(new_converted) output_corrected_texts = [AnnotatedText(anno_text).get_corrected_text() for anno_text in output_annotations] # Write as text files gold_file_processed = f"g_{os.path.basename(tmp_filename)}" sub_file_processed = f"o_{os.path.basename(tmp_filename)}" write_lines(gold_file_processed, gold_m2_annotations) write_lines(sub_file_processed, output_corrected_texts) # Run m2scorer (OFFICIAL VERSION 3.2, http://www.comp.nus.edu.sg/~nlp/conll14st.html) system(f'./m2scorer/m2scorer {sub_file_processed} {gold_file_processed}') remove_file(sub_file_processed) remove_file(gold_file_processed)
def evaluate_from_m2_file(m2_file, sub_lines, tmp_filename): output_corrected = [AnnotatedText(x).get_corrected_text() for x in sub_lines] sub_file_processed = f"o_{os.path.basename(tmp_filename)}" write_lines(sub_file_processed, output_corrected) m2_path = os.path.join(os.getcwd().split("/gandalf/")[0], "gandalf/scorer/m2scorer/m2scorer") system(f'{m2_path} {sub_file_processed} {m2_file}') remove_file(sub_file_processed)
def main(args): # Read original texts clc_reader = ClcCsvReader(args.input_file) output = [] for _, _, _, _, relabeled_anno_text, _ in clc_reader.iter_items(): output.append(relabeled_anno_text) write_lines(args.output_file, output)
def main(args): # Read original texts unfiltered_data = read_lines(args.unfiltered_file) # Filter text output, cnt = filter_by_error_type(unfiltered_data, error_type=args.error_type, system_type=args.system_type) # Save results out_file = args.unfiltered_file.replace('.txt', f'_by_{args.error_type}.txt') write_lines(out_file, output)
def main(args): # Read original texts test_orig = read_lines(args.test_orig) # Run checks in parallel and save result out_file = args.test_orig.replace('.txt', f'_{args.system_type}.txt') run_check_parallel(test_orig, check_type=args.system_type, error_type=args.error_type, n_threads=args.n_threads, fn_out=out_file) # Filter output unfiltered_data = read_lines(out_file) output = filter_by_error_type(unfiltered_data, error_type=args.error_type, system_type=args.system_type) # Save results out_filtered_file = out_file.replace('.txt', f'_by_{args.error_type}.txt') write_lines(out_filtered_file, output)
def run_check_parallel(orig_list, check_type, error_type, n_threads, fn_out): if check_type == 'Patterns': combined_data = get_combined_data(orig_list, check_type) elif check_type == 'OPC-with-filters': filters = {"<ErrorTypesFilter(types=None)>": {"types": [error_type]}} combined_data = get_combined_data(orig_list, check_type, addr='PREPROD', filters=filters) elif check_type == 'OPC-without-filters': filters = False combined_data = get_combined_data(orig_list, check_type, addr='PREPROD', filters=filters) elif check_type == 'UPC5-high-precision': combined_data = get_combined_data(orig_list, check_type) elif check_type == 'UPC5-high-recall': upc_addr = "upc-high-recall-server.phantasm.gnlp.io:8081" combined_data = get_combined_data(orig_list, check_type, addr=upc_addr, custom_server=True) else: raise ValueError('Unknown check_type = %s' % check_type) # create helper object to deal with batches batcher = Batcher(combined_data, batch_size=n_threads, verbose=True) pool = Pool(processes=n_threads) # pool to make multithreading result_anno = list() for batch in batcher.iter_batches(): result_anno_batch = pool.map(wrapped_check_func, batch) result_anno.extend(result_anno_batch) pool.close() pool.join() # Normalizing trick normalized_result_anno = [ AnnotatedTokens(AnnotatedText(x)).get_annotated_text() for x in result_anno ] write_lines(fn_out, normalized_result_anno) return normalized_result_anno
def main(args): clc_csv_reader = ClcCsvReader(fn=args.fn_clc_csv) error_types_bank = ErrorTypesBank() target_error_types_list = error_types_bank.patterns22_to_clc89( args.target_error_type) orig_lines = list() # original texts gold_annotations = list( ) # gold corrections in Annotated Text string format for _, _, _, _, gold_relabeled_anno_text, gold_error_types_list \ in clc_csv_reader.iter_items(max_item_number=args.max_item_number): # We are not interested in the text samples which doesn''t contain # at least one target error type if not is_lists_intersection(gold_error_types_list, target_error_types_list): continue ann_tokens = AnnotatedTokens(AnnotatedText(gold_relabeled_anno_text)) for ann in ann_tokens.iter_annotations(): if ann.meta['error_type'] not in target_error_types_list: ann_tokens.remove(ann) gold_annotations_renormalized = ann_tokens.get_annotated_text() # Add renormalized texts to the lists orig_sent = ann_tokens.get_original_text() orig_lines.append(orig_sent) gold_annotations.append(gold_annotations_renormalized) assert len(orig_lines) == len(gold_annotations) print('%d lines in unfiltered outputs.' % len(orig_lines)) gold_annotations_filtered, orig_lines_filtered = filter_by_nosuggestions_in_gold( gold_annotations, orig_lines) assert len(gold_annotations_filtered) == len(orig_lines_filtered) print('%d lines in filtered by NO_SUGGESTION flag outputs.' % len(orig_lines_filtered)) # Write to files fn_out_gold_file = args.fn_clc_csv.replace( '.csv', f'_{args.target_error_type}_gold.txt') fn_out_orig_file = args.fn_clc_csv.replace( '.csv', f'_{args.target_error_type}_orig.txt') write_lines(fn=fn_out_gold_file, lines=gold_annotations_filtered) write_lines(fn=fn_out_orig_file, lines=orig_lines_filtered)