Exemplo n.º 1
0
def add_clf_scores_to_file(model_file,
                           eval_file,
                           error_types,
                           skip_deps=False):
    print("Loading models")
    clf, scaler, selector = load_models(model_file)
    print("Loading data records")
    records, all_ann_sents = get_all_records(eval_file,
                                             error_types=error_types)
    print(f"{len(records)} records and {len(all_ann_sents)} "
          f"sentences were loaded")
    print("Doing feature preprocessing")
    features, labels, features_names = get_x_and_y(records, skip_deps)
    labels = np.array([int(x > 0.5) for x in labels])
    features_norm = scaler.transform(features)
    features_selected = selector.transform(features_norm)
    print("Doing prediction")
    scores = get_clf_pred_probs(clf, features_selected)
    print("Adding scores to sentences")
    cnt = 0
    label_stats = []
    for ann_sent in all_ann_sents:
        for ann in ann_sent.iter_annotations():
            ann.meta['clf_score'] = scores[cnt][1]
            label_stats.append(float(ann.meta['label']))
            cnt += 1
    assert cnt == len(records)
    print("Calculate accuracy")
    acc = calculate_and_print_metrics(labels, scores)
    print("Saving results to the file")
    output = [x.get_annotated_text() for x in all_ann_sents]
    clf_name = os.path.basename(model_file).replace(".pkl", "")
    et = "None" if not error_types else " ".join(error_types)
    out_file = eval_file.replace(".txt", f"_scored_by_{clf_name}_on_{et}.txt")
    write_lines(out_file, output)
Exemplo n.º 2
0
def evaluate_with_m2(gold_annotations, output_annotations, tmp_filename):
    assert len(gold_annotations) == len(output_annotations)
    gold_ann_tokens = [AnnotatedTokens(AnnotatedText(anno_text)) for anno_text in gold_annotations]
    gold_m2_annotations = []
    for ann_tokens in gold_ann_tokens:
        try:
            converted = MultiAnnotatedSentence.from_annotated_tokens(ann_tokens).to_m2_str() + '\n'
            gold_m2_annotations.append(converted)
        except Exception as e:
            # print(e)
            # print(ann_tokens.get_original_text())
            # print(ann_tokens.get_annotated_text())
            # print(ann_tokens.get_annotated_text(with_meta=False))
            # new_ann_tokens = AnnotatedTokens(ann_tokens._tokens)
            for ann in ann_tokens.iter_annotations():
                if not ann.suggestions or str(ann.suggestions[0]) == "NO_SUGGESTIONS":
                    ann_tokens.remove(ann)
            new_converted = MultiAnnotatedSentence.from_annotated_tokens(ann_tokens).to_m2_str() + '\n'
            gold_m2_annotations.append(new_converted)

    output_corrected_texts = [AnnotatedText(anno_text).get_corrected_text() for anno_text in output_annotations]
    # Write as text files

    gold_file_processed = f"g_{os.path.basename(tmp_filename)}"
    sub_file_processed = f"o_{os.path.basename(tmp_filename)}"
    write_lines(gold_file_processed, gold_m2_annotations)
    write_lines(sub_file_processed, output_corrected_texts)
    # Run m2scorer (OFFICIAL VERSION 3.2, http://www.comp.nus.edu.sg/~nlp/conll14st.html)
    system(f'./m2scorer/m2scorer {sub_file_processed} {gold_file_processed}')
    remove_file(sub_file_processed)
    remove_file(gold_file_processed)
Exemplo n.º 3
0
def evaluate_from_m2_file(m2_file, sub_lines, tmp_filename):
    output_corrected = [AnnotatedText(x).get_corrected_text() for x in sub_lines]
    sub_file_processed = f"o_{os.path.basename(tmp_filename)}"
    write_lines(sub_file_processed, output_corrected)
    m2_path = os.path.join(os.getcwd().split("/gandalf/")[0],
                           "gandalf/scorer/m2scorer/m2scorer")
    system(f'{m2_path} {sub_file_processed} {m2_file}')
    remove_file(sub_file_processed)
Exemplo n.º 4
0
def main(args):
    # Read original texts
    clc_reader = ClcCsvReader(args.input_file)
    output = []
    for _, _, _, _, relabeled_anno_text, _ in clc_reader.iter_items():
        output.append(relabeled_anno_text)

    write_lines(args.output_file, output)
def main(args):
    # Read original texts
    unfiltered_data = read_lines(args.unfiltered_file)
    # Filter text
    output, cnt = filter_by_error_type(unfiltered_data,
                                       error_type=args.error_type,
                                       system_type=args.system_type)
    # Save results
    out_file = args.unfiltered_file.replace('.txt',
                                            f'_by_{args.error_type}.txt')
    write_lines(out_file, output)
Exemplo n.º 6
0
def main(args):
    # Read original texts
    test_orig = read_lines(args.test_orig)
    # Run checks in parallel and save result
    out_file = args.test_orig.replace('.txt', f'_{args.system_type}.txt')
    run_check_parallel(test_orig,
                       check_type=args.system_type,
                       error_type=args.error_type,
                       n_threads=args.n_threads,
                       fn_out=out_file)
    # Filter output
    unfiltered_data = read_lines(out_file)
    output = filter_by_error_type(unfiltered_data,
                                  error_type=args.error_type,
                                  system_type=args.system_type)
    # Save results
    out_filtered_file = out_file.replace('.txt', f'_by_{args.error_type}.txt')
    write_lines(out_filtered_file, output)
Exemplo n.º 7
0
def run_check_parallel(orig_list, check_type, error_type, n_threads, fn_out):
    if check_type == 'Patterns':
        combined_data = get_combined_data(orig_list, check_type)
    elif check_type == 'OPC-with-filters':
        filters = {"<ErrorTypesFilter(types=None)>": {"types": [error_type]}}
        combined_data = get_combined_data(orig_list,
                                          check_type,
                                          addr='PREPROD',
                                          filters=filters)
    elif check_type == 'OPC-without-filters':
        filters = False
        combined_data = get_combined_data(orig_list,
                                          check_type,
                                          addr='PREPROD',
                                          filters=filters)
    elif check_type == 'UPC5-high-precision':
        combined_data = get_combined_data(orig_list, check_type)
    elif check_type == 'UPC5-high-recall':
        upc_addr = "upc-high-recall-server.phantasm.gnlp.io:8081"
        combined_data = get_combined_data(orig_list,
                                          check_type,
                                          addr=upc_addr,
                                          custom_server=True)
    else:
        raise ValueError('Unknown check_type = %s' % check_type)

    # create helper object to deal with batches
    batcher = Batcher(combined_data, batch_size=n_threads, verbose=True)
    pool = Pool(processes=n_threads)  # pool to make multithreading
    result_anno = list()
    for batch in batcher.iter_batches():
        result_anno_batch = pool.map(wrapped_check_func, batch)
        result_anno.extend(result_anno_batch)
    pool.close()
    pool.join()
    # Normalizing trick
    normalized_result_anno = [
        AnnotatedTokens(AnnotatedText(x)).get_annotated_text()
        for x in result_anno
    ]
    write_lines(fn_out, normalized_result_anno)
    return normalized_result_anno
def main(args):
    clc_csv_reader = ClcCsvReader(fn=args.fn_clc_csv)
    error_types_bank = ErrorTypesBank()
    target_error_types_list = error_types_bank.patterns22_to_clc89(
        args.target_error_type)
    orig_lines = list()  # original texts
    gold_annotations = list(
    )  # gold corrections in Annotated Text string format
    for _, _, _, _, gold_relabeled_anno_text, gold_error_types_list \
            in clc_csv_reader.iter_items(max_item_number=args.max_item_number):
        # We are not interested in the text samples which doesn''t contain
        # at least one target error type
        if not is_lists_intersection(gold_error_types_list,
                                     target_error_types_list):
            continue
        ann_tokens = AnnotatedTokens(AnnotatedText(gold_relabeled_anno_text))
        for ann in ann_tokens.iter_annotations():
            if ann.meta['error_type'] not in target_error_types_list:
                ann_tokens.remove(ann)
        gold_annotations_renormalized = ann_tokens.get_annotated_text()
        # Add renormalized texts to the lists
        orig_sent = ann_tokens.get_original_text()
        orig_lines.append(orig_sent)
        gold_annotations.append(gold_annotations_renormalized)
    assert len(orig_lines) == len(gold_annotations)
    print('%d lines in unfiltered outputs.' % len(orig_lines))
    gold_annotations_filtered, orig_lines_filtered = filter_by_nosuggestions_in_gold(
        gold_annotations, orig_lines)
    assert len(gold_annotations_filtered) == len(orig_lines_filtered)
    print('%d lines in filtered by NO_SUGGESTION flag outputs.' %
          len(orig_lines_filtered))
    # Write to files
    fn_out_gold_file = args.fn_clc_csv.replace(
        '.csv', f'_{args.target_error_type}_gold.txt')
    fn_out_orig_file = args.fn_clc_csv.replace(
        '.csv', f'_{args.target_error_type}_orig.txt')
    write_lines(fn=fn_out_gold_file, lines=gold_annotations_filtered)
    write_lines(fn=fn_out_orig_file, lines=orig_lines_filtered)