def check_spans(doc_fn, tag_fn, options): doc_count, span_count, mismatches = 0, 0, 0 with open_file(doc_fn, 'r', options) as doc_f: doc_reader = DocReader(doc_f) with open_file(tag_fn, 'r', options) as tag_f: span_reader = SpanReader(tag_f) for doc in doc_reader: for span in span_reader.document_spans(doc.id): doc_span_text = doc.text[span.start:span.end + 1] if doc_span_text != span.text: dt, st = safe_str(doc_span_text), safe_str(span.text) print(f'text mismatch in {doc.id}: "{dt}" ' f'vs "{st}: {span}"') mismatches += 1 span_count += 1 doc_count += 1 if doc_count % 10000 == 0: print( f'processed {doc_count} documents ' f'({span_count} spans)', file=sys.stderr) span_count, errors = span_reader.iter.index - 1, span_reader.errors if span_reader.current_doc_id() is not None: print(f'ERROR: extra lines in {tag_fn}') if mismatches or errors: print(f'Checked {span_count} spans, found {errors} errors ' f'and {mismatches} mismatches') else: print(f'OK, checked {span_count} spans')
def cut_tags(doc_fn, tag_fn, out_fn, options): removed, total = 0, 0 with open_file(doc_fn, 'r', options) as doc_f: doc_reader = DocReader(doc_f) with open_file(tag_fn, 'r', options) as tag_f: span_reader = SpanReader(tag_f, no_type_mapping=True) with open_file(out_fn, 'w', options) as out_f: for doc_idx, doc in enumerate(doc_reader): offset_map = get_offset_map(doc, options) if offset_map is None: # no-op, quick copy without parsing for span in span_reader.document_lines(doc.id): print(span, end='', file=out_f) total += 1 else: # need to parse, map and filter spans = list(span_reader.document_spans(doc.id)) mapped = apply_offset_map(spans, offset_map) removed += len(spans) - len(mapped) total += len(spans) for span in mapped: print(span, file=out_f) if (doc_idx+1) % 100000 == 0: print(f'processed {doc_idx+1} documents', file=sys.stderr) print(f'removed {removed}/{total} spans ({removed/total:.1%})', file=sys.stderr)
def char_to_byte_offsets(doc_fn, tag_fn, options): doc_count = 0 with open(doc_fn, encoding=options.encoding) as doc_f: doc_reader = DocReader(doc_f) with open(tag_fn, encoding=options.encoding) as tag_f: span_reader = SpanReader(tag_f) for doc in doc_reader: if options.max_docs and doc_count >= options.max_docs: break if char_and_byte_offsets_are_identical(doc.text, options): # fast common case for trivial mapping for span in span_reader.document_lines(doc.id): print(span, end='') else: # non-trivial mapping offset_map = make_offset_map(doc.text, options) for span in span_reader.document_spans(doc.id): span.start = offset_map[span.start] # offsets are end inclusive, so take the last byte # before the next character span.end = offset_map[span.end+1] - 1 print(span) doc_count += 1 if doc_count % 10000 == 0: print(f'processed {doc_count} documents', file=sys.stderr)
def cut_documents(doc_fn, out_fn, options): cut_count = 0 with open_file(doc_fn, 'r', options) as doc_f: doc_reader = DocReader(doc_f) with open_file(out_fn, 'w', options) as out_f: for doc_idx, doc in enumerate(doc_reader): cut_count += cut_document(doc, options) print(doc, file=out_f) if (doc_idx + 1) % 100000 == 0: print(f'processed {doc_idx+1} documents', file=sys.stderr) print(f'cut {cut_count}/{doc_idx} documents ({cut_count/doc_idx:.1%})', file=sys.stderr)
def convert_to_standoff(doc_fn, tag_fn, out_dir, options): NOTE_TYPE = 'AnnotatorNotes' with open_file(doc_fn, 'r', options) as doc_f: doc_reader = DocReader(doc_f) with open_file(tag_fn, 'r', options) as tag_f: # Read spans that include source information span_reader = SpanReader(tag_f, source=True) for doc in doc_reader: spans = list(span_reader.document_spans(doc.id)) try: convert_single(doc, spans, out_dir, options) except Exception as e: error(f'failed to convert {doc.id}: {e}') raise
def compare_spans(doc_fn, tag_fns, names, doc_out, tag_out, options): if names is None: names = tag_fns doc_count = 0 stats = Stats(names) with open_file(doc_fn, 'r', options) as doc_f: doc_reader = DocReader(doc_f) tag_fs = [] for tag_fn in tag_fns: tag_fs.append(open_file(tag_fn, 'r', options)) span_readers = [ SpanReader(tag_f, source=name) for tag_f, name in zip(tag_fs, names) ] for doc_idx, doc in enumerate(doc_reader): if options.max_docs and doc_count >= options.max_docs: break spans = [r.document_spans(doc.id) for r in span_readers] spans = [validate_spans(doc.id, doc.text, s) for s in spans] spans = [filter_spans(s, options) for s in spans] spans = [deduplicate_spans(s, options) for s in spans] selected_for_output = False for i in range(len(spans)): for j in range(i + 1, len(spans)): doc_stats = compare_document_spans(doc.id, names[i], names[j], spans[i], spans[j], options) stats.add_stats(doc_stats) if select_document_for_output(doc, doc_stats, options): selected_for_output = True if (selected_for_output and (options.sample is None or random.random() < options.sample)): print(doc, file=doc_out) for s in (s for sp in spans for s in sp): print(s, file=tag_out) doc_count += 1 if doc_count % 10000 == 0: print(f'processed {doc_count} documents', file=sys.stderr, flush=True) if (options.save_interval and doc_count % options.save_interval == 0): save_results(options.output, stats, options) doc_out.flush() tag_out.flush() stats.trim() save_results(options.output, stats, options)
def filter_documents(doc_fn, out_fn, ids, options): out_count = 0 with open_file(doc_fn, 'r', options) as doc_f: doc_reader = DocReader(doc_f) with open_file(out_fn, 'w', options) as out_f: for doc_idx, doc in enumerate(doc_reader): if doc.id in ids: print(doc, file=out_f, flush=True) out_count += 1 if (doc_idx + 1) % 100000 == 0: print(f'processed {doc_idx+1}, output {out_count}', file=sys.stderr) print(f'output {out_count}/{doc_idx} documents ({out_count/doc_idx:.1%})', file=sys.stderr)