def take_stats(txt, ann, fn, stats, options): annotations = [] for ln, line in enumerate(ann.splitlines(), start=1): if not line or line.isspace() or line[0] not in 'TN': info('skipping line {} in {}: {}'.format(ln, fn, line)) if line[0] == 'T': id_, type_span, text = line.split('\t') type_, span = type_span.split(' ', 1) stats[ENTITY_TYPE][type_] += 1 stats[ENTITY_TEXT][text] += 1 stats[TEXT_BY_TYPE.format(type_)][text] += 1 stats[TOTALS]['textbounds'] += 1 if len(span.split(';')) > 1: stats[FRAGMENTED_SPAN][type_] += 1 annotations.append(Textbound(id_, type_, span, text)) elif line[0] == 'N': id_, type_rid_tid, text = line.split('\t') type_, rid, tid = type_rid_tid.split(' ') if (tid.startswith(TAXONOMY_PREFIX) and options.taxdata is not None): tax_id = tid[len(TAXONOMY_PREFIX):] rank = options.taxdata.get_rank(tax_id) if rank == '<UNKNOWN>': stats[TAXONOMY_UNKNOWN][tax_id] += 1 division = options.taxdata.get_division(tax_id) stats[TAXONOMY_RANK][rank] += 1 stats[TAXONOMY_DIV][division] += 1 stats[TAXONOMY_RANK_DIV]['/'.join([rank, division])] += 1 stats[TEXT_BY_RANK.format(rank)][text] += 1 stats[TOTALS]['normalizations'] += 1 else: assert False, 'internal error' stats[TOTALS]['documents'] += 1 is_consistent = True overlapping = find_overlapping(annotations) for t1, t2 in overlapping: sorted_types = '{}-{}'.format(*sorted([t1.type, t2.type])) if t1.span_matches(t2): if t1.type == t2.type: # same span, different types is_consistent = False stats[SAME_SPAN][sorted_types] += 1 stats[SAME_SPAN_TEXT][t1.text] += 1 elif t1.contains(t2): stats[CONTAINMENT]['{} in {}'.format(t2.type, t1.type)] += 1 stats[CONTAINMENT_TEXT]['{} in {}'.format(t2.text, t1.text)] += 1 elif t2.contains(t1): stats[CONTAINMENT]['{} in {}'.format(t1.type, t2.type)] += 1 stats[CONTAINMENT_TEXT]['{} in {}'.format(t1.text, t2.text)] += 1 elif t1.span_crosses(t2): is_consistent = False stats[CROSSING_SPAN]['{}/{}'.format(t1.type, t2.type)] += 1 stats[CROSSING_SPAN_TEXT]['{}/{}'.format(t1.text, t2.text)] += 1 else: assert False, 'internal error' if is_consistent: stats[CONSISTENCY]['consistent'] += 1 else: stats[CONSISTENCY]['inconsistent'] += 1
def standoffs(self, index): """Return sentence annotations as list of Standoff objects.""" textbounds = [] for type_, start, end in self.get_tagged(): tstart, tend = start-self.base_offset, end-self.base_offset textbounds.append(Textbound('T%d' % index, type_, start, end, self.text[tstart:tend])) index += 1 return textbounds
def make_textbound(type_, span_str, text): id_ = generate_id('T') spans = [] for span in span_str.split(';'): start, end = (int(i) for i in span.split()) spans.append((start, end)) min_start = min(s[0] for s in spans) max_end = max(s[1] for s in spans) if len(spans) > 1: warning('replacing fragmented span {} with {} {}'.format( span_str, min_start, max_end)) return Textbound(id_, type_, min_start, max_end, text)
def mentions_to_standoffs(mentions, options): standoffs = [] # Mentions with identical span and type map to one textbound with # multiple normalizations. grouped = defaultdict(list) for m in mentions: grouped[(m.start, m.end, m.typename, m.text)].append(m) t_idx, n_idx = count(1), count(1) for (start, end, type_, text), group in sorted(grouped.items()): t_id = 'T{}'.format(next(t_idx)) standoffs.append(Textbound(t_id, type_, start, end, text)) for m in group: n_id = 'N{}'.format(next(n_idx)) n_name = get_norm_name(m.serial, m.text, options) # if we have a species name, add it to the norm text if m.species: n_name = n_name + ' ({})'.format(m.species) norm_id = get_norm_id(m.serial, 'TAGGER:{}'.format(m.serial), options) norm_id = rewrite_norm_id(norm_id, type_, m.species) standoffs.append(Normalization(n_id, t_id, norm_id, n_name)) return standoffs