示例#1
0
 def sent_rm_pos(sent):
     finnpos_analys = get_finnpos_analys(sent)
     anns = sent.xpath("./annotations/annotation")
     new_anns = anns.copy()
     for ann in anns:
         tok, tok_len = get_ann_pos(ann)
         if tok_len != 1:
             continue
         props = finnpos_analys[tok][1]
         if any((match(props) for match in to_remove)):
             new_anns.remove(ann)
     trim_anns(anns, new_anns)
示例#2
0
 def sent_span_dom(sent):
     anns = sent.xpath("./annotations/annotation")
     token_positions = {}
     for ann in anns:
         if sup_only and not HasSupportTournament.rank(ann):
             continue
         tok, tok_len = get_ann_pos(ann)
         token_positions.setdefault(tok, []).append((tok_len, ann))
     new_anns = greedy_max_span(token_positions)
     if sup_only:
         for ann in anns:
             if not HasSupportTournament.rank(ann):
                 new_anns.append(ann)
     trim_anns(anns, new_anns)
示例#3
0
 def sent_rm_ambg(sent):
     anns = sent.xpath("./annotations/annotation")
     new_anns = anns.copy()
     span_counts = {}
     for ann in anns:
         span = get_ann_pos(ann)
         if span not in span_counts:
             span_counts[span] = 0
         span_counts[span] += 1
     for ann in anns:
         span = get_ann_pos(ann)
         if span_counts[span] >= 2:
             new_anns.remove(ann)
     trim_anns(anns, new_anns)
示例#4
0
 def sent_span_dom(sent):
     anns = sent.xpath("./annotations/annotation")
     starts = []
     for ann in anns:
         anchor_pos = get_ann_pos_dict(ann)
         anchor = ann.attrib["anchor"]
         starts.append(int(anchor_pos["char"]))
     starts.sort()
     char_positions = {}
     for ann in anns:
         anchor_pos = get_ann_pos_dict(ann)
         anchor = ann.attrib["anchor"]
         cur_start = int(anchor_pos["char"])
         cur_start_idx = starts.index(cur_start)
         anchor_len = len(anchor)
         span = 0
         while (cur_start_idx + span < len(starts)
                and starts[cur_start_idx + span] <= cur_start + anchor_len):
             span += 1
         char_positions.setdefault(cur_start, []).append((span, ann))
     new_anns = greedy_max_span(char_positions)
     trim_anns(anns, new_anns)