def _process(self, input_pack: DataPack): all_anchors = defaultdict(list) anchor: WikiAnchor for anchor in input_pack.get(WikiAnchor): all_anchors[(anchor.span.begin, anchor.span.end)].append(anchor) for span in all_anchors.keys(): l_a: List[WikiAnchor] = all_anchors[span] if len(l_a) > 1: if len(l_a) > 2: print(input_pack.pack_name, l_a[0].target_page_name, len(l_a)) logging.error( "There are links that have more than 2 copies.") import pdb pdb.set_trace() for a in l_a[1:]: # Removing duplicates. input_pack.delete_entry(a)
def _process(self, input_pack: DataPack): hoppers = list(input_pack.get(Hopper)) for h in hoppers: input_pack.delete_entry(h)