def initialize(self, args, raw_path, cache_path, companion_path, precomputed_dataset=None): print("Caching the dataset...\n", flush=True) data = utils.load_dataset(raw_path, framework=self.framework) utils.add_companion(data, companion_path, self.language) utils.tokenize(data, mode="aggressive") if self.language == "deu": translator = Translator("de", "en") for i, sentence in enumerate(data.values()): sentence["lemmas"] = translator.translate(sentence["lemmas"]) print(f"{i} / {len(data)}", flush=True) for node, sentence in utils.node_generator(data): assert "label" not in node or '│' not in node["label"] for sentence in data.values(): self.relations_to_edges(sentence) for node in sentence["nodes"]: if "label" not in node: node["anchors"] = [] node["label"] = "<SCOPE>" else: node["anchors"] = list(range(len(sentence["input"]))) if node["label"][0] == '"' and node["label"][-1] == '"': node["property"] = True node["label"] = node["label"][1:-1] # create relative labels if precomputed_dataset is None: utils.create_possible_rules(data, DRGParser._create_possible_rules, prune=False) rule_set = utils.get_smallest_rule_set(data, approximate=False) else: utils.create_possible_rules(data, DRGParser._create_possible_rules, prune=False) rule_set = set(r[2] for e in precomputed_dataset.values() for n in e["nodes"] for r in n["possible rules"][1]) print(f" -> # relative labels: {len(rule_set)}\n", flush=True) for n, _ in utils.node_generator(data): n["possible rules"] = [item for item in n["possible rules"] if item["rule"] in rule_set] if precomputed_dataset is None: utils.change_unnecessary_relative_rules(data) rule_counter = Counter() for n, d in utils.node_generator(data): rule_counter.update((item["rule"] for item in n["possible rules"])) for rule, count in rule_counter.most_common(): print(f"- `{rule}`: {count}") print(flush=True) with open(cache_path, "w", encoding="utf8") as f: for example in data.values(): json.dump(example, f, ensure_ascii=False) f.write("\n")
def initialize(self, args, raw_path, cache_path, companion_path, precomputed_dataset=None): print("Caching the dataset...\n", flush=True) data = utils.load_dataset(raw_path, framework=self.framework) utils.add_companion(data, companion_path, self.language) utils.tokenize(data, mode="aggressive") utils.anchor_ids_from_intervals(data) for node, _ in utils.node_generator(data): if "label" not in node: node["label"] = "<TOP>" node["label"] = node["label"].replace('\\/', '/') assert '│' not in node["label"] # create relative labels if precomputed_dataset is None: utils.create_possible_rules(data, DcrParser._create_possible_rules, prune=True) rule_set = utils.get_smallest_rule_set( data, approximate=self.language == "eng") else: utils.create_possible_rules(data, DcrParser._create_possible_rules, prune=False) rule_set = set(r[2] for e in precomputed_dataset.values() for n in e["nodes"] for r in n["possible rules"][1]) print(f" -> # relative labels: {len(rule_set)}\n", flush=True) for n, _ in utils.node_generator(data): n["possible rules"] = [ item for item in n["possible rules"] if item["rule"] in rule_set ] utils.change_unnecessary_relative_rules(data) rule_counter = Counter() for n, _ in utils.node_generator(data): rule_counter.update((item["rule"] for item in n["possible rules"])) for rule, count in rule_counter.most_common(): print(f"- `{rule}`: {count}") print(flush=True) with open(cache_path, "w", encoding="utf8") as f: for example in data.values(): json.dump(example, f, ensure_ascii=False) f.write("\n")
def __init__(self, args, framework: str, language: str, fields): path = args.test_data[(framework, language)] self.data = utils.load_dataset(path, framework=framework, language=language) utils.add_companion(self.data, args.companion_data[(framework, language)], language) utils.tokenize(self.data, mode="aggressive") for sentence in self.data.values(): sentence["token anchors"] = [[a["from"], a["to"]] for a in sentence["token anchors"]] utils.create_bert_tokens(self.data, args.encoder) super(EvaluationParser, self).__init__(fields, self.data)
def initialize(self, args, raw_path, cache_path, companion_path): print("Caching the dataset...", flush=True) data = utils.load_dataset(raw_path, framework=self.framework) utils.add_companion(data, companion_path, self.language) utils.tokenize(data, mode="aggressive") # divide into leaf in inner nodes and induce inner anchors for sentence in data.values(): out_edges = [[] for _ in sentence["nodes"]] in_edges = [[] for _ in sentence["nodes"]] for edge in sentence["edges"]: out_edges[edge["source"]].append(edge["target"]) in_edges[edge["target"]].append(edge["source"]) leaves = [] for node in sentence["nodes"]: if len(out_edges[node["id"]]) == 0: node["label"] = "leaf" leaves.append(node["id"]) else: node["label"] = "inner" node["label"] = "leaf" if len( out_edges[node["id"]]) == 0 else "inner" if "anchors" not in node: node["anchors"] = [] node["anchors"] = {(a["from"], a["to"]): a for a in node["anchors"]} node["parents"] = set() depth = 0 layer = [sentence["tops"][0]] while len(layer) > 0: new_layer = [] for n in layer: sentence["nodes"][n]["depth"] = depth for child in out_edges[n]: if n not in sentence["nodes"][child]["parents"]: sentence["nodes"][child]["parents"].add(n) new_layer.append(child) depth += 1 layer = new_layer layer = leaves while len(layer) > 0: new_layer = [] for n in layer: for parent in in_edges[n]: for a in sentence["nodes"][n]["anchors"].keys(): if a not in sentence["nodes"][parent]["anchors"]: new_layer.append(parent) break sentence["nodes"][parent]["anchors"].update( sentence["nodes"][n]["anchors"]) layer = new_layer for node in sentence["nodes"]: node["anchors"] = list(node["anchors"].values()) del node["parents"] utils.anchor_ids_from_intervals(data) with open(cache_path, "w", encoding="utf8") as f: for sentence in data.values(): json.dump(sentence, f, ensure_ascii=False) f.write("\n")