예제 #1
0
    def initialize(self, args, raw_path, cache_path, companion_path, precomputed_dataset=None):
        print("Caching the dataset...\n", flush=True)

        data = utils.load_dataset(raw_path, framework=self.framework)

        utils.add_companion(data, companion_path, self.language)
        utils.tokenize(data, mode="aggressive")

        if self.language == "deu":
            translator = Translator("de", "en")
            for i, sentence in enumerate(data.values()):
                sentence["lemmas"] = translator.translate(sentence["lemmas"])
                print(f"{i} / {len(data)}", flush=True)

        for node, sentence in utils.node_generator(data):
            assert "label" not in node or '│' not in node["label"]

        for sentence in data.values():
            self.relations_to_edges(sentence)

            for node in sentence["nodes"]:
                if "label" not in node:
                    node["anchors"] = []
                    node["label"] = "<SCOPE>"
                else:
                    node["anchors"] = list(range(len(sentence["input"])))

                    if node["label"][0] == '"' and node["label"][-1] == '"':
                        node["property"] = True
                        node["label"] = node["label"][1:-1]

        # create relative labels

        if precomputed_dataset is None:
            utils.create_possible_rules(data, DRGParser._create_possible_rules, prune=False)
            rule_set = utils.get_smallest_rule_set(data, approximate=False)
        else:
            utils.create_possible_rules(data, DRGParser._create_possible_rules, prune=False)
            rule_set = set(r[2] for e in precomputed_dataset.values() for n in e["nodes"] for r in n["possible rules"][1])

        print(f" -> # relative labels: {len(rule_set)}\n", flush=True)

        for n, _ in utils.node_generator(data):
            n["possible rules"] = [item for item in n["possible rules"] if item["rule"] in rule_set]

        if precomputed_dataset is None:
            utils.change_unnecessary_relative_rules(data)

        rule_counter = Counter()
        for n, d in utils.node_generator(data):
            rule_counter.update((item["rule"] for item in n["possible rules"]))

        for rule, count in rule_counter.most_common():
            print(f"- `{rule}`: {count}")
        print(flush=True)

        with open(cache_path, "w", encoding="utf8") as f:
            for example in data.values():
                json.dump(example, f, ensure_ascii=False)
                f.write("\n")
예제 #2
0
    def initialize(self,
                   args,
                   raw_path,
                   cache_path,
                   companion_path,
                   precomputed_dataset=None):
        print("Caching the dataset...\n", flush=True)

        data = utils.load_dataset(raw_path, framework=self.framework)

        utils.add_companion(data, companion_path, self.language)
        utils.tokenize(data, mode="aggressive")
        utils.anchor_ids_from_intervals(data)

        for node, _ in utils.node_generator(data):
            if "label" not in node:
                node["label"] = "<TOP>"
            node["label"] = node["label"].replace('\\/', '/')
            assert '│' not in node["label"]

        # create relative labels

        if precomputed_dataset is None:
            utils.create_possible_rules(data,
                                        DcrParser._create_possible_rules,
                                        prune=True)
            rule_set = utils.get_smallest_rule_set(
                data, approximate=self.language == "eng")
        else:
            utils.create_possible_rules(data,
                                        DcrParser._create_possible_rules,
                                        prune=False)
            rule_set = set(r[2] for e in precomputed_dataset.values()
                           for n in e["nodes"] for r in n["possible rules"][1])

        print(f" -> # relative labels: {len(rule_set)}\n", flush=True)

        for n, _ in utils.node_generator(data):
            n["possible rules"] = [
                item for item in n["possible rules"]
                if item["rule"] in rule_set
            ]

        utils.change_unnecessary_relative_rules(data)

        rule_counter = Counter()
        for n, _ in utils.node_generator(data):
            rule_counter.update((item["rule"] for item in n["possible rules"]))

        for rule, count in rule_counter.most_common():
            print(f"- `{rule}`: {count}")
        print(flush=True)

        with open(cache_path, "w", encoding="utf8") as f:
            for example in data.values():
                json.dump(example, f, ensure_ascii=False)
                f.write("\n")
예제 #3
0
    def __init__(self, args, framework: str, language: str, fields):
        path = args.test_data[(framework, language)]
        self.data = utils.load_dataset(path,
                                       framework=framework,
                                       language=language)

        utils.add_companion(self.data,
                            args.companion_data[(framework, language)],
                            language)
        utils.tokenize(self.data, mode="aggressive")

        for sentence in self.data.values():
            sentence["token anchors"] = [[a["from"], a["to"]]
                                         for a in sentence["token anchors"]]

        utils.create_bert_tokens(self.data, args.encoder)

        super(EvaluationParser, self).__init__(fields, self.data)
예제 #4
0
    def initialize(self, args, raw_path, cache_path, companion_path):
        print("Caching the dataset...", flush=True)

        data = utils.load_dataset(raw_path, framework=self.framework)
        utils.add_companion(data, companion_path, self.language)
        utils.tokenize(data, mode="aggressive")

        # divide into leaf in inner nodes and induce inner anchors

        for sentence in data.values():
            out_edges = [[] for _ in sentence["nodes"]]
            in_edges = [[] for _ in sentence["nodes"]]

            for edge in sentence["edges"]:
                out_edges[edge["source"]].append(edge["target"])
                in_edges[edge["target"]].append(edge["source"])

            leaves = []
            for node in sentence["nodes"]:
                if len(out_edges[node["id"]]) == 0:
                    node["label"] = "leaf"
                    leaves.append(node["id"])
                else:
                    node["label"] = "inner"
                node["label"] = "leaf" if len(
                    out_edges[node["id"]]) == 0 else "inner"

                if "anchors" not in node:
                    node["anchors"] = []

                node["anchors"] = {(a["from"], a["to"]): a
                                   for a in node["anchors"]}
                node["parents"] = set()

            depth = 0
            layer = [sentence["tops"][0]]
            while len(layer) > 0:
                new_layer = []
                for n in layer:
                    sentence["nodes"][n]["depth"] = depth
                    for child in out_edges[n]:
                        if n not in sentence["nodes"][child]["parents"]:
                            sentence["nodes"][child]["parents"].add(n)
                            new_layer.append(child)
                depth += 1
                layer = new_layer

            layer = leaves
            while len(layer) > 0:
                new_layer = []
                for n in layer:
                    for parent in in_edges[n]:
                        for a in sentence["nodes"][n]["anchors"].keys():
                            if a not in sentence["nodes"][parent]["anchors"]:
                                new_layer.append(parent)
                                break
                        sentence["nodes"][parent]["anchors"].update(
                            sentence["nodes"][n]["anchors"])
                layer = new_layer

            for node in sentence["nodes"]:
                node["anchors"] = list(node["anchors"].values())
                del node["parents"]

        utils.anchor_ids_from_intervals(data)

        with open(cache_path, "w", encoding="utf8") as f:
            for sentence in data.values():
                json.dump(sentence, f, ensure_ascii=False)
                f.write("\n")