예제 #1
0
    def __init__(self, args, framework: str, language: str, part: str, fields, precomputed_dataset=None, filter_pred=None, **kwargs):
        assert part == "training" or part == "validation"
        path = args.training_data[(framework, language)] if part == "training" else args.validation_data[(framework, language)]

        self.framework = framework
        self.language = language

        cache_path = f"{path}_cache"
        if not os.path.exists(cache_path):
            self.initialize(args, path, cache_path, args.companion_data[(framework, language)], precomputed_dataset=precomputed_dataset)

        print("Loading the cached dataset")

        self.data = {}
        with io.open(cache_path, encoding="utf8") as reader:
            for line in reader:
                sentence = json.loads(line)
                self.data[sentence["id"]] = sentence

                if language == "zho":
                    sentence["lemmas"] = sentence["input"]

        self.node_counter, self.edge_counter, self.no_edge_counter = 0, 0, 0
        anchor_count, n_node_token_pairs = 0, 0

        for node, sentence in utils.node_generator(self.data):
            self.node_counter += 1
            node["properties"] = {"transformed": int("property" in node)}
            # assert len(node["anchors"]) > 0

        utils.create_aligned_rules(self.data, constrained_anchors=True)
        self.rule_counter = utils.count_rules(self.data, args.label_smoothing)

        utils.create_bert_tokens(self.data, args.encoder)
        utils.assign_labels_as_best_rules(self.data, self.rule_counter)
        utils.create_edge_permutations(self.data, EDSParser.node_similarity_key)

        # create edge vectors
        for sentence in self.data.values():
            N = len(sentence["nodes"])

            edge_count = utils.create_edges(sentence, attributes=False, normalize=args.normalize)
            self.edge_counter += edge_count
            self.no_edge_counter += N * (N - 1) - edge_count

            sentence["anchor edges"] = [N, len(sentence["input"]), []]
            for i, node in enumerate(sentence["nodes"]):
                for anchor in node["anchors"]:
                    sentence["anchor edges"][-1].append((i, anchor))

                anchor_count += len(node["anchors"])
                n_node_token_pairs += len(sentence["input"])

            sentence["id"] = [sentence["id"]]
            sentence["top"] = sentence["tops"][0]

        self.anchor_freq = anchor_count / n_node_token_pairs
        self.input_count = sum(len(sentence["input"]) for sentence in self.data.values())

        super(EDSParser, self).__init__(fields, self.data, filter_pred)
예제 #2
0
    def __init__(self,
                 args,
                 framework: str,
                 language: str,
                 part: str,
                 fields,
                 precomputed_dataset=None,
                 filter_pred=None,
                 **kwargs):
        assert part == "training" or part == "validation"
        path = args.training_data[(
            framework,
            language)] if part == "training" else args.validation_data[(
                framework, language)]

        self.framework = framework
        self.language = language

        cache_path = f"{path}_cache"
        if not os.path.exists(cache_path):
            self.initialize(args,
                            path,
                            cache_path,
                            args.companion_data[(framework, language)],
                            precomputed_dataset=precomputed_dataset)

        print("Loading the cached dataset")

        self.data = {}
        with io.open(cache_path, encoding="utf8") as reader:
            for line in reader:
                sentence = json.loads(line)
                self.data[sentence["id"]] = sentence

        self.node_counter, self.edge_counter, self.no_edge_counter = 0, 0, 0
        anchor_count, n_node_token_pairs = 0, 0

        #changed - add properties as done for ucca
        for node, sentence in utils.node_generator(self.data):
            self.node_counter += 1
            node["properties"] = {"dummy": 0}
        """
        property_keys = {p for node, _ in utils.node_generator(self.data) for p in node["properties"].keys()}
        for node, sentence in utils.node_generator(self.data):
            self.node_counter += 1

            properties = {}
            for key in property_keys:
                if key not in node["properties"]:
                    properties[key] = "<NONE>"
                    continue

                value = node["properties"][key]
                properties[key] = value[value.find('f'):] if key == "frame" else value

            node["properties"] = properties
        """

        utils.create_aligned_rules(self.data, constrained_anchors=True)
        self.rule_counter = utils.count_rules(self.data, args.label_smoothing)

        utils.create_bert_tokens(self.data, args.encoder)
        utils.assign_labels_as_best_rules(self.data, self.rule_counter)
        utils.create_edge_permutations(self.data,
                                       DcrParser.node_similarity_key)

        # create edge vectors
        for sentence in self.data.values():
            N = len(sentence["nodes"])

            edge_count = utils.create_edges(sentence, attributes=True)
            self.edge_counter += edge_count
            self.no_edge_counter += N * (N - 1) - edge_count

            sentence["anchor edges"] = [N, len(sentence["input"]), []]

            for i, node in enumerate(sentence["nodes"]):
                for anchor in node["anchors"]:
                    sentence["anchor edges"][-1].append((i, anchor))

                anchor_count += len(node["anchors"])
                n_node_token_pairs += len(sentence["input"])

            sentence["id"] = [sentence["id"]]
            sentence["top"] = 0
            """
            for k in sentence.keys():
                print(k)
                print(sentence[k])
                print("*****************************************")
            """

        self.anchor_freq = anchor_count / n_node_token_pairs
        self.input_count = sum(
            len(sentence["input"]) for sentence in self.data.values())

        super(DcrParser, self).__init__(fields, self.data, filter_pred)