def __init__(self, args, framework: str, language: str, part: str, fields, precomputed_dataset=None, filter_pred=None, **kwargs): assert part == "training" or part == "validation" path = args.training_data[(framework, language)] if part == "training" else args.validation_data[(framework, language)] self.framework = framework self.language = language cache_path = f"{path}_cache" if not os.path.exists(cache_path): self.initialize(args, path, cache_path, args.companion_data[(framework, language)], precomputed_dataset=precomputed_dataset) print("Loading the cached dataset") self.data = {} with io.open(cache_path, encoding="utf8") as reader: for line in reader: sentence = json.loads(line) self.data[sentence["id"]] = sentence if language == "zho": sentence["lemmas"] = sentence["input"] self.node_counter, self.edge_counter, self.no_edge_counter = 0, 0, 0 anchor_count, n_node_token_pairs = 0, 0 for node, sentence in utils.node_generator(self.data): self.node_counter += 1 node["properties"] = {"transformed": int("property" in node)} # assert len(node["anchors"]) > 0 utils.create_aligned_rules(self.data, constrained_anchors=True) self.rule_counter = utils.count_rules(self.data, args.label_smoothing) utils.create_bert_tokens(self.data, args.encoder) utils.assign_labels_as_best_rules(self.data, self.rule_counter) utils.create_edge_permutations(self.data, EDSParser.node_similarity_key) # create edge vectors for sentence in self.data.values(): N = len(sentence["nodes"]) edge_count = utils.create_edges(sentence, attributes=False, normalize=args.normalize) self.edge_counter += edge_count self.no_edge_counter += N * (N - 1) - edge_count sentence["anchor edges"] = [N, len(sentence["input"]), []] for i, node in enumerate(sentence["nodes"]): for anchor in node["anchors"]: sentence["anchor edges"][-1].append((i, anchor)) anchor_count += len(node["anchors"]) n_node_token_pairs += len(sentence["input"]) sentence["id"] = [sentence["id"]] sentence["top"] = sentence["tops"][0] self.anchor_freq = anchor_count / n_node_token_pairs self.input_count = sum(len(sentence["input"]) for sentence in self.data.values()) super(EDSParser, self).__init__(fields, self.data, filter_pred)
def __init__(self, args, framework: str, language: str, part: str, fields, precomputed_dataset=None, filter_pred=None, **kwargs): assert part == "training" or part == "validation" path = args.training_data[( framework, language)] if part == "training" else args.validation_data[( framework, language)] self.framework = framework self.language = language cache_path = f"{path}_cache" if not os.path.exists(cache_path): self.initialize(args, path, cache_path, args.companion_data[(framework, language)], precomputed_dataset=precomputed_dataset) print("Loading the cached dataset") self.data = {} with io.open(cache_path, encoding="utf8") as reader: for line in reader: sentence = json.loads(line) self.data[sentence["id"]] = sentence self.node_counter, self.edge_counter, self.no_edge_counter = 0, 0, 0 anchor_count, n_node_token_pairs = 0, 0 #changed - add properties as done for ucca for node, sentence in utils.node_generator(self.data): self.node_counter += 1 node["properties"] = {"dummy": 0} """ property_keys = {p for node, _ in utils.node_generator(self.data) for p in node["properties"].keys()} for node, sentence in utils.node_generator(self.data): self.node_counter += 1 properties = {} for key in property_keys: if key not in node["properties"]: properties[key] = "<NONE>" continue value = node["properties"][key] properties[key] = value[value.find('f'):] if key == "frame" else value node["properties"] = properties """ utils.create_aligned_rules(self.data, constrained_anchors=True) self.rule_counter = utils.count_rules(self.data, args.label_smoothing) utils.create_bert_tokens(self.data, args.encoder) utils.assign_labels_as_best_rules(self.data, self.rule_counter) utils.create_edge_permutations(self.data, DcrParser.node_similarity_key) # create edge vectors for sentence in self.data.values(): N = len(sentence["nodes"]) edge_count = utils.create_edges(sentence, attributes=True) self.edge_counter += edge_count self.no_edge_counter += N * (N - 1) - edge_count sentence["anchor edges"] = [N, len(sentence["input"]), []] for i, node in enumerate(sentence["nodes"]): for anchor in node["anchors"]: sentence["anchor edges"][-1].append((i, anchor)) anchor_count += len(node["anchors"]) n_node_token_pairs += len(sentence["input"]) sentence["id"] = [sentence["id"]] sentence["top"] = 0 """ for k in sentence.keys(): print(k) print(sentence[k]) print("*****************************************") """ self.anchor_freq = anchor_count / n_node_token_pairs self.input_count = sum( len(sentence["input"]) for sentence in self.data.values()) super(DcrParser, self).__init__(fields, self.data, filter_pred)