Пример #1
0
def get_all_rels(file_name,
                 rel_json,
                 rel_neo_csv,
                 entities=None,
                 rel_csv_header=(":START_ID", ":END_ID", ":TYPE", "name"),
                 parse_line=parse_line):
    lines = read_from_file(file_name)
    relations = {}
    idx = 0
    rel_data = []
    if not entities:
        entities = get_all_entities(file_name, None, None)
    for line in tqdm(range(len(lines))):
        line = lines[line].strip()
        try:
            ent1, rel, ent2 = parse_line(line)
        except:
            print(line)
            continue
        if rel not in relations:
            incDic(relations, rel, idx)
            idx += 1
        rel_data.append((entities[ent1], entities[ent2], 'Relation', rel))
    logger.info("origin relationships: %d" % len(rel_data))
    rel_data = list(set(rel_data))
    logger.info("duplicated data deleted: %d" % len(rel_data))
    write_csv(rel_neo_csv, rel_data, rel_csv_header)
    dump_json(rel_json, relations)
    return relations
Пример #2
0
def get_all_entities(file_name,
                     entity_json,
                     ent_neo_csv,
                     ent_csv_header=("id:ID", "name", ":LABEL"),
                     parse_line=parse_line):
    lines = read_from_file(file_name)
    entities = {}
    idx = 0
    csv_data = []
    for line in tqdm((range(len(lines)))):
        line = lines[line].strip()
        try:
            ent1, rel, ent2 = parse_line(line)
        except:
            print(line.split('\t'))
            continue
        if ent1 not in entities:
            incDic(entities, ent1, idx)
            idx += 1
            csv_data.append((str(idx), ent1, 'Entity'))
        if ent2 not in entities:
            incDic(entities, ent2, idx)
            idx += 1
            csv_data.append((str(idx), ent2, 'Entity'))
    if ent_neo_csv:
        write_csv(ent_neo_csv, csv_data, ent_csv_header)
    dump_json(entity_json, entities)
    return entities
Пример #3
0
def split_train_valid_test_triplets(file_name,
                                    rel_tris_num,
                                    save_files,
                                    train_valid_test=(0.8, 0.1, 0.1),
                                    parse_line=parse_clean_line):
    lines = read_from_file(file_name)
    rel_with_tris = {}
    for line in tqdm(range(len(lines))):
        line = lines[line].strip()
        triples = parse_line(line)
        ent1, rel, ent2 = triples
        appendDic(rel_with_tris, rel, triples)
    train_triplets = []
    valid_triplets = []
    test_triplets = []
    rel_tri_nums = {}
    for rel, triplets in rel_with_tris.items():
        num = len(triplets)
        rel_tri_nums[rel] = num
        test_num = math.ceil(train_valid_test[2] * num)
        valid_num = math.ceil(train_valid_test[1] * num)
        # print(test_num)
        tests = triplets[:test_num]
        valids = triplets[test_num:valid_num + test_num]
        trains = triplets[valid_num + test_num:]
        train_triplets.extend(trains)
        valid_triplets.extend(valids)
        test_triplets.extend(tests)
    rel_tri_nums = sorted_dict(rel_tri_nums)
    dump_json(rel_tris_num, rel_tri_nums)
    write_to_file(save_files[0], get_lines_format(train_triplets))
    write_to_file(save_files[1], get_lines_format(valid_triplets))
    write_to_file(save_files[2], get_lines_format(test_triplets))
Пример #4
0
 def get_rel_barrel_graph(self, graph, barrel, **kwargs):
     # self.filtered_rel_graph = GetSeedGraph.get_relation_seed(graph, barrel)
     # self.filtered_rel_dual_graph, self.filtered_rel_dict2rel = self.graphcls.get_dual_graph_from_tuples(graph, isTrans=False)
     self.filtered_rel_dual_graph_origin = GetSeedGraph.get_dual_seed(graph, barrel)
     self.filtered_rel2id, self.filtered_rel_dual_graph = get_rel_dict(self.filtered_rel_dual_graph_origin)
     self.filtered_rel_dict2rel = transpose_dict(self.filtered_rel2id)
     save_tuples_to_txt(self.filtered_rel_dual_graph, self.rel_graph)
     dump_json(self.rel_dual_rel2dict, self.filtered_rel_dict2rel)
     return set(self.filtered_rel2id.keys())
def get_post_community_with_attr_and_entity(community_json, info_save_json):
    communities = load_json(community_json)
    info_dic = {}
    for key, comm in communities.items():
        info_dic[key] = {}
        info_dic[key]["ent_and_attr_number"] = len(comm)
        rel_list = []
        for item in comm:
            if item.startswith("_<"):
                rel_list.append(item[1:])
        info_dic[key]["attr_number"] = len(rel_list)
        info_dic[key]["attr"] = rel_list
    dump_json(info_save_json, info_dic)
    return communities, info_dic
Пример #6
0
 def save_related_attr_infos(self,
                             community_graphs,
                             community_related_infos,
                             community_significant_infos,
                             community_graphs_json=None,
                             left_related_infos_json=None,
                             left_significants_json=None,
                             no_attr_comm_json=None,
                             single_ent_comm_json=None):
     dump_json(community_graphs_json, community_graphs)
     dump_json(left_related_infos_json, community_related_infos)
     dump_json(left_significants_json, community_significant_infos)
     dump_json(no_attr_comm_json, self.no_rel_comm)
     dump_json(single_ent_comm_json, self.filtered_comm)
Пример #7
0
def get_type2ents(file_name,
                  json_file,
                  ent_type_maps=None,
                  type_key='<类型>',
                  encoding_format='utf-8'):
    if ent_type_maps is None:
        ent_type_maps = get_type_dic(file_name, None, type_key,
                                     encoding_format)
    type2ents_map = {}
    for ent, types in ent_type_maps.items():
        for typ in types:
            appendDic(type2ents_map, typ, ent)
    logger.info("number of types: %d" % len(type2ent2_map))
    dump_json(json_file, type2ents_map)
Пример #8
0
    def load_graph(self, graph_triples=None, head_triples=None, tail_triples=None, rel_triples=None, **kwargs):
        if graph_triples is None:
            self.graphcls = BaseGetDualGraph(Config=self.config, **kwargs)
            self.graph = self.graphcls.graph
            self.graph_head_triples = self.graphcls.graph_head_triples
            self.graph_tail_triples = self.graphcls.graph_tail_triples
            self.graph_rel_triples = self.graphcls.graph_rel_triples
            is_origin_graph = self.is_origin_graph
            try:
                self.attr_graph = self.graphcls.attr_graph
                self.attr_head_triples, self.attr_tail_triples, self.attr_rel_triples \
                    = self.graphcls.attr_head_triples, self.graphcls.attr_tail_triples, self.graphcls.attr_rel_triples
            except:
                pass
        elif head_triples is None or tail_triples is None or rel_triples is None:
            head_triples, tail_triples, rel_triples = {}, {}, {}
            for triple in graph_triples:
                head, rel, tail = triple
                incDicWithAdd(head_triples, head, triple)
                incDicWithAdd(tail_triples, tail, triple)
                incDicWithAdd(rel_triples, rel, triple)
            self.graph = graph_triples
            self.graph_head_triples = head_triples
            self.graph_tail_triples = tail_triples
            self.graph_rel_triples = rel_triples
            is_origin_graph = False
        else:
            self.graph = graph_triples
            self.attr_head_triples = head_triples
            self.graph_tail_triples = tail_triples
            self.graph_rel_triples = rel_triples
            is_origin_graph = False

        if self.is_dual_graph:
            self.dual_graphs, self.dict2rel = self.graphcls.get_dual_graph_from_tuples(self.graph, isTrans=None)
            self.dual_graph_origin, self.dual_graph = self.dual_graphs
            save_tuples_to_txt(self.dual_graph, self.dual_graph_txt)
            dump_json(self.origin_dual_rel2dict, self.dict2rel)
        entity, relations = get_tuples_dict(self.graph)
        self.total_ent_num = len(entity)
        self.total_rel_num = len(relations)
        if is_origin_graph:
            self.graph_txt = self.triple_txt
        else:
            save_tuples_to_txt(self.graph, self.graph_txt)
Пример #9
0
def get_type_dic(file_name,
                 json_file,
                 type_key='<类型>',
                 encoding_format='utf-8',
                 parse_line=parse_line):
    lines = read_from_file(file_name)
    ent_types_maps = {}
    for line in tqdm(range(len(lines))):
        line = lines[line].strip()
        try:
            ent1, rel, ent2 = parse_line(line)
        except:
            print(line)
            continue
        if rel == type_key:
            appendDic(ent_types_maps, ent1, ent2)
    dump_json(json_file, ent_types_maps)
    return ent_types_maps
Пример #10
0
 def reverse_graph_tree(
         cls,
         filename: str,
         labelfile: str,
         outfile: Union[str, None],
         node2node_renumber: Union[str, None] = None) -> dict:
     print("filename: {}".format(filename))
     lines = read_from_file(filename)
     print("lines: {}".format(lines))
     id2label = load_json(labelfile)
     node2comm_list = cls.split_graph_tree(lines)
     last_partition = cls.cluster_community(node2comm_list)
     if node2node_renumber:
         node2node = load_json(node2node_renumber)
         last_partition = cls.reverse_node2label(node2node, last_partition)
     communs = cls.reverse_node2label(id2label, last_partition)
     communs_dict = cls.revert_list_to_dict(communs)
     dump_json(outfile, communs_dict)
     return communs_dict
def convert_triple_to_needs(filename,
                            outfile,
                            direct=True,
                            save_node2idx=None,
                            weight_property=None):
    lines = read_from_file(filename)
    node_idx = {}
    node_weights = {}
    for line in tqdm(range(len(lines))):
        line = lines[line].strip()
        triple = parse_line(line)
        if triple is None:
            continue
        ent1, rel, ent2 = triple
        if save_node2idx is not None:
            ent1idx = node2idx(node_idx, ent1)
            ent2idx = node2idx(node_idx, ent2)
        else:
            ent1idx = int(ent1)
            ent2idx = int(ent2)
        val = None
        try:
            val = float(rel)
        except:
            if weight_property is not None and rel.startswith("\""):
                val = weight_property
        key = " ".join([str(ent1idx), str(ent2idx)])
        incDicWithWeightAdd(node_weights, key, val)
        if not direct:
            key = " ".join([str(ent2idx), str(ent1idx)])
            incDicWithWeightAdd(node_weights, key, val)
    all_out_lines = []
    for key, val in tqdm(node_weights.items()):
        line = key + " " + str(val) + '\n'
        all_out_lines.append(line)
    idx_node = {}
    for node, idx in tqdm(node_idx.items()):
        idx_node[idx] = node
    dump_json(save_node2idx, idx_node)
    write_to_file(outfile, all_out_lines)
Пример #12
0
def filter_less_use_community(info_dic, left_comm_json, filtered_comm_json,
                              no_rel_comm_json):
    filtered_comm = {}
    no_rel_comm = {}
    left_comm = {}
    for key, info in info_dic.items():
        if info["attr_number"] == 0:
            this_key = len(no_rel_comm)
            no_rel_comm[this_key] = info
        elif info["ent_and_attr_number"] - info["attr_number"] <= 1:
            this_key = len(filtered_comm)
            filtered_comm[this_key] = info
        else:
            this_key = len(left_comm)
            left_comm[this_key] = info
    dump_json(left_comm_json, left_comm)
    dump_json(filtered_comm_json, filtered_comm)
    dump_json(no_rel_comm_json, no_rel_comm)
    return left_comm, filtered_comm, no_rel_comm
Пример #13
0
 def __call__(self, **kwargs):
     self.modularity.load_graph()
     community = self.modularity.get_last_community(
         **self.modularity_setting, **kwargs)
     all_out_dict = ProcessModularity.revert_list_to_dict(community)
     dump_json(self.outfile, all_out_dict)