예제 #1
0
 def to_graph(self) -> Graph:
     g = Graph(
         index_node_type=True,
         index_node_label=True,
         index_link_label=True,
         estimated_n_nodes=self.get_n_nodes(),
         estimated_n_links=self.get_n_links(),
         name=self.name)
     for n in self.iter_nodes():
         g.add_new_node(n.type, n.label)
     for e in self.iter_links():
         g.add_new_link(e.type, e.label, e.source_id, e.target_id)
     return g
예제 #2
0
    def from_json(obj: dict, ont: Ontology) -> 'SSD':
        g = Graph(True, True, True)
        node2attr = {x['node']: x['attribute'] for x in obj['mappings']}
        idmap = {}
        raw_attributes = {}
        for raw_attr in obj['attributes']:
            assert len(raw_attr['columnIds']
                       ) == 1 and raw_attr['columnIds'][0] == raw_attr['id']
            raw_attributes[raw_attr['id']] = raw_attr

        attrs = []
        for n in obj['semanticModel']['nodes']:
            if n['type'] == 'DataNode':
                node_type = GraphNodeType.DATA_NODE
                attr = raw_attributes[node2attr[n['id']]]
                n_lbl = attr['name']
                attrs.append(SSDAttribute(n['id'], n_lbl))
            else:
                node_type = GraphNodeType.CLASS_NODE
                n_lbl = n['prefix'] + n['label']
                n_lbl = ont.simplify_uri(n_lbl)

            idmap[n['id']] = g.add_new_node(node_type, n_lbl.encode()).id

        for e in obj['semanticModel']['links']:
            e_lbl = e['prefix'] + e['label']
            e_lbl = ont.simplify_uri(e_lbl)
            g.add_new_link(GraphLinkType.UNSPECIFIED, e_lbl.encode(),
                           idmap[e['source']], idmap[e['target']])

        return SSD(obj['name'], attrs, g, ont)
예제 #3
0
def make_ssd(sm: SemanticModel, keys: Set[str], ont: Ontology) -> SSD:
    attrs = {}
    for attr in sm.attrs:
        # new_lbl = attr.label.replace(Schema.PATH_DELIMITER, ".")
        new_lbl = attr.label
        attrs[attr.id] = SSDAttribute(attr.id, new_lbl)
        assert new_lbl in keys

    g = Graph()
    for n in sm.graph.iter_nodes():
        if n.is_data_node():
            label = attrs[n.id].name.encode()
        else:
            label = n.label
        g.add_new_node(n.type, label)
    for e in sm.graph.iter_links():
        g.add_new_link(e.type, e.label, e.source_id, e.target_id)
    return SSD(sm.id, list(attrs.values()), g, ont)
예제 #4
0
 def mask_dnode(self, g: Graph) -> Graph:
     """deprecated"""
     g2 = Graph(True, True, True, g.get_n_nodes(), g.get_n_links())
     for n in g.iter_nodes():
         assert g2.add_new_node(
             n.type, n.label if n.type == GraphNodeType.CLASS_NODE else
             b"DataNode").id == n.id
     for e in g.iter_links():
         assert g2.add_new_link(e.type, e.label, e.source_id,
                                e.target_id).id == e.id
     return g2
예제 #5
0
    def clear_serene_footprint(self, remove_unknown: bool = True) -> 'SSD':
        g = Graph(True, True, True)
        idmap = {}

        serene_all = None
        serene_unknown = None
        for n in self.graph.iter_nodes():
            if n.label == b"serene:All":
                serene_all = n
                continue

            if n.label == b"serene:Unknown":
                serene_unknown = n
                continue

        ignore_nodes = set()
        if serene_all is not None:
            ignore_nodes.add(serene_all.id)

        if remove_unknown and serene_unknown is not None:
            ignore_nodes.add(serene_unknown.id)
            for e in self.graph.iter_links():
                if e.source_id == serene_unknown.id:
                    assert e.get_target_node().is_data_node()
                    ignore_nodes.add(e.target_id)

        if len(ignore_nodes) == 0:
            # no serene footprint to remove
            return self

        for n in self.graph.iter_nodes():
            if n.id in ignore_nodes:
                continue

            idmap[n.id] = g.add_new_node(n.type, n.label).id
        for e in self.graph.iter_links():
            if e.label == b"serene:connect":
                continue
            if remove_unknown and e.label == b"serene:unknown":
                continue
            g.add_new_link(e.type, e.label, idmap[e.source_id],
                           idmap[e.target_id])

        self.graph = g
        return self
예제 #6
0
    def apply_cmds(self, tbl: DataTable) -> SemanticModel:
        g = Graph(index_node_type=True,
                  index_node_label=True,
                  index_link_label=True,
                  name=tbl.id.encode("utf-8"))
        attrs: List[Attribute] = []
        id_map: Dict[str, int] = {}

        for cmd in self.commands:
            if isinstance(cmd, PyTransformNewColumnCmd):
                # TODO: fix me! currently the new attr_path is generated from first input_attr_path
                # we should be explicitly about the output, since the first input attr path can be different
                # may be it should be the deepest attr path
                new_attr_path = Schema.PATH_DELIMITER.join(
                    cmd.input_attr_paths[0].split(Schema.PATH_DELIMITER)[:-1] +
                    [cmd.new_attr_name])
                # assert not tbl.schema.has_attr_path(new_attr_path)
                # TODO: fix me!! not handle list of input attr path properly (cmd.input_attr_paths[0])
                tbl.schema.add_new_attr_path(
                    new_attr_path,
                    tbl.schema.get_attr_type(cmd.input_attr_paths[0]),
                    cmd.input_attr_paths[-1])
                self.pytransform(tbl, cmd)
            elif isinstance(cmd, SetSemanticTypeCmd):
                lbl = cmd.input_attr_path.encode("utf-8")
                assert cmd.input_attr_path not in id_map
                id_map[cmd.input_attr_path] = g.add_new_node(
                    GraphNodeType.DATA_NODE, lbl).id
                if cmd.node_id not in id_map:
                    id_map[cmd.node_id] = g.add_new_node(
                        GraphNodeType.CLASS_NODE,
                        cmd.domain.encode("utf-8")).id

                attrs.append(
                    Attribute(id_map[cmd.input_attr_path], cmd.input_attr_path,
                              []))
                g.add_new_link(GraphLinkType.UNSPECIFIED,
                               cmd.type.encode("utf-8"), id_map[cmd.node_id],
                               id_map[cmd.input_attr_path])
            elif isinstance(cmd, SetInternalLinkCmd):
                if cmd.source_id not in id_map:
                    id_map[cmd.source_id] = g.add_new_node(
                        GraphNodeType.CLASS_NODE,
                        cmd.source_uri.encode('utf-8')).id
                if cmd.target_id not in id_map:
                    id_map[cmd.target_id] = g.add_new_node(
                        GraphNodeType.CLASS_NODE,
                        cmd.target_uri.encode('utf-8')).id

                assert g.get_node_by_id(
                    id_map[cmd.target_id]).n_incoming_links == 0
                g.add_new_link(GraphLinkType.UNSPECIFIED,
                               cmd.link_lbl.encode("utf-8"),
                               id_map[cmd.source_id], id_map[cmd.target_id])
            elif isinstance(cmd, ZipAttributesCmd):
                for row in tbl.rows:
                    cmd.zip_attributes(row)
                # TODO: fix me!! re-build schema, which is very expensive
                tbl.rebuild_schema()
            elif isinstance(cmd, UnpackOneElementListCmd):
                assert tbl.schema.get_attr_type(
                    cmd.input_attr) == Schema.LIST_VALUE
                for row in tbl.rows:
                    cmd.unpack(row)
                tbl.schema.update_attr_path(cmd.input_attr,
                                            Schema.SINGLE_VALUE)
            elif isinstance(cmd, AddLiteralColumnCmd):
                tbl.schema.add_new_attr_path(cmd.input_attr_path,
                                             tbl.schema.SINGLE_VALUE)
                for row in tbl.rows:
                    cmd.add_literal(row)
            elif isinstance(cmd, JoinListCmd):
                for row in tbl.rows:
                    cmd.execute(row)
                tbl.schema.update_attr_path(cmd.input_attr_path,
                                            Schema.SINGLE_VALUE)
            else:
                raise NotImplementedError(cmd.__class__.__name__)

        return SemanticModel(tbl.id, attrs, g)