def construct_aggregation_tree(self, flat_paths): reversed_index = {} list_nodes = [] for path in flat_paths: n_name = self.mapping["root"] current_parent_edge = None level = 0 for i, p in enumerate(path.path): if (n_name, current_parent_edge) in reversed_index: n_current = list_nodes[reversed_index[( n_name, current_parent_edge)]] else: n_current = AggregatedNode( n_name, get_node_table_name(self.model, n_name), current_parent_edge, level, ) list_nodes.append(n_current) reversed_index[(n_name, current_parent_edge)] = len(list_nodes) - 1 child_name, edge_tbl = get_edge_table(self.model, n_name, p) n_child = (list_nodes[reversed_index[(child_name, edge_tbl)]] if (child_name, edge_tbl) in reversed_index else AggregatedNode( child_name, get_node_table_name(self.model, child_name), edge_tbl, level + 1, )) n_child.parent = n_current if i == len(path.path) - 1: for reducer in path.reducers: prop = self.create_prop_from_json( self.doc_type, reducer, None) n_child.reducers.append(Reducer(prop, reducer["fn"])) n_current.add_child(n_child) if (child_name, edge_tbl) not in reversed_index: list_nodes.append(n_child) reversed_index[(child_name, edge_tbl)] = len(list_nodes) - 1 n_name = child_name current_parent_edge = edge_tbl level += 1 return list_nodes, Parser.get_leaves(list_nodes)
def get_table_list_from_path(self, p, root, path): r = [] splitted_path = path.split(".") if path else [] node = get_node_table_name(p.model, root) r.append(node) for i in splitted_path: root, node = get_edge_table(p.model, root, i) r.append(node) node = get_node_table_name(p.model, root) r.append(node) return r
def translate_parent(self, root_df): if len(self.parser.parent_nodes) == 0: return root_df root_tbl = get_node_table_name(self.parser.model, self.parser.root) root_id = self.parser.get_key_prop().id for f in self.parser.parent_nodes: df = self.translate_table(root_tbl, props=[]) n = f.head first = True while n is not None: edge_tbl = n.edge_up_tbl df = df.join(self.translate_edge(edge_tbl, reversed=False)) if first: df = df.map(lambda x: (x[1][1], ({root_id: x[0]},) + (x[1][0],))) \ .mapValues(lambda x: merge_dictionary(x[0], x[1])) first = False else: df = df.map(lambda x: (x[1][1], x[1][0])) cur_props = n.props tbl = n.tbl n_df = self.translate_table(tbl, props=cur_props) df = n_df.join(df) \ .mapValues(lambda x: merge_and_fill_empty_props(x, cur_props)) n = n.child df = df.map(lambda x: make_key_from_property(x[1], root_id)) root_df = root_df.leftOuterJoin(df).mapValues( lambda x: merge_dictionary(x[0], x[1])) return root_df
def json_to_parent_node(self, path): words = path.split(".") nodes = [ tuple([_f for _f in re.split("[\[\]]", w) if _f]) for w in words ] first = None prev = None prev_label = self.root for nd in nodes: n = nd[0] p = nd[1] if len(nd) > 1 else None parent_name, edge_tbl = get_edge_table(self.model, prev_label, n) parent_tbl = get_node_table_name(self.model, parent_name) if p is not None: json_props = [{ "name": p[0], "src": p[1] } for p in self.get_src_name(p.split(","))] props = self.create_props_from_json(self.doc_type, json_props, node_label=parent_name) else: props = [] cur = ParentNode(parent_name, parent_tbl, edge_tbl, props) if prev is not None: prev.child = cur else: first = cur prev_label = parent_name prev = cur return first
def json_to_parent_node(self, path): words = path.split('.') nodes = [tuple(filter(None, re.split('[\[\]]', w))) for w in words] first = None prev = None prev_label = self.root for nd in nodes: n = nd[0] p = nd[1] if len(nd) > 1 else None parent_name, edge_tbl = get_edge_table(self.model, prev_label, n) parent_tbl = get_node_table_name(self.model, parent_name) if p is not None: json_props = [{ 'name': p[0], 'src': p[1] } for p in self.get_src_name(p.split(','))] props = self.create_props_from_json(self.doc_type, json_props, node_label=parent_name) else: props = [] cur = ParentNode(parent_name, parent_tbl, edge_tbl, props) if prev is not None: prev.child = cur else: first = cur prev_label = parent_name prev = cur return first
def json_to_special_node(self, path): """ Create node in the path of special aggregation :param path: path define the node and the prop to be aggregated :return: """ words = path.split(".") nodes = [ tuple([_f for _f in re.split("[\[\]]", w) if _f]) for w in words ] first = None prev = None prev_label = self.root for (n, str_p) in nodes: child_name, edge_tbl = get_edge_table(self.model, prev_label, n) child_tbl = get_node_table_name(self.model, child_name) json_props = [{"name": p, "src": p} for p in str_p.split(",")] props = self.create_props_from_json(self.doc_type, json_props, node_label=child_name) cur = SpecialNode(child_name, child_tbl, edge_tbl, props) if prev is not None: prev.child = cur else: first = cur prev_label = child_name prev = cur return first
def translate(self): root_tbl = get_node_table_name(self.parser.model, self.parser.root) root_df = self.translate_table(root_tbl, props=self.parser.props) root_df = self.translate_special(root_df) root_df = self.translate_parent(root_df) root_df = self.get_direct_children(root_df) if len(self.parser.aggregated_nodes) == 0: return root_df return root_df.join(self.aggregate_nested_properties()).mapValues( lambda x: merge_dictionary(x[0], x[1]))
def get_props_for_nodes(self): prop_nodes = {} roots = {} for (k, v) in self.mapping.get("injecting_props", {}).items(): if k == "project" and "project_code" not in [ p.get("name") for p in v.get("props") ]: v.get("props").append({"name": PROJECT_CODE, "src": "code"}) if k != "program": prop_nodes[k] = CollectingNode( k, get_node_table_name(self.model, k), props=self.create_props_from_json(self.doc_type, v.get("props"), node_label=k), ) else: node_props = v.get("props") node_props.append({"name": PROGRAM_NAME, "src": "name"}) roots[k] = RootNode( k, get_node_table_name(self.model, k), self.create_props_from_json(self.doc_type, node_props, node_label=k, is_additional=True), ) if "project" not in prop_nodes.keys(): prop_nodes["project"] = CollectingNode( "project", get_node_table_name(self.model, "project"), props=self.create_props_from_json( self.doc_type, [{ "name": PROJECT_CODE, "src": "code" }], node_label="project", is_additional=True, ), ) return prop_nodes, roots
def create_auth_path_root(self): program_table_name = get_node_table_name(self.model, 'program') project_table_name = get_node_table_name(self.model, 'project') _, edge_up_tbl = get_edge_table(self.model, 'project', 'programs') root_program = RootNode( 'auth_path_root', program_table_name, self.create_props_from_json(self.doc_type, [{ 'name': 'program_name', 'src': 'name' }], node_label='program')) root_project = RootNode( 'project', project_table_name, self.create_props_from_json(self.doc_type, [{ 'name': 'project_code', 'src': 'code' }], node_label='project'), edge_up_tbl) root_program.root_child = root_project return root_program
def add_collecting_node(self, child, collectors, fst): parent_name = get_node_label( self.model, get_parent_name(self.model, child.name, fst)) _, edge_up_tbl = get_edge_table(self.model, child.name, fst) tbl_name = get_node_table_name( self.model, get_parent_label(self.model, child.name, fst)) collecting_node = (collectors[parent_name] if parent_name in collectors else CollectingNode(parent_name, tbl_name)) collecting_node.add_child(child) child.add_parent(collecting_node.name, edge_up_tbl) collectors[parent_name] = collecting_node return collecting_node
def add_root_node(self, child, roots, segment): root_name = get_node_label( self.model, get_parent_name(self.model, child.name, segment)) _, edge_up_tbl = get_edge_table(self.model, child.name, segment) root_tbl_name = get_node_table_name( self.model, get_parent_label(self.model, child.name, segment)) top_node = roots[root_name] if root_name in roots \ else RootNode(root_name, root_tbl_name, self.create_props_from_json(self.doc_type, self.mapping['injecting_props'][root_name]['props'], node_label=root_name)) child.add_parent(top_node.name, edge_up_tbl) top_node.add_child(child) roots[root_name] = top_node
def get_orphan_paths(self, selected_category, leaves): leaves_name = [ k for (k, v) in self.dictionary.schema.items() if v.get('category', None) == selected_category ] orphan_leaves = set([]) for name in leaves_name: self.leaves.add( LeafNode(name, get_node_table_name(self.model, name))) if name not in leaves: orphan_leaves.add(name) if len(orphan_leaves) > 0: return self.get_shortest_path_from_root(['program', 'project'], orphan_leaves) return set([])
def get_collecting_nodes(self): def selected_category_comparer(dictionary, x): return get_node_category(dictionary, x) == selected_category selected_category = self.mapping.get("category", "data_file") flat_paths = self.create_collecting_paths_from_root( "program", lambda x: selected_category_comparer(self.dictionary, x)) leaves = set([p.src for p in flat_paths]) for l in leaves: self.leaves.add(LeafNode(l, get_node_table_name(self.model, l))) nodes_with_props, roots = self.get_props_for_nodes() self.collectors, self.roots = self.create_tree_from_generated_edges( flat_paths, nodes_with_props, roots) self.update_level() self.collectors.sort()
def translate_special(self, root_df): """ If etlMapping have special_props entry that defines a special function, run this translation :param root_df: The special function also have the same root with hosted document (case or subject) :return: Return the origin rdd with result from special function included inside """ if len(self.parser.special_nodes) == 0: return root_df root_tbl = get_node_table_name(self.parser.model, self.parser.root) root_id = self.parser.get_key_prop().id for f in self.parser.special_nodes: if f.fn[0] == "sliding": df = self.translate_table(root_tbl, props=[]) n = f.head first = True while n is not None: edge_tbl = n.edge_up_tbl df = df.join(self.translate_edge(edge_tbl)) if first: df = df.map(lambda x: (x[1][1], ({ root_id: x[0] }, ) + (x[1][0], ))).mapValues( lambda x: merge_dictionary(x[0], x[1])) first = False else: df = df.map(lambda x: (x[1][1], x[1][0])) cur_props = n.props tbl = n.tbl n_df = self.translate_table(tbl, props=cur_props) df = n_df.join(df).mapValues( lambda x: merge_and_fill_empty_props(x, cur_props)) n = n.child df = df.map(lambda x: make_key_from_property(x[1], root_id)) (n, fn1, fn2) = tuple(f.fn[1:]) fid = self.parser.get_prop_by_name(f.name).id df = df.mapValues(lambda x: tuple([ v for (k, v) in list( collections.OrderedDict(sorted(x.items())).items()) ])) df = sliding(df, int(n.strip()), fn1.strip(), fn2.strip()).mapValues(lambda x: {fid: x}) root_df = root_df.leftOuterJoin(df).mapValues( lambda x: merge_dictionary(x[0], x[1])) return root_df
def create_tree_from_generated_edges(self, flat_paths, nodes_with_props, roots): collectors = nodes_with_props checking_set = set(self.generated_edges) for p in flat_paths: segments = list(p.path) _, edge_up_tbl = get_edge_table(self.model, p.src, segments[0]) if edge_up_tbl not in checking_set: continue if p.src not in collectors: tbl_name = get_node_table_name(self.model, p.src) collectors[p.src] = CollectingNode(p.src, tbl_name) child = collectors[p.src] if len(segments) > 1: for fst in segments[0:len(segments) - 1]: _, edge_up_tbl = get_edge_table(self.model, p.src, segments[0]) if edge_up_tbl not in checking_set: break child = self.add_collecting_node(child, collectors, fst) self.add_root_node(child, roots, segments[-1]) return list(collectors.values()), list(roots.values())
def add_root_node(self, child, roots, segment): root_name = get_node_label( self.model, get_parent_name(self.model, child.name, segment)) _, edge_up_tbl = get_edge_table(self.model, child.name, segment) root_tbl_name = get_node_table_name( self.model, get_parent_label(self.model, child.name, segment)) top_node = (roots[root_name] if root_name in roots else RootNode( root_name, root_tbl_name, self.create_props_from_json( self.doc_type, [{ "name": "program_name", "src": "name" }], node_label=root_name, is_additional=True, ), )) child.add_parent(top_node.name, edge_up_tbl) top_node.add_child(child) roots[root_name] = top_node