예제 #1
0
    def translate_parent(self, root_df):
        if len(self.parser.parent_nodes) == 0:
            return root_df
        root_tbl = get_node_table_name(self.parser.model, self.parser.root)
        root_id = self.parser.get_key_prop().id
        for f in self.parser.parent_nodes:
            df = self.translate_table(root_tbl, props=[])
            n = f.head
            first = True
            while n is not None:
                edge_tbl = n.edge_up_tbl
                df = df.join(self.translate_edge(edge_tbl, reversed=False))
                if first:
                    df = df.map(lambda x: (x[1][1], ({root_id: x[0]},) + (x[1][0],))) \
                        .mapValues(lambda x: merge_dictionary(x[0], x[1]))
                    first = False
                else:
                    df = df.map(lambda x: (x[1][1], x[1][0]))
                cur_props = n.props
                tbl = n.tbl
                n_df = self.translate_table(tbl, props=cur_props)

                df = n_df.join(df) \
                    .mapValues(lambda x: merge_and_fill_empty_props(x, cur_props))
                n = n.child
            df = df.map(lambda x: make_key_from_property(x[1], root_id))
            root_df = root_df.leftOuterJoin(df).mapValues(
                lambda x: merge_dictionary(x[0], x[1]))
        return root_df
예제 #2
0
 def get_direct_children(self, root_df):
     """
     Get data of all directed nodes and attach to root node
     :param root_df:
     :return:
     """
     for n in self.parser.flatten_props:
         # if n is a child of root node, we don't need to swap order of the pair ids
         edge_df = self.translate_edge(n.edge, not n.props_from_child)
         props = n.props
         if n.sorted_by is not None:
             sorting_prop = PropFactory.adding_prop(self.parser.doc_type,
                                                    n.sorted_by,
                                                    n.sorted_by, [])
             props.append(sorting_prop)
         child_df = self.translate_table(n.tbl_name, props=props)
         child_by_root = edge_df.join(child_df).map(
             lambda x: tuple([x[1][0], x[1][1]]))
         if n.sorted_by is not None:
             child_by_root = child_by_root.groupByKey()
             child_by_root = child_by_root.mapValues(
                 lambda it: sort_by_field(it, sorting_prop.id, n.desc_order
                                          )[0])
             child_by_root = child_by_root.mapValues(
                 lambda x:
                 {k: v
                  for (k, v) in x.items() if k != sorting_prop.id})
         root_df = root_df.leftOuterJoin(child_by_root).mapValues(
             lambda x: merge_and_fill_empty_props(x, n.props))
         child_df.unpersist()
         child_by_root.unpersist()
     return root_df
예제 #3
0
 def join_no_aggregate(self, df, joining_df, dual_props):
     joining_df = self.get_props_from_df(joining_df, dual_props)
     df = df.leftOuterJoin(
         joining_df).mapValues(lambda x: merge_and_fill_empty_props(
             x, [p.get("dst") for p in dual_props]))
     joining_df.unpersist()
     return df
예제 #4
0
 def join_and_aggregate(self, df, joining_df, dual_props, joining_node):
     frame_zero = get_frame_zero(joining_node.getting_fields)
     joining_df = self.get_props_from_df(joining_df, dual_props)\
         .mapValues(get_normal_frame(joining_node.getting_fields))\
         .aggregateByKey(frame_zero, seq_aggregate_with_reducer, merge_aggregate_with_reducer)\
         .mapValues(lambda x: {x1: x2 for (x0, x1, x2) in x})
     df = df.leftOuterJoin(joining_df)\
         .mapValues(lambda x: merge_and_fill_empty_props(x, [p.get('dst') for p in dual_props]))
     joining_df.unpersist()
     return df
예제 #5
0
 def collect_leaf(self, child, edge_df, collected_leaf_dfs, root_props=None):
     root_props = self.root_props if root_props is None else root_props
     if type(child) is LeafNode:
         child_df = self.translate_table(child.tbl_name, props=self.parser.props)
         if child_df.isEmpty():
             return
         child_df = child_df.join(edge_df).mapValues(
             lambda x: merge_and_fill_empty_props(x, root_props, to_tuple=True))
         collected_leaf_dfs['final'] = child_df if 'final' not in collected_leaf_dfs \
             else collected_leaf_dfs['final'].union(child_df).distinct()
         if child.name in collected_leaf_dfs:
             collected_leaf_dfs[child.name].unpersist()
         child.done = True
예제 #6
0
 def translate_special(self, root_df):
     """
     If etlMapping have special_props entry that defines a special function, run this translation
     :param root_df: The special function also have the same root with hosted document (case or subject)
     :return: Return the origin rdd with result from special function included inside
     """
     if len(self.parser.special_nodes) == 0:
         return root_df
     root_tbl = get_node_table_name(self.parser.model, self.parser.root)
     root_id = self.parser.get_key_prop().id
     for f in self.parser.special_nodes:
         if f.fn[0] == "sliding":
             df = self.translate_table(root_tbl, props=[])
             n = f.head
             first = True
             while n is not None:
                 edge_tbl = n.edge_up_tbl
                 df = df.join(self.translate_edge(edge_tbl))
                 if first:
                     df = df.map(lambda x: (x[1][1], ({
                         root_id: x[0]
                     }, ) + (x[1][0], ))).mapValues(
                         lambda x: merge_dictionary(x[0], x[1]))
                     first = False
                 else:
                     df = df.map(lambda x: (x[1][1], x[1][0]))
                 cur_props = n.props
                 tbl = n.tbl
                 n_df = self.translate_table(tbl, props=cur_props)
                 df = n_df.join(df).mapValues(
                     lambda x: merge_and_fill_empty_props(x, cur_props))
                 n = n.child
             df = df.map(lambda x: make_key_from_property(x[1], root_id))
             (n, fn1, fn2) = tuple(f.fn[1:])
             fid = self.parser.get_prop_by_name(f.name).id
             df = df.mapValues(lambda x: tuple([
                 v for (k, v) in list(
                     collections.OrderedDict(sorted(x.items())).items())
             ]))
             df = sliding(df, int(n.strip()), fn1.strip(),
                          fn2.strip()).mapValues(lambda x: {fid: x})
             root_df = root_df.leftOuterJoin(df).mapValues(
                 lambda x: merge_dictionary(x[0], x[1]))
     return root_df
예제 #7
0
 def merge_auth_root(self, root):
     df = self.translate_table(root.tbl_name, props=root.props)
     child = root.root_child
     props = copy(root.props)
     while child is not None:
         edge_tbl = child.edge_to_parent
         child_props = child.props
         df = df.join(self.translate_edge(edge_tbl)) \
             .map(lambda x: (x[1][1], x[1][0]))
         tbl_name = child.tbl_name
         df = df.join(self.translate_table(tbl_name, props=child_props)) \
             .mapValues(lambda x: merge_and_fill_empty_props(x, child_props))
         props.extend(child_props)
         child = child.root_child
     project_id_prop = self.parser.get_prop_by_name('project_id')
     if project_id_prop is None:
         project_id_prop = PropFactory.adding_prop(self.parser.doc_type, 'project_id', None, [])
     root_id = project_id_prop.id
     return df.mapValues(lambda x: construct_project_id(x, props, root_id))
예제 #8
0
 def merge_roots_to_children(self):
     collected_leaf_dfs = {}
     collected_collecting_dfs = {}
     for root in self.parser.roots:
         if root.root_child is None:
             df = self.translate_table(root.tbl_name, props=root.props)
             root_id = self.parser.get_prop_by_name('{}_id'.format(root.name)).id
         else:
             df = self.merge_auth_root(root)
         props = root.props
         for child in root.children:
             edge_tbl = child.parents[root.name]
             tmp_df = df.join(self.translate_edge(edge_tbl))
             if root.root_child is None:
                 tmp_df = tmp_df.map(lambda x: (x[1][1], ({root_id: x[0]},) + (x[1][0],)))\
                     .mapValues(lambda x: merge_and_fill_empty_props(x, props, to_tuple=True))
             else:
                 tmp_df = tmp_df.map(lambda x: (x[1][1], x[1][0])) \
                     .mapValues(lambda x: tuple([(k, v) for (k, v) in x.items()]))
             self.collect_collecting_child(child, tmp_df, collected_collecting_dfs)
     return collected_collecting_dfs, collected_leaf_dfs
예제 #9
0
 def collect_leaf(self,
                  child,
                  edge_df,
                  collected_leaf_dfs,
                  root_props=None):
     root_props = self.root_props if root_props is None else root_props
     if isinstance(child, LeafNode):
         child_df = self.translate_table(child.tbl_name,
                                         props=self.parser.props)
         child_df = child_df.mapValues(
             lambda x: merge_dictionary({"source_node": child.name}, x))
         if child_df.isEmpty():
             return
         child_df = child_df.join(edge_df).mapValues(
             lambda x: merge_and_fill_empty_props(
                 x, root_props, to_tuple=True))
         collected_leaf_dfs["final"] = (
             child_df if "final" not in collected_leaf_dfs else
             collected_leaf_dfs["final"].union(child_df).distinct())
         if child.name in collected_leaf_dfs:
             collected_leaf_dfs[child.name].unpersist()
         child.done = True