Пример #1
0
    def merge_project(self, child, edge_df, collected_collecting_dfs):
        if edge_df is None or edge_df.isEmpty():
            child.no_parent_to_map -= 1
            return
        child.no_parent_to_map -= 1
        child_df = self.translate_table(child.tbl_name, props=child.props)
        child_df = child_df.join(edge_df).mapValues(
            lambda x: merge_dictionary(x[0], x[1]))

        project_code_id = self.parser.get_prop_by_name(PROJECT_CODE).id
        program_name_id = self.parser.get_prop_by_name(PROGRAM_NAME).id
        project_id_prop = self.parser.get_prop_by_name(PROJECT_ID)
        if project_id_prop is None:
            project_id_prop = PropFactory.adding_prop(self.parser.doc_type,
                                                      PROJECT_ID,
                                                      None, [],
                                                      prop_type=(str, ))
        child_df = child_df.mapValues(lambda x: merge_dictionary(
            x,
            {
                project_id_prop.id:
                "{}-{}".format(x.get(program_name_id), x.get(project_code_id))
            },
        ))
        collected_collecting_dfs[child.name] = child_df.mapValues(
            lambda x: tuple([(k, v) for (k, v) in x.items()]))
Пример #2
0
    def translate_parent(self, root_df):
        if len(self.parser.parent_nodes) == 0:
            return root_df
        root_tbl = get_node_table_name(self.parser.model, self.parser.root)
        root_id = self.parser.get_key_prop().id
        for f in self.parser.parent_nodes:
            df = self.translate_table(root_tbl, props=[])
            n = f.head
            first = True
            while n is not None:
                edge_tbl = n.edge_up_tbl
                df = df.join(self.translate_edge(edge_tbl, reversed=False))
                if first:
                    df = df.map(lambda x: (x[1][1], ({root_id: x[0]},) + (x[1][0],))) \
                        .mapValues(lambda x: merge_dictionary(x[0], x[1]))
                    first = False
                else:
                    df = df.map(lambda x: (x[1][1], x[1][0]))
                cur_props = n.props
                tbl = n.tbl
                n_df = self.translate_table(tbl, props=cur_props)

                df = n_df.join(df) \
                    .mapValues(lambda x: merge_and_fill_empty_props(x, cur_props))
                n = n.child
            df = df.map(lambda x: make_key_from_property(x[1], root_id))
            root_df = root_df.leftOuterJoin(df).mapValues(
                lambda x: merge_dictionary(x[0], x[1]))
        return root_df
Пример #3
0
 def collect_collecting_child(self, child, edge_df,
                              collected_collecting_dfs):
     if edge_df is None or edge_df.isEmpty():
         child.no_parent_to_map -= 1
         return
     child.no_parent_to_map -= 1
     if len(child.props) > 0:
         child_df = self.translate_table(child.tbl_name, props=child.props)
         node = self.parser.get_prop_by_name(get_node_id_name(child.name))
         node_id = node.id if node is not None else None
         if node_id is not None:
             child_df = child_df.map(lambda x: (
                 x[0],
                 merge_dictionary(x[1], {node_id: x[0]}, to_tuple=True),
             ))
         else:
             child_df = child_df.mapValues(
                 lambda x: tuple([(k, v) for (k, v) in x.items()]))
         child_df = child_df.join(edge_df).mapValues(lambda x: x[0] + x[1])
     else:
         child_df = edge_df
     if child.name not in collected_collecting_dfs:
         collected_collecting_dfs[child.name] = child_df
     else:
         collected_collecting_dfs[child.name] = (
             collected_collecting_dfs[child.name].fullOuterJoin(
                 child_df).mapValues(lambda x: merge_data_frames(x)))
Пример #4
0
    def translate_final(self):
        """
        Because one file can belong to multiple root nodes (case, subject).
        In the final step of file document, we must construct the list of root instance's id
        :return:
        """
        df = self.load_from_hadoop()
        aggregating_props = self.get_aggregating_props()
        if len(aggregating_props) == 0:
            return df

        frame_zero = get_frame_zero(aggregating_props)

        prop_df = (df.mapValues(
            lambda x: get_props_to_tuple(x, aggregating_props)).aggregateByKey(
                frame_zero, seq_aggregate_with_prop,
                merge_aggregate_with_prop).mapValues(
                    lambda x: {x1: x2
                               for (x0, x1, x2) in x}))

        df = (df.mapValues(lambda x: remove_props_from_tuple(
            x, aggregating_props)).distinct().mapValues(
                lambda x: {x0: x1
                           for (x0, x1) in x}))

        return df.join(prop_df).mapValues(
            lambda x: merge_dictionary(x[0], x[1]))
Пример #5
0
 def translate_special(self, root_df):
     """
     If etlMapping have special_props entry that defines a special function, run this translation
     :param root_df: The special function also have the same root with hosted document (case or subject)
     :return: Return the origin rdd with result from special function included inside
     """
     if len(self.parser.special_nodes) == 0:
         return root_df
     root_tbl = get_node_table_name(self.parser.model, self.parser.root)
     root_id = self.parser.get_key_prop().id
     for f in self.parser.special_nodes:
         if f.fn[0] == "sliding":
             df = self.translate_table(root_tbl, props=[])
             n = f.head
             first = True
             while n is not None:
                 edge_tbl = n.edge_up_tbl
                 df = df.join(self.translate_edge(edge_tbl))
                 if first:
                     df = df.map(lambda x: (x[1][1], ({
                         root_id: x[0]
                     }, ) + (x[1][0], ))).mapValues(
                         lambda x: merge_dictionary(x[0], x[1]))
                     first = False
                 else:
                     df = df.map(lambda x: (x[1][1], x[1][0]))
                 cur_props = n.props
                 tbl = n.tbl
                 n_df = self.translate_table(tbl, props=cur_props)
                 df = n_df.join(df).mapValues(
                     lambda x: merge_and_fill_empty_props(x, cur_props))
                 n = n.child
             df = df.map(lambda x: make_key_from_property(x[1], root_id))
             (n, fn1, fn2) = tuple(f.fn[1:])
             fid = self.parser.get_prop_by_name(f.name).id
             df = df.mapValues(lambda x: tuple([
                 v for (k, v) in list(
                     collections.OrderedDict(sorted(x.items())).items())
             ]))
             df = sliding(df, int(n.strip()), fn1.strip(),
                          fn2.strip()).mapValues(lambda x: {fid: x})
             root_df = root_df.leftOuterJoin(df).mapValues(
                 lambda x: merge_dictionary(x[0], x[1]))
     return root_df
Пример #6
0
 def translate(self):
     root_tbl = get_node_table_name(self.parser.model, self.parser.root)
     root_df = self.translate_table(root_tbl, props=self.parser.props)
     root_df = self.translate_special(root_df)
     root_df = self.translate_parent(root_df)
     root_df = self.get_direct_children(root_df)
     if len(self.parser.aggregated_nodes) == 0:
         return root_df
     return root_df.join(self.aggregate_nested_properties()).mapValues(
         lambda x: merge_dictionary(x[0], x[1]))
Пример #7
0
 def ensure_project_id_exist(self, df):
     project_id_prop = self.parser.get_prop_by_name(PROJECT_ID)
     if project_id_prop is None:
         project_id_prop = PropFactory.adding_prop(self.parser.doc_type,
                                                   PROJECT_ID,
                                                   None, [],
                                                   prop_type=(str, ))
         project_code_id = self.parser.get_prop_by_name(PROJECT_CODE).id
         program_name_id = self.parser.get_prop_by_name(PROGRAM_NAME).id
         df = df.mapValues(lambda x: merge_dictionary(
             x,
             {
                 project_id_prop.id:
                 "{}-{}".format(x.get(program_name_id),
                                x.get(project_code_id))
             },
         ))
     return df
Пример #8
0
 def collect_leaf(self,
                  child,
                  edge_df,
                  collected_leaf_dfs,
                  root_props=None):
     root_props = self.root_props if root_props is None else root_props
     if isinstance(child, LeafNode):
         child_df = self.translate_table(child.tbl_name,
                                         props=self.parser.props)
         child_df = child_df.mapValues(
             lambda x: merge_dictionary({"source_node": child.name}, x))
         if child_df.isEmpty():
             return
         child_df = child_df.join(edge_df).mapValues(
             lambda x: merge_and_fill_empty_props(
                 x, root_props, to_tuple=True))
         collected_leaf_dfs["final"] = (
             child_df if "final" not in collected_leaf_dfs else
             collected_leaf_dfs["final"].union(child_df).distinct())
         if child.name in collected_leaf_dfs:
             collected_leaf_dfs[child.name].unpersist()
         child.done = True