示例#1
0
 def collect_collecting_child(self, child, edge_df,
                              collected_collecting_dfs):
     if edge_df is None or edge_df.isEmpty():
         child.no_parent_to_map -= 1
         return
     child.no_parent_to_map -= 1
     if len(child.props) > 0:
         child_df = self.translate_table(child.tbl_name, props=child.props)
         node = self.parser.get_prop_by_name(get_node_id_name(child.name))
         node_id = node.id if node is not None else None
         if node_id is not None:
             child_df = child_df.map(lambda x: (
                 x[0],
                 merge_dictionary(x[1], {node_id: x[0]}, to_tuple=True),
             ))
         else:
             child_df = child_df.mapValues(
                 lambda x: tuple([(k, v) for (k, v) in x.items()]))
         child_df = child_df.join(edge_df).mapValues(lambda x: x[0] + x[1])
     else:
         child_df = edge_df
     if child.name not in collected_collecting_dfs:
         collected_collecting_dfs[child.name] = child_df
     else:
         collected_collecting_dfs[child.name] = (
             collected_collecting_dfs[child.name].fullOuterJoin(
                 child_df).mapValues(lambda x: merge_data_frames(x)))
示例#2
0
    def join_to_an_index(self, df, translator, joining_node):
        """
        Perform the join between indices. It will:
         - load rdd to be join from HDFS
         - Joining with df
        :param df: rdd of translator that does the join
        :param translator: translator has rdd to be join this translator
        :param joining_node: joining_node define in yaml file.
        :return:
        """
        joining_df = translator.load_from_hadoop()

        # For joining two indices, we need to swap the property field and key of one of the index.
        # based on join_on value in the etlMapping, we know what field is used as joining field.
        # We swap the index that have name of key field different than the name of joining field
        joining_df_key_id = translator.parser.get_key_prop().id
        id_field_in_joining_df = translator.parser.get_prop_by_name(
            joining_node.joining_field).id
        # field which is identity of a node is named as _{node}_id now
        # before in etl-mapping for joining_props, we use {node}_id
        # for backward compatibility, we check first with the value in mapping file.
        # if there is not any Prop object like that, we check with new format _{node}_id
        id_field_in_df = self.parser.get_prop_by_name(
            joining_node.joining_field)
        if id_field_in_df is None:
            id_field_in_df = self.parser.get_prop_by_name(
                get_node_id_name(self.parser.doc_type))
        if id_field_in_df is None:
            raise Exception("{} field does not exist in index {}".format(
                joining_node.joining_field, self.parser.doc_type))
        id_field_in_df_id = id_field_in_df.id
        df_key_id = self.parser.get_key_prop().id

        swap_df = False
        if joining_df_key_id != id_field_in_joining_df:
            joining_df = swap_property_as_key(joining_df,
                                              id_field_in_joining_df,
                                              joining_df_key_id)
        if df_key_id != id_field_in_df_id:
            df = swap_property_as_key(df, id_field_in_df_id, df_key_id)
            swap_df = True

        # Join can be done with or without an aggregation function like max, min, sum, ...
        # these two type of join requires different map-reduce steos
        props_with_fn, props_without_fn = self.get_joining_props(
            translator, joining_node)
        if len(props_with_fn) > 0:
            df = self.join_and_aggregate(df, joining_df, props_with_fn,
                                         joining_node)
        if len(props_without_fn) > 0:
            df = self.join_no_aggregate(df, joining_df, props_without_fn)

        if swap_df:
            df = swap_property_as_key(df, df_key_id, id_field_in_df_id)
        return df
示例#3
0
 def __init__(self, mapping, model):
     self.mapping = mapping
     self.model = model
     self.name = mapping["name"]
     self.root = mapping["root"]
     self.doc_type = mapping["doc_type"]
     self.joining_nodes = []
     self.additional_props = []
     PropFactory.adding_prop(
         self.doc_type,
         get_node_id_name(self.doc_type),
         "",
         [],
         src_node=None,
         src_index=None,
         fn=None,
         prop_type=(str,),
     )
     self.types = []
示例#4
0
 def get_joining_props(self, translator, joining_index):
     """
     Get joining props added by an additional join between indices/documents
     :param joining_index: Joining index created from parser
     :return:
     """
     props_with_fn = []
     props_without_fn = []
     for r in joining_index.getting_fields:
         src_prop = translator.parser.get_prop_by_name(r.prop.name)
         # field which is identity of a node is named as _{node}_id now
         # before in etl-mapping for joining_props, we use {node}_id
         # for backward compatibility, we check first with the value in mapping file.
         # if there is not any Prop object like that, we check with new format _{node}_id
         if src_prop is None and r.prop.src == get_node_id_name_without_prefix(
                 translator.parser.doc_type):
             src_prop = translator.parser.get_prop_by_name(
                 get_node_id_name(translator.parser.doc_type))
         dst_prop = self.parser.get_prop_by_name(r.prop.name)
         if r.fn is None:
             props_without_fn.append({"src": src_prop, "dst": dst_prop})
         else:
             props_with_fn.append({"src": src_prop, "dst": dst_prop})
     return props_with_fn, props_without_fn
示例#5
0
 def get_key_prop(self):
     return PropFactory.get_prop_by_name(
         self.doc_type, get_node_id_name(self.doc_type)
     )
示例#6
0
def json_export(x, doc_type):
    x[1][get_node_id_name(doc_type)] = x[0]
    x[1]["node_id"] = x[
        0]  # redundant field for backward compatibility with arranger
    return (x[0], json.dumps(x[1]))