def collect_collecting_child(self, child, edge_df, collected_collecting_dfs): if edge_df is None or edge_df.isEmpty(): child.no_parent_to_map -= 1 return child.no_parent_to_map -= 1 if len(child.props) > 0: child_df = self.translate_table(child.tbl_name, props=child.props) node = self.parser.get_prop_by_name(get_node_id_name(child.name)) node_id = node.id if node is not None else None if node_id is not None: child_df = child_df.map(lambda x: ( x[0], merge_dictionary(x[1], {node_id: x[0]}, to_tuple=True), )) else: child_df = child_df.mapValues( lambda x: tuple([(k, v) for (k, v) in x.items()])) child_df = child_df.join(edge_df).mapValues(lambda x: x[0] + x[1]) else: child_df = edge_df if child.name not in collected_collecting_dfs: collected_collecting_dfs[child.name] = child_df else: collected_collecting_dfs[child.name] = ( collected_collecting_dfs[child.name].fullOuterJoin( child_df).mapValues(lambda x: merge_data_frames(x)))
def join_to_an_index(self, df, translator, joining_node): """ Perform the join between indices. It will: - load rdd to be join from HDFS - Joining with df :param df: rdd of translator that does the join :param translator: translator has rdd to be join this translator :param joining_node: joining_node define in yaml file. :return: """ joining_df = translator.load_from_hadoop() # For joining two indices, we need to swap the property field and key of one of the index. # based on join_on value in the etlMapping, we know what field is used as joining field. # We swap the index that have name of key field different than the name of joining field joining_df_key_id = translator.parser.get_key_prop().id id_field_in_joining_df = translator.parser.get_prop_by_name( joining_node.joining_field).id # field which is identity of a node is named as _{node}_id now # before in etl-mapping for joining_props, we use {node}_id # for backward compatibility, we check first with the value in mapping file. # if there is not any Prop object like that, we check with new format _{node}_id id_field_in_df = self.parser.get_prop_by_name( joining_node.joining_field) if id_field_in_df is None: id_field_in_df = self.parser.get_prop_by_name( get_node_id_name(self.parser.doc_type)) if id_field_in_df is None: raise Exception("{} field does not exist in index {}".format( joining_node.joining_field, self.parser.doc_type)) id_field_in_df_id = id_field_in_df.id df_key_id = self.parser.get_key_prop().id swap_df = False if joining_df_key_id != id_field_in_joining_df: joining_df = swap_property_as_key(joining_df, id_field_in_joining_df, joining_df_key_id) if df_key_id != id_field_in_df_id: df = swap_property_as_key(df, id_field_in_df_id, df_key_id) swap_df = True # Join can be done with or without an aggregation function like max, min, sum, ... # these two type of join requires different map-reduce steos props_with_fn, props_without_fn = self.get_joining_props( translator, joining_node) if len(props_with_fn) > 0: df = self.join_and_aggregate(df, joining_df, props_with_fn, joining_node) if len(props_without_fn) > 0: df = self.join_no_aggregate(df, joining_df, props_without_fn) if swap_df: df = swap_property_as_key(df, df_key_id, id_field_in_df_id) return df
def __init__(self, mapping, model): self.mapping = mapping self.model = model self.name = mapping["name"] self.root = mapping["root"] self.doc_type = mapping["doc_type"] self.joining_nodes = [] self.additional_props = [] PropFactory.adding_prop( self.doc_type, get_node_id_name(self.doc_type), "", [], src_node=None, src_index=None, fn=None, prop_type=(str,), ) self.types = []
def get_joining_props(self, translator, joining_index): """ Get joining props added by an additional join between indices/documents :param joining_index: Joining index created from parser :return: """ props_with_fn = [] props_without_fn = [] for r in joining_index.getting_fields: src_prop = translator.parser.get_prop_by_name(r.prop.name) # field which is identity of a node is named as _{node}_id now # before in etl-mapping for joining_props, we use {node}_id # for backward compatibility, we check first with the value in mapping file. # if there is not any Prop object like that, we check with new format _{node}_id if src_prop is None and r.prop.src == get_node_id_name_without_prefix( translator.parser.doc_type): src_prop = translator.parser.get_prop_by_name( get_node_id_name(translator.parser.doc_type)) dst_prop = self.parser.get_prop_by_name(r.prop.name) if r.fn is None: props_without_fn.append({"src": src_prop, "dst": dst_prop}) else: props_with_fn.append({"src": src_prop, "dst": dst_prop}) return props_with_fn, props_without_fn
def get_key_prop(self): return PropFactory.get_prop_by_name( self.doc_type, get_node_id_name(self.doc_type) )
def json_export(x, doc_type): x[1][get_node_id_name(doc_type)] = x[0] x[1]["node_id"] = x[ 0] # redundant field for backward compatibility with arranger return (x[0], json.dumps(x[1]))