def graph_join_tree(first_collection_constructor, collection_constructor_morphism, second_collection_constructor, left): name, first_collection, first_model, collection_relationship, model_relationship, second_collection, second_model, first_file_path, result_file_name = parse_info_for_join( first_collection_constructor, collection_constructor_morphism, second_collection_constructor) G = nx.DiGraph() objects = first_collection.get_iterable_collection_of_objects() print(objects) ## Now elem is either (vertex_id, dict) or (source_id, target_id, dict) ## The result is assumed to be a simple dictionary ## We do not necessarily include all the elements in the second collection or in the first collection. This is default. for elem in objects: ## The result set is a collection of subtrees result_list = collection_relationship.get_relationship(elem) #print(result_list) if len(result_list) > 0: if len(elem) == 2: merged_dict = dict() for elem2 in result_list: print(elem, elem2) print(type(elem2)) merged_dict = merge_two_dicts( merged_dict, merge_two_dicts(elem[1], elem2)) G.add_nodes_from([(elem[0], merged_dict)]) elif len(elem) == 3: merged_dict = dict() for elem2 in result_list: merged_dict = merge_two_dicts( merged_dict, merge_two_dicts(elem[2], elem2)) G.add_edges_from([(elem[0], elem[1], merged_dict)]) ## We add all the elements from the first collection even if they are not in relation with any element ## in the second collection. elif len(result_list) == 0 and left == True: if len(elem) == 2: G.add_nodes_from([elem]) elif len(elem) == 3: G.add_edges_from([elem]) result_file_path = parse_file_path(first_file_path, result_file_name) nx.write_gpickle(G, result_file_path, protocol=pickle.HIGHEST_PROTOCOL) result_collection = GraphCollection(name) result_collection.set_target_file_path(result_file_path) result_model = ModelCategoryJoin(first_model, model_relationship, second_model, left) result = CollectionConstructor(name, result_model.get_result(), result_collection) return result, result_model
def tree_join_table(first_collection_constructor, collection_constructor_morphism, second_collection_constructor, left, attributes): name, first_collection, first_model, collection_relationship, model_relationship, second_collection, second_model, first_file_path, result_file_name = parse_info_for_join( first_collection_constructor, collection_constructor_morphism, second_collection_constructor) result_path = parse_file_path(first_file_path, result_file_name) result = shelve.open(result_path) ## First we create a copy of the original tree: all_objects = first_collection.get_iterable_collection_of_objects() for key in all_objects.keys(): result[key] = all_objects[key] ## Then we store the result result_collection = TreeCollection(result_file_name, target_file_path=result_path) ## After this we modify the copy so that the orginal data are not affected objects = [] for attribute in attributes: ## Because it is unefficient to loop over all the nodes in the tree, the user must specify the attributes that ## we loop. Each attribute has a specified path related to them that allows us faster access to the object. result_objects = result_collection.find_elements_with_attribute_and_path( attribute, "") if len(result_objects) == 0: raise Exception("No nodes for the given attribute.", attributes) else: objects = objects + result_objects for pair in objects: ## Each element consists of the node that has the attribute that the user gave and also a path to that element in the tree. ## The path gives a unique and relatively fast way to access the element again and substitute the new value into the tree. ## Unlike graphs and tables, trees do not have unique id system in this demo. elem, path = pair[0], pair[1] result_list = collection_relationship.get_relationship(elem) if len(result_list) > 0: new_elem = dict() for elem2 in result_list: print(elem2) new_elem = merge_two_dicts( new_elem, merge_two_dicts(elem, elem2[len(elem2) - 1])) print(path, new_elem) update(result, path, new_elem) elif len(result_list) == 0 and left == False: remove(path, result) result_model = ModelCategoryJoin(first_model, model_relationship, second_model, left) return CollectionConstructor(name, result_model.get_result(), result_collection), result_model
def graph_join_graph(first_collection_constructor, collection_constructor_morphism, second_collection_constructor, left=False, right=False): name, first_collection, first_model, collection_relationship, model_relationship, second_collection, second_model, first_file_path, result_file_name = parse_info_for_join( first_collection_constructor, collection_constructor_morphism, second_collection_constructor) G = nx.DiGraph() objects = first_collection.get_iterable_collection_of_objects() ## Now elem is either (vertex_id, dict) or (source_id, target_id, dict) ## The result is assumed to be a simple dictionary ## We inlcude all the elements from both graphs. In this case the relation does not matter, naturally same elements are identified. if right == True and left == True: G = graph_union(first_collection.get_graph(), second_collection.get_graph()) ## We do not necessarily include all the elements in the second collection or in the first collection. This is default. else: for elem in objects: result_list = collection_relationship.get_relationship(elem) if len(result_list) > 0: if len(elem) == 2: merged_dict = dict() for elem2 in result_list: print(elem, elem2) if len(elem2) == 2: merged_dict = merge_two_dicts( merged_dict, merge_two_dicts(elem[1], elem2[1])) elif len(elem2) == 3: merged_dict = merge_two_dicts( merged_dict, merge_two_dicts(elem[1], elem2[2])) G.add_nodes_from([(elem[0], merged_dict)]) elif len(elem) == 3: merged_dict = dict() for elem2 in result_list: if len(elem2) == 2: merged_dict = merge_two_dicts( merged_dict, merge_two_dicts(elem[2], elem2[1])) elif len(elem2) == 3: merged_dict = merge_two_dicts( merged_dict, merge_two_dicts(elem[2], elem2[2])) G.add_edges_from([(elem[0], elem[1], merged_dict)]) ## We add all the elements from the first collection even if they are not in relation with any element ## in the second collection. elif len(result_list) == 0 and left == True: if len(elem) == 2: G.add_nodes_from([elem]) elif len(elem) == 3: G.add_edges_from([elem]) ## We include those elements that are in the second collection but are not in the image of the relation if right == True and left == False: G = graph_union(G, second_collection.get_graph()) result_file_path = parse_file_path(first_file_path, result_file_name) nx.write_gpickle(G, result_file_path, protocol=pickle.HIGHEST_PROTOCOL) result_collection = GraphCollection(name) result_collection.set_target_file_path(result_file_path) result_model = ModelCategoryJoin(first_model, model_relationship, second_model, left, right) result = CollectionConstructor(name, result_model.get_result(), result_collection) return result, result_model
def graph_join_table(first_collection_constructor, collection_constructor_morphism, second_collection_constructor, left=False): name, first_collection, first_model, collection_relationship, model_relationship, second_collection, second_model, first_file_path, result_file_name = parse_info_for_join( first_collection_constructor, collection_constructor_morphism, second_collection_constructor) G = nx.DiGraph() objects = first_collection.get_iterable_collection_of_objects() ## Now elem is either (vertex_id, dict) or (source_id, target_id, dict) ## The result is assumed to be a simple dictionary for elem in objects: result_list = collection_relationship.get_relationship(elem) if len(result_list) > 0: if len(elem) == 2: merged_dict = dict() for elem2 in result_list: merged_dict = merge_two_dicts( merged_dict, merge_two_dicts(elem[1], elem2)) G.add_nodes_from([(elem[0], merged_dict)]) elif len(elem) == 3: merged_dict = dict() for elem2 in result_list: merged_dict = merge_two_dicts( merged_dict, merge_two_dicts(elem[2], elem2)) G.add_edges_from([(elem[0], elem[1], merged_dict)]) elif len(result_list) == 0 and left == True: if len(elem) == 2: G.add_nodes_from([elem]) elif len(elem) == 3: G.add_edges_from([elem]) result_file_path = parse_file_path(first_file_path, result_file_name) nx.write_gpickle(G, result_file_path, protocol=pickle.HIGHEST_PROTOCOL) result_collection = GraphCollection(name) result_collection.set_target_file_path(result_file_path) result_model = ModelCategoryJoin(first_model, model_relationship, second_model, left) result = CollectionConstructor(name, result_model.get_result(), result_collection) return result, result_model
def table_join_graph(first_collection_constructor, collection_constructor_morphism, second_collection_constructor, second_description, left=False): name, first_collection, first_model, collection_relationship, model_relationship, second_collection, second_model, first_file_path, result_file_name = parse_info_for_join( first_collection_constructor, collection_constructor_morphism, second_collection_constructor) first_collection_description = first_collection.get_attributes_datatypes_dict( ) if len( set(first_collection_description.keys()).intersection( set(second_description.keys()))) > 0: print( "Warning: The descriptions are not disjoint. This might cause problems in the evaluation." ) result_description = merge_two_dicts(first_collection_description, second_description) length_of_first_collection_description = len(first_collection_description) second_file_path = second_collection.get_target_file_path() result_collection, result_table_row, result_h5file = create_h5file( result_description, first_file_path, second_file_path, collection_constructor_morphism) objects = first_collection.get_iterable_collection_of_objects() for elem in objects: result_list = collection_relationship.get_relationship(elem) ## We implicitly assume that the elements in the result are in right format i.e. they follow the given second ## description in the parameters. if len(result_list) > 0: for elem2 in result_list: j = 0 for key in result_description: if j >= length_of_first_collection_description: result_table_row[key] = elem2[len(elem2) - 1][key] else: result_table_row[key] = elem[key] j += 1 result_table_row.append() elif len(result_list) == 0 and left == True: for key in result_description: ## If we do not set values for all the columns in a row, the predefined default value is used which serves as NULL. if j < length_of_first_collection_description: result_table_row[key] = elem[key] result_h5file.close() result_model = ModelCategoryJoin(first_model, model_relationship, second_model, left) result = CollectionConstructor(name, result_model.get_result(), result_collection) return result, result_model
def table_join_table(first_collection_constructor, collection_constructor_morphism, second_collection_constructor, left=False): name, first_collection, first_model, collection_relationship, model_relationship, second_collection, second_model, first_file_path, result_file_name = parse_info_for_join( first_collection_constructor, collection_constructor_morphism, second_collection_constructor) first_collection_description = first_collection.get_attributes_datatypes_dict( ) second_collection_description = second_collection.get_attributes_datatypes_dict( ) length_of_first_collection_description = len(first_collection_description) result_description = merge_two_dicts(first_collection_description, second_collection_description) second_file_path = second_collection.get_target_file_path() result_collection, result_table_row, result_h5file = create_h5file( result_description, first_file_path, second_file_path, collection_constructor_morphism) objects = first_collection.get_iterable_collection_of_objects() for elem in objects: result_list = collection_relationship.get_relationship(elem) if len(result_list) > 0: for elem2 in result_list: j = 0 for key in result_description: if j >= length_of_first_collection_description: result_table_row[key] = elem2[key] else: result_table_row[key] = elem[key] j += 1 result_table_row.append() elif len(result_list) == 0 and left == True: for key in result_description: ## If we do not set values for all the columns in a row, the predefined default value is used which serves as NULL. if j < length_of_first_collection_description: result_table_row[key] = elem[key] result_h5file.close() result_model = ModelCategoryJoin(first_model, model_relationship, second_model) result = CollectionConstructor(name, result_model.get_result(), result_collection) return result, result_model
def table_join_tree(first_collection_constructor, collection_constructor_morphism, second_collection_constructor, second_description, left=False): name, first_collection, first_model, collection_relationship, model_relationship, second_collection, second_model, first_file_path, result_file_name = parse_info_for_join( first_collection_constructor, collection_constructor_morphism, second_collection_constructor) first_collection_description = first_collection.get_attributes_datatypes_dict( ) if len( set(first_collection_description.keys()).intersection( set(second_description.keys()))) > 0: print( "Warning: The descriptions are not disjoint. This might cause problems in the evaluation." ) result_description = merge_two_dicts(first_collection_description, second_description) length_of_first_collection_description = len(first_collection_description) second_file_path = second_collection.get_target_file_path() result_collection, result_table_row, result_h5file = create_h5file( result_description, first_file_path, second_file_path, collection_constructor_morphism) objects = first_collection.get_iterable_collection_of_objects() for elem in objects: result_list = collection_relationship.get_relationship(elem) if len(result_list) > 0: ## Here we assume that every element in the result has a tree structure ## The tree structure is flattened so that each path from the root to a leaf is made a row ## From the row we pick the wanted elements defined in the second description parameter for elem2 in result_list: j = 0 for key in result_description: if j >= length_of_first_collection_description: picked_values_from_tree = find_values_from_tree( elem2, key) if len(picked_values_from_tree) == 0: print("No value for " + str(key) + " in the subtree.") print( "The table will have the default value for " + str(key) + " in this row.") elif len(picked_values_from_tree) > 1: print( "Warning! With key " + str(key) + " exist multiple values. The algorithm picks the first." ) result_table_row[key] = picked_values_from_tree[0] else: result_table_row[key] = picked_values_from_tree[0] else: result_table_row[key] = elem[key] j += 1 result_table_row.append() elif len(result_list) == 0 and left == True: for key in result_description: ## If we do not set values for all the columns in a row, the predefined default value is used which serves as NULL. if j < length_of_first_collection_description: result_table_row[key] = elem[key] result_h5file.close() result_model = ModelCategoryJoin(first_model, model_relationship, second_model) result = CollectionConstructor(name, result_model.get_result(), result_collection) return result, result_model