def eval_sheet(targets, count, smallest_hash, a_builder, alignment, children, automated_decision): first = False a_builder.write("\n{:<5}\t{:<20}{:12}{:20}{:23}{:23}".format(count, smallest_hash, "", "", automated_decision, "")) if targets is None: a_builder.write(Cls.disambiguate_network(alignment, children)) else: response = Cls.disambiguate_network_2(children, targets, output=False) if response: temp = "" dataset = "" # for line in response: # print line for i in range(1, len(response)): resource = Ut.get_uri_local_name(response[i][0]) if i == 1: temp = "{:25}: {}".format(resource, response[i][1]) elif dataset == response[i][0]: temp = "{:25} | {}".format(temp, response[i][1]) else: if first is False: a_builder.write(" {}\n".format(temp)) else: a_builder.write("{:108}{}\n".format("", temp)) first = True temp = "{:25}: {}".format(resource, response[i][1]) dataset = response[i][0] a_builder.write("{:108}{}\n".format("", temp))
def generate_lens_name(datasets, operator="union"): datasets.sort() ds_concat = "" for dataset in datasets: ds_concat += dataset # RETURN THE LIST OF DATASET UNIQUE NAMES unique_list = list() # THE QUERY FOR CHECKING WHETHER THE LENS EXIST query = ask_union(datasets) for dataset in datasets: lens_targets_unique(unique_list, dataset) # print unique_list name = "" for i in range(0, len(unique_list)): local_name = Ut.get_uri_local_name(unique_list[i]) link = "" if i == 0 else "_" # print (local_name[0]).upper() name += link + (local_name[0]).upper() + local_name[1:] hash_value = hash(name + ds_concat) hash_value = str(hash_value).replace( "-", "N") if str(hash_value).__contains__("-") else "P{}".format(hash_value) name = "{}_{}_{}".format(operator, name, hash_value) # print name # print query # print hash(name) return {"name": name, "query": query}
def view_data(view_specs, view_filter, display=False): # GENERATING THE METADATA FOR REGISTERING A VIEW. # # THE VIEW IS COMPOSED OF # - EXACTLY ONE LENS # - ONE OR MORE FILTERS # # A FILTER IS COMPOSED OF # - EXACTLY ONE DATASET # - ONE OR MORE PROPERTIES # view_specs = { # St.researchQ_URI: question_uri, # St.datasets: view_lens, # St.lens_operation: Ns.lensOpi # } # TEXT BUFFER string_buffer = StringIO() main_buffer = StringIO() dataset_opt = [] # LIST OF DATASET THAT HAVE ONLY OPTIONAL PROPERTIES # HOLDER VARIABLE (STRING) FOR THE RESEARCH QUESTION URI question_uri = str(view_specs[St.researchQ_URI]).strip() # HOLDER VARIABLE (LIST) FOR LINKSETS AND/OR LENSES THAT COMPOSE THE LENS view_lens = view_specs[St.datasets] # KEY FUNCTION FOR ACCESSING ELEMENT ON WHICH TO SORT ON def get_key(item): return item[St.graph] # SORT THE LIST BASED ON THE GRAPH NAME OF EACH DICTIONARY # SORTING THE LIST OF FILTERS BASED ON THE DATASET NAME sorted_datasets = sorted(view_filter, key=get_key) # print sorted_datasets # [DESCRIPTION] RESEARCH QUESTION X main_buffer.write("\t### THE VIEW\n".format(question_uri)) main_buffer.write("\t\t\t<{}>\n".format(question_uri)) # [DESCRIPTION] CREATED A VIEW main_buffer.write("\t\t\t\talivocab:created\t\t\t<@URI> .\n\n") # [DESCRIPTION] THE VIEW main_buffer.write( "\t\t\t### THE COMPONENT OF THE VIEW: THE TYPE, THE LENS AND THE FILTERS\n" .format(Ns.view)) main_buffer.write("\t\t\t<@URI>\n".format(Ns.view)) # [DESCRIPTION] IS A TYPE OF RISIS:VIEW main_buffer.write("\t\t\t\ta\t\t\t\t\t\t\t<{}View> ;\n".format(Ns.riclass)) # [DESCRIPTION] THAT HAS A LENS main_buffer.write( "\t\t\t\talivocab:hasViewLens\t\t<{}view_lens_@> ;".format(Ns.view)) # SORT THE PROPERTIES IN EACH DICTIONARY count_ds = 0 for filter in sorted_datasets: count_ds += 1 append_ds = ";" if count_ds < len(sorted_datasets) else ".\n" if St.graph in filter: # [DESCRIPTION] THAT HAS A NUMBER OF FILTERS dataset_name = Ut.get_uri_local_name(filter[St.graph]) # DATA IS AN ARRAY OF DICTIONARIES WHERE, FOR EACH DATATYPE, WE HAVE A LIST OF PROPERTIES SELECTED data = filter["data"] count_sub_filter = 0 for dictionary in data: count_sub_filter += 1 ent_type = dictionary["entity_datatype"] pro_list = dictionary["properties"] # APPEND THE GRAPH string_buffer.write("\n\t\t\t### FILTER {}_{}".format( count_ds, count_sub_filter)) if len(data) > 1: append_ds = ";" if count_sub_filter < len(data) else ".\n" if St.entity_datatype in filter: entity_type_name = Ut.get_uri_local_name( filter[St.entity_datatype]) filter_c = "<{}filter_{}_{}_{}_@>".format( Ns.view, dataset_name, count_sub_filter, entity_type_name) else: filter_c = "<{}filter_{}_{}_@>".format( Ns.view, dataset_name, count_sub_filter) string_buffer.write("\n\t\t\t{}".format(filter_c)) # [DESCRIPTION] A FILTER HAS A DATASET string_buffer.write( "\n\t\t\t\tvoid:target\t\t\t\t\t<{}> ;".format( filter[St.graph])) has_filter = "\n\t\t\t\talivocab:hasFilter\t\t\t{} {}".format( filter_c, append_ds) # [DESCRIPTION] ADDING THE FILTERS BELONGING TO THE VIEW main_buffer.write(has_filter) # ADDING THE DATATYPE IF ANY if St.entity_datatype in dictionary: string_buffer.write( "\n\t\t\t\tvoid:hasDatatype\t\t\t<{}> ;".format( dictionary[St.entity_datatype])) # APPEND THE PROPERTIES # print "\n>>>>>>> FILTER:", filter if St.properties in dictionary: dictionary[St.properties].sort() count = 0 pro = None # [DESCRIPTION] WHERE EACH FILTER IS COMPOSED OF A NUMBER OF PROPERTIES check_optional = False total_properties = len(dictionary[St.properties]) for ds_property in dictionary[St.properties]: append = ";" if count < total_properties - 1 else ".\n" if type(ds_property) is tuple and len( ds_property) == 2: cur_property = str(ds_property[0]).strip() if len(cur_property ) > 0 and ds_property[1] is True: pro = "\n\t\t\t\talivocab:selectedOptional\t<{}> {}".format( ds_property[0], append) else: check_optional = True pro = "\n\t\t\t\talivocab:selected\t\t\t<{}> {}".format( cur_property, append) else: cur_property = str(ds_property).strip() if len(cur_property) > 0: check_optional = True pro = "\n\t\t\t\talivocab:selected\t\t\t<{}> {}".format( cur_property, append) if pro is not None: string_buffer.write(pro) count += 1 # THESE DATASETS ARE COMPOSED OF ONLY OPTIONAL PROPERTIES if check_optional is False: dataset_opt += [filter[St.graph]] # THE VIEW_LENS IS COMPOSED OF A NUMBER OF LENSES AND LINKSETS SELECTED main_buffer.write("\n\t\t\t### THE COMPONENT OF THE LENS".format(Ns.view)) main_buffer.write("\n\t\t\t<{}view_lens_@>".format(Ns.view)) count_ls = 0 for linkset_lens in view_lens: append_ls = ";" if count_ls < len(view_lens) - 1 else ".\n" main_buffer.write("\n\t\t\t\talivocab:selected\t\t\t<{}> {}".format( linkset_lens, append_ls)) count_ls += 1 main_triples = main_buffer.getvalue() triples = string_buffer.getvalue() # HASH THE STRING hash_value = hash(main_triples + triples) # CHANGE THE "-" NEGATIVE VALUE TO "N" AND POSITIVE TO "P" hash_value = str(hash_value).replace( '-', "N") if str(hash_value).__contains__('-') else "P" + str(hash_value) # GENERATE THE URI FOR THE VIEW uri = "{}View_{}".format(Ns.view, hash_value) query = PREFIX + """ INSERT DATA {{ GRAPH <{}> {{ {}{}\t\t}}\n\t}} """.format(question_uri, main_triples.replace("@URI", uri), triples).replace("@", hash_value) message = "\nThe metadata was generated as: {}".format(uri) print message print "\nVIEW INSERT QUERY:", query if display: print "\nVIEW INSERT QUERY:", query # return {St.message: message, St.insert_query: query, St.result: uri} return { St.message: message, St.insert_query: query, St.result: uri, "sparql_issue": dataset_opt }
def activity_overview(question_uri, get_text=True): idea = "" ds_mapping = "" alignments_data = "" lenses = "" views_data = "" """ 1. RESEARCH QUESTION LABEL """ idea_result = research_label(question_uri) idea += "\tQuestion URI: {}\n\tLabel: {}\n".format(question_uri, idea_result) """ 2. RESEARCH QUESTION DATASETS """ datasets = datasets_selected(question_uri) if datasets: for dataset in datasets: ds_mapping += "\t{} | {} | {} instances found\n".format( dataset[0], dataset[1], dataset[2]) """ 3. RESEARCH QUESTION LINKSETS """ alignments = alignments_mappings(question_uri) if alignments: for i in range(len(alignments)): # THE ALIGNMENT alignments_data += "\t{:2} - {}\n".format(i + 1, alignments[i]) # HE DESCRIPTION OF THE ALIGNMENT ali_description = alignments_mappings_description( question_uri, alignments[i]) for info in ali_description: pro = Ut.get_uri_local_name(info[0]) ls = str(info[1]).replace("http://risis.eu/linkset/", "linkset:") # LINKSETS CREATED if pro == "created" or pro == "used": size = get_namedgraph_size(info[1], isdistinct=False) alignments_data += "\t\t>>> {:13}: \t{} | {} correspondences found\n".format( pro, ls, size) # DESCRIPTION + EVOLUTION elif pro != "type": # print info alignments_data += "\t\t{:17}:\t{}\n".format(pro, ls) alignments_data += "\n" """ 4. RESEARCH QUESTION LENSES """ used_lenses = created_used_lens(question_uri) if used_lenses: for lens in used_lenses: pro = Ut.get_uri_local_name(lens[0]) les = str(lens[1]).replace("http://risis.eu/lens/", "lens:") lenses += "\t\t{:17}:\t{} | {} correspondences\n".format( pro, les, lens[2]) """ RESEARCH QUESTION VIEWS """ views_uri = views(question_uri) views_requested = 0 # EXTRACTING ALL THE VIEWS FOR THIS RESEARCH QUESTION if views_uri: views_requested = len(views_uri) - 1 for i in range(1, len(views_uri)): view_uri = views_uri[i][0] views_data += "\n\tView_Lens {}: {}".format(i, view_uri) view_composition = linksets_and_lenses(question_uri, view_uri) view_filters = filters(question_uri, view_uri) # DESCRIBING THE COMPOSITION OF EACH VIEW LENSES for element in view_composition: views_data += "\n\t\tComposition: {}".format(element) views_data += "\n" # EXTRACTING THE FILTERS for n in range(1, len(view_filters)): filter_uri = view_filters[n][0] views_data += "\n\t\tFilter {}: {}".format( n, view_filters[n][0]) filter_dt = filter_data(question_uri, filter_uri) # FILTER'S DATASETS views_data += "\n\t\t\tDataset: {}".format(filter_dt[1][0]) for m in range(1, len(filter_dt)): views_data += "\n\t\t\tProperty: {}".format( filter_dt[m][1]) views_data += "\n" if get_text: activity_buffer = StringIO() activity_buffer.write("\n>>> IDEA\n{}".format(idea)) activity_buffer.write("\n>>> DATASET MAPPINGS\n{}".format(ds_mapping)) activity_buffer.write( "\n>>> ALIGNMENT & LINKSETS\n{}".format(alignments_data)) activity_buffer.write("\n>>> LENSES\n{}".format(lenses)) activity_buffer.write("\n>>> VIEW REQUESTED [{}].\n{}". format(views_requested, views_data) if str(1) == 1 else "\n>>> VIEWS REQUESTED [{}].\n{}". format(views_requested, views_data)) print activity_buffer.getvalue() return activity_buffer.getvalue() else: result = { "idea": idea, "dataset_mappings": ds_mapping, "alignment_mappings": alignments_data, "lenses": lenses, "view_dic": views_data } # print alignments_data return result
def view(view_specs, view_filter, save=False, limit=10): """ :param view_specs: :param view_filter: :param save: :param limit :param view_filter: AN ARRAY OF DICTIONARY. THE DICTIONARY CONTAINS GRAPH AND PROPERTIES KEYWORDS. THE VALUE OF THE PROPERTIES KEYWORDS IS AN ARRAY OF PROPERTIES AVAILABLE IN THE GRAPH :param limit: LIMIT FOR THE OUTPUT DISPLAY TABLE :return: """ # LIMIT FOR THE VARIABLE IN THE SELECT str_limit = 70 ns = dict() view_where = "" view_select = "" variables_list = dict() namespace = dict() namespace_str = "" count = 1 is_problematic = False try: # 1. GENERATE THE INSERT METADATA # RETURNS MESSAGE, INSERT QUERY AND RESULT (THE VIEW URI) # RETURNS{St.message:message, St.insert_query: final, St.result: uri} view_metadata = view_data(view_specs, view_filter) # print view_metadata # print view_filter # 2. CHECK FOR POTENTIAL SPARQL TIMEOUT opt_list = view_metadata["sparql_issue"] if len(opt_list) != 0: is_problematic = True the_list = "" for ds in opt_list: the_list += "{} ".format(ds) message = "The insertion metadata was generated but not inserted. The properties listed in theses " \ " datasets [{}] are ALL OPTIONAL. The presence of at least one non OPTIONAL property is " \ "required.".format( the_list) view_metadata[St.message] = message print message # 3. REGISTER THE METADATA IF SAVE ID SET TO TRUE if save: if is_problematic is False: print "We are in save mode!" is_metadata_inserted = boolean_endpoint_response( view_metadata[St.insert_query]) print "IS THE METADATA INSERTED?: {}".format( is_metadata_inserted) message = "The insertion metadata was successfully inserted as: {}".format(view_metadata[St.result]) \ if (is_metadata_inserted == "true" or is_metadata_inserted == Ec.ERROR_STARDOG_1) \ else "The metadata could not be inserted." print message view_metadata[St.message] = message # print view_metadata[St.insert_query] # GENERATE THE INTERSECTION # AND DISPLAY THE QUERIES NEEDED inter = intersection(view_specs, display=False) if inter is None: print "WE CANNOT PROCEED AS THERE IS A PROBLEM WITH THE PROVIDED DATASETS." # For each design view, we have the dataset of interest # and the list of properties to display in a filter # THE FILTER IS A LIST OF GRAPH DICTIONARIES # [GRAPH1, GRAPH2, GRAPH3, ...] for graph in view_filter: optional = "" # THE GRAPH CONTAINS GRAPH AND DATA graph_uri = graph[St.graph] # About the dataset: [NAMESPACE, NAME] ds_ns_name = Ut.get_uri_ns_local_name(graph_uri) if ds_ns_name[1][0].isdigit(): ds_ns_name[1] = "D{}".format(ds_ns_name[1]) print ds_ns_name # shortening prefix length short_name = ds_ns_name[1] # HOLDING INFORMATION ABOUT THIS GRAPH (FOR EACH ENTITY DATATYPE, THE PROPERTIES SELECTED) graph_data = graph["data"] # Adding the dataset name to the namespace dictionary [local name: namespace] if ds_ns_name is not None: if ds_ns_name[1] not in ns: ns[ds_ns_name[1]] = ds_ns_name[0] # Generate the dataset design view WHICH LOOKS LIKE # ### DATASET: grid # GRAPH <http://risis.eu/genderc/grid> # { view_where += "\n\t### DATASET: {}\n\tGRAPH <{}>\n\t{{".format( ds_ns_name[1], graph_uri) # graph_data IS A LIST OF DICTIONARIES FOR HOLDING THE TYPES AND THEIR LISTED PROPERTIES count_ns = 0 for data_info in graph_data: e_type_uri = data_info[St.entity_datatype] type_triple = "" if e_type_uri == "no_type": e_type = "" # print "!!!!!!!!!!!!!!!!!!!!!!!!!! NO TYPE" else: e_type = Ut.get_uri_local_name(e_type_uri) # print "!!!!!!!!!!!!!!!!!!!!!!!!!!e_type", e_type if e_type: e_type = "_{}".format(e_type[short:]) type_triple = "\n\t\t\ta{:54} <{}> ;".format( "", e_type_uri) # ?GRID # TODO ADD THE TYPE TO THE ALIGNMENTS IN THE INTERSECT # SUBJECT: ADDING 1 AT THE END SO THAT SAME SOURCE AND TARGET ARE TAKEN CARE OFF view_where += "\n\t\t?{}{}_1{}".format(ds_ns_name[1], e_type, type_triple) # view_where += "\n\t\t?{}{}{}".format(ds_ns_name[1], "", type_triple) # Adding the resource as a variable to the variable list view_select += " ?{}{}_1".format( ds_ns_name[1], e_type, ) t_properties = data_info[St.properties] # FOR BACKWARD COMPATIBILITY, REMOVE "<" AND ">" for i in range(len(t_properties)): # print "PROPERTY TUPLE:", t_properties[i] if type(t_properties[i]) is tuple: # print "PROPERTY:", t_properties[i][0] t_properties[i] = (re.sub('[<>]', "", t_properties[i][0]), t_properties[i][1]) # print "PROPERTY:", t_properties[i] else: t_properties[i] = re.sub('[<>]', "", t_properties[i]) # 3 characters string to differentiate the properties of a dataset attache = ds_ns_name[1][short:] # VARIABLES if type(t_properties) is not list: print "THIS <PROPERTIES> NEED TO BE of TYPE A LIST" return None # Going though the properties of interest for i in range(len(t_properties)): # >>> PROPERTY IS JUST A STRING if type(t_properties[i]) is str: # EXTRACTING THE NAMESPACE TO USE FOR THE PROPERTY curr_ns = Ut.get_uri_ns_local_name(t_properties[i]) if type(curr_ns) is list: # Setting up the prefix and predicate predicate = "{}voc:{}".format( short_name, curr_ns[1]) prefix = "{}voc".format(short_name) # GENERATE THE LIST OF OPTIONAL PROPERTIES # optional += "\n\t\tOPTIONAL{{ ?{} {:55} ?{}_{} .}}".format( # ds_ns_name[1], predicate, attache, curr_ns[1]) # ADDING NAMESPACE TO THE VIEW QUERY if prefix not in namespace: namespace[prefix] = curr_ns[0] namespace_str += "\nPREFIX {}: <{}>".format( prefix, curr_ns[0]) # Adding predicates if i == len(t_properties) - 1: if namespace[prefix] != curr_ns[0]: view_where += "\n\t\t\t<{}> ?{}{}_{} .".format( t_properties[i], attache, e_type, curr_ns[1]) else: view_where += "\n\t\t\t{:55} ?{}{}_{} .".format( predicate, attache, e_type, curr_ns[1]) else: if namespace[prefix] != curr_ns[0]: view_where += "\n\t\t\t<{}> ?{}{}_{} ;".format( t_properties[i], attache, e_type, curr_ns[1]) else: view_where += "\n\t\t\t{:55} ?{}{}_{} ;".format( predicate, attache, e_type, curr_ns[1]) # ADDING THE VARIABLE LIST and making it # unique to a dataset with the variable attache value = (" ?{}{}_{}".format( attache, e_type, curr_ns[1])) if len(view_select + value) > str_limit: variables_list[count] = view_select view_select = value count += 1 else: view_select += value # IN THIS CASE, ONLY THE SUBJECT IS PROVIDED else: # TODO check this # "" view_where += ".\n\t\t?{}\n\t\t\t?p ?o .".format( curr_ns) # >>> HERE, WE ARE DEALING WITH A SUBJECT AND A PREDICATE elif type(t_properties[i]) is list: if len(t_properties[i]) == 2: curr_ns = Ut.get_uri_ns_local_name( t_properties[i][1]) if type(curr_ns) is list: predicate = "{}voc:{}".format( short_name, curr_ns[1]) prefix = "{}voc".format(short_name) # ADDING NAMESPACE if prefix not in namespace: namespace[prefix] = curr_ns[0] namespace_str += "\nPREFIX {}: <{}>".format( prefix, curr_ns[0]) # REMOVE PREVIOUS PUNCTUATION # print "REMOVING PREDICATE" view_where = view_where[:len(view_where) - 2] view_where += " .\n\t\t?{}\n\t\t\t{:55} ?{}{}_{} .".format( t_properties[i][0], predicate, attache, e_type, curr_ns[1]) # ADDING THE VARIABLE LIST value = (" ?{}{}_{}".format( attache, e_type, curr_ns[1])) if len(view_select + value) > str_limit: variables_list[count] = view_select view_select = value count += 1 else: view_select += value # >>> PROPERTY IS A SET OF TUPLE WITH THE PROPERTY AND A BOOLEAN # VALE INDICATING WHETHER OR NOT THE PROPERTY IS OPTIONAL elif type(t_properties[i]) is tuple: # Setting up the prefix and predicate curr_ns = Ut.get_uri_ns_local_name(t_properties[i][0]) prefix = "{}voc_{}".format(short_name, str(count_ns)) # ADDING NAMESPACE if curr_ns[0] not in namespace: count_ns += 1 namespace[curr_ns[0]] = prefix namespace_str += "\nPREFIX {}: <{}>".format( prefix, curr_ns[0]) # ACCESSING THE RIGHT NAMESPACE prefix = namespace[curr_ns[0]] # SETTING THE PREDICATE WITH THE RIGHT NAMESPACE predicate = "{}:{}".format(prefix, curr_ns[1]) # CHECKING IF TUPLE OF 2 if len(t_properties[i]) == 2: # ADDING PREDICATE AND SPARQL PUNCTUATION if i == len(t_properties) - 1: if t_properties[i][1] is True: optional += "\n\t\tOPTIONAL{{ ?{}{:15} {:60} ?{}{}_{} . }}".format( ds_ns_name[1], "{}_1".format(e_type), predicate, attache, e_type, curr_ns[1]) else: view_where += "\n\t\t\t{:55} ?{}{}_{} .".format( predicate, attache, e_type, curr_ns[1]) else: if t_properties[i][1] is True: optional += "\n\t\tOPTIONAL{{ ?{}{:15} {:60} ?{}{}_{} . }}".format( ds_ns_name[1], "{}_1".format(e_type), predicate, attache, e_type, curr_ns[1]) else: view_where += "\n\t\t\t{:55} ?{}{}_{} ;".format( predicate, attache, e_type, curr_ns[1]) # ADDING THE VARIABLE LIST value = (" ?{}{}_{}".format( attache, e_type, curr_ns[1])) if len(view_select + value) > str_limit: variables_list[count] = view_select view_select = value count += 1 else: view_select += value # IN CASE THE SELECTED PROPERTIES ARE ALL OPTIONAL, REMOVE THE RESOURCE # print "########################WERE", view_where # view_where = view_where.replace("?{}".format(ds_ns_name[1]), "") if len(optional) > 0: if view_where[len(view_where) - 1] == ".": "DO NOTHING" elif view_where[len(view_where) - 1] == ";": view_where = "{}.".format(view_where[:len(view_where) - 1]) else: # IN CASE THE SELECTED PROPERTIES ARE ALL OPTIONAL, REMOVE THE RESOURCE # print "########################WERE", view_where # print "############", view_where[len(view_where) - 1] view_where = view_where.replace( "?{}".format(ds_ns_name[1]), "") view_where += "\n\t\t### OPTIONAL PROPERTIES{}\n\t".format( optional) # refresh optional = "" # close view_where += "\n\t}\n" my_list = "" for key, variable in variables_list.items(): my_list += "\n" + variable if limit == 0: lmt = "" else: lmt = "LIMIT {}".format(limit) query = "{}\n\nSELECT DISTINCT {}\n{{{}{}\n}} {}".format( namespace_str, my_list + view_select, inter, view_where, lmt) # print "\nVIEW QUERY FOR GENERATING TABLE:", query # table = sparql_xml_to_matrix(query) # display_matrix(table, spacing=80, limit=limit, is_activated=False) print "\nDONE GENERATING THE VIEW" # return {"metadata": view_metadata, "query": query, "table": table} return { "metadata": view_metadata, "query": query, "sparql_issue": is_problematic } except Exception as err: print ">>> ERROR:", err view_metadata = {St.message: "Fatal Error"} return { "metadata": view_metadata, "query": None, "sparql_issue": is_problematic }
def enrich(specs, directory, endpoint): # TODO RUN IT IF THERE IS NOT GRAPH ENRICHED WITH THE SAME NAME # specs[St.graph] = "http://grid.ac/20170712" print "ENRICHING DATA/GRAPH FROM EXPORT-ALIGNMENT" print "GRAPH:", specs[St.graph] print "ENTITY TYPE:", specs[St.entity_datatype] print "LAT PREDICATE:", specs[St.long_predicate] print "LONG PREDICATE:", specs[St.lat_predicate] print "FILE DIRECTORY:", directory name = Ut.get_uri_local_name(specs[St.graph]) print endpoint data_1 = Qry.virtuoso_request( "ask {{ GRAPH <{}> {{ ?x ?y ?z . }} }}".format(specs[St.graph]), endpoint) data_1 = regex.findall("rs:boolean[ ]*(.*)[ ]*\.", data_1["result"]) if len(data_1) > 0: data_1 = data_1[0].strip() == "true" if data_1 is False: print "GRAPH: {} {}".format( specs[St.graph], "DOES NOT EXIST AT THE REMOTE VIRTUOSO SITE.") # CHECKING WHETHER BOTH DATASETS ARE AT THE VIRTUOSO TRIPLE STORE data_2 = Qry.virtuoso_request( "ask {GRAPH <http://geo.risis.eu/gadm>{ ?x ?y ?z . }}", endpoint) data_2 = regex.findall("rs:boolean[ ]*(.*)[ ]*\.", data_2["result"]) if len(data_2) > 0: data_2 = data_2[0].strip() == "true" if data_2 is False: print "GRAPH: {} {}".format( specs[St.graph], "DOES NOT EXIST AT THE REMOTE VIRTUOSO SITE.") if data_1 is False or data_2 is False: message = "BECAUSE BOTH DATASETS NEED TO BE PRESENT AT OUR TRIPLES STORE, WE ARE UNABLE TO EXECUTE THE REQUEST." return { St.message: message, St.result: 'The dataset {} ' 'cannot be enriched with GADM boundary at the moment.'.format( specs[St.graph]) } total = 0 limit = 20000 date = datetime.date.isoformat(datetime.date.today()).replace('-', '') f_path = "{0}{1}{1}{2}_enriched_{3}.ttl".format(directory, os.path.sep, name, date) b_path = "{0}{1}{1}{2}_enriched_{3}{4}".format(directory, os.path.sep, name, date, Ut.batch_extension()) # MAKE SURE THE FOLDER EXISTS try: if not os.path.exists(directory): os.makedirs(directory) except OSError as err: print "\n\t[utility_LOAD_TRIPLE_STORE:]", err return print "\n1. GETTING THE TOTAL NUMBER OF TRIPLES." count_query = enrich_query(specs, limit=0, offset=0, is_count=True) print count_query count_res = Qry.virtuoso_request(count_query, endpoint) result = count_res['result'] # GET THE TOTAL NUMBER OF TRIPLES if result is None: print "NO RESULT FOR THIS ENRICHMENT." return count_res g = rdflib.Graph() g.parse(data=result, format="turtle") attribute = rdflib.URIRef("http://www.w3.org/2005/sparql-results#value") for subject, predicate, obj in g.triples((None, attribute, None)): total = int(obj) # NUMBER OF REQUEST NEEDED iterations = total / limit if total % limit == 0 else total / limit + 1 print "\n2. TOTAL TRIPLES TO RETREIVE : {} \n\tTOTAL NUMBER OF ITERATIONS : {}\n".format( total, iterations) writer = codecs.open(f_path, "wb", "utf-8") batch_writer = codecs.open(b_path, "wb", "utf-8") print "3. GENERATING THE BATCH FILE TEXT" enriched_graph = "{}_enriched".format(specs[St.graph]) stardog_path = '' if Ut.OPE_SYS == "windows" else Svr.settings[ St.stardog_path] load_text = """echo "Loading data" {}stardog data add {} -g {} "{}" """.format(stardog_path, Svr.settings[St.database], enriched_graph, f_path) batch_writer.write(to_unicode(load_text)) batch_writer.close() # RUN THE ITERATIONS for i in range(0, iterations): offset = i * 20000 + 1 print "\tROUND: {} OFFSET: {}".format(i + 1, offset) # print "\t\t1. GENERATING THE ENRICHMENT QUERY" virtuoso = enrich_query(specs, limit=limit, offset=offset, is_count=False) # print virtuoso # exit(0) # print Qry.virtuoso(virtuoso)["result"] # print "\t\t2. RUNNING THE QUERY + WRITE THE RESULT TO FILE" writer.write(Qry.virtuoso_request(virtuoso, endpoint)["result"]) writer.close() print "\n4. RUNNING THE BATCH FILE" print "\tTHE DATA IS BEING LOADED OVER HTTP POST." if Svr.settings[St.split_sys] is True \ else "\tTHE DATA IS BEING LOADED AT THE STARDOG LOCAL HOST FROM BATCH." # os.system(b_path) # RUN THE BATCH FILE print "\tFILE: {}".format(f_path) print "\tBATCH: {}\n".format(b_path) os.chmod(b_path, 0o777) Ut.batch_load(b_path) if os.path.exists(b_path) is True: os.remove(b_path) # TODO 1. REGISTER THE DATASET TO BE ENRICHED IF NOT YET REGISTER # TODO 2. ADD THE ENRICHED DATASET TO THE RESEARCH QUESTION (REGISTER). # TODO 3. MAYBE, CREATE THE LINKSET BETWEEN THE SOURCE AND THE RESULTING size = Qry.get_namedgraph_size(enriched_graph) print "JOB DONE...!!!!!!" return { St.message: "The select dataset was enriched with the GADM boundary as {}. " "{} triples were created.".format(enriched_graph, size), St.result: enriched_graph }
def visualise(graphs, directory, credential): # production_directory = "/scratch/risis/data/rdf-data/links" # directory = production_directory writer = Buffer.StringIO() g = rdflib.Graph() source = {} target = {} attribute = {} src_count = 0 trg_count = 0 prd_count = 0 singletons = {} triples = 0 datasets = [None, None] code = 0 for graph in graphs: # print graph code += 1 links = export_alignment(graph) # THE MECHANISM USED mechanism = links['mechanism'] # print "mechanism", mechanism # THE SOURCE AND TARGET DATASETS if datasets == [None, None]: if str(links["type"]) == Ns.lens_type: datasets = links["lens_targets"] else: datasets = [links["source"], links['target']] # MAKE SURE THAT FOR ALL ALIGNMENT, THE SOURCE DATASET AND TARGET DATASETS ARE THE SAME elif datasets != [links["source"], links['target']]: print "No visualisation for different set of source-target" return None print "DATASETS: ", datasets # print links['result'] if links['result'] is not None: # LOAD THE CORRESPONDENCES TO THE MAIN GRAPH g.parse(data=links['result'], format="turtle") # INDEX THE CORRESPONDENCES USING THE SINGLETON PROPERTY sg = rdflib.Graph() sg.parse(data=links['result'], format="turtle") triples += len(sg) for subject, predicate, obj in sg.triples((None, None, None)): mech = "{}_{}".format(mechanism, code) if predicate not in singletons: singletons[predicate] = [mech] elif mech not in singletons[mech]: singletons[mech] += [mech] # WRITING THE FILE count = 0 writer.write("PREFIX ll: <{}>\n".format(Ns.alivocab)) writer.write("PREFIX rdf: <{}>\n".format(Ns.rdf)) writer.write("PREFIX link: <http://risis.eu/alignment/link/>\n") writer.write("PREFIX plot: <http://risis.eu/alignment/plot/>\n") writer.write("PREFIX mechanism: <{}>\n".format(Ns.mechanism)) print "size: ", len(datasets) if len(datasets) > 2: name = hash("".join(datasets)) name = "{}".format(str(name).replace( "-", "P")) if str(name).__contains__("-") else "P{}".format(name) else: name = "{}_{}".format(Ut.get_uri_local_name(datasets[0]), Ut.get_uri_local_name(datasets[1])) print "NAME: ", name # DROPPING GRAPH IF IT ALREADY EXISTS writer.write("\n#DROP SILENT GRAPH plot:{} ;\n".format(name)) # INSERT NEW DATA writer.write("#INSERT DATA\n#{") writer.write("\n\tplot:{}\n".format(name)) writer.write("\t{") # GOING THROUGH ALL CORRESPONDENCES OF HE MAIN GRAPH (MERGED) for subject, predicate, obj in g.triples((None, None, None)): count += 1 # INDEX THE SOURCE CORRESPONDENCE if subject not in source: src_count += 1 source[subject] = src_count # INDEX THE TARGET CORRESPONDENCE if obj not in target: trg_count += 1 target[obj] = trg_count # INDEX THE PAIR pre_code = "{}_{}".format(source[subject], target[obj]) if pre_code not in attribute: prd_count += 1 attribute[pre_code] = prd_count # WRITE THE PLOT COORDINATE AND ITS METADATA writer.write("\n\t\t### [ {} ]\n".format(count)) writer.write("\t\t{}\n".format(predicate).replace(Ns.alivocab, "ll:")) writer.write("\t\t\tlink:source {} ;\n".format(source[subject])) writer.write("\t\t\tlink:target {} ;\n".format(target[obj])) writer.write("\t\t\tlink:source_uri <{}> ;\n".format(subject)) writer.write("\t\t\tlink:target_uri <{}> ;\n".format(obj)) for value in singletons[predicate]: if str(value) != "None_1": writer.write( "\t\t\tlink:mechanism {} ;\n".format(value).replace( Ns.mechanism, "mechanism:")) writer.write("\t\t\trdf:type link:Link .\n") writer.write("") writer.write("\t}\n#}") # THE PATH OF THE OUTPUT FILES date = datetime.date.isoformat(datetime.date.today()).replace('-', '') f_path = "{0}{1}{1}{2}_plots_{3}.trig".format(directory, os.path.sep, name, date) b_path = "{0}{1}{1}{2}_plots_{3}{4}".format(directory, os.path.sep, name, date, Ut.batch_extension()) print "DIRECTORY:", directory # MAKE SURE THE FOLDER EXISTS try: if not os.path.exists(directory): os.makedirs(directory) except OSError as err: print "\n\t[utility_LOAD_TRIPLE_STORE:]", err return # CREATE THE FILES plot_writer = codecs.open(f_path, "wb", "utf-8") batch_writer = codecs.open(b_path, "wb", "utf-8") # print "3. GENERATING THE BATCH FILE TEXT" # enriched_graph = "{}{}_plots".format(Ns.plot, name) # stardog_path = '' if Ut.OPE_SYS == "windows" else Svr.settings[St.stardog_path] # load_text = """echo "Loading data" # {}stardog data add {} -g {} "{}" # """.format(stardog_path, Svr.DATABASE, enriched_graph, f_path) # GENERATE THE BATCH FILE FOR AUTOMATIC LOAD user = "******" password = "******" if credential is not None: if "user" in credential: user = credential["user"] if "password" in credential: password = credential["password"] load_text = "echo \"Loading data\"\n" \ "/usr/local/virtuoso-opensource/bin/isql 1112 {} {} exec=\"DB.DBA.TTLP_MT (file_to_string_output" \ "('/scratch/risis/data/rdf-data/links/Plots/{}_plots{}.trig'), '', 'http://risis.eu/converted', " \ "256);\"".format(user, password, name, date) batch_writer.write(to_unicode(load_text)) batch_writer.close() os.chmod(b_path, 0o777) # WRITE THE CORRESPONDENCES TO FILE plot_writer.write(writer.getvalue()) plot_writer.close() print "PLOT: {}".format(f_path) print "BATCH: {}".format(b_path) print "Job Done!!!" # Qry.virtuoso_request(writer.getvalue()) # print count, triples # file.close() return {'result': writer.getvalue(), 'message': "Constructed"}
def cluster_d_test_statss(linkset, network_size=3, targets=None, directory=None, greater_equal=True, print_it=False, limit=None, activated=False): network = [] print "LINK NETWORK INVESTIGATION" if activated is False: print "\tTHE FUNCTION I NOT ACTIVATED" return "" date = datetime.date.isoformat(datetime.date.today()).replace('-', '') linkset_name = Ut.get_uri_local_name(linkset) count_1 = 0 count_2 = 0 sheet_builder = Buffer.StringIO() analysis_builder = Buffer.StringIO() sheet_builder.write("Count ID STRUCTURE E-STRUCTURE-SIZE A. NETWORK QUALITY" " M. NETWORK QUALITY REFERENCE\n") linkset = linkset.strip() check = False # RUN THE CLUSTER clusters_0 = Cls.links_clustering(linkset, limit) for i_cluster in clusters_0.items(): # network = [] resources = "" uri_size = 0 count_1 += 1 children = i_cluster[1][St.children] cluster_size = len(children) # if "<http://www.grid.ac/institutes/grid.10493.3f>" not in children: # continue check = cluster_size >= network_size if greater_equal else cluster_size == network_size # NETWORK OF A PARTICULAR SIZE if check: count_2 += 1 # file_name = i_cluster[0] # 2: FETCHING THE CORRESPONDENTS smallest_hash = float('inf') child_list = "" for child in children: hashed = hash(child) if hashed <= smallest_hash: smallest_hash = hashed # GENERAL INFO 1: RESOURCES INVOLVED child_list += "\t{}\n".format(child) use = "<{}>".format(child) if Ut.is_nt_format(child) is not True else child resources += "\n\t\t\t\t{}".format(use) if len(child) > uri_size: uri_size = len(child) if directory: # MAKE SURE THE FILE NAME OF THE CLUSTER IS ALWAYS THE SAME file_name = "{}".format(str(smallest_hash).replace("-", "N")) if str( smallest_hash).startswith("-") \ else "P{}".format(smallest_hash) # QUERY FOR FETCHING ALL LINKED RESOURCES FROM THE LINKSET query = """ PREFIX prov: <{3}> PREFIX ll: <{4}> SELECT DISTINCT ?lookup ?object ?Strength ?Evidence {{ VALUES ?lookup{{ {0} }} {{ GRAPH <{1}> {{ ?lookup ?predicate ?object .}} }} UNION {{ GRAPH <{1}> {{?object ?predicate ?lookup . }} }} GRAPH <{2}> {{ ?predicate prov:wasDerivedFrom ?DerivedFrom . OPTIONAL {{ ?DerivedFrom ll:hasStrength ?Strength . }} OPTIONAL {{ ?DerivedFrom ll:hasEvidence ?Evidence . }} }} }} """.format(resources, linkset, linkset.replace("lens", "singletons"), Ns.prov, Ns.alivocab) # print query # THE RESULT OF THE QUERY ABOUT THE LINKED RESOURCES response = Qry.sparql_xml_to_matrix(query) # A DICTIONARY OF KEY: (SUBJECT-OBJECT) VALUE:STRENGTH response_dic = dict() result = response[St.result] if result: for i in range(1, len(result)): key = (result[i][0], result[i][1]) if key not in response_dic: response_dic[key] = result[i][2] # print response_dic # GENERAL INFO 2: info = "SIZE {} \nCLUSTER {} \nNAME {}\n".format(cluster_size, count_1, file_name) info2 = "CLUSTER [{}] NAME [{}] SIZE [{}]".format(count_1, file_name, cluster_size) analysis_builder.write("{}\n".format(info)) print "{:>5} {}".format(count_2, info2) analysis_builder.write("RESOURCES INVOLVED\n") analysis_builder.write(child_list) analysis_builder.write("\nCORRESPONDENT FOUND ") analysis_builder.write( Qry.display_matrix(response, spacing=uri_size, output=True, line_feed='.', is_activated=True)) # INFO TYPE 3: PROPERTY-VALUES OF THE RESOURCES INVOLVED analysis_builder.write("\n\nDISAMBIGUATION HELPER ") if targets is None: analysis_builder.write(Cls.disambiguate_network(linkset, children)) else: analysis_builder.write(Cls.disambiguate_network_2(children, targets)) position = i_cluster[1][St.row] if St.annotate in i_cluster[1]: analysis_builder.write("\n\nANNOTATED CLUSTER PROCESS") analysis_builder.write(i_cluster[1][St.annotate]) # THE CLUSTER # print "POSITION: {}".format(position) # print "\nMATRIX DISPLAY\n" # for i in range(0, position): # resource = (i_cluster[1][St.matrix])[i] # print "\t{}".format(resource[:position]) # print "\t{}".format(resource) # GENERATING THE NETWORK AS A TUPLE WHERE A TUPLE REPRESENT TWO RESOURCES IN A RELATIONSHIP :-) network = [] for i in range(1, position): for j in range(1, position): if (i, j) in (i_cluster[1][St.matrix_d]) and (i_cluster[1][St.matrix_d])[(i, j)] != 0: r = (i_cluster[1][St.matrix_d])[(i, 0)] c = (i_cluster[1][St.matrix_d])[(0, j)] r_name = "{}:{}".format(i, Ut.get_uri_local_name(r)) c_name = "{}:{}".format(j, Ut.get_uri_local_name(c)) network += [(r_name, c_name)] # network += [(r_smart, c_smart)] # print "\tNETWORK", network if print_it: print "" print analysis_builder.getvalue() # SETTING THE DIRECTORY if directory: # linkset_name = Ut.get_uri_local_name(linkset) # date = datetime.date.isoformat(datetime.date.today()).replace('-', '') temp_directory = "{}{}".format(directory, "\{}_Analysis_{}\{}\{}_{}\\".format( network_size, date, linkset_name, cluster_size, file_name)) if not os.path.exists(temp_directory): os.makedirs(temp_directory) """"""""""""" PLOTTING """"""""""""" # FIRE THE DRAWING: Supported formats: eps, pdf, pgf, png, ps, raw, rgba, svg, svgz. analysis_builder.write( draw_graph(graph=network, file_path="{}{}.{}".format(temp_directory, "cluster_{}".format(file_name), "pdf"), show_image=False) ) """"""""""""" WRITING TO DISC """"""""""""" # WRITE TO DISC Ut.write_2_disc(file_directory=temp_directory, file_name="cluster_{}".format(file_name, ), data=analysis_builder.getvalue(), extension="txt") analysis_builder = Buffer.StringIO() if directory: if network: automated_decision = metric(network)["AUTOMATED_DECISION"] eval_sheet(targets, count_2, "{}_{}".format(cluster_size, file_name), sheet_builder, linkset, children, automated_decision) else: print network if directory: # if len(sheet_builder.getvalue()) > 150 and count_2 == 2: if len(sheet_builder.getvalue()) > 150 and len(clusters_0) == count_1: tmp_directory = "{}{}".format(directory, "\{}_Analysis_{}\{}\\".format( network_size, date, linkset_name)) """"""""""""" WRITING CLUSTER SHEET TO DISC """"""""""""" print "\nWRITING CLUSTER SHEET AT\n\t{}".format(tmp_directory) Ut.write_2_disc(file_directory=tmp_directory, file_name="{}_ClusterSheet".format(cluster_size), data=sheet_builder.getvalue(), extension="txt") # if count_2 == 2: # break print ">>> FOUND: {}".format(count_2) if directory is None: return "{}\t{}".format(network_size, count_2)
def set_linkset_expands_name(specs): unique = "" source = specs[St.source] target = specs[St.target] if St.reducer in source: unique += source[St.reducer] # GEO DATA # unit_value = "" if St.longitude in source: unique += source[St.longitude] if St.latitude in source: unique += source[St.latitude] if St.longitude in target: unique += target[St.longitude] if St.latitude in source: unique += target[St.latitude] if St.unit in specs: unique += str(specs[St.unit]) unit = Ut.get_uri_local_name(str(specs[St.unit])) unique += unit if St.unit_value in specs: unique += str(specs[St.unit_value]) unit_value = str(specs[St.unit_value]) unique += unit_value if St.reducer in specs[St.target]: unique += target[St.reducer] if St.intermediate_graph in specs: unique = str(specs[St.intermediate_graph]) if St.threshold in specs: unique += str(specs[St.threshold]) if St.delta in specs: unique += str(specs[St.delta]) if St.aligns_name in source: unique += source[St.aligns_name] elif St.latitude_name in source: # src_aligns += source[St.latitude_name] unique += "Latitude" if St.longitude_name in source: # src_aligns += source[St.longitude_name] unique += "Longitude" if St.aligns_name in target: unique += target[St.aligns_name] elif St.latitude_name in target: # trg_aligns += target[St.latitude_name] unique += "Latitude" if St.longitude_name in target: # trg_aligns += target[St.longitude_name] unique += "Longitude" dir_name = DIRECTORY date = datetime.date.isoformat(datetime.date.today()).replace('-', '') unique = Ut.hash_it(specs[St.mechanism] + source[St.graph_name] + target[St.graph_name] + source[St.entity_datatype] + target[St.entity_datatype] + unique) if St.expands in specs: specs[St.linkset_name] = "expands_{}_{}".format( specs[St.expands_name], unique) specs[St.linkset] = "{}{}".format(Ns.linkset, specs[St.linkset_name]) singleton_metadata_file = "{}(SingletonMetadata)-{}.trig".format( specs[St.linkset_name], date) singleton_metadata_output = "{}/{}".format(dir_name, singleton_metadata_file) future_path = os.path.join(DIRECTORY, singleton_metadata_output) future_path = future_path.replace("\\", "/").replace("//", "/") if len(future_path) > 255: full_hashed = Ut.hash_it(specs[St.linkset_name]) specs[St.linkset_name] = "expands_{}_{}_{}".format( source[St.graph_name], specs[St.mechanism], full_hashed) print "\t- specs[St.linkset]", specs[St.linkset] return specs[St.linkset] else: return set_linkset_name(specs)
def intersection_extended(specs, lens_name, display=False): # print Ut.headings("EXTENDED INTERSECTION") inter = "" insert = Buffer.StringIO() insert_sing = Buffer.StringIO() model_1 = """ ### ABOUT {0} GRAPH <{0}> {{ {1} }} """ model_2 = """ ### {2}. ABOUT {0} GRAPH <{0}> {{ ?{1} ?pred_{2} ?{3} . }} ### SINGLETONS GRAPH <{4}> {{ ?pred_{2} ?x_{2} ?y_{2} . }}""" count_graph = 1 for graph in specs[St.datasets]: query = """ PREFIX void: <{}> PREFIX bdb: <{}> SELECT distinct ?subTarget ?objTarget ?subjectEntityType ?objectEntityType {{ <{}> #void:target*/(void:subjectsTarget|void:objectsTarget)* ?x ; void:target*/(void:subjectsTarget|void:objectsTarget)* ?x . ?x void:subjectsTarget ?subTarget ; void:objectsTarget ?objTarget ; bdb:subjectsDatatype ?subjectEntityType ; bdb:objectsDatatype ?objectEntityType . FILTER NOT EXISTS {{ ?subTarget a void:Linkset }} FILTER NOT EXISTS {{ ?objTarget a void:Linkset }} }}""".format(Ns.void, Ns.bdb, graph) # print "INTERSECTION QUERY:", query response = sparql_xml_to_matrix(query) if display: print "INTERSECTION QUERY:", query # print "\nGRAPH:", graph # print "RESPONSE:", response # exit(0) if response: targets = response[St.result] # IF THE RESULT HAS MORE THAN # print "LENGTH:", len(targets) if targets is not None and len(targets) > 2: union = "" for i in range(1, len(targets)): append = "UNION" if i < len(targets) - 1 else "" tab = "" if i == 1 else "" src = Ut.get_uri_local_name(targets[i][0]) trg = Ut.get_uri_local_name(targets[i][1]) if src[0].isdigit(): src = "D{}".format(src) if trg[0].isdigit(): trg = "D{}".format(trg) src_TYPE = Ut.get_uri_local_name(targets[i][2]) trg_TYPE = Ut.get_uri_local_name(targets[i][3]) src_variable = "{}_{}_1".format(src, src_TYPE[short:]) if src == trg and src_TYPE == trg_TYPE: trg_variable = "{}_{}_2".format(trg, trg_TYPE[short:]) else: trg_variable = "{}_{}_1".format(trg, trg_TYPE[short:]) union += "\n\t\t{0}{{ ?{1} ?predicate_{2} ?{3} . }} {4}".format( tab, src_variable, i, trg_variable, append) union = model_1.format(graph, union) # print "UNION:", union inter += union # ONLY TWO TARGETS elif targets and len(targets) == 2: src = Ut.get_uri_local_name(targets[1][0]) trg = Ut.get_uri_local_name(targets[1][1]) if src[0].isdigit(): src = "D{}".format(src) if trg[0].isdigit(): trg = "D{}".format(trg) src_TYPE = Ut.get_uri_local_name(targets[1][2]) trg_TYPE = Ut.get_uri_local_name(targets[1][3]) src_variable = "{}_{}_1".format(src, src_TYPE[short:]) if src == trg and src_TYPE == trg_TYPE: trg_variable = "{}_{}_2".format(trg, trg_TYPE[short:]) else: trg_variable = "{}_{}_1".format(trg, trg_TYPE[short:]) inter += model_2.format( graph, src_variable, count_graph, trg_variable, "{}{}".format(Ns.singletons, Ut.get_uri_local_name_plus(graph))) insert.write("\t\t?{} ?pred_{} ?{} .\n".format( src_variable, count_graph, trg_variable)) insert_sing.write( "\t\t?pred_{0} ?x_{0} ?y_{0}.\n".format(count_graph)) count_graph += 1 # print inter # exit(0) insert_query = """INSERT\n{{ ### LINKS GRAPH <{5}{4}> {{\n{1}\t}} ### METADATA GRAPH <{6}{4}> {{\n{3}\t}}\n}}\nWHERE\n{{{2}\n}} """.format("", insert.getvalue(), inter, insert_sing.getvalue(), lens_name, Ns.lens, Ns.singletons) return insert_query
def intersection2(specification): print "\nINTERSECTION TASK" \ "\n======================================================" \ "========================================================\n" query = "" p_count = 0 up_count = 0 check = dict() union_check = dict() # target_check = dict() view_lens = specification[St.datasets] # print datasets for graph in view_lens: if type(graph) is not str and type(graph) is not unicode: # print dataset print "THE DATASET MUST BE OF TYPE STRING. {} WAS GIVEN.".format( type(graph)) return None # NAME OF THE GRAPH graph_name = Ut.get_uri_local_name(graph) # print "Dataset:", dataset # GET THE TYPE OF THE GRAPH graph_type = get_graph_type(graph) if graph_type[St.message] != "NO RESPONSE": if graph_type[St.result] is not None: # print "\tABLE TO RETRIEVE THE TYPE {}".format(graph_type) # EXPECTING ONE RESULT. BECAUSE THE MATRIX HAS A HEADER, LENS NEED TO BE 2 if len(graph_type[St.result]) == 2: # EXPECTING A LENS DATATYPE if graph_type[St.result][1][0] == "{}Lens".format(Ns.bdb): operator = get_lens_operator(graph) # print "\tLENS GENERATED BY {}".format(operator) if (operator is not None) and (operator == "{}".format( Ns.lensOpu)): if graph not in check: check[graph] = 1 # print "\tGETTING TARGET GRAPHS" targets = get_graph_targets(graph) if targets[St.result] is not None: # print "\tABLE TO RETRIEVE TARGETS {}".format(targets) union_query = "" graphs = list() for i in range(1, len(targets[St.result])): target = targets[St.result][i][0] # print "target: ", target # GET SOURCE AND TARGET DATASETS src_trg = get_graph_source_target( target) if src_trg[St.result] is not None: src = Ut.get_uri_local_name( src_trg[St.result][1][0]) trg = Ut.get_uri_local_name( src_trg[St.result][1][1]) # print "\tSOURCE: {} TARGET:{}".format(src, trg) if "{}_{}".format( src, trg) in union_check: up_count += 1 "\t{}_{} already exist".format( src, trg) # print "\t{}_{} already exist".format(src, trg) else: union_check["{}_{}".format( src, trg)] = up_count temp = "\n\t\tGRAPH <{}> \n\t\t{{ " \ "\n\t\t\t?{} ?pred_{} ?{} . " \ "\n\t\t}}".format(graph, src, up_count, trg) # print "\tTHE RESULTING GRAPH {}".format(temp) graphs.append(temp) else: "No source and target datasets" # query += "\n\tGRAPH <{}> \n\t{{ {} \n\t}}".format(dataset, triples) if len(union_check) > 1: for i in range(len(graphs)): if i == 0: union_query += "\n\t### LENS BY UNION: {}\n\t{{{}\n\t}}".format( graph_name, graphs[i]) elif i > 0: union_query += "\n\tUNION\n\t{{{}\n\t}}".format( graphs[i]) else: union_query += "\n\t### LENS BY UNION: {}\n\t{{{}\n\t}}".format( graph_name, graphs[0]) query += union_query else: "No target for this grap" else: "\tTHE DATASET ALREADY EXISTS" # print "\tTHE DATASET ALREADY EXISTS" else: "Not a union operator" elif graph_type[St.result][1][0] == "{}Linkset".format( Ns.void): "It is a linkset" # GET SOURCE AND TARGET DATASETS src_trg = get_graph_source_target(graph) if src_trg is not None: src = Ut.get_uri_local_name( src_trg[St.result][1][0]) trg = Ut.get_uri_local_name( src_trg[St.result][1][1]) if graph in check: print "already exist" else: p_count += 1 check[graph] = p_count query += "\n\t### LINKSET: {}\n\tGRAPH <{}> \n\t{{\n\t\t?{} ?predicate_{} ?{} .\n\t}}".\ format(graph_name, graph, src, p_count, trg) else: print "WE COULD NOT ACCESS THE TYPE OF THE GRAPH: <{}>.".format( graph) else: print Ec.ERROR_CODE_1 return None # print query return query
def intersection(specs, display=False): inter = "" count_graph = 1 for graph in specs[St.datasets]: query = """ PREFIX void: <{}> PREFIX bdb: <{}> SELECT distinct ?subTarget ?objTarget ?subjectEntityType ?objectEntityType {{ <{}> #void:target*/(void:subjectsTarget|void:objectsTarget)* ?x ; void:target*/(void:subjectsTarget|void:objectsTarget)* ?x . ?x void:subjectsTarget ?subTarget ; void:objectsTarget ?objTarget ; bdb:subjectsDatatype ?subjectEntityType ; bdb:objectsDatatype ?objectEntityType . FILTER NOT EXISTS {{ ?subTarget a void:Linkset }} FILTER NOT EXISTS {{ ?objTarget a void:Linkset }} }}""".format(Ns.void, Ns.bdb, graph) # print "INTERSECTION QUERY:", query response = sparql_xml_to_matrix(query) if display: print "INTERSECTION QUERY:", query # print "\nGRAPH:", graph # print "RESPONSE:", response # exit(0) if response: targets = response[St.result] # IF THE RESULT HAS MORE THAN # print "LENGTH:", len(targets) if targets is not None and len(targets) > 2: union = "" for i in range(1, len(targets)): append = "UNION" if i < len(targets) - 1 else "" tab = "" if i == 1 else "" src = Ut.get_uri_local_name(targets[i][0]) trg = Ut.get_uri_local_name(targets[i][1]) if src[0].isdigit(): src = "D{}".format(src) if trg[0].isdigit(): trg = "D{}".format(trg) src_TYPE = Ut.get_uri_local_name(targets[i][2]) trg_TYPE = Ut.get_uri_local_name(targets[i][3]) src_variable = "{}_{}_1".format(src, src_TYPE[short:]) if src == trg and src_TYPE == trg_TYPE: trg_variable = "{}_{}_2".format(trg, trg_TYPE[short:]) else: trg_variable = "{}_{}_1".format(trg, trg_TYPE[short:]) union += "\n\t\t{0}{{ ?{1} ?predicate_{2} ?{3} . }} {4}".format( tab, src_variable, i, trg_variable, append) union = """ ### ABOUT {0} GRAPH <{0}> {{ {1} }} """.format(graph, union) # print "UNION:", union inter += union elif targets and len(targets) == 2: src = Ut.get_uri_local_name(targets[1][0]) trg = Ut.get_uri_local_name(targets[1][1]) if src[0].isdigit(): src = "D{}".format(src) if trg[0].isdigit(): trg = "D{}".format(trg) src_TYPE = Ut.get_uri_local_name(targets[1][2]) trg_TYPE = Ut.get_uri_local_name(targets[1][3]) src_variable = "{}_{}_1".format(src, src_TYPE[short:]) if src == trg and src_TYPE == trg_TYPE: trg_variable = "{}_{}_2".format(trg, trg_TYPE[short:]) else: trg_variable = "{}_{}_1".format(trg, trg_TYPE[short:]) inter += """ ### ABOUT {0} GRAPH <{0}> {{ ?{1} ?pred_{2} ?{3} . }} """.format(graph, src_variable, count_graph, trg_variable) count_graph += 1 # print inter # exit(0) return inter
def geo_match_query(specs): # Note that for WKT formatted points, # the location is <long, lat>. The location of the White House can also be encoded using the WGS 84 # source = specs[St.source] # target = specs[St.target] # src_lat = source[St.latitude] # src_long = source[St.longitude] is_de_duplication = (specs[St.source][St.graph] == specs[St.target][St.graph]) and \ (specs[St.source][St.entity_datatype] == specs[St.target][St.entity_datatype]) number_of_load = '{}_1'.format(specs[St.lens_name]) if is_de_duplication is True \ else "{}_2".format(specs[St.lens_name]) unit = "{}(s)".format(Ut.get_uri_local_name(specs[St.unit]).lower()) match = """ ###################################################################### ### INSETTING MATCH FOUND IN A TEMPORARY GRAPH ###################################################################### PREFIX ll: <{0}> PREFIX tmpvocab: <{0}> PREFIX tmpgraph: <{1}> prefix lens: <{5}> prefix singleton: <{7}> prefix prov: <{12}> PREFIX geof: <http://www.opengis.net/def/function/geosparql/> PREFIX wgs: <http://www.w3.org/2003/01/geo/wgs84_pos#> INSERT {{ GRAPH lens:{6} {{ ?src_resource ?singPre ?trg_resource . }} GRAPH singleton:{6} {{ ?singPre rdf:singletonPropertyOf ll:nearbyGeoSim{10} . ?singPre ll:hasEvidence "Near each other by at most {3} {9}" . ?singPre ll:hasStrength 1 . ?singPre ?pre_derived ?obj_derived . ?singPre ?der_pre ?der_obj . }} }} WHERE {{ ### THE ALIGNMENT TO REFINE GRAPH lens:{11} {{ ?src_resource ?singleton ?trg_resource . }} GRAPH singleton:{11} {{ ?singleton ?pre_derived ?obj_derived . OPTIONAL{{ ?obj_derived prov:wasDerivedFrom* ?der_from . ?der_from ?der_pre ?der_obj . }} }} ### SOURCE DATASET WITH GEO-COORDINATES GRAPH tmpgraph:load_{6}_1 {{ ?src_resource wgs:long ?src_longitude . ?src_resource wgs:lat ?src_latitude . ### Create A SINGLETON URI BIND( replace("{0}{8}_#", "#", STRAFTER(str(UUID()),"uuid:")) as ?pre ) BIND( iri(?pre) as ?singPre ) }} ### TARGET DATASET WITH GEO-COORDINATES GRAPH tmpgraph:load_{2} {{ ?trg_resource wgs:long ?trg_longitude . ?trg_resource wgs:lat ?trg_latitude . }} ### MATCHING TARGETS NEAR BY SOURCE ?src_resource geof:nearby (?trg_resource {3} <{4}>). }} """.format( # 0 1 2 3 4 Ns.alivocab, Ns.tmpgraph, number_of_load, specs[St.unit_value], specs[St.unit], # 5 6 7 8 9 10 Ns.lens, specs[St.lens_name], Ns.singletons, specs[St.mechanism], unit, specs[St.sameAsCount], # 11 12 specs[St.refined_name], Ns.prov) return match
def cluster_d_test(linkset, network_size=3, network_size_max=3, targets=None, constraint_targets=None, constraint_text="", directory=None, greater_equal=True, print_it=False, limit=None, only_good=False, activated=False): # FOR CONSTRAINTS TO WORK, IT SHOULD NOT BE NONE network = [] print "\nLINK NETWORK INVESTIGATION" if activated is False: print "\tTHE FUNCTION I NOT ACTIVATED" return "" elif network_size > network_size_max and greater_equal is False: print "\t[network_size] SHOULD BE SMALLER THAN [network_size_max]" return "" date = datetime.date.isoformat(datetime.date.today()).replace('-', '') linkset_name = Ut.get_uri_local_name(linkset) linkset = linkset.strip() if network_size_max - network_size == 0: greater_equal = False check = False # RUN THE CLUSTER clusters_0 = Cls.links_clustering(linkset, limit) if greater_equal is True: temp_size = 0 for cluster, cluster_val in clusters_0.items(): new_size = len(list(cluster_val["nodes"])) if new_size > temp_size: temp_size = new_size network_size_max = temp_size print "THE BIGGEST NETWORK'S: {}".format(network_size_max) def check_constraint(): text = constraint_text.lower() text = text.split(",") # CONSTRAINT BUILDER c_builder = Buffer.StringIO() if constraint_targets is not None: for dictionary in constraint_targets: graph = dictionary[St.graph] data_list = dictionary[St.data] properties = data_list[0][St.properties] prop = properties[0] if Ut.is_nt_format(properties[0]) else "<{}>".format(properties[0]) # WRITING THE CONSTRAINT ON THE GRAPH graph_q = """ {{ GRAPH <{0}> {{ ?lookup {1} ?constraint . }} }} """.format(graph, prop) c_builder.write(graph_q) if len(c_builder.getvalue()) == 0 else \ c_builder.write("UNION {}".format(graph_q)) # WRITING THE FILTER if len(c_builder.getvalue()) > 0: for i in range(0, len(text)): if i == 0 : c_builder.write(""" FILTER (LCASE(STR(?constraint)) = "{}" """.format(text[i].strip())) else: c_builder.write(""" || LCASE(STR(?constraint)) = "{}" """.format(text[i].strip())) c_builder.write(")") # # THE RESULT OF THE QUERY ABOUT THE LINKED RESOURCES query = Qry.cluster_rsc_strengths_query(resources, linkset) query = query.replace("# CONSTRAINTS IF ANY", c_builder.getvalue()) # print query response = Qry.sparql_xml_to_matrix(query) if response[St.result] is None: return False return True for index in range(network_size, network_size_max + 1): count_1 = 0 count_2 = 0 curr_network_size = index print "\nCLUSTERS OF SIZE {}".format(index) sheet_builder = Buffer.StringIO() analysis_builder = Buffer.StringIO() sheet_builder.write("Count ID STRUCTURE E-STRUCTURE-SIZE A. NETWORK QUALITY" " M. NETWORK QUALITY REFERENCE\n") for cluster, cluster_val in clusters_0.items(): # network = [] resources = "" uri_size = 0 count_1 += 1 children = list(cluster_val["nodes"]) strengths = cluster_val["strengths"] cluster_size = len(children) # if "<http://www.grid.ac/institutes/grid.10493.3f>" not in children: # continue check = cluster_size >= curr_network_size if greater_equal else cluster_size == curr_network_size # NETWORK OF A PARTICULAR SIZE if check: # file_name = i_cluster[0] # 2: FETCHING THE CORRESPONDENTS smallest_hash = float('inf') child_list = "" for child in children: # CREATE THE HASHED ID AS THE CLUSTER NAME hashed = hash(child) if hashed <= smallest_hash: smallest_hash = hashed # GENERAL INFO 1: RESOURCES INVOLVED child_list += "\t{}\n".format(child) # LIST OF RESOURCES IN THE CLUTER use = "<{}>".format(child) if Ut.is_nt_format(child) is not True else child resources += "\n\t\t\t\t{}".format(use) if len(child) > uri_size: uri_size = len(child) # MAKE SURE THE FILE NAME OF THE CLUSTER IS ALWAYS THE SAME file_name = "{}".format(str(smallest_hash).replace("-", "N")) if str( smallest_hash).startswith("-") \ else "P{}".format(smallest_hash) if constraint_targets is not None and check_constraint() is False: continue count_2 += 1 # # THE RESULT OF THE QUERY ABOUT THE LINKED RESOURCES query = Qry.cluster_rsc_strengths_query(resources, linkset) response = Qry.sparql_xml_to_matrix(query) # GENERAL INFO 2: info = "SIZE {} \nCLUSTER {} \nNAME {}\n".format(cluster_size, count_1, file_name) info2 = "CLUSTER [{}] NAME [{}] SIZE [{}]".format(count_1, file_name, cluster_size) analysis_builder.write("{}\n".format(info)) analysis_builder.write("RESOURCES INVOLVED\n") analysis_builder.write(child_list) analysis_builder.write("\nCORRESPONDENT FOUND ") analysis_builder.write( Qry.display_matrix(response, spacing=uri_size, output=True, line_feed='.', is_activated=True)) # INFO TYPE 3: PROPERTY-VALUES OF THE RESOURCES INVOLVED analysis_builder.write("\n\nDISAMBIGUATION HELPER ") if targets is None: analysis_builder.write(Cls.disambiguate_network(linkset, children)) else: report = Cls.disambiguate_network_2(children, targets) if report is not None: analysis_builder.write(report) # GENERATING THE NETWORK AS A TUPLE WHERE A TUPLE REPRESENT TWO RESOURCES IN A RELATIONSHIP :-) network = [] link_count = 0 for link in cluster_val["links"]: link_count += 1 name_1 = "{}-{}".format(Ut.hash_it(link[0]), Ut.get_uri_local_name(link[0])) name_2 = "{}-{}".format(Ut.hash_it(link[1]), Ut.get_uri_local_name(link[1])) network += [(name_1, name_2)] # GET THE AUTOMATED FLAG if print_it: print "" print analysis_builder.getvalue() # SETTING THE DIRECTORY if directory: if network: automated_decision = metric(network)["AUTOMATED_DECISION"] if only_good is True and automated_decision.startswith("GOOD") is not True: count_2 -= 1 continue print "{:>5} {}".format(count_2, info2) eval_sheet(targets, count_2, "{}_{}".format(cluster_size, file_name), sheet_builder, linkset, children, automated_decision) else: print network # linkset_name = Ut.get_uri_local_name(linkset) # date = datetime.date.isoformat(datetime.date.today()).replace('-', '') temp_directory = "{}{}".format(directory, "\{}_Analysis_{}\{}\{}_{}\\".format( curr_network_size, date, linkset_name, cluster_size, file_name)) if not os.path.exists(temp_directory): os.makedirs(temp_directory) """"""""""""" PLOTTING """"""""""""" # FIRE THE DRAWING: Supported formats: eps, pdf, pgf, png, ps, raw, rgba, svg, svgz. analysis_builder.write( draw_graph(graph=network, file_path="{}{}.{}".format(temp_directory, "cluster_{}".format(file_name), "pdf"), show_image=False) ) """"""""""""" WRITING TO DISC """"""""""""" # WRITE TO DISC Ut.write_2_disc(file_directory=temp_directory, file_name="cluster_{}".format(file_name, ), data=analysis_builder.getvalue(), extension="txt") analysis_builder = Buffer.StringIO() if directory: # if len(sheet_builder.getvalue()) > 150 and count_2 == 2: if len(sheet_builder.getvalue()) > 150 and len(clusters_0) == count_1: tmp_directory = "{}{}".format(directory, "\{}_Analysis_{}\{}\\".format( curr_network_size, date, linkset_name)) """"""""""""" WRITING CLUSTER SHEET TO DISC """"""""""""" print "\n\tWRITING CLUSTER SHEET AT\n\t{}".format(tmp_directory) Ut.write_2_disc(file_directory=tmp_directory, file_name="{}_ClusterSheet".format(cluster_size), data=sheet_builder.getvalue(), extension="txt") # if count_2 == 2: # break if greater_equal is True: # no need to continue as we already did all network greater of equal to "network-size" input break print "\t>>> FOUND: {} CLUSTERS OF SIZE {}".format(count_2, curr_network_size) if directory is None: return "{}\t{}".format(curr_network_size, count_2)
def cluster_d_test_stats(linkset, network_size=3, targets=None, directory=None, greater_equal=True, print_it=False, limit=None, activated=False): network = [] print "LINK NETWORK INVESTIGATION" if activated is False: print "\tTHE FUNCTION I NOT ACTIVATED" return "" date = datetime.date.isoformat(datetime.date.today()).replace('-', '') linkset_name = Ut.get_uri_local_name(linkset) count_1 = 0 count_2 = 0 sheet_builder = Buffer.StringIO() analysis_builder = Buffer.StringIO() sheet_builder.write("Count ID STRUCTURE E-STRUCTURE-SIZE A. NETWORK QUALITY" " M. NETWORK QUALITY REFERENCE\n") linkset = linkset.strip() check = False # RUN THE CLUSTER clusters_0 = Cls.links_clustering(linkset, limit) for cluster, cluster_val in clusters_0.items(): # network = [] resources = "" uri_size = 0 count_1 += 1 children = list(cluster_val["nodes"]) strengths = cluster_val["strengths"] cluster_size = len(children) # if "<http://www.grid.ac/institutes/grid.10493.3f>" not in children: # continue check = cluster_size >= network_size if greater_equal else cluster_size == network_size # NETWORK OF A PARTICULAR SIZE if check: count_2 += 1 # file_name = i_cluster[0] # 2: FETCHING THE CORRESPONDENTS smallest_hash = float('inf') child_list = "" for child in children: hashed = hash(child) if hashed <= smallest_hash: smallest_hash = hashed # GENERAL INFO 1: RESOURCES INVOLVED child_list += "\t{}\n".format(child) use = "<{}>".format(child) if Ut.is_nt_format(child) is not True else child resources += "\n\t\t\t\t{}".format(use) if len(child) > uri_size: uri_size = len(child) if directory: # MAKE SURE THE FILE NAME OF THE CLUSTER IS ALWAYS THE SAME file_name = "{}".format(str(smallest_hash).replace("-", "N")) if str( smallest_hash).startswith("-") \ else "P{}".format(smallest_hash) # # THE RESULT OF THE QUERY ABOUT THE LINKED RESOURCES query = Qry.cluster_rsc_strengths_query(resources, linkset) response = Qry.sparql_xml_to_matrix(query) # GENERAL INFO 2: info = "SIZE {} \nCLUSTER {} \nNAME {}\n".format(cluster_size, count_1, file_name) info2 = "CLUSTER [{}] NAME [{}] SIZE [{}]".format(count_1, file_name, cluster_size) analysis_builder.write("{}\n".format(info)) print "{:>5} {}".format(count_2, info2) analysis_builder.write("RESOURCES INVOLVED\n") analysis_builder.write(child_list) analysis_builder.write("\nCORRESPONDENT FOUND ") analysis_builder.write( Qry.display_matrix(response, spacing=uri_size, output=True, line_feed='.', is_activated=True)) # INFO TYPE 3: PROPERTY-VALUES OF THE RESOURCES INVOLVED analysis_builder.write("\n\nDISAMBIGUATION HELPER ") if targets is None: analysis_builder.write(Cls.disambiguate_network(linkset, children)) else: analysis_builder.write(Cls.disambiguate_network_2(children, targets)) # GENERATING THE NETWORK AS A TUPLE WHERE A TUPLE REPRESENT TWO RESOURCES IN A RELATIONSHIP :-) network = [] link_count = 0 for link in cluster_val["links"]: link_count += 1 name_1 = "{}".format(Ut.get_uri_local_name(link[0])) name_2 = "{}".format(Ut.get_uri_local_name(link[1])) network += [(name_1, name_2)] if print_it: print "" print analysis_builder.getvalue() # SETTING THE DIRECTORY if directory: # linkset_name = Ut.get_uri_local_name(linkset) # date = datetime.date.isoformat(datetime.date.today()).replace('-', '') temp_directory = "{}{}".format(directory, "\{}_Analysis_{}\{}\{}_{}\\".format( network_size, date, linkset_name, cluster_size, file_name)) if not os.path.exists(temp_directory): os.makedirs(temp_directory) """"""""""""" PLOTTING """"""""""""" # FIRE THE DRAWING: Supported formats: eps, pdf, pgf, png, ps, raw, rgba, svg, svgz. analysis_builder.write( draw_graph(graph=network, file_path="{}{}.{}".format(temp_directory, "cluster_{}".format(file_name), "pdf"), show_image=False) ) """"""""""""" WRITING TO DISC """"""""""""" # WRITE TO DISC Ut.write_2_disc(file_directory=temp_directory, file_name="cluster_{}".format(file_name, ), data=analysis_builder.getvalue(), extension="txt") analysis_builder = Buffer.StringIO() if network: automated_decision = metric(network)["AUTOMATED_DECISION"] eval_sheet(targets, count_2, "{}_{}".format(cluster_size, file_name), sheet_builder, linkset, children, automated_decision) else: print network if directory: # if len(sheet_builder.getvalue()) > 150 and count_2 == 2: if len(sheet_builder.getvalue()) > 150 and len(clusters_0) == count_1: tmp_directory = "{}{}".format(directory, "\{}_Analysis_{}\{}\\".format( network_size, date, linkset_name)) """"""""""""" WRITING CLUSTER SHEET TO DISC """"""""""""" print "\nWRITING CLUSTER SHEET AT\n\t{}".format(tmp_directory) Ut.write_2_disc(file_directory=tmp_directory, file_name="{}_ClusterSheet".format(cluster_size), data=sheet_builder.getvalue(), extension="txt") # if count_2 == 2: # break print ">>> FOUND: {}".format(count_2) if directory is None: return "{}\t{}".format(network_size, count_2)
def lens_refine_name(specs, lens_type): extra = "" source = specs[St.source] target = specs[St.target] if St.reducer in source: extra += source[St.reducer] # GEO DATA unit_value = "" if St.longitude in source: extra += source[St.longitude] if St.latitude in source: extra += source[St.latitude] if St.longitude in target: extra += target[St.longitude] if St.latitude in source: extra += target[St.latitude] if St.unit in specs: extra += str(specs[St.unit]) unit = Ut.get_uri_local_name(str(specs[St.unit])) if St.unit_value in specs: extra += str(specs[St.unit_value]) unit_value = str(specs[St.unit_value]) if St.reducer in specs[St.target]: extra += target[St.reducer] if St.intermediate_graph in specs: intermediate = str(specs[St.intermediate_graph]) if St.threshold in specs: extra += str(specs[St.threshold]) if St.delta in specs: extra += str(specs[St.delta]) if St.aligns_name in source: extra += source[St.aligns_name] elif St.latitude_name in source: # src_aligns += source[St.latitude_name] extra += "Latitude" if St.longitude_name in source: # src_aligns += source[St.longitude_name] extra += "Longitude" if St.aligns_name in target: extra += target[St.aligns_name] elif St.latitude_name in target: # trg_aligns += target[St.latitude_name] extra += "Latitude" if St.longitude_name in target: # trg_aligns += target[St.longitude_name] extra += "Longitude" unique = Ut.hash_it(extra) specs[St.lens] = u"{}refine_{}_{}".format(unique, Ns.lens, specs[St.refined_name]) update_specification(specs)
def stats_optimised(graph, display_table=False, display_text=False, boolean=True): optional = dict() stat = {} text = buffer() # 1. FIND ALL TYPES IN THE GRAPH qry_types = """ ### RETRIEVE ALL TYPES FROM THE GRAPH SELECT DISTINCT ?Types (count(distinct ?resource) as ?EntityCount) {{ GRAPH <{}> {{ ?resource a ?Types . }} }} GROUP by ?Types ?EntityType ORDER BY ?Graph """.format(graph) # print qry_types types_matrix = sparql_xml_to_matrix(qry_types) # print types_matrix # if display_table: display_matrix(types_matrix, spacing=70, limit=100, is_activated=display_table) # 2. FOR EACH TYPES GET ALL PROPERTIES if types_matrix["result"] is not None: types = types_matrix["result"] for i in range(1, len(types)): curr_type = types[i][0] type_name = Ut.get_uri_local_name(curr_type) instances = int(types[i][1]) optional[curr_type] = dict() qry_properties = """ ### RETRIEVE ALL PROPERTIES FOR THE TYPE [{0}] SELECT DISTINCT ?Properties_for_{0} {{ GRAPH <{1}> {{ ?resource a <{2}> ; ?Properties_for_{0} ?object . }} }} """.format(type_name, graph, curr_type) properties_matrix = sparql_xml_to_matrix(qry_properties) # if display_table: # print "\nPROPERTY COUNT:", len(properties_matrix["result"]) - 1 display_matrix(properties_matrix, spacing=70, limit=100, is_activated=display_table) # PROPERTY OCCURENCE COUNT pro_text = buffer() sel_text = buffer() grp_text = buffer() if properties_matrix["result"] is not None: pro_text.write("\nSELECT ?predicate (COUNT(distinct ?resource) as ?Occurrences)") pro_text.write("\n{{\n\tGRAPH <{}> ".format(graph)) pro_text.write("\n\t{{\n\t\t?resource a <{}> .".format(curr_type)) pro_text.write("\n\t\t?resource ?predicate ?object .") pro_text.write("\n\t}}\n}}\nGROUP BY ?predicate".format(grp_text.getvalue())) properties = properties_matrix["result"] cur_dic = optional[curr_type] count = 0 append = "" # RUN THE QUERY FOR PROPERTIES OCCURRENCES qry_property_stats = pro_text.getvalue() # print qry_property_stats Occurrences_matrix = sparql_xml_to_matrix(qry_property_stats) # if display_table: display_matrix(Occurrences_matrix, spacing=70, limit=100, is_activated=display_table) if Occurrences_matrix["result"] != None: Occurrences = Occurrences_matrix["result"] for i in range(1, len(Occurrences)): # THE PROPERTY IS THE KEY OF THE DICTIONARY if boolean is True: cur_dic[Occurrences[i][0]] = int(Occurrences[i][1]) % float(instances) != 0 else: cur_dic[Occurrences[i][0]] = math.floor(100 * int(Occurrences[i][1]) / float(instances)) text.write("\nGRAPH: {}".format(graph)) for key, value in optional.items(): line = "-------------------------------------------------------------------------------------------------" text.write("\n\n\tENTITY TYPE: {}".format(key)) text.write("\n\t\t{:100}{}".format(line, "------------")) text.write("\n\t\t{:<5}{:97}{}".format(len(optional[key]), "Properties", "Optional")) text.write("\n\t\t{:100}{}".format(line, "------------")) for pro, opt in value.items(): if opt: text.write("\n\t\t{:100}{}".format("{} ***".format(pro), opt)) else: text.write("\n\t\t{:100}{}".format(pro, opt)) if display_text: print text.getvalue() return optional
def set_linkset_name(specs, inverse=False): src_aligns = "" trg_aligns = "" reducer = "" intermediate = "" threshold = "" delta = "" geo = "" unit = "" source = specs[St.source] target = specs[St.target] if St.reducer in source: reducer += source[St.reducer] # GEO DATA unit_value = "" if St.longitude in source: geo += source[St.longitude] if St.latitude in source: geo += source[St.latitude] if St.longitude in target: geo += target[St.longitude] if St.latitude in source: geo += target[St.latitude] if St.unit in specs: geo += str(specs[St.unit]) unit = Ut.get_uri_local_name(str(specs[St.unit])) if St.unit_value in specs: geo += str(specs[St.unit_value]) unit_value = str(specs[St.unit_value]) if St.reducer in specs[St.target]: reducer += target[St.reducer] if St.intermediate_graph in specs: intermediate = str(specs[St.intermediate_graph]) if St.threshold in specs: threshold += str(specs[St.threshold]) if St.delta in specs: delta += str(specs[St.delta]) if St.aligns_name in source: src_aligns += source[St.aligns_name] elif St.latitude_name in source: # src_aligns += source[St.latitude_name] src_aligns += "Latitude" if St.longitude_name in source: # src_aligns += source[St.longitude_name] src_aligns += "Longitude" if St.aligns_name in target: trg_aligns += target[St.aligns_name] elif St.latitude_name in target: # trg_aligns += target[St.latitude_name] trg_aligns += "Latitude" if St.longitude_name in target: # trg_aligns += target[St.longitude_name] trg_aligns += "Longitude" dir_name = DIRECTORY date = datetime.date.isoformat(datetime.date.today()).replace('-', '') if inverse is False: h_name = specs[St.mechanism] + \ source[St.graph_name] + src_aligns + \ target[St.graph_name] + trg_aligns + \ source[St.entity_datatype] + target[St.entity_datatype] + "-" +\ reducer + intermediate + threshold + delta + geo hashed = hash(h_name) append = str(hashed).replace( "-", "N") if str(hashed).__contains__("-") else "P{}".format(hashed) specs[St.linkset_name] = "{}_{}_{}{}{}_{}_{}_{}".format( source[St.graph_name], target[St.graph_name], specs[St.mechanism], unit_value, unit, source[St.entity_name], src_aligns, append) singleton_metadata_file = "{}(SingletonMetadata)-{}.trig".format( specs[St.linkset_name], date) singleton_metadata_output = "{}/{}".format(dir_name, singleton_metadata_file) future_path = os.path.join(DIRECTORY, singleton_metadata_output) future_path = future_path.replace("\\", "/").replace("//", "/") if len(future_path) > 255: full_hashed = Ut.hash_it(specs[St.linkset_name]) specs[St.linkset_name] = "{}_{}_{}".format(source[St.graph_name], specs[St.mechanism], full_hashed) # if len(specs[St.linkset_name]) > 255: # specs[St.linkset_name] = Ut.hash_it(specs[St.linkset_name]) specs[St.linkset] = "{}{}".format(Ns.linkset, specs[St.linkset_name]) return specs[St.linkset] else: h_name = specs[St.mechanism] + \ target[St.graph_name] + trg_aligns + \ source[St.graph_name] + src_aligns + \ target[St.entity_datatype] + source[St.entity_datatype] + "-" +\ reducer + intermediate + threshold + delta + geo hashed = hash(h_name) append = str(hashed).replace( "-", "N") if str(hashed).__contains__("-") else "P{}".format(hashed) specs[St.linkset_name] = "{}_{}_{}{}{}_{}_{}_{}".format( target[St.graph_name], source[St.graph_name], specs[St.mechanism], unit_value, unit, target[St.entity_name], trg_aligns, append) singleton_metadata_file = "{}(SingletonMetadata)-{}.trig".format( specs[St.linkset_name], date) singleton_metadata_output = "{}/{}".format(dir_name, singleton_metadata_file) future_path = os.path.join(DIRECTORY, singleton_metadata_output) future_path = future_path.replace("\\", "/").replace("//", "/") if len(future_path) > 255: full_hashed = Ut.hash_it(specs[St.linkset_name]) specs[St.linkset_name] = "{}_{}_{}".format(target[St.graph_name], specs[St.mechanism], full_hashed) # if len(specs[St.linkset_name]) > 255: # specs[St.linkset_name] = Ut.hash_it(specs[St.linkset_name]) specs[St.linkset] = "{}{}".format(Ns.linkset, specs[St.linkset_name]) print "\t- specs[St.linkset]", specs[St.linkset] return specs[St.linkset]
def stats(graph, display_table=False, display_text=False): optional = dict() stat = {} text = buffer() # 1. FIND ALL TYPES IN THE GRAPH qry_types = """ ### RETRIEVE ALL TYPES FROM THE GRAPH SELECT DISTINCT ?Types (count(distinct ?resource) as ?EntityCount) {{ GRAPH <{}> {{ ?resource a ?Types . }} }} GROUP by ?Types ?EntityType ORDER BY ?Graph """.format(graph) types_matrix = sparql_xml_to_matrix(qry_types) # print types_matrix if display_table: display_matrix(types_matrix, spacing=70, limit=100, is_activated=True) # 2. OR EACH TYPES GET ALL PROPERTIES if types_matrix["result"] is not None: types = types_matrix["result"] for i in range(1, len(types)): curr_type = types[i][0] type_name = Ut.get_uri_local_name(curr_type ) instances = int(types[i][1]) optional[type_name] = dict() qry_properties = """ ### RETRIEVE ALL PROPERTIES FOR THE TYPE [{0}] SELECT DISTINCT ?Properties_for_{0} {{ GRAPH <{1}> {{ ?resource a <{2}> ; ?Properties_for_{0} ?object . }} }} """.format(type_name, graph, curr_type) properties_matrix = sparql_xml_to_matrix(qry_properties) if properties_matrix["result"] is not None: columns = 4 rows = len(properties_matrix["result"]) if display_table: print "\nPROPERTY COUNT:", len(properties_matrix["result"]) - 1 display_matrix(properties_matrix, spacing=70, limit=100, is_activated=False) # PROPERTY OCCURENCE COUNT matrix = [[str(x * y * 0).replace("0", "") for x in range(columns)] for y in range(rows)] properties = properties_matrix["result"] matrix[0][0] = properties[0][0] matrix[0][1] = "Optional" matrix[0][2] = "Instances" matrix[0][3] = "Percentage" # print type_name cur_dic = optional[type_name] for i in range(1, len(properties)): qry_occurence = """ ### RETRIEVE THE NUMBER OF OCCURRENCES FOR THIS PROPERTY ### TYPE : {2} ### PROPERTY : {3} ### GRAPH : {1} SELECT (count(?object) as ?Occurrences) {{ GRAPH <{1}> {{ ?resource a <{2}> ; <{3}> ?object . }} }} """.format(type_name, graph, curr_type, properties[i][0]) # print qry_occurence Occurrences_matrix = sparql_xml_to_matrix(qry_occurence) if Occurrences_matrix["result"] is not None: # print Occurrences_matrix["result"][1][0] # print i matrix[i][0] = properties[i][0] matrix[i][2] = Occurrences_matrix["result"][1][0] matrix[i][3] = int(Occurrences_matrix["result"][1][0])/float(instances) if int(Occurrences_matrix["result"][1][0])%float(instances) == 0: matrix[i][1] = False cur_dic[properties[i][0]] = False else: matrix[i][1] = True cur_dic[properties[i][0]] = True # matrix = properties_matrix["result"] + matrix # print matrix to_display = {"message": "OK", "result": matrix} if display_table: display_matrix(to_display, spacing=50, limit=100, is_activated=True) stat[type_name] = matrix text.write("\nGRAPH: {}".format(graph)) for key, value in optional.items(): line = "-------------------------------------------------------------------------------------------------" text.write("\n\n\tENTITY TYPE: {}".format(key)) text.write("\n\t\t{:100}{}".format(line, "------------")) text.write("\n\t\t{:<3}{:97}{}".format(len(optional[key]), "Properties", "Optional")) text.write("\n\t\t{:100}{}".format(line, "------------")) for pro, opt in value.items(): if opt: text.write("\n\t\t{:100}{}".format("{} ***".format(pro), opt)) else: text.write("\n\t\t{:100}{}".format(pro, opt)) if display_text: print text.getvalue() return optional