def export_flat_alignment_service(alignment): alignment = str(alignment).strip() row_alignment = alignment alignment = alignment if Ut.is_nt_format( alignment) is True else "<{}>".format(alignment) # CONSTRUCT QUERY query = """ PREFIX ll: <{0}> PREFIX linkset: <{1}> PREFIX lens: <{2}> PREFIX singletons: <{3}> CONSTRUCT {{ ?srcCorr ll:mySameAs ?trgCorr . ?trgCorr ll:mySameAs ?srcCorr . }} WHERE {{ BIND( {4} as ?alignment ) # THE ALIGNMENT GRAPH WITH EXPLICIT SYMMETRY GRAPH ?alignment {{ ?srcCorr ?singleton ?trgCorr . }} }} ; CONSTRUCT {{ ?alignment ?pred ?obj . ?obj ?predicate ?object . }} WHERE {{ # THE METADATA BIND( {4} as ?alignment ) ?alignment ?pred ?obj . OPTIONAL {{ ?obj ?predicate ?object . }} }} """.format( Ns.alivocab, Ns.linkset, Ns.lens, Ns.singletons, alignment, ) print query exit(0) # FIRE THE CONSTRUCT AGAINST THE TRIPLE STORE alignment_construct = Qry.endpointconstruct(query) # REMOVE EMPTY LINES triples = len(regex.findall('ll:mySameAs', alignment_construct)) alignment_construct = "\n".join( [line for line in alignment_construct.splitlines() if line.strip()]) result = "### TRIPLE COUNT: {}\n### LINKSET: {}\n".format( triples, alignment) + alignment_construct message = "You have just downloaded the graph [{}] which contains [{}] correspondences. ".format( row_alignment, triples) return {'result': result, 'message': message}
def properties(graph, datatype=None): comment = "# " if datatype is None else "" datatype = datatype if Ut.is_nt_format(datatype) is True else "<{}>".format(datatype) graph = graph if Ut.is_nt_format(graph) is True else "<{}>".format(graph) properties = """ # <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> SELECT DISTINCT ?predicate WHERE {{ GRAPH {} {{ {}?subj {} ?type . ?subj ?predicate ?obj . }} }} """.format(graph, comment, datatype) print properties Qr.display_result(query=properties, spacing=50, limit=0, is_activated=True)
def linkset_createdorused(question_uri, alignment_mapping_uri, specs, is_created=True): if alignment_mapping_uri.__contains__("<<"): alignment_mapping_uri = str(alignment_mapping_uri).replace( "<<", "<").replace(">>", ">") if Ut.is_nt_format(alignment_mapping_uri) is False: alignment_mapping_uri = "<{}>".format(alignment_mapping_uri) linkset_uri = specs[St.refined] if St.refined in specs else specs[ St.linkset] comment = "#" if is_created is True: created = "alivocab:created" opposed = "prov:used\t\t" print "REGISTERING [{}] AS CREATED".format(linkset_uri) else: created = "prov:used\t\t" opposed = "alivocab:created" comment = "#" print "REGISTERING [{}] AS IMPORTED".format(linkset_uri) query = PREFIX + """ INSERT {{ GRAPH <{0}> {{ {1} {2} <{3}> . {4}{1} prov:wasDerivedFrom <{3}> . }} }} WHERE {{ GRAPH <{0}> {{ FILTER NOT EXISTS {{ {1} {5} <{3}> . }} }} ### BIND(iri(\"{1}\") AS ?aligns) }} """.format(question_uri, alignment_mapping_uri, created, linkset_uri, comment, opposed) # print query return query
def target_datatype_properties(model, label, linkset_label): main_tabs = "\t\t\t" tabs = "{}\t\t\t\t\t\t\t\t\t\t\t\t".format(main_tabs) # ALIGNMENT COMBINATION: LIST OD DICTIONARIES alignment_targets = "" property_list_bind = "" count = 0 for item in model: count += 1 target = item[St.graph] data = item[St.data] # LIST OF DICTIONARIES for n in range(0, len(data)): code = "llTarget:{}_{}".format(label, Ut.hash_it(target + str(data[n]))) datatype = data[n][St.entity_datatype] properties = data[n][St.properties] property_list = "" # LIST OF PROPERTIES for i in range(0, len(properties)): i_property = properties[i] if Ut.is_nt_format( properties[i]) else "<{}>".format( data[i][St.properties][i]) property_list += "?property_{}_{}_{} ".format(count, n, i) if i == 0 \ else ",\n{}?property_{}_{}_{} ".format(tabs, count, n, i) if i == 0 and count == 1: property_list_bind += """BIND( IRI("{}") AS ?property_{}_{}_{})""".format( i_property, count, n, i) else: property_list_bind += """\n{}BIND( IRI("{}") AS ?property_{}_{}_{})""".format( main_tabs, i_property, count, n, i) triples = """ {5}linkset:{4} ll:hasAlignmentTarget {0} . {5}{0} ll:hasTarget <{1}> . {5}{0} ll:hasDatatype <{2}> . {5}{0} ll:aligns {3}. """.format(code, target, datatype, property_list, linkset_label, main_tabs) # print triples alignment_targets += triples return {"list": alignment_targets, "binds": property_list_bind}
def check_constraint(): text = constraint_text.lower() text = text.split(",") # CONSTRAINT BUILDER c_builder = Buffer.StringIO() if constraint_targets is not None: for dictionary in constraint_targets: graph = dictionary[St.graph] data_list = dictionary[St.data] properties = data_list[0][St.properties] prop = properties[0] if Ut.is_nt_format(properties[0]) else "<{}>".format(properties[0]) # WRITING THE CONSTRAINT ON THE GRAPH graph_q = """ {{ GRAPH <{0}> {{ ?lookup {1} ?constraint . }} }} """.format(graph, prop) c_builder.write(graph_q) if len(c_builder.getvalue()) == 0 else \ c_builder.write("UNION {}".format(graph_q)) # WRITING THE FILTER if len(c_builder.getvalue()) > 0: for i in range(0, len(text)): if i == 0 : c_builder.write(""" FILTER (LCASE(STR(?constraint)) = "{}" """.format(text[i].strip())) else: c_builder.write(""" || LCASE(STR(?constraint)) = "{}" """.format(text[i].strip())) c_builder.write(")") # # THE RESULT OF THE QUERY ABOUT THE LINKED RESOURCES query = Qry.cluster_rsc_strengths_query(resources, linkset) query = query.replace("# CONSTRAINTS IF ANY", c_builder.getvalue()) # print query response = Qry.sparql_xml_to_matrix(query) if response[St.result] is None: return False return True
def export_flat_alignment_and_metadata(alignment): flat = export_flat_alignment(alignment) alignment = str(alignment).strip() row_alignment = alignment alignment = alignment if Ut.is_nt_format( alignment) is True else "<{}>".format(alignment) # CONSTRUCT QUERY query = """ PREFIX ll: <{0}> PREFIX linkset: <{1}> PREFIX lens: <{2}> PREFIX singletons: <{3}> CONSTRUCT {{ ?alignment ?pred ?obj . ?obj ?predicate ?object . }} WHERE {{ BIND ({4} AS ?alignment) # THE METADATA ?alignment ?pred ?obj . OPTIONAL {{ ?obj ?predicate ?object . }} }} #LIMIT 10 """.format(Ns.alivocab, Ns.linkset, Ns.lens, Ns.singletons, alignment) # print query # FIRE THE CONSTRUCT AGAINST THE TRIPLE STORE alignment_construct = Qry.endpointconstruct(query, clean=False) # REMOVE EMPTY LINES triples = flat["triples"] # triples = len(re.findall('ll:mySameAs', alignment_construct)) alignment_construct = "\n".join([line for line in alignment_construct.splitlines() if line.strip()]) + "\n\n" + \ flat['result'] result = "### GENERIC METADATA FOR \n### LINKSET: {}\n\n{}".format( alignment, alignment_construct) message = "You have just downloaded the graph [{}] which contains [{}] correspondences. ".format( row_alignment, triples) print result return {'result': result, 'message': message}
def export_flat_alignment(alignment): print Ut.headings("EXPORTING THE ALIGNMENT WITH NO METADATA") print "Export for: {}".format(alignment) alignment = str(alignment).strip() row_alignment = alignment alignment = alignment if Ut.is_nt_format( alignment) is True else "<{}>".format(alignment) # CONSTRUCT QUERY query = """ PREFIX ll: <{}> CONSTRUCT {{ ?x ll:mySameAs ?z }} WHERE {{ GRAPH {} {{ ?x ?y ?z }} }} order by ?x """.format(Ns.alivocab, alignment) # print query # FIRE THE CONSTRUCT AGAINST THE TRIPLE STORE alignment_construct = Qry.endpointconstruct(query) # REMOVE EMPTY LINES # COMMA IS COUNTED WHENEVER THERE ARE MORE OBJECTS FOR THE SUBJECT triples = len(regex.findall('ll:mySameAs', alignment_construct)) + len( regex.findall(',', alignment_construct)) alignment_construct = "\n".join( [line for line in alignment_construct.splitlines() if line.strip()]) alignment_construct = alignment_construct.replace( "{", "{}\n{{".format(alignment)) # RESULTS result = "### TRIPLE COUNT: {0}\n### LINKSET: {1}\n".format( triples, alignment) + alignment_construct message = "You have just downloaded the graph [{}] which contains [{}] correspondences. ".format( row_alignment, triples) return {'result': result, 'message': message, "triples": triples}
def get_table(dataset_specs, reducer=None): # ADD THE REDUCER IF SET. THE REDUCER OR (DATASET REDUCER) HELPS ELIMINATING # THE COMPUTATION OF SIMILARITY FOR INSTANCES THAT WHERE ALREADY MATCHED print "\nLOADING: {} {}".format(dataset_specs[St.graph], dataset_specs[St.entity_datatype]) if reducer is None: reducer_comment = "#" reducer = "" else: reducer_comment = "" reducer = reducer aligns = dataset_specs[St.aligns] if Ut.is_nt_format(dataset_specs[St.aligns]) \ else "<{}>".format(dataset_specs[St.aligns]) query = """ SELECT DISTINCT * {{ GRAPH <{0}> {{ ?subject a <{1}> ; {2} ?object . }} {4}FILTER NOT EXISTS {4}{{ {4} GRAPH <{3}> {4} {{ {4} {{ ?subject ?pred ?obj . }} {4} UNION {4} {{ ?obj ?pred ?subject. }} {4} }} {4}}} }} {5} """.format(dataset_specs[St.graph], dataset_specs[St.entity_datatype], aligns, reducer, reducer_comment, LIMIT) table_matrix = Qry.sparql_xml_to_matrix(query) # Qry.display_matrix(table_matrix, is_activated=True) # print table_matrix # print query if table_matrix[St.result]: print "\tINPUT SIZE: {}".format(str(len(table_matrix[St.result]) - 1)) return table_matrix[St.result]
def linkset_evolution(research_question_uri, refined_linkset_uri): # BUILD THE SPECIFICATION specs = { St.researchQ_URI: research_question_uri.strip(), St.linkset: refined_linkset_uri } # print specs # DOCUMENT THE ALIGNMENT document = "" metadata = linkset_evolution_composition(alignment_mapping=specs) # print "METADATA:", metadata if metadata: # 1: GETTING SUBJECT - OBJECT & MECHANISM- elements1 = re.findall('\t.*:aligns(.*) "<{0,1}', metadata) # elements2 = re.findall('(<.*?>)', metadata, re.S) elements2 = re.findall('"(<{0,1}.*?>{0,1})"', metadata, re.S) # print "1: ", elements1 # print "2:", elements2 # print "" if len(elements1) == len(elements2) == 3: for i in range(3): append = " | " if i < 2 else "" two = elements2[i] if Ut.is_nt_format( elements2[i]) else "<{}>".format(elements2[i]) document += "{}={}{}".format(elements1[i], two, append) document = "[{}]".format(document) # FOLLOW DOWN THE PATH new_link = linkset_wasderivedfrom(refined_linkset_uri) new_document = linkset_evolution(research_question_uri, new_link) # print "DONE!!!!" # RECURSIVE CALL return document + ";\n" + linkset_evolution( research_question_uri, new_link) if new_document else document # print "NO EVOLUTION RESULT" return document
def get_table(dataset_specs, reducer=None): # ADD THE REDUCER IF SET if reducer is None: reducer_comment = "#" reducer = "" else: reducer_comment = "" reducer = reducer aligns = dataset_specs[St.aligns] if Ut.is_nt_format(dataset_specs[St.aligns]) \ else "<{}>".format(dataset_specs[St.aligns]) query = """ SELECT DISTINCT * {{ GRAPH <{0}> {{ ?subject a <{1}> ; {2} ?object . }} {4}FILTER NOT EXISTS {4}{{ {4} GRAPH <{3}> {4} {{ {4} {{ ?subject ?pred ?obj . }} {4} UNION {4} {{ ?obj ?pred ?subject. }} {4} }} {4}}} }} {5} """.format( dataset_specs[St.graph], dataset_specs[St.entity_datatype], aligns, reducer, reducer_comment, LIMIT) table_matrix = Qry.sparql_xml_to_matrix(query) # Qry.display_matrix(table_matrix, is_activated=True) # print table_matrix # print query return table_matrix[St.result]
def export_alignment(alignment, limit=5000): # COMMENT THE LINKSET IF IT IS EQUAL TO NONE # This function returns all the links + some metadata about the alignment. # METADATA: source dataset, target dataset and mechanism use = alignment alignment = str(alignment).strip() row_alignment = alignment alignment = alignment if Ut.is_nt_format( alignment) is True else "<{}>".format(alignment) src_dataset = None trg_dataset = None lens_targets = [] mec_dataset = None rdf_type = None # GET THE METADATA OF THE ALIGNMENT: THE QUERY meta = """ PREFIX ll: <{0}> CONSTRUCT {{ {1} ?y ?z. ?z ?p ?o . }} WHERE {{ {1} ?y ?z . #?z ?p ?o . }} order by ?y """.format(Ns.alivocab, alignment) # print meta # GET THE METADATA OF THE ALIGNMENT: RUN THE QUERY meta_construct = Qry.endpointconstruct(meta, clean=False) meta_construct = meta_construct.replace("{", "").replace("}", "") # print meta_construct # LOAD THE METADATA USING RDFLIB sg = rdflib.Graph() sg.parse(data=meta_construct, format="turtle") # EXTRACT FROM THE RESPONSE: THE SOURCE AND TARGET DATASETS AND THE ALIGNMENT sbj = rdflib.URIRef(use) source = rdflib.URIRef("http://rdfs.org/ns/void#subjectsTarget") target = rdflib.URIRef("http://rdfs.org/ns/void#objectsTarget") lens_uri_targets = rdflib.URIRef("http://rdfs.org/ns/void#target") rdf_uri_type = rdflib.URIRef( "http://www.w3.org/1999/02/22-rdf-syntax-ns#type") mechanism = rdflib.URIRef( "http://risis.eu/alignment/predicate/alignsMechanism") # EXTRACT THE ALIGNMENT TYPE for item in sg.objects(sbj, rdf_uri_type): rdf_type = item print "TYPE: ", rdf_type if str(rdf_type) == Ns.lens_type: # EXTRACT THE SOURCE DATASET for item in sg.objects(sbj, lens_uri_targets): lens_targets += [str(item)] print "{} TARGETS in {}".format(len(lens_targets), alignment) for trg_item in lens_targets: print "\t- {}".format(trg_item) else: # EXTRACT THE SOURCE DATASET for item in sg.objects(sbj, source): src_dataset = item # EXTRACT THE TARGET DATASET for item in sg.objects(sbj, target): trg_dataset = item # EXTRACT THE MECHANISM USED FOR THIS ALIGNMENT for item in sg.objects(sbj, mechanism): mec_dataset = item # CONSTRUCT QUERY FOR EXTRACTING HE CORRESPONDENCES comment = "" if limit else "#" query = """ PREFIX ll: <{}> CONSTRUCT {{ ?x ?y ?z }} WHERE {{ GRAPH {} {{ ?x ?y ?z }} }} order by ?x {}LIMIT {} """.format(Ns.alivocab, alignment, comment, limit) # print query # FIRE THE CONSTRUCT FOR CORRESPONDENCES AGAINST THE TRIPLE STORE alignment_construct = Qry.endpointconstruct(query, clean=False) triples = 0 links = None # RESULTS if alignment_construct is not None: links = "### TRIPLE COUNT: {}\n### LINKSET: {}\n".format( triples, alignment) + alignment_construct links = links.replace("{", "").replace("}", "") message = "You have just downloaded the graph [{}] which contains [{}] correspondences. ".format( row_alignment, triples) # result = result # print result print "Done with graph: {}".format(alignment) return { "type": rdf_type, 'result': links, 'message': message, 'source': src_dataset, "target": trg_dataset, "lens_targets": lens_targets, 'mechanism': mec_dataset }
def export_alignment_all(alignment, directory=None, limit=5000): directory = os.path.join(directory, "") print directory if os.path.isdir(os.path.dirname(directory)) is False or os.path.exists( directory) is False: print "CREATING THE DIRECTORY" os.mkdir(os.path.dirname(directory)) # COMMENT THE LINKSET OIT IF IT IS EQUAL TO NONE # This function returns all the links + some metadata about the alignment. # METADATA: source dataset, target dataset and mechanism use = alignment alignment = str(alignment).strip() row_alignment = alignment alignment = alignment if Ut.is_nt_format( alignment) is True else "<{}>".format(alignment) # **************************************************** # 1. GET THE METADATA OF THE ALIGNMENT: THE QUERY # **************************************************** meta = """ PREFIX ll: <{0}> CONSTRUCT {{ {1} ?y ?z. ?z ?p ?o . }} WHERE {{ {1} ?y ?z . OPTIONAL{{ ?z ?p ?o . }} OPTIONAL{{ ?O ?Q ?R . }} }} order by ?y """.format(Ns.alivocab, alignment) # print meta # GET THE METADATA OF THE ALIGNMENT: RUN THE QUERY meta_construct = Qry.endpointconstruct(meta, clean=False) meta_construct = meta_construct.replace("{", "").replace("}", "") with open(os.path.join(directory, "metadata.ttl"), "wb") as metadata: metadata.write(meta_construct) # print meta_construct # **************************************************** # 2. GET THE CORRESPONDENCES OF THE LINKSET # **************************************************** # CONSTRUCT QUERY FOR EXTRACTING HE CORRESPONDENCES comment = "" if limit else "#" query = """ PREFIX ll: <{}> CONSTRUCT {{ ?x ?y ?z }} WHERE {{ GRAPH {} {{ ?x ?y ?z }} }} order by ?x {}LIMIT {} """.format(Ns.alivocab, alignment, comment, limit) # print query # FIRE THE CONSTRUCT FOR CORRESPONDENCES AGAINST THE TRIPLE STORE alignment_construct = Qry.endpointconstruct(query, clean=False) if alignment_construct: alignment_construct = alignment_construct.replace( "{", "{}\n{{".format(alignment)) # print alignment_construct with open(os.path.join(directory, "linkset.trig"), "wb") as links: links.write(alignment_construct) # **************************************************** # 3. GET THE METADATA CORRESPONDENCES' PREDICATES # **************************************************** singleton_graph_uri = Ut.from_alignment2singleton(alignment) singleton_query = """ PREFIX ll: <{0}> PREFIX singletons: <{1}> CONSTRUCT {{ ?predicate ?x ?y }} WHERE {{ {{ SELECT ?predicate {{ GRAPH {2} {{ ?subject ?predicate ?object }} }} order by ?x {3}LIMIT {4} }} GRAPH {5} {{ ?predicate ?x ?y }} }} """.format(Ns.alivocab, Ns.singletons, alignment, comment, limit, singleton_graph_uri) # print singleton_query # FIRE THE CONSTRUCT FOR SINGLETON AGAINST THE TRIPLE STORE singleton_construct = Qry.endpointconstruct(singleton_query, clean=False) if singleton_construct: singleton_construct = singleton_construct.replace( "{", "{}\n{{".format(singleton_graph_uri)) # print singleton_construct with open(os.path.join(directory, "singletons.trig"), "wb") as singletons: singletons.write(singleton_construct) # LOAD THE METADATA USING RDFLIB sg = rdflib.Graph() sg.parse(data=meta_construct, format="turtle") # EXTRACT FROM THE RESPONSE: THE SOURCE AND TARGET DATASETS AND THE ALIGNMENT sbj = rdflib.URIRef(use) triples_uri = rdflib.URIRef("http://rdfs.org/ns/void#triples") # EXTRACT THE ALIGNMENT TYPE triples = "" for item in sg.objects(sbj, triples_uri): triples = item print "TRIPLES: ", triples if alignment_construct is not None: links = "### TRIPLE COUNT: {}\n### LINKSET: {}\n".format( triples, alignment) + alignment_construct links = links.replace("{", "").replace("}", "") message = "You have just downloaded the graph [{}] which contains [{}] correspondences. ".format( row_alignment, triples) host = Svr.settings[St.stardog_host_name] endpoint = b"http://{}/annex/{}/sparql/query?".format( host, Svr.settings[St.database]) local_name = Ut.get_uri_local_name_plus(alignment) file_at_parent_directory = os.path.join( os.path.abspath(os.path.join(directory, os.pardir)), "{}.zip".format(local_name)) zipped_file = Ut.zip_folder(directory, output_file_path=file_at_parent_directory) print "\t>>> THE ZIPPED FILE IS LOCATED AT:\n\t\t- {}".format(zipped_file) # result = result # print result print "Done with graph: {}".format(alignment) # return {'result': { # "generic_metadata": meta_construct, # 'specific_metadata': singleton_construct, # 'data': alignment_construct}, 'message': message} return {'result': zipped_file, 'message': message}
def cluster_d_test(linkset, network_size=3, network_size_max=3, targets=None, constraint_targets=None, constraint_text="", directory=None, greater_equal=True, print_it=False, limit=None, only_good=False, activated=False): # FOR CONSTRAINTS TO WORK, IT SHOULD NOT BE NONE network = [] print "\nLINK NETWORK INVESTIGATION" if activated is False: print "\tTHE FUNCTION I NOT ACTIVATED" return "" elif network_size > network_size_max and greater_equal is False: print "\t[network_size] SHOULD BE SMALLER THAN [network_size_max]" return "" date = datetime.date.isoformat(datetime.date.today()).replace('-', '') linkset_name = Ut.get_uri_local_name(linkset) linkset = linkset.strip() if network_size_max - network_size == 0: greater_equal = False check = False # RUN THE CLUSTER clusters_0 = Cls.links_clustering(linkset, limit) if greater_equal is True: temp_size = 0 for cluster, cluster_val in clusters_0.items(): new_size = len(list(cluster_val["nodes"])) if new_size > temp_size: temp_size = new_size network_size_max = temp_size print "THE BIGGEST NETWORK'S: {}".format(network_size_max) def check_constraint(): text = constraint_text.lower() text = text.split(",") # CONSTRAINT BUILDER c_builder = Buffer.StringIO() if constraint_targets is not None: for dictionary in constraint_targets: graph = dictionary[St.graph] data_list = dictionary[St.data] properties = data_list[0][St.properties] prop = properties[0] if Ut.is_nt_format(properties[0]) else "<{}>".format(properties[0]) # WRITING THE CONSTRAINT ON THE GRAPH graph_q = """ {{ GRAPH <{0}> {{ ?lookup {1} ?constraint . }} }} """.format(graph, prop) c_builder.write(graph_q) if len(c_builder.getvalue()) == 0 else \ c_builder.write("UNION {}".format(graph_q)) # WRITING THE FILTER if len(c_builder.getvalue()) > 0: for i in range(0, len(text)): if i == 0 : c_builder.write(""" FILTER (LCASE(STR(?constraint)) = "{}" """.format(text[i].strip())) else: c_builder.write(""" || LCASE(STR(?constraint)) = "{}" """.format(text[i].strip())) c_builder.write(")") # # THE RESULT OF THE QUERY ABOUT THE LINKED RESOURCES query = Qry.cluster_rsc_strengths_query(resources, linkset) query = query.replace("# CONSTRAINTS IF ANY", c_builder.getvalue()) # print query response = Qry.sparql_xml_to_matrix(query) if response[St.result] is None: return False return True for index in range(network_size, network_size_max + 1): count_1 = 0 count_2 = 0 curr_network_size = index print "\nCLUSTERS OF SIZE {}".format(index) sheet_builder = Buffer.StringIO() analysis_builder = Buffer.StringIO() sheet_builder.write("Count ID STRUCTURE E-STRUCTURE-SIZE A. NETWORK QUALITY" " M. NETWORK QUALITY REFERENCE\n") for cluster, cluster_val in clusters_0.items(): # network = [] resources = "" uri_size = 0 count_1 += 1 children = list(cluster_val["nodes"]) strengths = cluster_val["strengths"] cluster_size = len(children) # if "<http://www.grid.ac/institutes/grid.10493.3f>" not in children: # continue check = cluster_size >= curr_network_size if greater_equal else cluster_size == curr_network_size # NETWORK OF A PARTICULAR SIZE if check: # file_name = i_cluster[0] # 2: FETCHING THE CORRESPONDENTS smallest_hash = float('inf') child_list = "" for child in children: # CREATE THE HASHED ID AS THE CLUSTER NAME hashed = hash(child) if hashed <= smallest_hash: smallest_hash = hashed # GENERAL INFO 1: RESOURCES INVOLVED child_list += "\t{}\n".format(child) # LIST OF RESOURCES IN THE CLUTER use = "<{}>".format(child) if Ut.is_nt_format(child) is not True else child resources += "\n\t\t\t\t{}".format(use) if len(child) > uri_size: uri_size = len(child) # MAKE SURE THE FILE NAME OF THE CLUSTER IS ALWAYS THE SAME file_name = "{}".format(str(smallest_hash).replace("-", "N")) if str( smallest_hash).startswith("-") \ else "P{}".format(smallest_hash) if constraint_targets is not None and check_constraint() is False: continue count_2 += 1 # # THE RESULT OF THE QUERY ABOUT THE LINKED RESOURCES query = Qry.cluster_rsc_strengths_query(resources, linkset) response = Qry.sparql_xml_to_matrix(query) # GENERAL INFO 2: info = "SIZE {} \nCLUSTER {} \nNAME {}\n".format(cluster_size, count_1, file_name) info2 = "CLUSTER [{}] NAME [{}] SIZE [{}]".format(count_1, file_name, cluster_size) analysis_builder.write("{}\n".format(info)) analysis_builder.write("RESOURCES INVOLVED\n") analysis_builder.write(child_list) analysis_builder.write("\nCORRESPONDENT FOUND ") analysis_builder.write( Qry.display_matrix(response, spacing=uri_size, output=True, line_feed='.', is_activated=True)) # INFO TYPE 3: PROPERTY-VALUES OF THE RESOURCES INVOLVED analysis_builder.write("\n\nDISAMBIGUATION HELPER ") if targets is None: analysis_builder.write(Cls.disambiguate_network(linkset, children)) else: report = Cls.disambiguate_network_2(children, targets) if report is not None: analysis_builder.write(report) # GENERATING THE NETWORK AS A TUPLE WHERE A TUPLE REPRESENT TWO RESOURCES IN A RELATIONSHIP :-) network = [] link_count = 0 for link in cluster_val["links"]: link_count += 1 name_1 = "{}-{}".format(Ut.hash_it(link[0]), Ut.get_uri_local_name(link[0])) name_2 = "{}-{}".format(Ut.hash_it(link[1]), Ut.get_uri_local_name(link[1])) network += [(name_1, name_2)] # GET THE AUTOMATED FLAG if print_it: print "" print analysis_builder.getvalue() # SETTING THE DIRECTORY if directory: if network: automated_decision = metric(network)["AUTOMATED_DECISION"] if only_good is True and automated_decision.startswith("GOOD") is not True: count_2 -= 1 continue print "{:>5} {}".format(count_2, info2) eval_sheet(targets, count_2, "{}_{}".format(cluster_size, file_name), sheet_builder, linkset, children, automated_decision) else: print network # linkset_name = Ut.get_uri_local_name(linkset) # date = datetime.date.isoformat(datetime.date.today()).replace('-', '') temp_directory = "{}{}".format(directory, "\{}_Analysis_{}\{}\{}_{}\\".format( curr_network_size, date, linkset_name, cluster_size, file_name)) if not os.path.exists(temp_directory): os.makedirs(temp_directory) """"""""""""" PLOTTING """"""""""""" # FIRE THE DRAWING: Supported formats: eps, pdf, pgf, png, ps, raw, rgba, svg, svgz. analysis_builder.write( draw_graph(graph=network, file_path="{}{}.{}".format(temp_directory, "cluster_{}".format(file_name), "pdf"), show_image=False) ) """"""""""""" WRITING TO DISC """"""""""""" # WRITE TO DISC Ut.write_2_disc(file_directory=temp_directory, file_name="cluster_{}".format(file_name, ), data=analysis_builder.getvalue(), extension="txt") analysis_builder = Buffer.StringIO() if directory: # if len(sheet_builder.getvalue()) > 150 and count_2 == 2: if len(sheet_builder.getvalue()) > 150 and len(clusters_0) == count_1: tmp_directory = "{}{}".format(directory, "\{}_Analysis_{}\{}\\".format( curr_network_size, date, linkset_name)) """"""""""""" WRITING CLUSTER SHEET TO DISC """"""""""""" print "\n\tWRITING CLUSTER SHEET AT\n\t{}".format(tmp_directory) Ut.write_2_disc(file_directory=tmp_directory, file_name="{}_ClusterSheet".format(cluster_size), data=sheet_builder.getvalue(), extension="txt") # if count_2 == 2: # break if greater_equal is True: # no need to continue as we already did all network greater of equal to "network-size" input break print "\t>>> FOUND: {} CLUSTERS OF SIZE {}".format(count_2, curr_network_size) if directory is None: return "{}\t{}".format(curr_network_size, count_2)
def cluster_d_test_stats(linkset, network_size=3, targets=None, directory=None, greater_equal=True, print_it=False, limit=None, activated=False): network = [] print "LINK NETWORK INVESTIGATION" if activated is False: print "\tTHE FUNCTION I NOT ACTIVATED" return "" date = datetime.date.isoformat(datetime.date.today()).replace('-', '') linkset_name = Ut.get_uri_local_name(linkset) count_1 = 0 count_2 = 0 sheet_builder = Buffer.StringIO() analysis_builder = Buffer.StringIO() sheet_builder.write("Count ID STRUCTURE E-STRUCTURE-SIZE A. NETWORK QUALITY" " M. NETWORK QUALITY REFERENCE\n") linkset = linkset.strip() check = False # RUN THE CLUSTER clusters_0 = Cls.links_clustering(linkset, limit) for cluster, cluster_val in clusters_0.items(): # network = [] resources = "" uri_size = 0 count_1 += 1 children = list(cluster_val["nodes"]) strengths = cluster_val["strengths"] cluster_size = len(children) # if "<http://www.grid.ac/institutes/grid.10493.3f>" not in children: # continue check = cluster_size >= network_size if greater_equal else cluster_size == network_size # NETWORK OF A PARTICULAR SIZE if check: count_2 += 1 # file_name = i_cluster[0] # 2: FETCHING THE CORRESPONDENTS smallest_hash = float('inf') child_list = "" for child in children: hashed = hash(child) if hashed <= smallest_hash: smallest_hash = hashed # GENERAL INFO 1: RESOURCES INVOLVED child_list += "\t{}\n".format(child) use = "<{}>".format(child) if Ut.is_nt_format(child) is not True else child resources += "\n\t\t\t\t{}".format(use) if len(child) > uri_size: uri_size = len(child) if directory: # MAKE SURE THE FILE NAME OF THE CLUSTER IS ALWAYS THE SAME file_name = "{}".format(str(smallest_hash).replace("-", "N")) if str( smallest_hash).startswith("-") \ else "P{}".format(smallest_hash) # # THE RESULT OF THE QUERY ABOUT THE LINKED RESOURCES query = Qry.cluster_rsc_strengths_query(resources, linkset) response = Qry.sparql_xml_to_matrix(query) # GENERAL INFO 2: info = "SIZE {} \nCLUSTER {} \nNAME {}\n".format(cluster_size, count_1, file_name) info2 = "CLUSTER [{}] NAME [{}] SIZE [{}]".format(count_1, file_name, cluster_size) analysis_builder.write("{}\n".format(info)) print "{:>5} {}".format(count_2, info2) analysis_builder.write("RESOURCES INVOLVED\n") analysis_builder.write(child_list) analysis_builder.write("\nCORRESPONDENT FOUND ") analysis_builder.write( Qry.display_matrix(response, spacing=uri_size, output=True, line_feed='.', is_activated=True)) # INFO TYPE 3: PROPERTY-VALUES OF THE RESOURCES INVOLVED analysis_builder.write("\n\nDISAMBIGUATION HELPER ") if targets is None: analysis_builder.write(Cls.disambiguate_network(linkset, children)) else: analysis_builder.write(Cls.disambiguate_network_2(children, targets)) # GENERATING THE NETWORK AS A TUPLE WHERE A TUPLE REPRESENT TWO RESOURCES IN A RELATIONSHIP :-) network = [] link_count = 0 for link in cluster_val["links"]: link_count += 1 name_1 = "{}".format(Ut.get_uri_local_name(link[0])) name_2 = "{}".format(Ut.get_uri_local_name(link[1])) network += [(name_1, name_2)] if print_it: print "" print analysis_builder.getvalue() # SETTING THE DIRECTORY if directory: # linkset_name = Ut.get_uri_local_name(linkset) # date = datetime.date.isoformat(datetime.date.today()).replace('-', '') temp_directory = "{}{}".format(directory, "\{}_Analysis_{}\{}\{}_{}\\".format( network_size, date, linkset_name, cluster_size, file_name)) if not os.path.exists(temp_directory): os.makedirs(temp_directory) """"""""""""" PLOTTING """"""""""""" # FIRE THE DRAWING: Supported formats: eps, pdf, pgf, png, ps, raw, rgba, svg, svgz. analysis_builder.write( draw_graph(graph=network, file_path="{}{}.{}".format(temp_directory, "cluster_{}".format(file_name), "pdf"), show_image=False) ) """"""""""""" WRITING TO DISC """"""""""""" # WRITE TO DISC Ut.write_2_disc(file_directory=temp_directory, file_name="cluster_{}".format(file_name, ), data=analysis_builder.getvalue(), extension="txt") analysis_builder = Buffer.StringIO() if network: automated_decision = metric(network)["AUTOMATED_DECISION"] eval_sheet(targets, count_2, "{}_{}".format(cluster_size, file_name), sheet_builder, linkset, children, automated_decision) else: print network if directory: # if len(sheet_builder.getvalue()) > 150 and count_2 == 2: if len(sheet_builder.getvalue()) > 150 and len(clusters_0) == count_1: tmp_directory = "{}{}".format(directory, "\{}_Analysis_{}\{}\\".format( network_size, date, linkset_name)) """"""""""""" WRITING CLUSTER SHEET TO DISC """"""""""""" print "\nWRITING CLUSTER SHEET AT\n\t{}".format(tmp_directory) Ut.write_2_disc(file_directory=tmp_directory, file_name="{}_ClusterSheet".format(cluster_size), data=sheet_builder.getvalue(), extension="txt") # if count_2 == 2: # break print ">>> FOUND: {}".format(count_2) if directory is None: return "{}\t{}".format(network_size, count_2)
def cluster_d_test_statss(linkset, network_size=3, targets=None, directory=None, greater_equal=True, print_it=False, limit=None, activated=False): network = [] print "LINK NETWORK INVESTIGATION" if activated is False: print "\tTHE FUNCTION I NOT ACTIVATED" return "" date = datetime.date.isoformat(datetime.date.today()).replace('-', '') linkset_name = Ut.get_uri_local_name(linkset) count_1 = 0 count_2 = 0 sheet_builder = Buffer.StringIO() analysis_builder = Buffer.StringIO() sheet_builder.write("Count ID STRUCTURE E-STRUCTURE-SIZE A. NETWORK QUALITY" " M. NETWORK QUALITY REFERENCE\n") linkset = linkset.strip() check = False # RUN THE CLUSTER clusters_0 = Cls.links_clustering(linkset, limit) for i_cluster in clusters_0.items(): # network = [] resources = "" uri_size = 0 count_1 += 1 children = i_cluster[1][St.children] cluster_size = len(children) # if "<http://www.grid.ac/institutes/grid.10493.3f>" not in children: # continue check = cluster_size >= network_size if greater_equal else cluster_size == network_size # NETWORK OF A PARTICULAR SIZE if check: count_2 += 1 # file_name = i_cluster[0] # 2: FETCHING THE CORRESPONDENTS smallest_hash = float('inf') child_list = "" for child in children: hashed = hash(child) if hashed <= smallest_hash: smallest_hash = hashed # GENERAL INFO 1: RESOURCES INVOLVED child_list += "\t{}\n".format(child) use = "<{}>".format(child) if Ut.is_nt_format(child) is not True else child resources += "\n\t\t\t\t{}".format(use) if len(child) > uri_size: uri_size = len(child) if directory: # MAKE SURE THE FILE NAME OF THE CLUSTER IS ALWAYS THE SAME file_name = "{}".format(str(smallest_hash).replace("-", "N")) if str( smallest_hash).startswith("-") \ else "P{}".format(smallest_hash) # QUERY FOR FETCHING ALL LINKED RESOURCES FROM THE LINKSET query = """ PREFIX prov: <{3}> PREFIX ll: <{4}> SELECT DISTINCT ?lookup ?object ?Strength ?Evidence {{ VALUES ?lookup{{ {0} }} {{ GRAPH <{1}> {{ ?lookup ?predicate ?object .}} }} UNION {{ GRAPH <{1}> {{?object ?predicate ?lookup . }} }} GRAPH <{2}> {{ ?predicate prov:wasDerivedFrom ?DerivedFrom . OPTIONAL {{ ?DerivedFrom ll:hasStrength ?Strength . }} OPTIONAL {{ ?DerivedFrom ll:hasEvidence ?Evidence . }} }} }} """.format(resources, linkset, linkset.replace("lens", "singletons"), Ns.prov, Ns.alivocab) # print query # THE RESULT OF THE QUERY ABOUT THE LINKED RESOURCES response = Qry.sparql_xml_to_matrix(query) # A DICTIONARY OF KEY: (SUBJECT-OBJECT) VALUE:STRENGTH response_dic = dict() result = response[St.result] if result: for i in range(1, len(result)): key = (result[i][0], result[i][1]) if key not in response_dic: response_dic[key] = result[i][2] # print response_dic # GENERAL INFO 2: info = "SIZE {} \nCLUSTER {} \nNAME {}\n".format(cluster_size, count_1, file_name) info2 = "CLUSTER [{}] NAME [{}] SIZE [{}]".format(count_1, file_name, cluster_size) analysis_builder.write("{}\n".format(info)) print "{:>5} {}".format(count_2, info2) analysis_builder.write("RESOURCES INVOLVED\n") analysis_builder.write(child_list) analysis_builder.write("\nCORRESPONDENT FOUND ") analysis_builder.write( Qry.display_matrix(response, spacing=uri_size, output=True, line_feed='.', is_activated=True)) # INFO TYPE 3: PROPERTY-VALUES OF THE RESOURCES INVOLVED analysis_builder.write("\n\nDISAMBIGUATION HELPER ") if targets is None: analysis_builder.write(Cls.disambiguate_network(linkset, children)) else: analysis_builder.write(Cls.disambiguate_network_2(children, targets)) position = i_cluster[1][St.row] if St.annotate in i_cluster[1]: analysis_builder.write("\n\nANNOTATED CLUSTER PROCESS") analysis_builder.write(i_cluster[1][St.annotate]) # THE CLUSTER # print "POSITION: {}".format(position) # print "\nMATRIX DISPLAY\n" # for i in range(0, position): # resource = (i_cluster[1][St.matrix])[i] # print "\t{}".format(resource[:position]) # print "\t{}".format(resource) # GENERATING THE NETWORK AS A TUPLE WHERE A TUPLE REPRESENT TWO RESOURCES IN A RELATIONSHIP :-) network = [] for i in range(1, position): for j in range(1, position): if (i, j) in (i_cluster[1][St.matrix_d]) and (i_cluster[1][St.matrix_d])[(i, j)] != 0: r = (i_cluster[1][St.matrix_d])[(i, 0)] c = (i_cluster[1][St.matrix_d])[(0, j)] r_name = "{}:{}".format(i, Ut.get_uri_local_name(r)) c_name = "{}:{}".format(j, Ut.get_uri_local_name(c)) network += [(r_name, c_name)] # network += [(r_smart, c_smart)] # print "\tNETWORK", network if print_it: print "" print analysis_builder.getvalue() # SETTING THE DIRECTORY if directory: # linkset_name = Ut.get_uri_local_name(linkset) # date = datetime.date.isoformat(datetime.date.today()).replace('-', '') temp_directory = "{}{}".format(directory, "\{}_Analysis_{}\{}\{}_{}\\".format( network_size, date, linkset_name, cluster_size, file_name)) if not os.path.exists(temp_directory): os.makedirs(temp_directory) """"""""""""" PLOTTING """"""""""""" # FIRE THE DRAWING: Supported formats: eps, pdf, pgf, png, ps, raw, rgba, svg, svgz. analysis_builder.write( draw_graph(graph=network, file_path="{}{}.{}".format(temp_directory, "cluster_{}".format(file_name), "pdf"), show_image=False) ) """"""""""""" WRITING TO DISC """"""""""""" # WRITE TO DISC Ut.write_2_disc(file_directory=temp_directory, file_name="cluster_{}".format(file_name, ), data=analysis_builder.getvalue(), extension="txt") analysis_builder = Buffer.StringIO() if directory: if network: automated_decision = metric(network)["AUTOMATED_DECISION"] eval_sheet(targets, count_2, "{}_{}".format(cluster_size, file_name), sheet_builder, linkset, children, automated_decision) else: print network if directory: # if len(sheet_builder.getvalue()) > 150 and count_2 == 2: if len(sheet_builder.getvalue()) > 150 and len(clusters_0) == count_1: tmp_directory = "{}{}".format(directory, "\{}_Analysis_{}\{}\\".format( network_size, date, linkset_name)) """"""""""""" WRITING CLUSTER SHEET TO DISC """"""""""""" print "\nWRITING CLUSTER SHEET AT\n\t{}".format(tmp_directory) Ut.write_2_disc(file_directory=tmp_directory, file_name="{}_ClusterSheet".format(cluster_size), data=sheet_builder.getvalue(), extension="txt") # if count_2 == 2: # break print ">>> FOUND: {}".format(count_2) if directory is None: return "{}\t{}".format(network_size, count_2)