def align_ntriples(ntriples_file_to_align, alignment_file, predicates_to_align=[rdfs_label], alignment_uri="http://example.org/aligned", number_of_words=5): print("Generating file to align into memory") ff = open(ntriples_file_to_align) triples_to_fragment = pyTripleSimple.SimpleTripleStore() triples_to_fragment.load_ntriples(ff) ff.close() print("Generating fragments to align") fte_obj = FreeTextExpanderTripleStore(triples_to_fragment, predicates_to_align) fte_obj.generate(number_of_words) alignment_files = fte_obj.write_out_to_ntriples() print("Reading alignment file into memory") alignment_obj = pyTripleSimple.SimpleTripleStore() fa = open(alignment_file) alignment_obj.load_ntriples(fa) fa.close() aligned_obj = pyTripleSimple.SimpleTripleStore() for alignment_file in alignment_files: faf = open(alignment_file) fragments_obj = pyTripleSimple.SimpleTripleStore() print("Loading fragments to align into memory '%s'" % alignment_file) fragments_obj.load_ntriples(faf) for fragment in fragments_obj.iterator_triples(): result = alignment_obj.simple_pattern_match( [("s", "p", "o")], [("o", "in", [fragment.object])], ["s"]) if result: aligned_obj.load_ntriples([ "<%s> <%s> %s ." % (fragment.subject, alignment_uri, result[0][0][0]) ]) ntriples_aligned_file = ntriples_file_to_align + ".alignment.nt" print("Exporting aligned file") fo = open(ntriples_aligned_file, "w") aligned_obj.export_to_ntriples_file(fo) fo.close()
def main(ntriples_file_name, free_text_predicates=None): f = open(ntriples_file_name, "r") ts = pyTripleSimple.SimpleTripleStore( ) #pyTripleSimple.ShelveTripleEngine(ntriples_file_name) print('Loading "%s"' % os.path.abspath(ntriples_file_name)) start_time = time.clock() ts.load_ntriples(f) end_time = time.clock() print("Finished loading ntriples file") #print("Number of triples %s loaded in %s seconds (%s triples/second)" % (number_of_triples, end_time - start_time,(number_of_triples * 1.0)/ (end_time - start_time))) if free_text_predicates is not None: ft = FreeTextSimpleTripleStore( ts, predicates_to_index=free_text_predicates) else: ft = FreeTextSimpleTripleStore(ts) ft.generate() file_names = ft.write_out_to_ntriples(ntriples_file_name + ".") print("Generated free text triples '%s'" % ntriples_file_name) for file_name in file_names: print("Wrote '%s'" % file_name) return file_names
def test_find_triples(self): ts = pyTripleSimple.SimpleTripleStore() ts.load_ntriples(self.test_source) r1 = ts.find_triples(subjects="<http://example.org/resource999>") self.assertEquals(set([]),r1,"Should return an empty list") r2 = ts.find_triples(subjects="<http://example.org/resource9>") self.assertEquals(1,len(r2)) r3 = ts.find_triples(predicates="http://example.org/property") self.assertEquals(30,len(r3)) r4 = ts.find_triples(objects="<http://example.org/resource2>") self.assertEquals(7,len(r4)) r5 = ts.find_triples(literals="chat") self.assertEquals(3,len(r5)) r6 = ts.find_triples(subjects=['<http://example.org/resource26>','http://example.org/resource25']) self.assertEquals(3,len(r6)) r7 = ts.find_triples(subjects=['<http://example.org/resource26>','http://example.org/resource25'], predicates="<http://example.org/property>") self.assertEquals(3,len(r7)) r8 = ts.find_triples(['<http://example.org/resource26>','http://example.org/resource25'], predicates="<http://example.org/propertyX>") self.assertEquals(0,len(r8)) r9 = ts.find_triples("<http://example.org/resource14>", "<http://example.org/property>", literals="x") self.assertEquals(1,len(r9)) r10 = ts.find_triples("<http://example.org/resource14>", "<http://example.org/property>", objects="x") self.assertEquals(0,len(r10))
def test_simple_pattern_match(self): ts = pyTripleSimple.SimpleTripleStore() f = open("acme.nt","r") ts.load_ntriples(f) r1 = ts.simple_pattern_match([("a","p","b")],[("p","in",["<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>"])],("b")) self.assertEquals(5,len(r1)) r2 = ts.simple_pattern_match([("a","p","b")],[("p","in",["<http://example.org/predicateDoesNotExist>"])],("b")) self.assertEquals(0,len(r2)) r3 = ts.simple_pattern_match([("a","p","b"),("a","r","ca"),("b","r","cb")],[("r","in",["<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>"])],("p","ca","cb")) self.assertEquals(5,len(r3)) r4 = ts.simple_pattern_match([("a","p","b"),("a","r","ca"),("b","r","cb")],[("p", "!=", "r"),("r","in",["<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>"])],("p","ca","cb")) self.assertEquals(2,len(r4)) r5 = ts.simple_pattern_match([('a','p','b')], [],['a','p','b']) self.assertEquals(57,len(r5)) r6 = ts.simple_pattern_match([('a','p','b')], [('p','in',['<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>'])],['a','p','b']) self.assertEquals(14,len(r6)) r7 = ts.simple_pattern_match([('a','p','b')], [('p','not in',['<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>'])],['a','p','b']) self.assertEquals(57 - 14,len(r7)) r8 = ts.simple_pattern_match([('a','p','b')], [('b','in',['Hollywood'])],['a','p','b']) self.assertEquals(2,len(r8)) r9 = ts.simple_pattern_match([('a','p','b')], [('b','not in',['Hollywood'])],['a','p','b']) self.assertEquals(57 - 2,len(r9)) r10 = ts.simple_pattern_match([('s','p','o')],[],[pyTripleSimple.is_literal('o')]) self.assertEquals(2,len(r10))
def test_EmbeddedQuote(self): ts = pyTripleSimple.SimpleTripleStore() ts.load_ntriples([r'<http://link.informatics.stonybrook.edu/umls/SUI/S7018982> <http://www.w3.org/2000/01/rdf-schema#label> "Instruct parent/caregiver of infant 0-3 months to limit water intake to Z\\\"x oz to 1 oz at a time, 4 oz daily" .', r'<http://link.informatics.stonybrook.edu/umls/SUI/S11718919> <http://www.w3.org/2000/01/rdf-schema#label> "Vetscription Worm Away 7 Praziquantel 30 MG \\\" Pyrantel Pamoate 30 MG Chewable Tablet" .']) triples = list(ts.iterator_triples()) self.assertEquals(triples[0].object,r"Instruct parent/caregiver of infant 0-3 months to limit water intake to Z\\\"x oz to 1 oz at a time, 4 oz daily") self.assertEquals(triples[1].object,r"Vetscription Worm Away 7 Praziquantel 30 MG \\\" Pyrantel Pamoate 30 MG Chewable Tablet")
def test_generate_graphml(self): self.ts = pyTripleSimple.SimpleTripleStore() f = open("acme.nt") self.ts.load_ntriples(f) f.close() egfrsts_obj = pyTripleSimple.ExtractGraphFromSimpleTripleStore(self.ts) egfrsts_obj.register_label() egfrsts_obj.register_class() egfrsts_obj.add_pattern_for_links([['a','b','c']],[('b','in',['<http://acme.com/rdf#isLabeller>'])],("a","c"), "labeller") egfrsts_obj.register_node_predicate("<http://acme.com/rdf#ndc/date_issued>", "date", lambda x : x.upper()) result_xml = egfrsts_obj.translate_into_graphml_file() from xml.etree.ElementTree import XML elements = XML(result_xml) xml_tags = [] for element in elements: xml_tags.append(element.tag) self.assertTrue("{http://graphml.graphdrawing.org/xmlns}key" in xml_tags) self.assertTrue("{http://graphml.graphdrawing.org/xmlns}graph" in xml_tags) try: import networkx fo = open("acme.graphml","w") fo.write(result_xml) fo.close() networkx.read_graphml("acme.graphml") f.close() except ImportError: pass
def test_TripleIterator(self): ts = pyTripleSimple.SimpleTripleStore() ts.load_ntriples(self.test_source) result1 = list(ts.iterator_triples()) self.assertEquals(30,len(result1),"Wrong number of triples iterated") result2 = list(ts.iterator_ntriples()) self.assertEquals(30,len(result2),"Wrong number of triples iterated")
def __init__(self, triple_simple_store, predicates_to_index=[rdfs_label]): self.predicate_for_word = "http://vivoweb.org/ontology/core#freetextKeyword" self.triple_simple_store = triple_simple_store self.predicates_to_index = predicates_to_index self.lexer = FreeTextLexer() self.predicates_triple_store = {} for predicate_to_index in self.predicates_to_index: self.predicates_triple_store[ predicate_to_index] = pyTripleSimple.SimpleTripleStore()
def main(): parser = OptionParser(usage="usage: %prog [options]", version="%prog 1.0") parser.add_option("-f", "--file", action="store", dest="file_name", default=False, help="ntriples file to read in") parser.add_option("-c", "--command", action="store", dest="command", default="statistics", help="Supported commands are: 'statistics' and 'query'") parser.add_option("-q", "--query", action="store", dest="query", default=False, help="Specify the pattern to match the solution for") parser.add_option("-r", "--restrictions", action="store", dest="restrictions", default=None, help="Specify restrictions on the solution") parser.add_option("-v", "--variables", action="store", dest="variables", default=False, help="Specify the variables for output") parser.add_option("-n", "--limit", action="store", dest="display_n", default="50", help="Limit the number of results") parser.add_option( "-o", "--output-format", action="store", dest="output_format", default="stdout", help="Query output format: stdout, json, delimited, ntriples") parser.add_option("-w", "--output-file", action="store", dest="output_file_name", default=0, help="Send results to named file") parser.add_option("--header", action="store", dest="header", default=1, help="For table output add a header row") parser.add_option("--delimiter", action="store", dest="delimiter", default="\t", help="Delimiter to use in table output") parser.add_option("--clean", action="store", dest="clean", default=0, help="Strips string and <> uri designations") (options, args) = parser.parse_args() ts = pyTripleSimple.SimpleTripleStore( ) #pyTripleSimple.ShelveTripleEngine(ntriples_file_name) if options.display_n == "All": display_n = None else: display_n = int(options.display_n) file_name = options.file_name if file_name: try: f = open(file_name, "r") except IOError: raise if options.output_file_name: try: fo = open(options.output_file_name, 'w') except IOError: raise else: fo = StringIO.StringIO() if options.command == "statistics": fo.write('Loading "%s"\n' % os.path.abspath(file_name)) start_time = time.clock() ts.load_ntriples(f) end_time = time.clock() fo.write("Finished loading ntriples file\n") number_of_triples = ts.n_triples() Nt = number_of_triples fo.write( "Number of triples %s loaded in %s seconds (%s triples/second)\n" % (number_of_triples, end_time - start_time, (number_of_triples * 1.0) / (end_time - start_time))) object_breakdown = ts.simple_pattern_match( [("s", "p", "o")], [], [pyTripleSimple.is_literal("o")]) number_of_literals = 0 for result in object_breakdown: if result[0][0] == '"1"': number_of_literals = result[1] Nl = number_of_literals number_of_objects = number_of_triples - number_of_literals No = number_of_objects rdf_type_breakdown = ts.simple_pattern_match( [("a", "r", "c")], [("r", "in", ["<" + pyTripleSimple.common_prefixes["rdf"] + "type>"])], ["r"]) if len(rdf_type_breakdown): number_of_instances = rdf_type_breakdown[0][1] else: number_of_instances = 0 Ni = number_of_instances number_of_symbols = ts.n_symbols() Ns = number_of_symbols number_of_distinct_literals = ts.n_literals() Ndl = number_of_distinct_literals classes_results = ts.simple_pattern_match( [("a", "r", "c")], [("r", "in", ["<" + pyTripleSimple.common_prefixes["rdf"] + "type>"])], ["c"]) number_of_distinct_classes = len(classes_results) Ndc = number_of_distinct_classes number_of_distinct_objects = ts.n_objects( ) - number_of_distinct_literals Ndo = number_of_distinct_objects number_of_distinct_subjects = ts.n_subjects() Nds = number_of_distinct_subjects number_of_distinct_predicates = ts.n_predicates() Ndp = number_of_distinct_predicates subject_uris = ts.simple_pattern_match([("s", "p", "o")], [], ["s"]) object_uris = ts.simple_pattern_match([("s", "p", "o")], [], ["o"]) subject_objects_literals_uris = ts.union_pattern_match_result_set( subject_uris, object_uris) subject_objects_uris = [ uresult for uresult in subject_objects_literals_uris if uresult[0][0][0] != '"' and uresult[0][0][-1] ] number_of_distinct_uris = len(subject_objects_uris) Nu = number_of_distinct_uris class_coverage = [(class_result[1] * 1.0) / Ni for class_result in classes_results] fo.write("\n") fo.write("Number of triples (Nt): %s\n" % number_of_triples) fo.write("Number of literals (Nl): %s\n" % number_of_literals) fo.write("Number of objects (No): %s\n" % number_of_objects) fo.write("Number of typed instances (Ni): %s\n" % number_of_instances) fo.write("Number of URIs excluding predicates (Nu): %s\n" % number_of_distinct_uris) fo.write("Number of distinct classes (Nc): %s\n" % number_of_distinct_classes) fo.write("Number of distinct subjects (Nds): %s\n" % number_of_distinct_subjects) fo.write("Number of distinct predicates (Ndp): %s\n" % number_of_distinct_predicates) fo.write("Number of distinct objects (Ndo): %s\n" % number_of_distinct_objects) fo.write("Number of distinct literals (Ndl): %s\n" % number_of_distinct_literals) fo.write("Number of distinct lexical symbols (Ndls): %s\n" % number_of_symbols) fo.write("\n") fo.write("Literalness (Nl/Nt): %s\n" % ((Nl * 1.0) / Nt)) if Nl > 0: fo.write("Literal uniqueness (Ndl/Nl): %s\n" % ((Ndl * 1.0) / Nl)) else: fo.write("Literal uniqueness (Ndl/Nl): undefined") fo.write("Object uniqueness (Ndo/No): %s\n" % ((Ndo * 1.0) / No)) fo.write("Interconnectedness (1 - (Nl+Ni)/Nt): %s\n" % (1.0 - (Nl + Ni) / (Nt * 1.0))) fo.write("Subject coverage (Nds/Nu): %s\n" % ((1.0 * Nds) / Nu)) fo.write("Object coverage (Ndo/Nu): %s\n" % ((1.0 * Ndo) / Nu)) fo.write("Class coverage: %s\n" % class_coverage) #fo.write("Fraction of objects that are literals: %s\n" % ((number_of_distinct_literals * 1.0) / number_of_distinct_objects)) fo.write("\n") fo.write("Top subjects are:\n") pprint.pprint(ts.top_subjects(display_n), fo) fo.write("\n") fo.write("Top objects are:\n") pprint.pprint(ts.top_objects(display_n), fo) fo.write("\n") fo.write("Top predicates are:\n") pprint.pprint(ts.top_predicates(None), fo) fo.write("\n") fo.write("Top classes are:\n") pprint.pprint(classes_results, fo) elif options.command == "query": ts.load_ntriples(f) query = eval(options.query) if options.restrictions: restrictions = eval(options.restrictions) else: restrictions = [] if options.variables: solution_variables = eval(options.variables) else: solution_variables = None result_set = ts.simple_pattern_match(query, restrictions, solution_variables) if display_n == "All": pass else: result_set = result_set[:display_n] if options.output_format == "stdout": pprint.pprint(result_set, fo) fo.write("Query returned %s results" % len(result_set)) elif options.output_format == "ntriples": for result in result_set: i = 1 for solution in result[0]: if i % 3 == 1: fo.write(result[0][0] + " ") elif i % 3 == 2: fo.write(result[0][1] + " ") elif i % 3 == 0: fo.write(result[0][2] + " .\n") i += 1 elif options.output_format == "json": import json json.dump(result_set, fo) elif options.output_format == "delimited": header = options.header delimiter = options.delimiter string_tab = "" if header: if len(result_set): for solution_variable in solution_variables: fo.write("%s%s" % (solution_variable, delimiter)) fo.write("count\n") else: pass for result in result_set: for solution in result[0]: if options.clean: if len(solution): solution = solution[1:-1] fo.write("%s%s" % (solution, delimiter)) fo.write("%s\n" % result[1]) if options.output_file_name: pass else: print(fo.getvalue()) fo.close()
def test_PyTripleSimpleStore(self): ts = pyTripleSimple.SimpleTripleStore() ts.load_ntriples(self.test_source) self.assertEquals(30,ts.n_triples(),"Wrong number of triples extracted")
def main(ntriples_file_name): ts = pyt.SimpleTripleStore() #Create a triple store object try: f = open(ntriples_file_name) except IOError: print("File '%s' could not be read" % os.path.abspath(ntriple_file_name)) raise ts.load_ntriples(f) rdf_type = "<" + pyt.common_prefixes["rdf"] + 'type>' # Get all classes defined with counts classes_result = ts.simple_pattern_match([('a', 't', 'c')], [('t', 'in', [rdf_type])], ['c']) class_count = len(classes_result) class_sizes = [class_result[1] for class_result in classes_result] class_mean = (sum(class_sizes) * 1.0) / class_count class_count_normalized = [ class_size / class_mean for class_size in class_sizes ] # normalize the count by the mean # Get all definitions from typed objects property_class_results = ts.simple_pattern_match([('a', 'p', 'b'), ('a', 't', 'ca'), (('b', 't', 'cb'))], [('t', 'in', [rdf_type]), ('p', '!=', 't')], ['p', 'ca', 'cb']) property_class_relations_count = len(property_class_results) property_class_relations_sizes = [ property_class_result[1] for property_class_result in property_class_results ] property_class_mean = (1.0 * sum(property_class_relations_sizes) ) / property_class_relations_count property_class_count_normalized = [ property_class_size / property_class_mean for property_class_size in property_class_relations_sizes ] gexf_string = "" gexf = GephiGexf() gexf_string += gexf.xml_header() gexf_string += gexf.metadata() gexf_string += gexf.open_graph() gexf_string += gexf.open_nodes() class_dict = {} for i in range(class_count): # Create nodes class_name = classes_result[i][0][0][1:-1] class_dict[class_name] = i gexf_string += gexf.open_node(i, classes_result[i][0][0][1:-1], size=class_count_normalized[i]) gexf_string += gexf.close_node() gexf_string += gexf.close_nodes() property_dict_normalized = {} for i in range(property_class_relations_count ): # Define edges - Gephi does not support parallel edges subject, object = (property_class_results[i][0][1][1:-1], property_class_results[i][0][2][1:-1]) subject_id = class_dict[subject] object_id = class_dict[object] relation_pair = (subject_id, object_id) if relation_pair in property_dict_normalized: property_dict_normalized[ relation_pair] += property_class_count_normalized[ i] # Cumulate weights else: property_dict_normalized[ relation_pair] = property_class_count_normalized[i] gexf_string += gexf.open_edges() i = 0 for relation_pair in property_dict_normalized.keys(): # Output edges gexf_string += gexf.open_edge(i, relation_pair[0], relation_pair[1], property_dict_normalized[relation_pair]) gexf_string += gexf.close_edge() i += 1 gexf_string += gexf.close_edges() gexf_string += gexf.close_graph() gexf_string += gexf.close_xml() # Write out Gephi file try: gexf_file_name = ntriples_file_name + ".gexf" fg = open(gexf_file_name, "w") except IOError: print("File %s'' could not be written" % os.path.abspath(gexf_file_name)) raise fg.write(gexf_string) fg.close() #Write out predicate counts to standard output print("count\tclass1\tpredicate\tclass2\tclass1Count\tclass2Count") for property_class_result in property_class_results: count = property_class_result[1] property_class_pair = property_class_result[0] class_1 = property_class_pair[1][1:-1] class_1i = class_dict[class_1] predicate = property_class_pair[0][1:-1] class_2 = property_class_pair[2][1:-1] class_2i = class_dict[class_2] class_1n = class_sizes[class_1i] class_2n = class_sizes[class_2i] print( str(count) + '\t' + class_1 + '\t' + predicate + '\t' + class_2 + '\t' + str(class_1n) + '\t' + str(class_2n))