def setUp(self): ontology_content = ''' @prefix : <http://dig.isi.edu/ontologies/dig/> . @prefix owl: <http://www.w3.org/2002/07/owl#> . @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix schema: <http://schema.org/> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . :Person a owl:Class ; rdfs:subClassOf :Actor, :Biological_Object ; :common_properties :label, :title, :religion ; . :has_name a owl:DatatypeProperty ; schema:domainIncludes :Person ; schema:rangeIncludes xsd:string ; . :has_child a owl:ObjectProperty ; schema:domainIncludes :Person ; schema:rangeIncludes :Person ; . ''' ontology = Ontology(ontology_content, validation=False, include_undefined_class=True, quiet=True) kg_schema = KGSchema(ontology.merge_with_master_config(dict())) etk = ETK(kg_schema=kg_schema, ontology=ontology, generate_json_ld=True) etk2 = ETK(kg_schema=kg_schema, ontology=ontology, generate_json_ld=False) self.doc = etk.create_document(dict(), doc_id='http://xxx/1', type_=[DIG.Person.toPython()]) self.doc2 = etk2.create_document(dict(), doc_id='http://xxx/2', type_=[DIG.Person.toPython()])
def setUp(self): ontology_content = ''' @prefix : <http://dig.isi.edu/ontologies/dig/> . @prefix dig: <http://dig.isi.edu/ontologies/dig/> . @prefix owl: <http://www.w3.org/2002/07/owl#> . @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix schema: <http://schema.org/> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . :Person a owl:Class ; rdfs:subClassOf :Actor, :Biological_Object ; :common_properties :label, :title, :religion ; . :has_name a owl:DatatypeProperty ; schema:domainIncludes :Person ; schema:rangeIncludes xsd:string ; . :has_child a owl:ObjectProperty ; schema:domainIncludes :Person ; schema:rangeIncludes :Person ; . ''' kg_schema = KGSchema() kg_schema.add_schema(ontology_content, 'ttl') etk = ETK(kg_schema=kg_schema) self.doc = etk.create_document(dict(), doc_id='http://xxx/1', type_=[URI('dig:Person')])
def setUp(self): sample_doc = { "projects": [{ "name": "etk", "description": "version 2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others.", "members": ["dongyu", "amandeep", "sylvia", "Runqi12"], "date": "2007-12-05", "place": "columbus:georgia:united states:-84.98771:32.46098", "s": "segment_test_1" }, { "name": "rltk", "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.", "members": ["mayank", "yixiang"], "date": ["2007-12-05T23:19:00"], "cost": -3213.32, "s": "segment_test_2" }] } kg_schema = KGSchema( json.load(open('etk/unit_tests/ground_truth/test_config.json'))) etk = ETK(kg_schema) self.doc = etk.create_document(sample_doc)
def test_website_patterns_condition(self) -> None: etk = ETK() doc = etk.create_document(sample_input) default_doc_selector = DefaultDocumentSelector() res_true = default_doc_selector.select_document( doc, website_patterns=[".*unittest", ".*abc"]) res_false = default_doc_selector.select_document( doc, website_patterns=[".*ABc", ".*hhhh"]) self.assertEqual(True, res_true) self.assertEqual(False, res_false)
def test_segment(self) -> None: etk = ETK() doc = etk.create_document(sample_input) descriptions = doc.select_segments("projects[*].description") description_value = [i.value for i in descriptions] expected = [ 'version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.', 'record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.' ] self.assertEqual(description_value, expected)
def test_json_paths_and_json_paths_regex(self) -> None: etk = ETK() doc = etk.create_document(sample_input) default_doc_selector = DefaultDocumentSelector() res_true = default_doc_selector.select_document( doc, json_paths=["$.website"], json_paths_regex=[".*unittest", ".*abc"]) res_false = default_doc_selector.select_document( doc, json_paths=["$.website"], json_paths_regex=[".*hhhh"]) self.assertEqual(True, res_true) self.assertEqual(False, res_false)
def test_segment(self) -> None: kg_schema = KGSchema( json.load(open('etk/unit_tests/ground_truth/test_config.json'))) etk = ETK(kg_schema=kg_schema) doc = etk.create_document(sample_input) descriptions = doc.select_segments("projects[*].description") description_value = [i.value for i in descriptions] expected = [ 'version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.', 'record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.' ] self.assertEqual(description_value, expected)
def test_all_condition(self) -> None: etk = ETK() doc = etk.create_document(sample_input) default_doc_selector = DefaultDocumentSelector() res_true = default_doc_selector.select_document( doc, datasets=[".*unittest", ".*abc"], url_patterns=[".*unittest", ".*zxc"], website_patterns=[".*unittest", ".*abc"], json_paths=["$.website"], json_paths_regex=[".*unittest", ".*abc"]) res_false = default_doc_selector.select_document( doc, datasets=[".*abc", ".*hhhh"], url_patterns=[".*ZXc", ".*hhhh"], website_patterns=[".*ABc", ".*hhhh"], json_paths=["$.website"], json_paths_regex=[".*hhhh"]) self.assertEqual(True, res_true) self.assertEqual(False, res_false)
def test_KnowledgeGraph_provenance(self) -> None: sample_doc = { "projects": [ { "name": "etk", "description": "version 2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others.", "members": [ "dongyu", "amandeep", "sylvia", "Runqi12" ], "date": "2007-12-05", "place": "columbus:georgia:united states:-84.98771:32.46098" }, { "name": "rltk", "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.", "members": [ "mayank", "yixiang" ], "date": ["2007-12-05T23:19:00"], "cost": -3213.32 } ] } kg_schema = KGSchema(json.load(open('etk/unit_tests/ground_truth/test_config.json'))) etk = ETK(kg_schema) doc = etk.create_document(sample_doc) try: doc.kg.add_value("developer", json_path="projects[*].members[*]") except KgValueError: pass try: doc.kg.add_value("test_date", json_path="projects[*].date[*]") except KgValueError: pass try: doc.kg.add_value("test_add_value_date", value=[date(2018, 3, 28), {}, datetime(2018, 3, 28, 1, 1, 1)], json_path_extraction="projects[0].date") except KgValueError: pass try: doc.kg.add_value("test_location", json_path="projects[*].place") except KgValueError: pass # print (json.dumps(doc.value, indent=2)) expeced_provenances = [ { "@id": 0, "@type": "kg_provenance_record", "reference_type": "location", "value": "dongyu", "json_path": "projects.[0].members.[0]" }, { "@id": 1, "@type": "kg_provenance_record", "reference_type": "location", "value": "amandeep", "json_path": "projects.[0].members.[1]" }, { "@id": 2, "@type": "kg_provenance_record", "reference_type": "location", "value": "sylvia", "json_path": "projects.[0].members.[2]" }, { "@id": 3, "@type": "kg_provenance_record", "reference_type": "location", "value": "Runqi12", "json_path": "projects.[0].members.[3]" }, { "@id": 4, "@type": "kg_provenance_record", "reference_type": "location", "value": "mayank", "json_path": "projects.[1].members.[0]" }, { "@id": 5, "@type": "kg_provenance_record", "reference_type": "location", "value": "yixiang", "json_path": "projects.[1].members.[1]" }, { "@id": 6, "@type": "kg_provenance_record", "reference_type": "location", "value": "2007-12-05T00:00:00", "json_path": "projects.[0].date.[0]" }, { "@id": 7, "@type": "kg_provenance_record", "reference_type": "location", "value": "2007-12-05T23:19:00", "json_path": "projects.[1].date.[0]" }, { "@id": 8, "@type": "kg_provenance_record", "reference_type": "constant", "value": "2018-03-28", "json_path": "projects[0].date" }, { "@id": 9, "@type": "kg_provenance_record", "reference_type": "constant", "value": "2018-03-28T01:01:01", "json_path": "projects[0].date" }, { "@id": 10, "@type": "kg_provenance_record", "reference_type": "location", "value": "columbus:georgia:united states:-84.98771:32.46098", "json_path": "projects.[0].place" } ] self.assertEqual(expeced_provenances, doc.value["provenances"])
projects = doc.select_segments("projects[*]") for d, p in zip(descriptions, projects): names = doc.extract(self.rule_extractor, d) p.store(names, "members") return list() if __name__ == "__main__": sample_input = { "projects": [{ "name": "etk", "description": "version 2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others." }, { "name": "rltk", "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students." }] } etk = ETK(modules=RuleETKModule) doc = etk.create_document(sample_input) docs = etk.process_ems(doc) print(json.dumps(docs[0].value, indent=2))
import os, sys, json sys.path.append(os.path.join(os.path.dirname(__file__), '..')) sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) from etk.etk import ETK from etk.knowledge_graph import KGSchema from examples.config_to_em.em_base_generator import EmBaseGenerator ebg = EmBaseGenerator('template.tpl') ebg.generate_em_base('master_config.json', 'ems/em_base.py') kg_schema = KGSchema(json.load(open("master_config.json", "r"))) etk = ETK(kg_schema, ["./ems"]) doc = etk.create_document(json.load(open('sample_html.jl', 'r'))) docs = etk.process_ems(doc) print(json.dumps(docs[0].value, indent=2))
}, "matched_sentence": { "type": "string" }, "date": { "type": "string" } } } kg_schema = KGSchema(master_config) etk = ETK(kg_schema, ["./"]) # read the news news_file = open( '/Users/amandeep/Github/etk/examples/ranking_pipeline/resources/new_2018-04-03-first-10000.jl' ) # news_file = open('/Users/amandeep/Github/etk/examples/ranking_pipeline/resources/news_stories_3.jl') news_stories = [ etk.create_document(json.loads(line), url=json.loads(line)['tld'], doc_id=json.loads(line)['doc_id']) for line in news_file ] results = list() for news_story in news_stories: results.extend(etk.process_ems(news_story)) o = open('ifp_news_similarity.jl', 'w') for result in results: o.write(json.dumps(result.value)) o.write('\n')
def __init__(self, query_server=None, update_server=None): self.punctuation_table = str.maketrans( dict.fromkeys(string.punctuation)) if query_server and update_server: self.query_server = query_server self.update_server = update_server else: self.query_server = DATAMRT_SERVER self.update_server = DATAMRT_SERVER # initialize kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) self.doc = etk.create_document( {}, doc_id="http://isi.edu/default-ns/projects") # bind prefixes self.doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') self.doc.kg.bind('wd', 'http://www.wikidata.org/entity/') self.doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') self.doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') self.doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') self.doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') self.doc.kg.bind('wdv', 'http://www.wikidata.org/value/') self.doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') self.doc.kg.bind('p', 'http://www.wikidata.org/prop/') self.doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') self.doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') self.doc.kg.bind( 'prn', 'http://www.wikidata.org/prop/reference/value-normalized/') self.doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') self.doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') self.doc.kg.bind( 'psn', 'http://www.wikidata.org/prop/statement/value-normalized/') self.doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') self.doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') self.doc.kg.bind( 'pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') self.doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') self.doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') self.doc.kg.bind('schema', 'http://schema.org/') # give definition of the nodes we definied p = WDProperty('C2001', Datatype.MonolingualText) p.add_label('keywords', lang='en') p.add_description('identifier of a dataset in the Datamart system', lang='en') p.add_statement('P31', Item('Q19847637')) p.add_statement('P1629', Item('Q1172284')) self.doc.kg.add_subject(p) p = WDProperty('C2004', Datatype.StringValue) p.add_label('datamart identifier', lang='en') p.add_description( 'keywords associated with an item to facilitate finding the item using text search', lang='en') p.add_statement('P31', Item('Q18616576')) self.doc.kg.add_subject(p) p = WDProperty('C2005', Datatype.StringValue) p.add_label('variable measured', lang='en') p.add_description('the variables measured in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) p.add_statement('P1628', URLValue('http://schema.org/variableMeasured')) self.doc.kg.add_subject(p) p = WDProperty('C2006', Datatype.StringValue) p.add_label('values', lang='en') p.add_description( 'the values of a variable represented as a text document', lang='en') p.add_statement('P31', Item('Q18616576')) self.doc.kg.add_subject(p) p = WDProperty('C2007', Datatype.Item) p.add_label('data type', lang='en') p.add_description( 'the data type used to represent the values of a variable, integer (Q729138), Boolean (Q520777), ' 'Real (Q4385701), String (Q184754), Categorical (Q2285707)', lang='en') p.add_statement('P31', Item('Q18616576')) self.doc.kg.add_subject(p) p = WDProperty('C2008', Datatype.URLValue) p.add_label('semantic type', lang='en') p.add_description( 'a URL that identifies the semantic type of a variable in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) self.doc.kg.add_subject(p) # get the starting source id sparql_query = """ prefix wdt: <http://www.wikidata.org/prop/direct/> prefix wd: <http://www.wikidata.org/entity/> prefix wikibase: <http://wikiba.se/ontology#> PREFIX p: <http://www.wikidata.org/prop/> PREFIX pqv: <http://www.wikidata.org/prop/qualifier/value/> PREFIX pq: <http://www.wikidata.org/prop/qualifier/> PREFIX ps: <http://www.wikidata.org/prop/statement/> prefix bd: <http://www.bigdata.com/rdf#> prefix bds: <http://www.bigdata.com/rdf/search#> select ?x where { wd:Z00000 wdt:P1114 ?x . } """ try: sparql = SPARQLWrapper(self.query_server) sparql.setQuery(sparql_query) sparql.setReturnFormat(JSON) sparql.setMethod(POST) sparql.setRequestMethod(URLENCODED) results = sparql.query().convert()['results']['bindings'] except: print("Getting query of wiki data failed!") raise ValueError("Unable to initialize the datamart query service") if not results: print( "[WARNING] No starting source id found! Will initialize the starting source with D1000001" ) self.resource_id = 1000001 else: self.resource_id = 1000001
def main(): filename = sys.argv[1] query_title = sys.argv[2] ranking_criteria = sys.argv[3] top_k = sys.argv[4] if ranking_criteria not in ('TITLE', 'SENTENCE'): print('Wrong mode! Please check the input argument!') return master_config = { "fields": { "developer": { "type": "string" }, "student_developer": { "type": "string" }, "spacy_name": { "type": "string" }, "date": { "type": "date" } } } kg_schema = KGSchema(master_config) etk = ETK(kg_schema, ["./extraction_modules/"]) nlp = spacy.load('en_core_web_lg') date_extractor = DateExtractor(etk=etk) queries = dict() queries_ent_map = dict() with open(query_title) as f: for line in f: orig_ifp_title = line # remove date information from query term res = date_extractor.extract(text=line) start, end = float('inf'), -1 for i in res: start = min(start, i.provenance['start_char']) end = max(end, i.provenance['end_char']) # delete date from query term if len(res) != 0: line = line[:start] + line[end+1:] queries[orig_ifp_title] = line queries_ent_map[line] = list() # extract entities from query term doc = nlp(line) for ent in doc.ents: queries_ent_map[line].append(re.escape(ent.text.strip())) # remove empty entities queries_ent_map[line] = list(filter(bool, queries_ent_map[line])) # the list of selected docs for given query term query_docs_mapping = dict() docs = list() with open(filename) as f: for line in f: json_obj = json.loads(line) docs.append(etk.create_document(json_obj)) ds = DefaultDocumentSelector() for orig_query, proc_query in queries.items(): content_regex = queries_ent_map[proc_query] query_docs_mapping[proc_query] = list() for doc in docs: if len(content_regex) == 0 \ or ds.select_document(document=doc, json_paths=['$.lexisnexis.doc_description'], json_paths_regex=content_regex): query_docs_mapping[proc_query].append(doc) # TODO: pass ifp_id in for orig_query, proc_query in queries.items(): # print(len(query_docs_mapping[proc_query])) dr_processor = DocRetrieveProcessor(etk=etk, ifp_id="1233", ifp_title=proc_query, orig_ifp_title=orig_query) heap = list() for doc in query_docs_mapping[proc_query]: processed_doc = dict() if ranking_criteria == 'SENTENCE': processed_doc = dr_processor.process_by_sentence(doc=doc, threshold=0).cdr_document elif ranking_criteria == 'TITLE': processed_doc = dr_processor.process_by_title(doc=doc, threshold=0).cdr_document if len(heap) < top_k: heappush(heap, (processed_doc['similarity'], processed_doc['date'], processed_doc)) else: if processed_doc['similarity'] > heap[0][0]: heappush(heap, (processed_doc['similarity'], processed_doc['date'], processed_doc)) heap.sort(reverse=True) output_filename = './resources/output/'+orig_ifp_title+"_result.jl" with open(output_filename, 'a+b') as f: for item in heap: print(item[0]) jl_str = json.dumps(item[2]) + '\n' f.write(jl_str.encode())
def model_statement(self): # initialize KGSchema kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id='http://isi.edu/default-ns/projects') # bind prefix doc = create_custom_prefix(doc, custom_dict={self.ns: self.uri}) # extract files self.extract_files() # model statement inputs = self.data['inputs'] for k, v in inputs.items(): if k != 'metadata': # construct wikifier instance if k == 'wikifier' and not v['existed']: q = WDItem(v['qnode'], namespace=self.ns, creator=':datamart') q.add_label('A wikifier file for ' + inputs['dataset']['content']['filename'], lang='en') q.add_statement('P31', Item( 'SDQ1001', namespace=self.ns)) # an instance of Wikifier q.add_statement('P127', Item('SDQ1003', namespace=self.ns)) # belongs to q.add_statement('SDP3003', StringValue(v['content']), namespace=self.ns) # hasFileContent q.add_statement('SDP3004', StringValue(v['hashcode']), namespace=self.ns) # hashValue # construct mapping_file instance elif k == 'mappingFile' and not v['existed']: q = WDItem(v['qnode'], namespace=self.ns, creator=':datamart') q.add_label('A mapping file for ' + inputs['dataset']['content']['filename'], lang='en') q.add_statement('P31', Item( 'SDQ1002', namespace=self.ns)) # an instance of MappingFile q.add_statement('P170', StringValue('T2WML')) q.add_statement('P127', Item('SDQ1003', namespace=self.ns)) q.add_statement('SDP3003', StringValue(json.dumps(v['content'])), namespace=self.ns) q.add_statement('SDP3004', StringValue(v['hashcode']), namespace=self.ns) # construct dataset instance elif k == 'dataset' and not v['existed']: q = WDItem(v['qnode'], namespace=self.ns, creator=':datamart') q.add_label(v['content']['title'], lang='en') q.add_description(v['content']['description'], lang='en') q.add_statement('P31', Item('Q1172284')) # an instance of Dataset q.add_statement('SDP3001', Item(inputs['wikifier']['qnode'], namespace=self.ns), namespace=self.ns) # a wikifier file q.add_statement('SDP3002', Item(inputs['mappingFile']['qnode'], namespace=self.ns), namespace=self.ns) # a mapping file q.add_statement('P1476', StringValue( v['content']['title'])) # title q.add_statement( 'P921', StringValue(v['content']['description'])) # described q.add_statement('P127', Item('SDQ1003', namespace=self.ns)) # belongs to q.add_statement('SDP2004', StringValue(', '.join( v['content']['keywords'])), namespace=self.ns) # keywords q.add_statement('SDP3004', StringValue(v['hashcode']), namespace=self.ns) if self.data['storeColumnValue']: for data in v['content']['variable_measured']: statement = q.add_statement( 'SDP2005', StringValue(data['column_name']), namespace=self.ns) # variable measured statement.add_qualifier( 'SDP2006', StringValue(data['values_of_a_column']), namespace=self.ns) # the values of a column statement.add_qualifier( 'SDP2007', Item(data['data_structure_type']), namespace=self.ns) # data structure type statement.add_qualifier( 'SDP2008', URLValue(data['semantic_type_identifier']), namespace=self.ns) # semantic type statement.add_qualifier( 'P1545', QuantityValue( data['column_index'], namespace=self.ns)) # column index doc.kg.add_subject(q) return doc
def model_data() -> None: """ This function generates triples for user defined properties for uploading them to wikidata :return: """ stream = open(Path.cwd().parent / "Datasets/new-property-configuration.yaml", 'r', encoding='utf8') yaml_data = yaml.safe_load(stream) # initialize kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") # bind prefixes doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') doc.kg.bind('wd', 'http://www.wikidata.org/entity/') doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') doc.kg.bind('wdv', 'http://www.wikidata.org/value/') doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') doc.kg.bind('p', 'http://www.wikidata.org/prop/') doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') doc.kg.bind('prn', 'http://www.wikidata.org/prop/reference/value-normalized/') doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') doc.kg.bind('psn', 'http://www.wikidata.org/prop/statement/value-normalized/') doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') doc.kg.bind('pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') doc.kg.bind('schema', 'http://schema.org/') sparql_endpoint = "https://query.wikidata.org/sparql" type_map = {'quantity': Datatype.QuantityValue, 'url': URLValue} property_type_cache = {} for k, v in yaml_data.items(): p = WDProperty(k, type_map[v['type']], creator='http://www.isi.edu/t2wml') for lang, value in v['label'].items(): for val in value: p.add_label(val, lang=lang) for lang, value in v['description'].items(): for val in value: p.add_description(val, lang=lang) for pnode, items in v['statements'].items(): for item in items: try: property_type = property_type_cache[pnode] except KeyError: property_type = get_property_type(pnode, sparql_endpoint) property_type_cache[pnode] = property_type if property_type == "WikibaseItem": value = Item(str(item['value'])) elif property_type == "WikibaseProperty": value = Property(item['value']) elif property_type == "String": value = StringValue(item['value']) elif property_type == "Quantity": value = QuantityValue(item['value']) elif property_type == "Time": value = TimeValue( str(item['value']), Item(item["calendar"]), translate_precision_to_integer(item["precision"]), item["time_zone"]) elif property_type == "Url": value = URLValue(item['value']) elif property_type == "Monolingualtext": value = MonolingualText(item['value'], item["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(item['value']) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(item["latitude"], item["longitude"], item["precision"]) p.add_statement(pnode, value) doc.kg.add_subject(p) with open(Path.cwd().parent / "new_properties/result.ttl", "w") as f: data = doc.kg.serialize('ttl') f.write(data)
douglas.add_statement('P2048', QuantityValue(1.96, unit=Item('Q11573'))) # official website # statement = douglas.add_statement('P856', URLValue('http://douglasadams.com/')) statement = douglas.add_truthy_statement( 'P856', URLValue('http://douglasadams.com/')) statement.add_qualifier('P407', Item('Q1860')) # Freebase ID douglas.add_statement( 'P646', ExternalIdentifier('/m/0282x', URLValue('http://g.co/kg/m/0282x'))) doc.kg.add_subject(douglas) return list() if __name__ == "__main__": kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ExampleETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") revise(True) docs = etk.process_ems(doc) print(docs[0].kg.serialize('ttl'))
class TestProvenance(unittest.TestCase): def test_Provenance(self) -> None: kg_schema = KGSchema( json.load(open('etk/unit_tests/ground_truth/test_config.json'))) self.etk = ETK(kg_schema=kg_schema) g = [ 'runqi', 'sylvia', 'dongyu', 'mayank', 'pedro', 'amandeep', 'yixiang' ] self.name_extractor = GlossaryExtractor(g, "name_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=1) doc = self.etk.create_document(sample_input) descriptions = doc.select_segments("projects[*].description") projects = doc.select_segments("projects[*]") for d, p in zip(descriptions, projects): names = doc.extract(self.name_extractor, d) p.store(names, "members") expected_provenances = [{ "@id": 0, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[0].description", "start_char": 33, "end_char": 38 } }, { "@id": 1, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[0].description", "start_char": 40, "end_char": 46 } }, { "@id": 2, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[0].description", "start_char": 48, "end_char": 54 } }, { "@id": 3, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[0].description", "start_char": 56, "end_char": 64 } }, { "@id": 4, "@type": "storage_provenance_record", "doc_id": None, "field": None, "destination": "projects.[0].members", "parent_provenances": { "Runqi": 0, "Dongyu": 1, "Sylvia": 2, "Amandeep": 3 } }, { "@id": 5, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[1].description", "start_char": 39, "end_char": 44 } }, { "@id": 6, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[1].description", "start_char": 46, "end_char": 52 } }, { "@id": 7, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[1].description", "start_char": 54, "end_char": 61 } }, { "@id": 8, "@type": "storage_provenance_record", "doc_id": None, "field": None, "destination": "projects.[1].members", "parent_provenances": { "Pedro": 5, "Mayank": 6, "Yixiang": 7 } }] expected_projects = [{ "name": "etk", "description": "version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.", "members": ["Runqi", "Dongyu", "Sylvia", "Amandeep"] }, { "name": "rltk", "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.", "members": ["Pedro", "Mayank", "Yixiang"] }] #print ("hiiiiiiiiiiiiiiiii") #print ("projects: " + str(doc.value["projects"])) #print ("provenances: " + str(doc.value["provenances"])) self.assertEqual(expected_projects, doc.value["projects"]) self.assertEqual(expected_provenances, doc.value["provenances"])
class ETKWorker(object): def __init__(self, master_config, em_paths, logger, worker_id, project_name, kafka_input_args=None, kafka_output_args=None): self.logger = logger self.worker_id = worker_id self.check_interval = 1000 self.exit_sign = False try: kg_schema = KGSchema(master_config) self.etk_ins = ETK(kg_schema, em_paths, logger=logger) except Exception as e: logger.exception('ETK initialization failed') raise e # kafka input self.kafka_input_server = config['input_server'] self.kafka_input_session_timeout = config['input_session_timeout'] self.kafka_input_group_id = config['input_group_id'] self.kafka_input_topic = '{project_name}_in'.format(project_name=project_name) self.kafka_input_args = dict() if kafka_input_args is None else kafka_input_args self.kafka_consumer = KafkaConsumer( bootstrap_servers=self.kafka_input_server, group_id=self.kafka_input_group_id, consumer_timeout_ms=self.check_interval, value_deserializer=lambda v: json.loads(v.decode('utf-8')), **self.kafka_input_args ) self.kafka_consumer.subscribe([self.kafka_input_topic]) # kafka output self.kafka_output_server = config['output_server'] self.kafka_output_topic = '{project_name}_out'.format(project_name=project_name) self.kafka_output_args = dict() if kafka_output_args is None else kafka_output_args self.kafka_producer = KafkaProducer( bootstrap_servers=self.kafka_output_server, value_serializer=lambda v: json.dumps(v).encode('utf-8'), **self.kafka_output_args ) self.timeout_count = self.kafka_input_session_timeout / self.check_interval self.current_timeout_count = 0 def process(self): # prev_doc_sent_time = None while not self.exit_sign: # high level api handles batching # will exit once timeout try: for msg in self.kafka_consumer: # force to commit, block till getting response self.kafka_consumer.commit() # get message, clear timeout count self.current_timeout_count = 0 cdr = msg.value # TODO better way to add execution profile # cdr['@execution_profile'] = {'@worker_id': self.worker_id} # doc_arrived_time = time.time() # cdr['@execution_profile']['@doc_arrived_time'] = \ # datetime.utcfromtimestamp(doc_arrived_time).isoformat() # cdr['@execution_profile']['@doc_wait_time'] = \ # 0.0 if not prev_doc_sent_time \ # else float(doc_arrived_time - prev_doc_sent_time) # cdr['@execution_profile']['@doc_length'] = len(json.dumps(cdr)) if 'doc_id' not in cdr or len(cdr['doc_id']) == 0: self.logger.error('invalid cdr: unknown doc_id') continue self.logger.info('processing %s' % cdr['doc_id']) try: # start_run_core_time = time.time() # run etk module doc = self.etk_ins.create_document(cdr, url=cdr['url'], doc_id=cdr['doc_id']) # process_ems returns a list of Documents results = self.etk_ins.process_ems(doc) for result in results: cdr_result = result.cdr_document # indexing # TODO indexed_cdr = index_knowledge_graph_fields(cdr_result) if not indexed_cdr: logger.error('indexing in sandpaper failed') continue # cdr = indexed_cdr # cdr['@execution_profile']['@run_core_time'] = \ # float(time.time() - start_run_core_time) # doc_sent_time = time.time() # cdr['@execution_profile']['@doc_sent_time'] = \ # datetime.utcfromtimestamp(doc_sent_time).isoformat() # prev_doc_sent_time = doc_sent_time # cdr['@execution_profile']['@doc_processed_time'] = \ # float(doc_sent_time - doc_arrived_time) # output result r = self.kafka_producer.send(self.kafka_output_topic, indexed_cdr) r.get(timeout=60) # wait till sent self.logger.info('{} done'.format(indexed_cdr['doc_id'])) except Exception as e: self.logger.exception('failed at %s' % cdr['doc_id']) except ValueError as e: # I/O operation on closed epoll fd self.logger.info('consumer closed') self.exit_sign = True except StopIteration as e: # timeout self.current_timeout_count += 1 if self.current_timeout_count >= self.timeout_count: self.exit_sign = True except CommitFailedError as e: self.exit_sign = True # https://github.com/dpkp/kafka-python/blob/535d8f6a85969c4e07de0bc81e14513c677995be/kafka/errors.py#L65 # if this worker is dead, restart and reattach to the group g_restart_worker = True def __del__(self): self.logger.info('ETK worker {} is exiting...'.format(self.worker_id)) try: self.kafka_consumer.close() except: pass try: self.kafka_producer.close() except: pass
def _init_etk(): # initialize for etk kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") # bind prefixes doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') doc.kg.bind('wd', 'http://www.wikidata.org/entity/') doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') doc.kg.bind('wdv', 'http://www.wikidata.org/value/') doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') doc.kg.bind('p', 'http://www.wikidata.org/prop/') doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') doc.kg.bind( 'prn', 'http://www.wikidata.org/prop/reference/value-normalized/') doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') doc.kg.bind( 'psn', 'http://www.wikidata.org/prop/statement/value-normalized/') doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') doc.kg.bind( 'pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') doc.kg.bind('schema', 'http://schema.org/') # give definition of the nodes we definied p = WDProperty('C2001', Datatype.MonolingualText) p.add_label('datamart identifier', lang='en') p.add_description('identifier of a dataset in the Datamart system', lang='en') p.add_statement('P31', Item('Q19847637')) p.add_statement('P1629', Item('Q1172284')) doc.kg.add_subject(p) p = WDProperty('C2004', Datatype.StringValue) p.add_label('keywords', lang='en') p.add_description( 'keywords associated with an item to facilitate finding the item using text search', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2005', Datatype.StringValue) p.add_label('variable measured', lang='en') p.add_description('the variables measured in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) p.add_statement('P1628', URLValue('http://schema.org/variableMeasured')) doc.kg.add_subject(p) p = WDProperty('C2006', Datatype.StringValue) p.add_label('values', lang='en') p.add_description( 'the values of a variable represented as a text document', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2007', Datatype.Item) p.add_label('data type', lang='en') p.add_description( 'the data type used to represent the values of a variable, integer (Q729138), Boolean (Q520777), ' 'Real (Q4385701), String (Q184754), Categorical (Q2285707)', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2008', Datatype.URLValue) p.add_label('semantic type', lang='en') p.add_description( 'a URL that identifies the semantic type of a variable in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2010', Datatype.StringValue) p.add_label('extra information', lang='en') p.add_description( 'some extra information that may needed for this dataset', lang='en') doc.kg.add_subject(p) p = WDProperty('C2011', Datatype.TimeValue) p.add_label('start date', lang='en') p.add_description( 'The earlist time exist in this dataset, only valid when there exists time format data in this dataset', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2012', Datatype.TimeValue) p.add_label('end date', lang='en') p.add_description( 'The latest time exist in this dataset, only valid when there exists time format data in this dataset', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2013', Datatype.QuantityValue) p.add_label('time granularity', lang='en') p.add_description('time granularity in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2014', Datatype.StringValue) p.add_label('uploader information', lang='en') p.add_description('information about who uploaded and when uploaded', lang='en') doc.kg.add_subject(p) return doc
doc.kg.add_value("developer", member.value) return list() if __name__ == "__main__": sample_input = { "projects": [{ "name": "etk", "description": "version 2 of etk, implemented by Runqi Shao, Dongyu Li, Sylvia lin, Amandeep and " "others." }, { "name": "rltk", "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students." }] } kg_schema = KGSchema(json.load(open("master_config.json", "r"))) etk = ETK(kg_schema=kg_schema, modules=ExampleETKModule) doc = etk.create_document(sample_input, doc_id="http://isi.edu/default-ns/projects") docs = etk.process_ems(doc) print(json.dumps(docs[0].kg.value, indent=2)) print(docs[0].kg.get_values('developer'))
class TripleGenerator(Generator): def __init__(self, **kwargs): super().__init__(**kwargs) prop_declaration = kwargs.pop("prop_declaration") dest_fp = kwargs.pop("dest_fp") truthy = kwargs.pop("truthy") use_id = kwargs.pop("use_id") prefix_path = kwargs.pop("prefix_path") self.datatype_mapping = { # nomenclature from https://w.wiki/Tfn "item": Item, "WikibaseItem": Item, "time": TimeValue, "Time": TimeValue, "globe-coordinate": GlobeCoordinate, "GlobeCoordinate": GlobeCoordinate, "quantity": QuantityValue, "Quantity": QuantityValue, "monolingualtext": MonolingualText, "Monolingualtext": MonolingualText, "string": StringValue, "String": StringValue, "external-identifier": ExternalIdentifier, "ExternalId": ExternalIdentifier, "url": StringValue, #TODO bug potentially in rdflib "Url": StringValue, "property": WDProperty, "WikibaseProperty": WDProperty } self.set_prefix(prefix_path) self.prop_declaration = prop_declaration self.set_properties(self.prop_file) self.fp = dest_fp self.truthy = truthy self.reset_etk_doc() self.serialize_prefix() self.use_id = use_id def set_prefix(self, prefix_path: str): self.prefix_dict = {} if prefix_path != "NONE": with open(prefix_path, "r") as fp: for line_num, edge in enumerate(fp): edge_list = edge.strip("\r\n").split("\t") if line_num == 0: node1_index, node2_index = edge_list.index( "node1"), edge_list.index("node2") else: prefix, expand = edge_list[node1_index], edge_list[ node2_index] self.prefix_dict[prefix] = expand def read_prop_declaration(self, line_number: int, edge: str): node1, node2, prop, e_id = self.parse_edges(edge) if prop == "data_type": self.prop_types[node1] = self.datatype_mapping[node2.strip()] return def set_properties(self, prop_file: str): self.prop_types = {} if prop_file == "NONE": return with open(prop_file, "r") as fp: props = fp.readlines() for line in props[1:]: node1, _, node2 = line.split("\t") try: self.prop_types[node1] = self.datatype_mapping[node2.strip()] except: raise KGTKException( "DataType {} of node {} is not supported.\n".format( node2, node1)) def _node_2_entity(self, node: str): ''' A node can be Qxxx or Pxxx, return the proper entity. ''' if node in self.prop_types: entity = WDProperty(node, self.prop_types[node]) else: entity = WDItem(TripleGenerator.replace_illegal_string(node)) return entity def reset_etk_doc(self, doc_id: str = "http://isi.edu/default-ns/projects"): """ reset the doc object and return it. Called at initialization and after outputting triples. """ kg_schema = KGSchema() kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl") self.etk = ETK(kg_schema=kg_schema, modules=ETKModule) self.doc = self.etk.create_document({}, doc_id=doc_id) for k, v in wiki_namespaces.items(): if k in self.prefix_dict: self.doc.kg.bind(k, self.prefix_dict[k]) else: self.doc.kg.bind(k, v) def serialize(self): """ Seriealize the triples. Used a hack to avoid serializing the prefix again. """ docs = self.etk.process_ems(self.doc) self.fp.write("\n\n".join( docs[0].kg.serialize("ttl").split("\n\n")[1:])) self.fp.flush() self.reset() def serialize_prefix(self): """ This function should be called only once after the doc object is initialized. In order to serialize the prefix at the very begining it has to be printed per the change of rdflib 4.2.2->5.0.0 Relevent issue: https://github.com/RDFLib/rdflib/issues/965 """ for k, v in wiki_namespaces.items(): if k in self.prefix_dict: line = "@prefix " + k + ": <" + self.prefix_dict[k] + "> .\n" else: line = "@prefix " + k + ": <" + v + "> .\n" self.fp.write(line) self.fp.write("\n") self.fp.flush() self.reset() def reset(self): self.to_append_statement_id = None self.to_append_statement = None self.read_num_of_lines = 0 self.reset_etk_doc() def generate_label_triple(self, node1: str, node2: str) -> bool: entity = self._node_2_entity(node1) text_string, lang = TripleGenerator.process_text_string(node2) entity.add_label(text_string, lang=lang) self.doc.kg.add_subject(entity) return True def generate_description_triple(self, node1: str, node2: str) -> bool: entity = self._node_2_entity(node1) text_string, lang = TripleGenerator.process_text_string(node2) entity.add_description(text_string, lang=lang) self.doc.kg.add_subject(entity) return True def generate_alias_triple(self, node1: str, node2: str) -> bool: entity = self._node_2_entity(node1) text_string, lang = TripleGenerator.process_text_string(node2) entity.add_alias(text_string, lang=lang) self.doc.kg.add_subject(entity) return True def generate_prop_declaration_triple(self, node1: str, node2: str) -> bool: # update the known prop_types if node1 in self.prop_types: if not self.prop_declaration: raise KGTKException( "Duplicated property definition of {} found!".format( node1)) else: self.prop_types[node1] = node2 prop = WDProperty(node1, self.datatype_mapping[node2]) self.doc.kg.add_subject(prop) return True def generate_normal_triple(self, node1: str, property: str, node2: str, is_qualifier_edge: bool, e_id: str) -> bool: if self.use_id: e_id = TripleGenerator.replace_illegal_string(e_id) entity = self._node_2_entity(node1) edge_type = self.prop_types[property] if edge_type == Item: object = WDItem(TripleGenerator.replace_illegal_string(node2)) elif edge_type == WDProperty: object = WDProperty(TripleGenerator.replace_illegal_string(node2), self.prop_types[node2]) elif edge_type == TimeValue: if self.yyyy_mm_dd_pattern.match(node2): try: dateTimeString = node2 object = TimeValue( value=dateTimeString, # TODO calendar=Item("Q1985727"), precision=Precision.year, time_zone=0, ) except: return False elif self.yyyy_pattern.match(node2): try: dateTimeString = node2 + "-01-01" object = TimeValue( value=dateTimeString, # TODO calendar=Item("Q1985727"), precision=Precision.year, time_zone=0, ) except: return False else: try: # TODO, in future, the two cases above will be dropped in principle to comply with the iso format # now it is iso format assert (node2[0] == "^") node2 = node2[1:] # remove ^ if node2.startswith("+"): node2 = node2[1:] dateTimeString, precision = node2.split("/") dateTimeString = dateTimeString[:-1] # remove Z object = TimeValue( value=dateTimeString, calendar=Item("Q1985727"), precision=precision, time_zone=0, ) except: return False elif edge_type == GlobeCoordinate: latitude, longitude = node2[1:].split("/") latitude = float(latitude) longitude = float(longitude) object = GlobeCoordinate(latitude, longitude, 0.0001, globe=Item("Q2")) # earth elif edge_type == QuantityValue: # +70[+60,+80]Q743895 res = self.quantity_pattern.match(node2).groups() amount, lower_bound, upper_bound, unit = res amount = TripleGenerator.clean_number_string(amount) num_type = self.xsd_number_type(amount) lower_bound = TripleGenerator.clean_number_string(lower_bound) upper_bound = TripleGenerator.clean_number_string(upper_bound) if unit != None: if upper_bound != None and lower_bound != None: object = QuantityValue(amount, unit=Item(unit), upper_bound=upper_bound, lower_bound=lower_bound, type=num_type) else: object = QuantityValue(amount, unit=Item(unit), type=num_type) else: if upper_bound != None and lower_bound != None: object = QuantityValue(amount, upper_bound=upper_bound, lower_bound=lower_bound, type=num_type) else: object = QuantityValue(amount, type=num_type) elif edge_type == MonolingualText: text_string, lang = TripleGenerator.process_text_string(node2) object = MonolingualText(text_string, lang) elif edge_type == ExternalIdentifier: object = ExternalIdentifier(node2) elif edge_type == URLValue: if TripleGenerator.is_valid_uri_with_scheme_and_host(node2): object = URLValue(node2) else: return False else: # treat everything else as stringValue object = StringValue(node2) if type(object) == WDItem or type(object) == WDProperty: self.doc.kg.add_subject(object) if is_qualifier_edge: # edge: e8 p9 ^2013-01-01T00:00:00Z/11 # create qualifier edge on previous STATEMENT and return the updated STATEMENT self.to_append_statement.add_qualifier(property, object) self.doc.kg.add_subject(self.to_append_statement) else: # edge: q1 p8 q2 e8 # create brand new property edge and replace STATEMENT if self.truthy: self.to_append_statement = entity.add_truthy_statement( property, object, statement_id=e_id ) if self.use_id else entity.add_truthy_statement( property, object) else: self.to_append_statement = entity.add_statement( property, object, statement_id=e_id ) if self.use_id else entity.add_statement(property, object) self.doc.kg.add_subject(entity) return True def entry_point(self, line_number: int, edge: str): # print(line_number,edge) """ generates a list of two, the first element is the determination of the edge type using corresponding edge type the second element is a bool indicating whether this is a valid property edge or qualifier edge. Call corresponding downstream functions """ if line_number == 1: # initialize the order_map self.initialize_order_map(edge) return # use the order_map to map the node node1, node2, prop, e_id = self.parse_edges(edge) if line_number == 2: # by default a statement edge is_qualifier_edge = False else: if node1 != self.to_append_statement_id and node1 != self.corrupted_statement_id: is_qualifier_edge = False # also a new statement edge if self.read_num_of_lines >= self.n: self.serialize() else: # qualifier edge or property declaration edge is_qualifier_edge = True if node1 == self.corrupted_statement_id: self.warn_log.write( "QUALIFIER edge at line [{}] associated of corrupted statement edge of id [{}] dropped.\n" .format(line_number, self.corrupted_statement_id)) return if prop in self.label_set: success = self.generate_label_triple(node1, node2) elif prop in self.description_set: success = self.generate_description_triple(node1, node2) elif prop in self.alias_set: success = self.generate_alias_triple(node1, node2) elif prop == "data_type": # special edge of prop declaration success = self.generate_prop_declaration_triple(node1, node2) else: if prop in self.prop_types: success = self.generate_normal_triple(node1, prop, node2, is_qualifier_edge, e_id) else: raise KGTKException( "property [{}]'s type is unknown at line [{}].\n".format( prop, line_number)) if (not success) and self.warning: if not is_qualifier_edge: self.warn_log.write( "CORRUPTED_STATEMENT edge at line: [{}] with edge id [{}].\n" .format(line_number, e_id)) self.corrupted_statement_id = e_id else: self.warn_log.write( "CORRUPTED_QUALIFIER edge at line: [{}] with edge id [{}].\n" .format(line_number, e_id)) else: self.read_num_of_lines += 1 if not is_qualifier_edge: self.to_append_statement_id = e_id @staticmethod def xsd_number_type(num): if isinstance(num, float) and 'e' in str(num).lower(): return LiteralType.double return LiteralType.decimal
def generate_triples(user_id: str, resolved_excel: list, sparql_endpoint: str, filetype: str = 'ttl', created_by: str = 't2wml') -> str: """ This function uses ETK to generate the RDF triples :param user_id: :param resolved_excel: :param sparql_endpoint: :param filetype: :return: """ # initialize kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") property_type_map = property_type_dict # bind prefixes doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') doc.kg.bind('wd', 'http://www.wikidata.org/entity/') doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') doc.kg.bind('wdv', 'http://www.wikidata.org/value/') doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') doc.kg.bind('p', 'http://www.wikidata.org/prop/') doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') doc.kg.bind('prn', 'http://www.wikidata.org/prop/reference/value-normalized/') doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') doc.kg.bind('psn', 'http://www.wikidata.org/prop/statement/value-normalized/') doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') doc.kg.bind('pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') doc.kg.bind('schema', 'http://schema.org/') # property_type_cache = {} is_error = False for i in resolved_excel: _item = i["statement"]["item"] if _item is not None: item = WDItem(_item, creator='http://www.isi.edu/{}'.format(created_by)) try: property_type = property_type_map[i["statement"]["property"]] except KeyError: property_type = get_property_type(i["statement"]["property"], sparql_endpoint) if property_type != "Property Not Found" and i["statement"][ "property"] not in property_type_map: property_type_map[i["statement"] ["property"]] = property_type if property_type == "WikibaseItem": value = Item(str(i["statement"]["value"])) elif property_type == "WikibaseProperty": value = Property(i["statement"]["value"]) elif property_type == "String": value = StringValue(i["statement"]["value"]) elif property_type == "Quantity": _value = i["statement"]["value"] _value = str(_value).replace(',', '') value = QuantityValue(_value) elif property_type == "Time": value = TimeValue( str(i["statement"]["value"]), Item(i["statement"]["calendar"]), translate_precision_to_integer( i["statement"]["precision"]), i["statement"]["time_zone"]) elif property_type == "Url": value = URLValue(i["statement"]["value"]) elif property_type == "Monolingualtext": value = MonolingualText(i["statement"]["value"], i["statement"]["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(i["statement"]["value"]) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(i["statement"]["latitude"], i["statement"]["longitude"], i["statement"]["precision"]) elif property_type == "Property Not Found": is_error = True break s = item.add_statement(i["statement"]["property"], value) doc.kg.add_subject(item) if "qualifier" in i["statement"]: for j in i["statement"]["qualifier"]: try: property_type = property_type_map[j["property"]] except KeyError: property_type = get_property_type( j["property"], sparql_endpoint) if property_type != "Property Not Found" and i[ "statement"][ "property"] not in property_type_map: property_type_map[i["statement"] ["property"]] = property_type if property_type == "WikibaseItem": value = Item(str(j["value"])) elif property_type == "WikibaseProperty": value = Property(j["value"]) elif property_type == "String": value = StringValue(j["value"]) elif property_type == "Quantity": value = QuantityValue(j["value"]) elif property_type == "Time": value = TimeValue(str(j["value"]), Item(j["calendar"]), j["precision"], j["time_zone"]) elif property_type == "Url": value = URLValue(j["value"]) elif property_type == "Monolingualtext": value = MonolingualText(j["value"], j["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(j["value"]) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(j["latitude"], j["longitude"], j["precision"]) elif property_type == "Property Not Found": is_error = True if value is None: continue else: s.add_qualifier(j["property"], value) doc.kg.add_subject(s) if not is_error: data = doc.kg.serialize(filetype) else: # data = "Property Not Found" raise Exception('data exception while generating triples') return data
if __name__ == "__main__": with open('date_ground_truth.txt', 'r') as f: texts = f.readlines() etk = ETK(modules=DateETKModule) res = [] for text in texts: text = text.strip() if text and text[0] != '#': temp = text.split('|') if len(temp) == 3: input_text, expected, format = temp doc = etk.create_document({ 'input': input_text, 'expected': expected, 'format': format }) docs = etk.process_ems(doc) res.append(docs[0].value) for r in res: extracted = r['extracted_date'][0] if 'extracted_date' in r and r[ 'extracted_date'] else ' ' expected = r['expected'].replace( '@today', datetime.datetime.now().isoformat()[:10]) print( 'extracted: ', extracted, '\texpected:',
def model_schema(self): # read data data = self.read_data(self.data['schema']) # initialize KGSchema custom_dict, ns_dict = {}, {'wd': 'http://www.wikidata.org/entity/'} for each in data['prefix']: for k, v in each.items(): custom_dict[k] = v if k != 'wd': ns_dict[k] = v + '/entity' kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id='http://isi.edu/default-ns/projects') # bind prefix doc = create_custom_prefix(doc, custom_dict) type_map = { 'quantity': Datatype.QuantityValue, 'url': URLValue, 'item': Datatype.Item, 'time': Datatype.TimeValue, 'string': Datatype.StringValue, 'text': Datatype.MonolingualText } # model schema for k, v in data.items(): if ':' in k: k = k.split(':') if 'Q' in k[1]: p = WDItem(k[1], namespace=k[0], creator=':datamart') elif 'P' in k[1]: p = WDProperty(k[1], type_map[v['type']], namespace=k[0], creator=':datamart') else: raise Exception('There is no P/Q information.') return None for lang, value in v['description'].items(): for val in value: p.add_description(val, lang=lang) for lang, value in v['label'].items(): for val in value: p.add_label(val, lang=lang) for node, value in v['statements'].items(): ns = node.split(':')[0] if ':' in node else 'wd' for val in value: prop_type = self.get_property_type(node, ns_dict[ns]) if prop_type == 'WikibaseItem': v = Item(str(val['value'])) elif prop_type == 'WikibaseProperty': v = Property(val['value']) elif prop_type == 'String': v = StringValue(val['value']) elif prop_type == 'Quantity': v = QuantityValue(val['value']) elif prop_type == 'Url': v = URLValue(val['value']) elif prop_type == 'Monolingualtext': v = MonolingualText(val['value'], val['lang']) p.add_statement(node, v) doc.kg.add_subject(p) return doc
def generate_triples(user_id: str, resolved_excel: list, sparql_endpoint: str, filetype: str = 'ttl') -> str: """ This function uses ETK to generate the RDF triples :param user_id: :param resolved_excel: :param sparql_endpoint: :param filetype: :return: """ # initialize kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") # bind prefixes doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') doc.kg.bind('wd', 'http://www.wikidata.org/entity/') doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') doc.kg.bind('wdv', 'http://www.wikidata.org/value/') doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') doc.kg.bind('p', 'http://www.wikidata.org/prop/') doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') doc.kg.bind('prn', 'http://www.wikidata.org/prop/reference/value-normalized/') doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') doc.kg.bind('psn', 'http://www.wikidata.org/prop/statement/value-normalized/') doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') doc.kg.bind('pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') doc.kg.bind('schema', 'http://schema.org/') # property_type_cache = {} is_error = False for i in resolved_excel: item = WDItem(i["statement"]["item"], creator='http://www.isi.edu/t2wml') try: property_type = property_type_map[i["statement"]["property"]] except KeyError: property_type = get_property_type(i["statement"]["property"], sparql_endpoint) property_type_map[i["statement"]["property"]] = property_type if property_type == "WikibaseItem": value = Item(str(i["statement"]["value"])) elif property_type == "WikibaseProperty": value = Property(i["statement"]["value"]) elif property_type == "String": value = StringValue(i["statement"]["value"]) elif property_type == "Quantity": value = QuantityValue(i["statement"]["value"]) elif property_type == "Time": value = TimeValue(str(i["statement"]["value"]), Item(i["statement"]["calendar"]), translate_precision_to_integer(i["statement"]["precision"]), i["statement"]["time_zone"]) elif property_type == "Url": value = URLValue(i["statement"]["value"]) elif property_type == "Monolingualtext": value = MonolingualText(i["statement"]["value"], i["statement"]["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(i["statement"]["value"]) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(i["statement"]["latitude"], i["statement"]["longitude"], i["statement"]["precision"]) elif property_type == "Property Not Found": is_error = True break s = item.add_statement(i["statement"]["property"], value) doc.kg.add_subject(item) if "qualifier" in i["statement"]: for j in i["statement"]["qualifier"]: try: property_type = property_type_map[j["property"]] except KeyError: property_type = get_property_type(j["property"], sparql_endpoint) property_type_map[j["property"]] = property_type if property_type == "WikibaseItem": value = Item(str(j["value"])) elif property_type == "WikibaseProperty": value = Property(j["value"]) elif property_type == "String": value = StringValue(j["value"]) elif property_type == "Quantity": value = QuantityValue(j["value"]) elif property_type == "Time": value = TimeValue(str(j["value"]), Item(j["calendar"]), j["precision"], j["time_zone"]) elif property_type == "Url": value = URLValue(j["value"]) elif property_type == "Monolingualtext": value = MonolingualText(j["value"], j["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(j["value"]) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(j["latitude"], j["longitude"], j["precision"]) elif property_type == "Property Not Found": is_error = True s.add_qualifier(j["property"], value) doc.kg.add_subject(s) if not is_error: data = doc.kg.serialize(filetype) else: data = "Property Not Found" # os.makedirs(Path.cwd() / "new_properties", exist_ok=True) # results_file_name = user_id + "_results.ttl" # changes_file_name = user_id + "_changes.tsv" # with open(Path(app.config['downloads']) / results_file_name, "w") as fp: # fp.write(data) # with open(Path(app.config['downloads']) / changes_file_name, "w") as fp: # serialize_change_record(fp) return data
parser.add_option("-o", "--output_file", action="store", type="string", dest="output_file") (c_options, args) = parser.parse_args() input_file = c_options.input_file output_file = c_options.output_file f = open(input_file, mode='r', encoding='utf-8') o = open(output_file, mode='w', encoding='utf-8') l = open('{}.log'.format(output_file), mode='w', encoding='utf-8') print('Starting to process file: {}'.format(input_file)) count = 0 sum = 0 for line in f: if count == 10000: sum += count l.write('Processed {} lines'.format(str(sum))) l.write('\n') count = 0 json_x = json.loads(line) doc = etk.create_document(json_x) doc.doc_id = json_x['doc_id'] sentences = etk.process_ems(doc) for s in sentences: o.write(json.dumps(s.value)) o.write('\n') count += 1
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix skos: <http://www.w3.org/2004/02/skos/core#> . @prefix schema: <http://schema.org/> . :Event a owl:Class ; . :Entity a owl:Class ; . :Organization a owl:Class ; . :MOVEMENT_TRANSPORT a owl:Class ; . :GeopoliticalEntity a owl:Class ; . skos:prefLabel a owl:DatatypeProperty ; schema:domainIncludes :Entity, :Event ; rdfs:range xsd:string ; . :conflict_attack_place a owl:ObjectProperty ; schema:domainIncludes :Entity, :Event ; schema:rangeIncludes :GeopoliticalEntity ; . ''' ontology = Ontology(ontology_content, validation=False, include_undefined_class=True, quiet=True) kg_schema = KGSchema(ontology.merge_with_master_config(dict())) etk = ETK(modules=ExampleETKModule, kg_schema=kg_schema, ontology=ontology) input_data = {'doc_id': '1', 'data': json.loads(sample_input)} doc = etk.create_document(input_data) docs = etk.process_ems(doc) kgs = [json.dumps(doc.kg.value) for doc in docs[1:]] with open('output.jsonl', 'w') as f: f.write('\n'.join(kgs)) with open('output.nt', 'w') as f: f.writelines(map(rdf_generation, kgs))
import os, sys, json, codecs sys.path.append(os.path.join(os.path.dirname(__file__), '..')) sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) from etk.etk import ETK from etk.extractors.html_content_extractor import HTMLContentExtractor, Strategy from etk.extractors.html_metadata_extractor import HTMLMetadataExtractor from etk.extractors.inferlink_extractor import InferlinkExtractor, InferlinkRuleSet sample_html = json.load(codecs.open('sample_html.json', 'r')) # read sample file from disk etk = ETK() doc = etk.create_document(sample_html, mime_type="text/html", url="http://ex.com/123") metadata_extractor = HTMLMetadataExtractor() content_extractor = HTMLContentExtractor() landmark_extractor = InferlinkExtractor( InferlinkRuleSet( InferlinkRuleSet.load_rules_file('sample_inferlink_rules.json'))) root = doc.select_segments("$")[0] raw = doc.select_segments("$.raw_content")[0] # root.store_extractions(doc.invoke_extractor(metadata_extractor, extract_title=True), "title") # root.store_extractions(doc.invoke_extractor(metadata_extractor, extract_meta=True), "metadata") root.store_extractions( doc.invoke_extractor(content_extractor, raw, strategy=Strategy.ALL_TEXT), "etk2_text") root.store_extractions(
json_path='$.factoid.metadata.file_name') extracted_doc.kg.add_value( 'provenance_sheet', json_path='$.factoid.metadata.sheet_name') extracted_doc.kg.add_value('value', json_path='$.factoid.value') extracted_doc.kg.add_value('type', json_path='$.factoid.type') extracted_doc.kg.add_value('identifier_key', json_path='$.factoid.identifier_key') extracted_doc.kg.add_value('identifier_value', json_path='$.factoid.identifier_value') extracted_docs.append(extracted_doc) return extracted_docs if __name__ == "__main__": # elicit_alignment/m9/datasets/orig/structured/west_african_food_composition/example/ dir_path = sys.argv[1] file_name = 'West African Food Composition.xls' input_path = os.path.join(dir_path, file_name) output_path = os.path.join(dir_path, file_name + '.jl') kg_schema = KGSchema(json.load(open('master_config.json'))) etk = ETK(modules=ElicitWestAmericanFoodModule, kg_schema=kg_schema) doc = etk.create_document({'file_path': input_path}) docs = etk.process_ems(doc) with open(output_path, 'w') as f: for i in range(1, len(docs)): # ignore the first f.write(json.dumps(docs[i].value) + '\n')