def reset_etk_doc(self, doc_id: str = "http://isi.edu/default-ns/projects"): """ reset the doc object and return it. Called at initialization and after outputting triples. """ kg_schema = KGSchema() kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl") self.etk = ETK(kg_schema=kg_schema, modules=ETKModule) self.doc = self.etk.create_document({}, doc_id=doc_id) for k, v in wiki_namespaces.items(): if k in self.prefix_dict: self.doc.kg.bind(k, self.prefix_dict[k]) else: self.doc.kg.bind(k, v)
def __init__( self, propFile: str, labelSet: str, aliasSet: str, descriptionSet: str, n: int, destFp: TextIO = sys.stdout, ): self.propTypes = self.__setPropTypes(propFile) self.labelSet, self.aliasSet, self.descriptionSet = self.__setSets( labelSet, aliasSet, descriptionSet) # TODO handle standard output self.fp = destFp self.n = int(n) self.read = 0 # serialize prfix kg_schema = KGSchema() kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl") self.etk = ETK(kg_schema=kg_schema, modules=ETKModule) self.doc = self.__setDoc() self.__serialize_prefix()
def model_schema(self): # read data data = self.read_data(self.data['schema']) # initialize KGSchema custom_dict, ns_dict = {}, {'wd': 'http://www.wikidata.org/entity/'} for each in data['prefix']: for k, v in each.items(): custom_dict[k] = v if k != 'wd': ns_dict[k] = v + '/entity' kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id='http://isi.edu/default-ns/projects') # bind prefix doc = create_custom_prefix(doc, custom_dict) type_map = { 'quantity': Datatype.QuantityValue, 'url': URLValue, 'item': Datatype.Item, 'time': Datatype.TimeValue, 'string': Datatype.StringValue, 'text': Datatype.MonolingualText } # model schema for k, v in data.items(): if ':' in k: k = k.split(':') if 'Q' in k[1]: p = WDItem(k[1], namespace=k[0], creator=':datamart') elif 'P' in k[1]: p = WDProperty(k[1], type_map[v['type']], namespace=k[0], creator=':datamart') else: raise Exception('There is no P/Q information.') return None for lang, value in v['description'].items(): for val in value: p.add_description(val, lang=lang) for lang, value in v['label'].items(): for val in value: p.add_label(val, lang=lang) for node, value in v['statements'].items(): ns = node.split(':')[0] if ':' in node else 'wd' for val in value: prop_type = self.get_property_type(node, ns_dict[ns]) if prop_type == 'WikibaseItem': v = Item(str(val['value'])) elif prop_type == 'WikibaseProperty': v = Property(val['value']) elif prop_type == 'String': v = StringValue(val['value']) elif prop_type == 'Quantity': v = QuantityValue(val['value']) elif prop_type == 'Url': v = URLValue(val['value']) elif prop_type == 'Monolingualtext': v = MonolingualText(val['value'], val['lang']) p.add_statement(node, v) doc.kg.add_subject(p) return doc
def model_statement(self): # initialize KGSchema kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id='http://isi.edu/default-ns/projects') # bind prefix doc = create_custom_prefix(doc, custom_dict={self.ns: self.uri}) # extract files self.extract_files() # model statement inputs = self.data['inputs'] for k, v in inputs.items(): if k != 'metadata': # construct wikifier instance if k == 'wikifier' and not v['existed']: q = WDItem(v['qnode'], namespace=self.ns, creator=':datamart') q.add_label('A wikifier file for ' + inputs['dataset']['content']['filename'], lang='en') q.add_statement('P31', Item( 'SDQ1001', namespace=self.ns)) # an instance of Wikifier q.add_statement('P127', Item('SDQ1003', namespace=self.ns)) # belongs to q.add_statement('SDP3003', StringValue(v['content']), namespace=self.ns) # hasFileContent q.add_statement('SDP3004', StringValue(v['hashcode']), namespace=self.ns) # hashValue # construct mapping_file instance elif k == 'mappingFile' and not v['existed']: q = WDItem(v['qnode'], namespace=self.ns, creator=':datamart') q.add_label('A mapping file for ' + inputs['dataset']['content']['filename'], lang='en') q.add_statement('P31', Item( 'SDQ1002', namespace=self.ns)) # an instance of MappingFile q.add_statement('P170', StringValue('T2WML')) q.add_statement('P127', Item('SDQ1003', namespace=self.ns)) q.add_statement('SDP3003', StringValue(json.dumps(v['content'])), namespace=self.ns) q.add_statement('SDP3004', StringValue(v['hashcode']), namespace=self.ns) # construct dataset instance elif k == 'dataset' and not v['existed']: q = WDItem(v['qnode'], namespace=self.ns, creator=':datamart') q.add_label(v['content']['title'], lang='en') q.add_description(v['content']['description'], lang='en') q.add_statement('P31', Item('Q1172284')) # an instance of Dataset q.add_statement('SDP3001', Item(inputs['wikifier']['qnode'], namespace=self.ns), namespace=self.ns) # a wikifier file q.add_statement('SDP3002', Item(inputs['mappingFile']['qnode'], namespace=self.ns), namespace=self.ns) # a mapping file q.add_statement('P1476', StringValue( v['content']['title'])) # title q.add_statement( 'P921', StringValue(v['content']['description'])) # described q.add_statement('P127', Item('SDQ1003', namespace=self.ns)) # belongs to q.add_statement('SDP2004', StringValue(', '.join( v['content']['keywords'])), namespace=self.ns) # keywords q.add_statement('SDP3004', StringValue(v['hashcode']), namespace=self.ns) if self.data['storeColumnValue']: for data in v['content']['variable_measured']: statement = q.add_statement( 'SDP2005', StringValue(data['column_name']), namespace=self.ns) # variable measured statement.add_qualifier( 'SDP2006', StringValue(data['values_of_a_column']), namespace=self.ns) # the values of a column statement.add_qualifier( 'SDP2007', Item(data['data_structure_type']), namespace=self.ns) # data structure type statement.add_qualifier( 'SDP2008', URLValue(data['semantic_type_identifier']), namespace=self.ns) # semantic type statement.add_qualifier( 'P1545', QuantityValue( data['column_index'], namespace=self.ns)) # column index doc.kg.add_subject(q) return doc
douglas.add_statement('P2048', QuantityValue(1.96, unit=Item('Q11573'))) # official website # statement = douglas.add_statement('P856', URLValue('http://douglasadams.com/')) statement = douglas.add_truthy_statement( 'P856', URLValue('http://douglasadams.com/')) statement.add_qualifier('P407', Item('Q1860')) # Freebase ID douglas.add_statement( 'P646', ExternalIdentifier('/m/0282x', URLValue('http://g.co/kg/m/0282x'))) doc.kg.add_subject(douglas) return list() if __name__ == "__main__": kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ExampleETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") revise(True) docs = etk.process_ems(doc) print(docs[0].kg.serialize('ttl'))
def _init_etk(): # initialize for etk kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") # bind prefixes doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') doc.kg.bind('wd', 'http://www.wikidata.org/entity/') doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') doc.kg.bind('wdv', 'http://www.wikidata.org/value/') doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') doc.kg.bind('p', 'http://www.wikidata.org/prop/') doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') doc.kg.bind( 'prn', 'http://www.wikidata.org/prop/reference/value-normalized/') doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') doc.kg.bind( 'psn', 'http://www.wikidata.org/prop/statement/value-normalized/') doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') doc.kg.bind( 'pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') doc.kg.bind('schema', 'http://schema.org/') # give definition of the nodes we definied p = WDProperty('C2001', Datatype.MonolingualText) p.add_label('datamart identifier', lang='en') p.add_description('identifier of a dataset in the Datamart system', lang='en') p.add_statement('P31', Item('Q19847637')) p.add_statement('P1629', Item('Q1172284')) doc.kg.add_subject(p) p = WDProperty('C2004', Datatype.StringValue) p.add_label('keywords', lang='en') p.add_description( 'keywords associated with an item to facilitate finding the item using text search', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2005', Datatype.StringValue) p.add_label('variable measured', lang='en') p.add_description('the variables measured in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) p.add_statement('P1628', URLValue('http://schema.org/variableMeasured')) doc.kg.add_subject(p) p = WDProperty('C2006', Datatype.StringValue) p.add_label('values', lang='en') p.add_description( 'the values of a variable represented as a text document', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2007', Datatype.Item) p.add_label('data type', lang='en') p.add_description( 'the data type used to represent the values of a variable, integer (Q729138), Boolean (Q520777), ' 'Real (Q4385701), String (Q184754), Categorical (Q2285707)', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2008', Datatype.URLValue) p.add_label('semantic type', lang='en') p.add_description( 'a URL that identifies the semantic type of a variable in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2010', Datatype.StringValue) p.add_label('extra information', lang='en') p.add_description( 'some extra information that may needed for this dataset', lang='en') doc.kg.add_subject(p) p = WDProperty('C2011', Datatype.TimeValue) p.add_label('start date', lang='en') p.add_description( 'The earlist time exist in this dataset, only valid when there exists time format data in this dataset', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2012', Datatype.TimeValue) p.add_label('end date', lang='en') p.add_description( 'The latest time exist in this dataset, only valid when there exists time format data in this dataset', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2013', Datatype.QuantityValue) p.add_label('time granularity', lang='en') p.add_description('time granularity in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2014', Datatype.StringValue) p.add_label('uploader information', lang='en') p.add_description('information about who uploaded and when uploaded', lang='en') doc.kg.add_subject(p) return doc
def __init__(self, query_server=None, update_server=None): self.punctuation_table = str.maketrans( dict.fromkeys(string.punctuation)) if query_server and update_server: self.query_server = query_server self.update_server = update_server else: self.query_server = DATAMRT_SERVER self.update_server = DATAMRT_SERVER # initialize kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) self.doc = etk.create_document( {}, doc_id="http://isi.edu/default-ns/projects") # bind prefixes self.doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') self.doc.kg.bind('wd', 'http://www.wikidata.org/entity/') self.doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') self.doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') self.doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') self.doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') self.doc.kg.bind('wdv', 'http://www.wikidata.org/value/') self.doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') self.doc.kg.bind('p', 'http://www.wikidata.org/prop/') self.doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') self.doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') self.doc.kg.bind( 'prn', 'http://www.wikidata.org/prop/reference/value-normalized/') self.doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') self.doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') self.doc.kg.bind( 'psn', 'http://www.wikidata.org/prop/statement/value-normalized/') self.doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') self.doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') self.doc.kg.bind( 'pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') self.doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') self.doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') self.doc.kg.bind('schema', 'http://schema.org/') # give definition of the nodes we definied p = WDProperty('C2001', Datatype.MonolingualText) p.add_label('keywords', lang='en') p.add_description('identifier of a dataset in the Datamart system', lang='en') p.add_statement('P31', Item('Q19847637')) p.add_statement('P1629', Item('Q1172284')) self.doc.kg.add_subject(p) p = WDProperty('C2004', Datatype.StringValue) p.add_label('datamart identifier', lang='en') p.add_description( 'keywords associated with an item to facilitate finding the item using text search', lang='en') p.add_statement('P31', Item('Q18616576')) self.doc.kg.add_subject(p) p = WDProperty('C2005', Datatype.StringValue) p.add_label('variable measured', lang='en') p.add_description('the variables measured in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) p.add_statement('P1628', URLValue('http://schema.org/variableMeasured')) self.doc.kg.add_subject(p) p = WDProperty('C2006', Datatype.StringValue) p.add_label('values', lang='en') p.add_description( 'the values of a variable represented as a text document', lang='en') p.add_statement('P31', Item('Q18616576')) self.doc.kg.add_subject(p) p = WDProperty('C2007', Datatype.Item) p.add_label('data type', lang='en') p.add_description( 'the data type used to represent the values of a variable, integer (Q729138), Boolean (Q520777), ' 'Real (Q4385701), String (Q184754), Categorical (Q2285707)', lang='en') p.add_statement('P31', Item('Q18616576')) self.doc.kg.add_subject(p) p = WDProperty('C2008', Datatype.URLValue) p.add_label('semantic type', lang='en') p.add_description( 'a URL that identifies the semantic type of a variable in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) self.doc.kg.add_subject(p) # get the starting source id sparql_query = """ prefix wdt: <http://www.wikidata.org/prop/direct/> prefix wd: <http://www.wikidata.org/entity/> prefix wikibase: <http://wikiba.se/ontology#> PREFIX p: <http://www.wikidata.org/prop/> PREFIX pqv: <http://www.wikidata.org/prop/qualifier/value/> PREFIX pq: <http://www.wikidata.org/prop/qualifier/> PREFIX ps: <http://www.wikidata.org/prop/statement/> prefix bd: <http://www.bigdata.com/rdf#> prefix bds: <http://www.bigdata.com/rdf/search#> select ?x where { wd:Z00000 wdt:P1114 ?x . } """ try: sparql = SPARQLWrapper(self.query_server) sparql.setQuery(sparql_query) sparql.setReturnFormat(JSON) sparql.setMethod(POST) sparql.setRequestMethod(URLENCODED) results = sparql.query().convert()['results']['bindings'] except: print("Getting query of wiki data failed!") raise ValueError("Unable to initialize the datamart query service") if not results: print( "[WARNING] No starting source id found! Will initialize the starting source with D1000001" ) self.resource_id = 1000001 else: self.resource_id = 1000001
ontology = """ @prefix : <http://isi.edu/xij-rule-set#> . @prefix owl: <http://www.w3.org/2002/07/owl#> . @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . :Software a owl:Class ; rdfs:label "Software" . :Person a owl:Class ; rdfs:label "Person" . :Developer a owl:Class ; rdfs:label "Developer" . :name a owl:DatatypeProperty ; rdf:domain :Person ; rdf:range xsd:string . :developer a owl:ObjectProperty ; rdfs:label "developer" ; rdf:domain :Software ; rdf:range :Developer . """ kg_schema = KGSchema() kg_schema.add_schema(ontology, 'ttl') etk = ETK(kg_schema=kg_schema, modules=ExampleETKModule) doc = etk.create_document(sample_input, doc_id="http://isi.edu/default-ns/projects") docs = etk.process_ems(doc) print(docs[0].kg.serialize('ttl'))