def _node_2_entity(self, node: str): ''' A node can be Qxxx or Pxxx, return the proper entity. ''' if node in self.prop_types: entity = WDProperty(node, self.prop_types[node]) else: entity = WDItem(TripleGenerator.replace_illegal_string(node)) return entity
def genLabelTriple(self, node1: str, label: str, node2: str) -> bool: if node1 in self.propTypes: entity = WDProperty(node1.upper(), self.propTypes[node1]) else: entity = WDItem(node1.upper()) if "@" in node2: node2, lang = node2.split("@") entity.add_label(node2.replace('"', "").replace("'", ""), lang=lang) else: entity.add_label(node2.replace('"', "").replace("'", ""), lang="en") # default self.doc.kg.add_subject(entity) return True
def process_one_column(self, column_data: pd.Series, item: WDItem, column_number: int, semantic_type: typing.List[str]) -> bool: """ :param column_data: a pandas series data :param item: the target q node aimed to add on :param column_number: the column number :param semantic_type: a list indicate the semantic tpye of this column :return: a bool indicate succeeded or not """ try: all_data = set(column_data.tolist()) all_value_str_set = set() for each in all_data: # set to lower characters, remove punctuation and split by the space words_processed = str(each).lower().translate( self.punctuation_table).split() for word in words_processed: all_value_str_set.add(word) all_value_str = " ".join(all_value_str_set) statement = item.add_statement( 'C2005', StringValue(column_data.name)) # variable measured statement.add_qualifier('C2006', StringValue(all_value_str)) # values if 'http://schema.org/Float' in semantic_type: semantic_type_url = 'http://schema.org/Float' data_type = "float" elif 'http://schema.org/Integer' in semantic_type: data_type = "int" semantic_type_url = 'http://schema.org/Integer' elif 'http://schema.org/Text' in semantic_type: data_type = "string" semantic_type_url = 'http://schema.org/Text' statement.add_qualifier('C2007', Item(data_type)) # data structure type statement.add_qualifier( 'C2008', URLValue(semantic_type_url)) # semantic type identifier statement.add_qualifier( 'P1545', QuantityValue(column_number)) # column index return True except: # import pdb # pdb.set_trace() return False
def model_schema(self): # read data data = self.read_data(self.data['schema']) # initialize KGSchema custom_dict, ns_dict = {}, {'wd': 'http://www.wikidata.org/entity/'} for each in data['prefix']: for k, v in each.items(): custom_dict[k] = v if k != 'wd': ns_dict[k] = v + '/entity' kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id='http://isi.edu/default-ns/projects') # bind prefix doc = create_custom_prefix(doc, custom_dict) type_map = { 'quantity': Datatype.QuantityValue, 'url': URLValue, 'item': Datatype.Item, 'time': Datatype.TimeValue, 'string': Datatype.StringValue, 'text': Datatype.MonolingualText } # model schema for k, v in data.items(): if ':' in k: k = k.split(':') if 'Q' in k[1]: p = WDItem(k[1], namespace=k[0], creator=':datamart') elif 'P' in k[1]: p = WDProperty(k[1], type_map[v['type']], namespace=k[0], creator=':datamart') else: raise Exception('There is no P/Q information.') return None for lang, value in v['description'].items(): for val in value: p.add_description(val, lang=lang) for lang, value in v['label'].items(): for val in value: p.add_label(val, lang=lang) for node, value in v['statements'].items(): ns = node.split(':')[0] if ':' in node else 'wd' for val in value: prop_type = self.get_property_type(node, ns_dict[ns]) if prop_type == 'WikibaseItem': v = Item(str(val['value'])) elif prop_type == 'WikibaseProperty': v = Property(val['value']) elif prop_type == 'String': v = StringValue(val['value']) elif prop_type == 'Quantity': v = QuantityValue(val['value']) elif prop_type == 'Url': v = URLValue(val['value']) elif prop_type == 'Monolingualtext': v = MonolingualText(val['value'], val['lang']) p.add_statement(node, v) doc.kg.add_subject(p) return doc
def model_statement(self): # initialize KGSchema kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id='http://isi.edu/default-ns/projects') # bind prefix doc = create_custom_prefix(doc, custom_dict={self.ns: self.uri}) # extract files self.extract_files() # model statement inputs = self.data['inputs'] for k, v in inputs.items(): if k != 'metadata': # construct wikifier instance if k == 'wikifier' and not v['existed']: q = WDItem(v['qnode'], namespace=self.ns, creator=':datamart') q.add_label('A wikifier file for ' + inputs['dataset']['content']['filename'], lang='en') q.add_statement('P31', Item( 'SDQ1001', namespace=self.ns)) # an instance of Wikifier q.add_statement('P127', Item('SDQ1003', namespace=self.ns)) # belongs to q.add_statement('SDP3003', StringValue(v['content']), namespace=self.ns) # hasFileContent q.add_statement('SDP3004', StringValue(v['hashcode']), namespace=self.ns) # hashValue # construct mapping_file instance elif k == 'mappingFile' and not v['existed']: q = WDItem(v['qnode'], namespace=self.ns, creator=':datamart') q.add_label('A mapping file for ' + inputs['dataset']['content']['filename'], lang='en') q.add_statement('P31', Item( 'SDQ1002', namespace=self.ns)) # an instance of MappingFile q.add_statement('P170', StringValue('T2WML')) q.add_statement('P127', Item('SDQ1003', namespace=self.ns)) q.add_statement('SDP3003', StringValue(json.dumps(v['content'])), namespace=self.ns) q.add_statement('SDP3004', StringValue(v['hashcode']), namespace=self.ns) # construct dataset instance elif k == 'dataset' and not v['existed']: q = WDItem(v['qnode'], namespace=self.ns, creator=':datamart') q.add_label(v['content']['title'], lang='en') q.add_description(v['content']['description'], lang='en') q.add_statement('P31', Item('Q1172284')) # an instance of Dataset q.add_statement('SDP3001', Item(inputs['wikifier']['qnode'], namespace=self.ns), namespace=self.ns) # a wikifier file q.add_statement('SDP3002', Item(inputs['mappingFile']['qnode'], namespace=self.ns), namespace=self.ns) # a mapping file q.add_statement('P1476', StringValue( v['content']['title'])) # title q.add_statement( 'P921', StringValue(v['content']['description'])) # described q.add_statement('P127', Item('SDQ1003', namespace=self.ns)) # belongs to q.add_statement('SDP2004', StringValue(', '.join( v['content']['keywords'])), namespace=self.ns) # keywords q.add_statement('SDP3004', StringValue(v['hashcode']), namespace=self.ns) if self.data['storeColumnValue']: for data in v['content']['variable_measured']: statement = q.add_statement( 'SDP2005', StringValue(data['column_name']), namespace=self.ns) # variable measured statement.add_qualifier( 'SDP2006', StringValue(data['values_of_a_column']), namespace=self.ns) # the values of a column statement.add_qualifier( 'SDP2007', Item(data['data_structure_type']), namespace=self.ns) # data structure type statement.add_qualifier( 'SDP2008', URLValue(data['semantic_type_identifier']), namespace=self.ns) # semantic type statement.add_qualifier( 'P1545', QuantityValue( data['column_index'], namespace=self.ns)) # column index doc.kg.add_subject(q) return doc
def generate_triples(user_id: str, resolved_excel: list, sparql_endpoint: str, filetype: str = 'ttl', created_by: str = 't2wml') -> str: """ This function uses ETK to generate the RDF triples :param user_id: :param resolved_excel: :param sparql_endpoint: :param filetype: :return: """ # initialize kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") property_type_map = property_type_dict # bind prefixes doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') doc.kg.bind('wd', 'http://www.wikidata.org/entity/') doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') doc.kg.bind('wdv', 'http://www.wikidata.org/value/') doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') doc.kg.bind('p', 'http://www.wikidata.org/prop/') doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') doc.kg.bind('prn', 'http://www.wikidata.org/prop/reference/value-normalized/') doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') doc.kg.bind('psn', 'http://www.wikidata.org/prop/statement/value-normalized/') doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') doc.kg.bind('pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') doc.kg.bind('schema', 'http://schema.org/') # property_type_cache = {} is_error = False for i in resolved_excel: _item = i["statement"]["item"] if _item is not None: item = WDItem(_item, creator='http://www.isi.edu/{}'.format(created_by)) try: property_type = property_type_map[i["statement"]["property"]] except KeyError: property_type = get_property_type(i["statement"]["property"], sparql_endpoint) if property_type != "Property Not Found" and i["statement"][ "property"] not in property_type_map: property_type_map[i["statement"] ["property"]] = property_type if property_type == "WikibaseItem": value = Item(str(i["statement"]["value"])) elif property_type == "WikibaseProperty": value = Property(i["statement"]["value"]) elif property_type == "String": value = StringValue(i["statement"]["value"]) elif property_type == "Quantity": _value = i["statement"]["value"] _value = str(_value).replace(',', '') value = QuantityValue(_value) elif property_type == "Time": value = TimeValue( str(i["statement"]["value"]), Item(i["statement"]["calendar"]), translate_precision_to_integer( i["statement"]["precision"]), i["statement"]["time_zone"]) elif property_type == "Url": value = URLValue(i["statement"]["value"]) elif property_type == "Monolingualtext": value = MonolingualText(i["statement"]["value"], i["statement"]["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(i["statement"]["value"]) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(i["statement"]["latitude"], i["statement"]["longitude"], i["statement"]["precision"]) elif property_type == "Property Not Found": is_error = True break s = item.add_statement(i["statement"]["property"], value) doc.kg.add_subject(item) if "qualifier" in i["statement"]: for j in i["statement"]["qualifier"]: try: property_type = property_type_map[j["property"]] except KeyError: property_type = get_property_type( j["property"], sparql_endpoint) if property_type != "Property Not Found" and i[ "statement"][ "property"] not in property_type_map: property_type_map[i["statement"] ["property"]] = property_type if property_type == "WikibaseItem": value = Item(str(j["value"])) elif property_type == "WikibaseProperty": value = Property(j["value"]) elif property_type == "String": value = StringValue(j["value"]) elif property_type == "Quantity": value = QuantityValue(j["value"]) elif property_type == "Time": value = TimeValue(str(j["value"]), Item(j["calendar"]), j["precision"], j["time_zone"]) elif property_type == "Url": value = URLValue(j["value"]) elif property_type == "Monolingualtext": value = MonolingualText(j["value"], j["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(j["value"]) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(j["latitude"], j["longitude"], j["precision"]) elif property_type == "Property Not Found": is_error = True if value is None: continue else: s.add_qualifier(j["property"], value) doc.kg.add_subject(s) if not is_error: data = doc.kg.serialize(filetype) else: # data = "Property Not Found" raise Exception('data exception while generating triples') return data
def generate_normal_triple(self, node1: str, property: str, node2: str, is_qualifier_edge: bool, e_id: str) -> bool: if self.use_id: e_id = TripleGenerator.replace_illegal_string(e_id) entity = self._node_2_entity(node1) edge_type = self.prop_types[property] if edge_type == Item: object = WDItem(TripleGenerator.replace_illegal_string(node2)) elif edge_type == WDProperty: object = WDProperty(TripleGenerator.replace_illegal_string(node2), self.prop_types[node2]) elif edge_type == TimeValue: if self.yyyy_mm_dd_pattern.match(node2): try: dateTimeString = node2 object = TimeValue( value=dateTimeString, # TODO calendar=Item("Q1985727"), precision=Precision.year, time_zone=0, ) except: return False elif self.yyyy_pattern.match(node2): try: dateTimeString = node2 + "-01-01" object = TimeValue( value=dateTimeString, # TODO calendar=Item("Q1985727"), precision=Precision.year, time_zone=0, ) except: return False else: try: # TODO, in future, the two cases above will be dropped in principle to comply with the iso format # now it is iso format assert (node2[0] == "^") node2 = node2[1:] # remove ^ if node2.startswith("+"): node2 = node2[1:] dateTimeString, precision = node2.split("/") dateTimeString = dateTimeString[:-1] # remove Z object = TimeValue( value=dateTimeString, calendar=Item("Q1985727"), precision=precision, time_zone=0, ) except: return False elif edge_type == GlobeCoordinate: latitude, longitude = node2[1:].split("/") latitude = float(latitude) longitude = float(longitude) object = GlobeCoordinate(latitude, longitude, 0.0001, globe=Item("Q2")) # earth elif edge_type == QuantityValue: # +70[+60,+80]Q743895 res = self.quantity_pattern.match(node2).groups() amount, lower_bound, upper_bound, unit = res amount = TripleGenerator.clean_number_string(amount) num_type = self.xsd_number_type(amount) lower_bound = TripleGenerator.clean_number_string(lower_bound) upper_bound = TripleGenerator.clean_number_string(upper_bound) if unit != None: if upper_bound != None and lower_bound != None: object = QuantityValue(amount, unit=Item(unit), upper_bound=upper_bound, lower_bound=lower_bound, type=num_type) else: object = QuantityValue(amount, unit=Item(unit), type=num_type) else: if upper_bound != None and lower_bound != None: object = QuantityValue(amount, upper_bound=upper_bound, lower_bound=lower_bound, type=num_type) else: object = QuantityValue(amount, type=num_type) elif edge_type == MonolingualText: text_string, lang = TripleGenerator.process_text_string(node2) object = MonolingualText(text_string, lang) elif edge_type == ExternalIdentifier: object = ExternalIdentifier(node2) elif edge_type == URLValue: if TripleGenerator.is_valid_uri_with_scheme_and_host(node2): object = URLValue(node2) else: return False else: # treat everything else as stringValue object = StringValue(node2) if type(object) == WDItem or type(object) == WDProperty: self.doc.kg.add_subject(object) if is_qualifier_edge: # edge: e8 p9 ^2013-01-01T00:00:00Z/11 # create qualifier edge on previous STATEMENT and return the updated STATEMENT self.to_append_statement.add_qualifier(property, object) self.doc.kg.add_subject(self.to_append_statement) else: # edge: q1 p8 q2 e8 # create brand new property edge and replace STATEMENT if self.truthy: self.to_append_statement = entity.add_truthy_statement( property, object, statement_id=e_id ) if self.use_id else entity.add_truthy_statement( property, object) else: self.to_append_statement = entity.add_statement( property, object, statement_id=e_id ) if self.use_id else entity.add_statement(property, object) self.doc.kg.add_subject(entity) return True
def generate_triples(user_id: str, resolved_excel: list, sparql_endpoint: str, filetype: str = 'ttl') -> str: """ This function uses ETK to generate the RDF triples :param user_id: :param resolved_excel: :param sparql_endpoint: :param filetype: :return: """ # initialize kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") # bind prefixes doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') doc.kg.bind('wd', 'http://www.wikidata.org/entity/') doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') doc.kg.bind('wdv', 'http://www.wikidata.org/value/') doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') doc.kg.bind('p', 'http://www.wikidata.org/prop/') doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') doc.kg.bind('prn', 'http://www.wikidata.org/prop/reference/value-normalized/') doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') doc.kg.bind('psn', 'http://www.wikidata.org/prop/statement/value-normalized/') doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') doc.kg.bind('pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') doc.kg.bind('schema', 'http://schema.org/') # property_type_cache = {} is_error = False for i in resolved_excel: item = WDItem(i["statement"]["item"], creator='http://www.isi.edu/t2wml') try: property_type = property_type_map[i["statement"]["property"]] except KeyError: property_type = get_property_type(i["statement"]["property"], sparql_endpoint) property_type_map[i["statement"]["property"]] = property_type if property_type == "WikibaseItem": value = Item(str(i["statement"]["value"])) elif property_type == "WikibaseProperty": value = Property(i["statement"]["value"]) elif property_type == "String": value = StringValue(i["statement"]["value"]) elif property_type == "Quantity": value = QuantityValue(i["statement"]["value"]) elif property_type == "Time": value = TimeValue(str(i["statement"]["value"]), Item(i["statement"]["calendar"]), translate_precision_to_integer(i["statement"]["precision"]), i["statement"]["time_zone"]) elif property_type == "Url": value = URLValue(i["statement"]["value"]) elif property_type == "Monolingualtext": value = MonolingualText(i["statement"]["value"], i["statement"]["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(i["statement"]["value"]) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(i["statement"]["latitude"], i["statement"]["longitude"], i["statement"]["precision"]) elif property_type == "Property Not Found": is_error = True break s = item.add_statement(i["statement"]["property"], value) doc.kg.add_subject(item) if "qualifier" in i["statement"]: for j in i["statement"]["qualifier"]: try: property_type = property_type_map[j["property"]] except KeyError: property_type = get_property_type(j["property"], sparql_endpoint) property_type_map[j["property"]] = property_type if property_type == "WikibaseItem": value = Item(str(j["value"])) elif property_type == "WikibaseProperty": value = Property(j["value"]) elif property_type == "String": value = StringValue(j["value"]) elif property_type == "Quantity": value = QuantityValue(j["value"]) elif property_type == "Time": value = TimeValue(str(j["value"]), Item(j["calendar"]), j["precision"], j["time_zone"]) elif property_type == "Url": value = URLValue(j["value"]) elif property_type == "Monolingualtext": value = MonolingualText(j["value"], j["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(j["value"]) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(j["latitude"], j["longitude"], j["precision"]) elif property_type == "Property Not Found": is_error = True s.add_qualifier(j["property"], value) doc.kg.add_subject(s) if not is_error: data = doc.kg.serialize(filetype) else: data = "Property Not Found" # os.makedirs(Path.cwd() / "new_properties", exist_ok=True) # results_file_name = user_id + "_results.ttl" # changes_file_name = user_id + "_changes.tsv" # with open(Path(app.config['downloads']) / results_file_name, "w") as fp: # fp.write(data) # with open(Path(app.config['downloads']) / changes_file_name, "w") as fp: # serialize_change_record(fp) return data
def upload(self): """ upload the dataset """ # This special Q node is used to store the next count to store the new Q node sparql_query = """ prefix wdt: <http://www.wikidata.org/prop/direct/> prefix wdtn: <http://www.wikidata.org/prop/direct-normalized/> prefix wdno: <http://www.wikidata.org/prop/novalue/> prefix wds: <http://www.wikidata.org/entity/statement/> prefix wdv: <http://www.wikidata.org/value/> prefix wdref: <http://www.wikidata.org/reference/> prefix wd: <http://www.wikidata.org/entity/> prefix wikibase: <http://wikiba.se/ontology#> prefix p: <http://www.wikidata.org/prop/> prefix pqv: <http://www.wikidata.org/prop/qualifier/value/> prefix pq: <http://www.wikidata.org/prop/qualifier/> prefix ps: <http://www.wikidata.org/prop/statement/> prefix psn: <http://www.wikidata.org/prop/statement/value-normalized/> prefix prv: <http://www.wikidata.org/prop/reference/value/> prefix psv: <http://www.wikidata.org/prop/statement/value/> prefix prn: <http://www.wikidata.org/prop/reference/value-normalized/> prefix pr: <http://www.wikidata.org/prop/reference/> prefix pqn: <http://www.wikidata.org/prop/qualifier/value-normalized/> prefix skos: <http://www.w3.org/2004/02/skos/core#> prefix prov: <http://www.w3.org/ns/prov#> prefix schema: <http://schema.org/'> prefix bd: <http://www.bigdata.com/rdf#> prefix bds: <http://www.bigdata.com/rdf/search#> delete { wd:Z00000 wdt:P1114 ?x . } where { wd:Z00000 wdt:P1114 ?x . } """ try: sparql = SPARQLWrapper(self.update_server) sparql.setQuery(sparql_query) sparql.setReturnFormat(JSON) sparql.setMethod(POST) sparql.setRequestMethod(URLENCODED) sparql.setCredentials(config.user, config.password) results = sparql.query() #.convert()['results']['bindings'] except: print("Updating the count for datamart failed!") raise ValueError("Unable to connect to datamart query service") # add datamart count to ttl q = WDItem('Z00000') q.add_label('Datamart datasets count', lang='en') q.add_statement('P1114', QuantityValue(self.resource_id)) # title self.doc.kg.add_subject(q) # upload extracted_data = self.doc.kg.serialize("ttl") headers = { 'Content-Type': 'application/x-turtle', } response = requests.post(self.update_server, data=extracted_data.encode('utf-8'), headers=headers, auth=HTTPBasicAuth(config.user, config.password)) print('Upload file finished with status code: {}!'.format( response.status_code)) if response.status_code != 200: raise ValueError("Uploading file failed") else: # upload truthy temp_output = StringIO() serialize_change_record(temp_output) temp_output.seek(0) tu = TruthyUpdater(self.update_server, False, config.user, config.password) np_list = [] for l in temp_output.readlines(): if not l: continue node, prop = l.strip().split('\t') np_list.append((node, prop)) tu.build_truthy(np_list) print('Update truthy finished!')
def model_data(self, input_df: pd.DataFrame, metadata: dict): if metadata is None: metadata = {} title = metadata.get("title") or "" keywords = metadata.get("keywords") or "" file_type = metadata.get("file_type") or "" # TODO: if no url given? url = metadata.get("url") or "https://" if type(keywords) is list: keywords = " ".join(keywords) node_id = 'D' + str(self.resource_id) q = WDItem(node_id) self.resource_id += 1 q.add_label(node_id, lang='en') q.add_statement( 'P31', Item('Q1172284')) # indicate it is subclass of a dataset q.add_statement('P2699', URLValue(url)) # url q.add_statement('P2701', StringValue(file_type)) # file type q.add_statement('P1476', MonolingualText(title, lang='en')) # title q.add_statement('C2001', StringValue(node_id)) # datamart identifier q.add_statement('C2004', StringValue(keywords)) # keywords # each columns for i in self.columns_are_string: try: semantic_type = metadata['variables'][i]['semantic_type'] except IndexError: semantic_type = 'http://schema.org/Text' res = self.process_one_column(column_data=input_df.iloc[:, i], item=q, column_number=i, semantic_type=semantic_type) if not res: print("Error when adding column " + str(i)) self.doc.kg.add_subject(q)
def model_data(properties_file_path, output_file_path) -> None: """ This function generates triples for user defined properties for uploading them to wikidata :return: """ # stream = open(Path.cwd().parent / "Datasets/new-property-configuration.yaml", 'r', encoding='utf8') stream = open(properties_file_path, 'r', encoding='utf8') yaml_data = yaml.safe_load(stream) # initialize kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") # bind prefixes doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') doc.kg.bind('wd', 'http://www.wikidata.org/entity/') doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') doc.kg.bind('wdv', 'http://www.wikidata.org/value/') doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') doc.kg.bind('p', 'http://www.wikidata.org/prop/') doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') doc.kg.bind('prn', 'http://www.wikidata.org/prop/reference/value-normalized/') doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') doc.kg.bind('psn', 'http://www.wikidata.org/prop/statement/value-normalized/') doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') doc.kg.bind('pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') doc.kg.bind('schema', 'http://schema.org/') # sparql_endpoint = "https://query.wikidata.org/sparql" sparql_endpoint = "http://dsbox02.isi.edu:8899/bigdata/namespace/wdq/sparql" type_map = {'quantity': Datatype.QuantityValue, 'url': URLValue} property_type_cache = {} for k, v in yaml_data.items(): if k.startswith('Q'): p = WDItem(k, creator='http://www.isi.edu/t2wml') elif k.startswith('P'): p = WDProperty(k, type_map[v['type']], creator='http://www.isi.edu/t2wml') for lang, value in v['label'].items(): if not isinstance(value, list): value = [value] for val in value: p.add_label(val, lang=lang) for lang, value in v['description'].items(): if not isinstance(value, list): value = [value] for val in value: p.add_description(val, lang=lang) for pnode, items in v['statements'].items(): if not isinstance(items, list): items = [items] for item in items: try: property_type = property_type_cache[pnode] except KeyError: property_type = get_property_type(pnode, sparql_endpoint) property_type_cache[pnode] = property_type if property_type == "WikibaseItem": values = item['value'] if not isinstance(values, list): values = [values] value = [Item(v) for v in values if v is not None] elif property_type == "WikibaseProperty": value = Property(item['value']) elif property_type == "String": value = StringValue(item['value']) elif property_type == "Quantity": values = item['value'] if not isinstance(values, list): values = [values] value = [QuantityValue(v) for v in values] elif property_type == "Time": value = TimeValue( str(item['value']), Item(item["calendar"]), translate_precision_to_integer(item["precision"]), item["time_zone"]) elif property_type == "Url": value = URLValue(item['value']) elif property_type == "Monolingualtext": value = MonolingualText(item['value'], item["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(item['value']) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(item["latitude"], item["longitude"], item["precision"]) for val in value: p.add_statement(pnode, val) doc.kg.add_subject(p) # with open(Path.cwd().parent / "new_properties/result.ttl", "w") as f: with open(output_file_path, "w") as f: data = doc.kg.serialize('ttl') f.write(data)
def genNormalTriple(self, node1: str, label: str, node2: str, isPropEdge: bool) -> bool: """ The normal triple's type is determined by 1. label's datatype in prop_types.tsv 2. kgtk format convention of node2 field Update the self.STATEMENT """ # determine the node type [property|item] if node1 in self.propTypes: entity = WDProperty(node1.upper(), self.propTypes[node1]) else: entity = WDItem(node1.upper()) # determine the edge type edgeType = self.propTypes[label] if edgeType == Item: OBJECT = Item(node2.upper()) elif edgeType == TimeValue: # https://www.wikidata.org/wiki/Help:Dates # ^201301-01T00:00:00Z/11 dateTimeString, precision = node2[1:].split("/") dateString, timeString = dateTimeString.split("T") OBJECT = TimeValue( value=dateString, calendar=Item("Q1985727"), precision=precision, time_zone=0, ) elif edgeType == GlobeCoordinate: latitude, longitude = node2[1:].split("/") OBJECT = GlobeCoordinate(latitude, longitude, 0.0001, globe=StringValue("Earth")) elif edgeType == QuantityValue: amount, unit = (re.compile("([\+|\-]?[0-9]+\.?[0-9]*)U([0-9]+)" ).match(node2).groups()) OBJECT = QuantityValue(amount=float(amount), unit=Item(unit)) elif edgeType == MonolingualText: try: textString, lang = node2.split("@") OBJECT = MonolingualText(textString, lang) except: OBJECT = MonolingualText(textString, "en") else: # treat everything else as stringValue OBJECT = StringValue(node2) if isPropEdge: # edge: q1 p8 q2 e8 # create brand new property edge and replace STATEMENT self.STATEMENT = entity.add_statement(label.upper(), OBJECT) else: # edge: e8 p9 ^2013-01-01T00:00:00Z/11 # create qualifier edge on previous STATEMENT and return the updated STATEMENT self.STATEMENT.add_qualifier(label.upper(), OBJECT) self.doc.kg.add_subject(self.STATEMENT) return True