示例#1
0
文件: generator.py 项目: yyht/kgtk
 def _node_2_entity(self, node: str):
     '''
     A node can be Qxxx or Pxxx, return the proper entity.
     '''
     if node in self.prop_types:
         entity = WDProperty(node, self.prop_types[node])
     else:
         entity = WDItem(TripleGenerator.replace_illegal_string(node))
     return entity
示例#2
0
 def genLabelTriple(self, node1: str, label: str, node2: str) -> bool:
     if node1 in self.propTypes:
         entity = WDProperty(node1.upper(), self.propTypes[node1])
     else:
         entity = WDItem(node1.upper())
     if "@" in node2:
         node2, lang = node2.split("@")
         entity.add_label(node2.replace('"', "").replace("'", ""),
                          lang=lang)
     else:
         entity.add_label(node2.replace('"', "").replace("'", ""),
                          lang="en")  # default
     self.doc.kg.add_subject(entity)
     return True
示例#3
0
    def process_one_column(self, column_data: pd.Series, item: WDItem,
                           column_number: int,
                           semantic_type: typing.List[str]) -> bool:
        """
        :param column_data: a pandas series data
        :param item: the target q node aimed to add on
        :param column_number: the column number
        :param semantic_type: a list indicate the semantic tpye of this column
        :return: a bool indicate succeeded or not
        """
        try:
            all_data = set(column_data.tolist())
            all_value_str_set = set()
            for each in all_data:
                # set to lower characters, remove punctuation and split by the space
                words_processed = str(each).lower().translate(
                    self.punctuation_table).split()
                for word in words_processed:
                    all_value_str_set.add(word)
            all_value_str = " ".join(all_value_str_set)

            statement = item.add_statement(
                'C2005', StringValue(column_data.name))  # variable measured
            statement.add_qualifier('C2006',
                                    StringValue(all_value_str))  # values
            if 'http://schema.org/Float' in semantic_type:
                semantic_type_url = 'http://schema.org/Float'
                data_type = "float"
            elif 'http://schema.org/Integer' in semantic_type:
                data_type = "int"
                semantic_type_url = 'http://schema.org/Integer'
            elif 'http://schema.org/Text' in semantic_type:
                data_type = "string"
                semantic_type_url = 'http://schema.org/Text'

            statement.add_qualifier('C2007',
                                    Item(data_type))  # data structure type
            statement.add_qualifier(
                'C2008',
                URLValue(semantic_type_url))  # semantic type identifier
            statement.add_qualifier(
                'P1545', QuantityValue(column_number))  # column index
            return True
        except:
            # import pdb
            # pdb.set_trace()
            return False
示例#4
0
    def model_schema(self):
        # read data
        data = self.read_data(self.data['schema'])

        # initialize KGSchema
        custom_dict, ns_dict = {}, {'wd': 'http://www.wikidata.org/entity/'}
        for each in data['prefix']:
            for k, v in each.items():
                custom_dict[k] = v
                if k != 'wd':
                    ns_dict[k] = v + '/entity'
        kg_schema = KGSchema()
        kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
        etk = ETK(kg_schema=kg_schema, modules=ETKModule)
        doc = etk.create_document({},
                                  doc_id='http://isi.edu/default-ns/projects')

        # bind prefix
        doc = create_custom_prefix(doc, custom_dict)

        type_map = {
            'quantity': Datatype.QuantityValue,
            'url': URLValue,
            'item': Datatype.Item,
            'time': Datatype.TimeValue,
            'string': Datatype.StringValue,
            'text': Datatype.MonolingualText
        }

        # model schema
        for k, v in data.items():
            if ':' in k:
                k = k.split(':')
                if 'Q' in k[1]:
                    p = WDItem(k[1], namespace=k[0], creator=':datamart')
                elif 'P' in k[1]:
                    p = WDProperty(k[1],
                                   type_map[v['type']],
                                   namespace=k[0],
                                   creator=':datamart')
                else:
                    raise Exception('There is no P/Q information.')
                    return None

                for lang, value in v['description'].items():
                    for val in value:
                        p.add_description(val, lang=lang)

                for lang, value in v['label'].items():
                    for val in value:
                        p.add_label(val, lang=lang)

                for node, value in v['statements'].items():
                    ns = node.split(':')[0] if ':' in node else 'wd'
                    for val in value:
                        prop_type = self.get_property_type(node, ns_dict[ns])
                        if prop_type == 'WikibaseItem':
                            v = Item(str(val['value']))
                        elif prop_type == 'WikibaseProperty':
                            v = Property(val['value'])
                        elif prop_type == 'String':
                            v = StringValue(val['value'])
                        elif prop_type == 'Quantity':
                            v = QuantityValue(val['value'])
                        elif prop_type == 'Url':
                            v = URLValue(val['value'])
                        elif prop_type == 'Monolingualtext':
                            v = MonolingualText(val['value'], val['lang'])
                        p.add_statement(node, v)
                doc.kg.add_subject(p)

        return doc
示例#5
0
    def model_statement(self):
        # initialize KGSchema
        kg_schema = KGSchema()
        kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
        etk = ETK(kg_schema=kg_schema, modules=ETKModule)
        doc = etk.create_document({},
                                  doc_id='http://isi.edu/default-ns/projects')

        # bind prefix
        doc = create_custom_prefix(doc, custom_dict={self.ns: self.uri})

        # extract files
        self.extract_files()

        # model statement
        inputs = self.data['inputs']
        for k, v in inputs.items():
            if k != 'metadata':
                # construct wikifier instance
                if k == 'wikifier' and not v['existed']:
                    q = WDItem(v['qnode'],
                               namespace=self.ns,
                               creator=':datamart')
                    q.add_label('A wikifier file for ' +
                                inputs['dataset']['content']['filename'],
                                lang='en')
                    q.add_statement('P31', Item(
                        'SDQ1001',
                        namespace=self.ns))  # an instance of Wikifier
                    q.add_statement('P127',
                                    Item('SDQ1003',
                                         namespace=self.ns))  # belongs to
                    q.add_statement('SDP3003',
                                    StringValue(v['content']),
                                    namespace=self.ns)  # hasFileContent
                    q.add_statement('SDP3004',
                                    StringValue(v['hashcode']),
                                    namespace=self.ns)  # hashValue

                # construct mapping_file instance
                elif k == 'mappingFile' and not v['existed']:
                    q = WDItem(v['qnode'],
                               namespace=self.ns,
                               creator=':datamart')
                    q.add_label('A mapping file for ' +
                                inputs['dataset']['content']['filename'],
                                lang='en')
                    q.add_statement('P31', Item(
                        'SDQ1002',
                        namespace=self.ns))  # an instance of MappingFile
                    q.add_statement('P170', StringValue('T2WML'))
                    q.add_statement('P127', Item('SDQ1003', namespace=self.ns))
                    q.add_statement('SDP3003',
                                    StringValue(json.dumps(v['content'])),
                                    namespace=self.ns)
                    q.add_statement('SDP3004',
                                    StringValue(v['hashcode']),
                                    namespace=self.ns)

                # construct dataset instance
                elif k == 'dataset' and not v['existed']:
                    q = WDItem(v['qnode'],
                               namespace=self.ns,
                               creator=':datamart')
                    q.add_label(v['content']['title'], lang='en')
                    q.add_description(v['content']['description'], lang='en')
                    q.add_statement('P31',
                                    Item('Q1172284'))  # an instance of Dataset
                    q.add_statement('SDP3001',
                                    Item(inputs['wikifier']['qnode'],
                                         namespace=self.ns),
                                    namespace=self.ns)  # a wikifier file
                    q.add_statement('SDP3002',
                                    Item(inputs['mappingFile']['qnode'],
                                         namespace=self.ns),
                                    namespace=self.ns)  # a mapping file
                    q.add_statement('P1476', StringValue(
                        v['content']['title']))  # title
                    q.add_statement(
                        'P921',
                        StringValue(v['content']['description']))  # described
                    q.add_statement('P127',
                                    Item('SDQ1003',
                                         namespace=self.ns))  # belongs to
                    q.add_statement('SDP2004',
                                    StringValue(', '.join(
                                        v['content']['keywords'])),
                                    namespace=self.ns)  # keywords
                    q.add_statement('SDP3004',
                                    StringValue(v['hashcode']),
                                    namespace=self.ns)

                    if self.data['storeColumnValue']:
                        for data in v['content']['variable_measured']:
                            statement = q.add_statement(
                                'SDP2005',
                                StringValue(data['column_name']),
                                namespace=self.ns)  # variable measured
                            statement.add_qualifier(
                                'SDP2006',
                                StringValue(data['values_of_a_column']),
                                namespace=self.ns)  # the values of a column
                            statement.add_qualifier(
                                'SDP2007',
                                Item(data['data_structure_type']),
                                namespace=self.ns)  # data structure type
                            statement.add_qualifier(
                                'SDP2008',
                                URLValue(data['semantic_type_identifier']),
                                namespace=self.ns)  # semantic type
                            statement.add_qualifier(
                                'P1545',
                                QuantityValue(
                                    data['column_index'],
                                    namespace=self.ns))  # column index

                doc.kg.add_subject(q)

        return doc
示例#6
0
def generate_triples(user_id: str,
                     resolved_excel: list,
                     sparql_endpoint: str,
                     filetype: str = 'ttl',
                     created_by: str = 't2wml') -> str:
    """
    This function uses ETK to generate the RDF triples
    :param user_id:
    :param resolved_excel:
    :param sparql_endpoint:
    :param filetype:
    :return:
    """
    # initialize
    kg_schema = KGSchema()
    kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
    etk = ETK(kg_schema=kg_schema, modules=ETKModule)
    doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects")
    property_type_map = property_type_dict

    # bind prefixes
    doc.kg.bind('wikibase', 'http://wikiba.se/ontology#')
    doc.kg.bind('wd', 'http://www.wikidata.org/entity/')
    doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/')
    doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/')
    doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/')
    doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/')
    doc.kg.bind('wdv', 'http://www.wikidata.org/value/')
    doc.kg.bind('wdref', 'http://www.wikidata.org/reference/')
    doc.kg.bind('p', 'http://www.wikidata.org/prop/')
    doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/')
    doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/')
    doc.kg.bind('prn',
                'http://www.wikidata.org/prop/reference/value-normalized/')
    doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/')
    doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/')
    doc.kg.bind('psn',
                'http://www.wikidata.org/prop/statement/value-normalized/')
    doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/')
    doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/')
    doc.kg.bind('pqn',
                'http://www.wikidata.org/prop/qualifier/value-normalized/')
    doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#')
    doc.kg.bind('prov', 'http://www.w3.org/ns/prov#')
    doc.kg.bind('schema', 'http://schema.org/')

    # property_type_cache = {}
    is_error = False
    for i in resolved_excel:
        _item = i["statement"]["item"]
        if _item is not None:
            item = WDItem(_item,
                          creator='http://www.isi.edu/{}'.format(created_by))
            try:
                property_type = property_type_map[i["statement"]["property"]]
            except KeyError:
                property_type = get_property_type(i["statement"]["property"],
                                                  sparql_endpoint)
                if property_type != "Property Not Found" and i["statement"][
                        "property"] not in property_type_map:
                    property_type_map[i["statement"]
                                      ["property"]] = property_type
            if property_type == "WikibaseItem":
                value = Item(str(i["statement"]["value"]))
            elif property_type == "WikibaseProperty":
                value = Property(i["statement"]["value"])
            elif property_type == "String":
                value = StringValue(i["statement"]["value"])
            elif property_type == "Quantity":
                _value = i["statement"]["value"]
                _value = str(_value).replace(',', '')
                value = QuantityValue(_value)
            elif property_type == "Time":
                value = TimeValue(
                    str(i["statement"]["value"]),
                    Item(i["statement"]["calendar"]),
                    translate_precision_to_integer(
                        i["statement"]["precision"]),
                    i["statement"]["time_zone"])
            elif property_type == "Url":
                value = URLValue(i["statement"]["value"])
            elif property_type == "Monolingualtext":
                value = MonolingualText(i["statement"]["value"],
                                        i["statement"]["lang"])
            elif property_type == "ExternalId":
                value = ExternalIdentifier(i["statement"]["value"])
            elif property_type == "GlobeCoordinate":
                value = GlobeCoordinate(i["statement"]["latitude"],
                                        i["statement"]["longitude"],
                                        i["statement"]["precision"])
            elif property_type == "Property Not Found":
                is_error = True
                break
            s = item.add_statement(i["statement"]["property"], value)
            doc.kg.add_subject(item)

            if "qualifier" in i["statement"]:
                for j in i["statement"]["qualifier"]:
                    try:
                        property_type = property_type_map[j["property"]]

                    except KeyError:
                        property_type = get_property_type(
                            j["property"], sparql_endpoint)
                        if property_type != "Property Not Found" and i[
                                "statement"][
                                    "property"] not in property_type_map:
                            property_type_map[i["statement"]
                                              ["property"]] = property_type
                    if property_type == "WikibaseItem":
                        value = Item(str(j["value"]))
                    elif property_type == "WikibaseProperty":
                        value = Property(j["value"])
                    elif property_type == "String":
                        value = StringValue(j["value"])
                    elif property_type == "Quantity":
                        value = QuantityValue(j["value"])
                    elif property_type == "Time":
                        value = TimeValue(str(j["value"]), Item(j["calendar"]),
                                          j["precision"], j["time_zone"])
                    elif property_type == "Url":
                        value = URLValue(j["value"])
                    elif property_type == "Monolingualtext":
                        value = MonolingualText(j["value"], j["lang"])
                    elif property_type == "ExternalId":
                        value = ExternalIdentifier(j["value"])
                    elif property_type == "GlobeCoordinate":
                        value = GlobeCoordinate(j["latitude"], j["longitude"],
                                                j["precision"])
                    elif property_type == "Property Not Found":
                        is_error = True
                    if value is None:
                        continue
                    else:
                        s.add_qualifier(j["property"], value)
            doc.kg.add_subject(s)
    if not is_error:
        data = doc.kg.serialize(filetype)
    else:
        # data = "Property Not Found"
        raise Exception('data exception while generating triples')

    return data
示例#7
0
文件: generator.py 项目: yyht/kgtk
    def generate_normal_triple(self, node1: str, property: str, node2: str,
                               is_qualifier_edge: bool, e_id: str) -> bool:
        if self.use_id:
            e_id = TripleGenerator.replace_illegal_string(e_id)
        entity = self._node_2_entity(node1)
        edge_type = self.prop_types[property]
        if edge_type == Item:
            object = WDItem(TripleGenerator.replace_illegal_string(node2))
        elif edge_type == WDProperty:
            object = WDProperty(TripleGenerator.replace_illegal_string(node2),
                                self.prop_types[node2])

        elif edge_type == TimeValue:
            if self.yyyy_mm_dd_pattern.match(node2):
                try:
                    dateTimeString = node2
                    object = TimeValue(
                        value=dateTimeString,  # TODO
                        calendar=Item("Q1985727"),
                        precision=Precision.year,
                        time_zone=0,
                    )
                except:
                    return False
            elif self.yyyy_pattern.match(node2):
                try:
                    dateTimeString = node2 + "-01-01"
                    object = TimeValue(
                        value=dateTimeString,  # TODO
                        calendar=Item("Q1985727"),
                        precision=Precision.year,
                        time_zone=0,
                    )
                except:
                    return False
            else:
                try:
                    # TODO, in future, the two cases above will be dropped in principle to comply with the iso format
                    # now it is iso format
                    assert (node2[0] == "^")
                    node2 = node2[1:]  # remove ^
                    if node2.startswith("+"):
                        node2 = node2[1:]
                    dateTimeString, precision = node2.split("/")
                    dateTimeString = dateTimeString[:-1]  # remove Z
                    object = TimeValue(
                        value=dateTimeString,
                        calendar=Item("Q1985727"),
                        precision=precision,
                        time_zone=0,
                    )
                except:
                    return False

        elif edge_type == GlobeCoordinate:
            latitude, longitude = node2[1:].split("/")
            latitude = float(latitude)
            longitude = float(longitude)
            object = GlobeCoordinate(latitude,
                                     longitude,
                                     0.0001,
                                     globe=Item("Q2"))  # earth

        elif edge_type == QuantityValue:
            # +70[+60,+80]Q743895
            res = self.quantity_pattern.match(node2).groups()
            amount, lower_bound, upper_bound, unit = res

            amount = TripleGenerator.clean_number_string(amount)
            num_type = self.xsd_number_type(amount)

            lower_bound = TripleGenerator.clean_number_string(lower_bound)
            upper_bound = TripleGenerator.clean_number_string(upper_bound)
            if unit != None:
                if upper_bound != None and lower_bound != None:
                    object = QuantityValue(amount,
                                           unit=Item(unit),
                                           upper_bound=upper_bound,
                                           lower_bound=lower_bound,
                                           type=num_type)
                else:
                    object = QuantityValue(amount,
                                           unit=Item(unit),
                                           type=num_type)
            else:
                if upper_bound != None and lower_bound != None:
                    object = QuantityValue(amount,
                                           upper_bound=upper_bound,
                                           lower_bound=lower_bound,
                                           type=num_type)
                else:
                    object = QuantityValue(amount, type=num_type)

        elif edge_type == MonolingualText:
            text_string, lang = TripleGenerator.process_text_string(node2)
            object = MonolingualText(text_string, lang)
        elif edge_type == ExternalIdentifier:
            object = ExternalIdentifier(node2)
        elif edge_type == URLValue:
            if TripleGenerator.is_valid_uri_with_scheme_and_host(node2):
                object = URLValue(node2)
            else:
                return False
        else:
            # treat everything else as stringValue
            object = StringValue(node2)

        if type(object) == WDItem or type(object) == WDProperty:
            self.doc.kg.add_subject(object)

        if is_qualifier_edge:
            # edge: e8 p9 ^2013-01-01T00:00:00Z/11
            # create qualifier edge on previous STATEMENT and return the updated STATEMENT
            self.to_append_statement.add_qualifier(property, object)
            self.doc.kg.add_subject(self.to_append_statement)
        else:
            # edge: q1 p8 q2 e8
            # create brand new property edge and replace STATEMENT
            if self.truthy:
                self.to_append_statement = entity.add_truthy_statement(
                    property, object, statement_id=e_id
                ) if self.use_id else entity.add_truthy_statement(
                    property, object)
            else:
                self.to_append_statement = entity.add_statement(
                    property, object, statement_id=e_id
                ) if self.use_id else entity.add_statement(property, object)
            self.doc.kg.add_subject(entity)
        return True
示例#8
0
def generate_triples(user_id: str, resolved_excel: list, sparql_endpoint: str, filetype: str = 'ttl') -> str:
	"""
	This function uses ETK to generate the RDF triples
	:param user_id:
	:param resolved_excel:
	:param sparql_endpoint:
	:param filetype:
	:return:
	"""
	# initialize
	kg_schema = KGSchema()
	kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
	etk = ETK(kg_schema=kg_schema, modules=ETKModule)
	doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects")

	# bind prefixes
	doc.kg.bind('wikibase', 'http://wikiba.se/ontology#')
	doc.kg.bind('wd', 'http://www.wikidata.org/entity/')
	doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/')
	doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/')
	doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/')
	doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/')
	doc.kg.bind('wdv', 'http://www.wikidata.org/value/')
	doc.kg.bind('wdref', 'http://www.wikidata.org/reference/')
	doc.kg.bind('p', 'http://www.wikidata.org/prop/')
	doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/')
	doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/')
	doc.kg.bind('prn', 'http://www.wikidata.org/prop/reference/value-normalized/')
	doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/')
	doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/')
	doc.kg.bind('psn', 'http://www.wikidata.org/prop/statement/value-normalized/')
	doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/')
	doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/')
	doc.kg.bind('pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/')
	doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#')
	doc.kg.bind('prov', 'http://www.w3.org/ns/prov#')
	doc.kg.bind('schema', 'http://schema.org/')

	# property_type_cache = {}
	is_error = False
	for i in resolved_excel:
		item = WDItem(i["statement"]["item"],  creator='http://www.isi.edu/t2wml')
		try:
			property_type = property_type_map[i["statement"]["property"]]
		except KeyError:
			property_type = get_property_type(i["statement"]["property"], sparql_endpoint)
			property_type_map[i["statement"]["property"]] = property_type
		if property_type == "WikibaseItem":
			value = Item(str(i["statement"]["value"]))
		elif property_type == "WikibaseProperty":
			value = Property(i["statement"]["value"])
		elif property_type == "String":
			value = StringValue(i["statement"]["value"])
		elif property_type == "Quantity":
			value = QuantityValue(i["statement"]["value"])
		elif property_type == "Time":
			value = TimeValue(str(i["statement"]["value"]), Item(i["statement"]["calendar"]), translate_precision_to_integer(i["statement"]["precision"]), i["statement"]["time_zone"])
		elif property_type == "Url":
			value = URLValue(i["statement"]["value"])
		elif property_type == "Monolingualtext":
			value = MonolingualText(i["statement"]["value"], i["statement"]["lang"])
		elif property_type == "ExternalId":
			value = ExternalIdentifier(i["statement"]["value"])
		elif property_type == "GlobeCoordinate":
			value = GlobeCoordinate(i["statement"]["latitude"], i["statement"]["longitude"], i["statement"]["precision"])
		elif property_type == "Property Not Found":
			is_error = True
			break
		s = item.add_statement(i["statement"]["property"], value)
		doc.kg.add_subject(item)

		if "qualifier" in i["statement"]:
			for j in i["statement"]["qualifier"]:
				try:
					property_type = property_type_map[j["property"]]
				except KeyError:
					property_type = get_property_type(j["property"], sparql_endpoint)
					property_type_map[j["property"]] = property_type
				if property_type == "WikibaseItem":
					value = Item(str(j["value"]))
				elif property_type == "WikibaseProperty":
					value = Property(j["value"])
				elif property_type == "String":
					value = StringValue(j["value"])
				elif property_type == "Quantity":
					value = QuantityValue(j["value"])
				elif property_type == "Time":
					value = TimeValue(str(j["value"]), Item(j["calendar"]), j["precision"], j["time_zone"])
				elif property_type == "Url":
					value = URLValue(j["value"])
				elif property_type == "Monolingualtext":
					value = MonolingualText(j["value"], j["lang"])
				elif property_type == "ExternalId":
					value = ExternalIdentifier(j["value"])
				elif property_type == "GlobeCoordinate":
					value = GlobeCoordinate(j["latitude"], j["longitude"], j["precision"])
				elif property_type == "Property Not Found":
					is_error = True
				s.add_qualifier(j["property"], value)
		doc.kg.add_subject(s)
	if not is_error:
		data = doc.kg.serialize(filetype)
	else:
		data = "Property Not Found"
	# os.makedirs(Path.cwd() / "new_properties", exist_ok=True)
	# results_file_name = user_id + "_results.ttl"
	# changes_file_name = user_id + "_changes.tsv"

	# with open(Path(app.config['downloads']) / results_file_name, "w") as fp:
	# 	fp.write(data)
	# with open(Path(app.config['downloads']) / changes_file_name, "w") as fp:
	# 	serialize_change_record(fp)
	return data
示例#9
0
    def upload(self):
        """
            upload the dataset
        """
        # This special Q node is used to store the next count to store the new Q node
        sparql_query = """
            prefix wdt: <http://www.wikidata.org/prop/direct/>
            prefix wdtn: <http://www.wikidata.org/prop/direct-normalized/>
            prefix wdno: <http://www.wikidata.org/prop/novalue/>
            prefix wds: <http://www.wikidata.org/entity/statement/>
            prefix wdv: <http://www.wikidata.org/value/>
            prefix wdref: <http://www.wikidata.org/reference/>
            prefix wd: <http://www.wikidata.org/entity/>
            prefix wikibase: <http://wikiba.se/ontology#>
            prefix p: <http://www.wikidata.org/prop/>
            prefix pqv: <http://www.wikidata.org/prop/qualifier/value/>
            prefix pq: <http://www.wikidata.org/prop/qualifier/>
            prefix ps: <http://www.wikidata.org/prop/statement/>
            prefix psn: <http://www.wikidata.org/prop/statement/value-normalized/>
            prefix prv: <http://www.wikidata.org/prop/reference/value/>
            prefix psv: <http://www.wikidata.org/prop/statement/value/>
            prefix prn: <http://www.wikidata.org/prop/reference/value-normalized/>
            prefix pr: <http://www.wikidata.org/prop/reference/>
            prefix pqn: <http://www.wikidata.org/prop/qualifier/value-normalized/>
            prefix skos: <http://www.w3.org/2004/02/skos/core#>
            prefix prov: <http://www.w3.org/ns/prov#>
            prefix schema: <http://schema.org/'>
            prefix bd: <http://www.bigdata.com/rdf#>
            prefix bds: <http://www.bigdata.com/rdf/search#>

            delete {
                  wd:Z00000 wdt:P1114 ?x .
                }
                where {
                    wd:Z00000 wdt:P1114 ?x .
                }
            """
        try:
            sparql = SPARQLWrapper(self.update_server)
            sparql.setQuery(sparql_query)
            sparql.setReturnFormat(JSON)
            sparql.setMethod(POST)
            sparql.setRequestMethod(URLENCODED)
            sparql.setCredentials(config.user, config.password)
            results = sparql.query()  #.convert()['results']['bindings']
        except:
            print("Updating the count for datamart failed!")
            raise ValueError("Unable to connect to datamart query service")
        # add datamart count to ttl
        q = WDItem('Z00000')
        q.add_label('Datamart datasets count', lang='en')
        q.add_statement('P1114', QuantityValue(self.resource_id))  # title
        self.doc.kg.add_subject(q)
        # upload
        extracted_data = self.doc.kg.serialize("ttl")
        headers = {
            'Content-Type': 'application/x-turtle',
        }
        response = requests.post(self.update_server,
                                 data=extracted_data.encode('utf-8'),
                                 headers=headers,
                                 auth=HTTPBasicAuth(config.user,
                                                    config.password))
        print('Upload file finished with status code: {}!'.format(
            response.status_code))

        if response.status_code != 200:
            raise ValueError("Uploading file failed")
        else:
            # upload truthy
            temp_output = StringIO()
            serialize_change_record(temp_output)
            temp_output.seek(0)
            tu = TruthyUpdater(self.update_server, False, config.user,
                               config.password)
            np_list = []
            for l in temp_output.readlines():
                if not l: continue
                node, prop = l.strip().split('\t')
                np_list.append((node, prop))
            tu.build_truthy(np_list)
            print('Update truthy finished!')
示例#10
0
 def model_data(self, input_df: pd.DataFrame, metadata: dict):
     if metadata is None:
         metadata = {}
     title = metadata.get("title") or ""
     keywords = metadata.get("keywords") or ""
     file_type = metadata.get("file_type") or ""
     # TODO: if no url given?
     url = metadata.get("url") or "https://"
     if type(keywords) is list:
         keywords = " ".join(keywords)
     node_id = 'D' + str(self.resource_id)
     q = WDItem(node_id)
     self.resource_id += 1
     q.add_label(node_id, lang='en')
     q.add_statement(
         'P31', Item('Q1172284'))  # indicate it is subclass of a dataset
     q.add_statement('P2699', URLValue(url))  # url
     q.add_statement('P2701', StringValue(file_type))  # file type
     q.add_statement('P1476', MonolingualText(title, lang='en'))  # title
     q.add_statement('C2001', StringValue(node_id))  # datamart identifier
     q.add_statement('C2004', StringValue(keywords))  # keywords
     # each columns
     for i in self.columns_are_string:
         try:
             semantic_type = metadata['variables'][i]['semantic_type']
         except IndexError:
             semantic_type = 'http://schema.org/Text'
         res = self.process_one_column(column_data=input_df.iloc[:, i],
                                       item=q,
                                       column_number=i,
                                       semantic_type=semantic_type)
         if not res:
             print("Error when adding column " + str(i))
     self.doc.kg.add_subject(q)
示例#11
0
def model_data(properties_file_path, output_file_path) -> None:
    """
    This function generates triples for user defined properties for uploading them to wikidata
    :return:
    """
    # stream = open(Path.cwd().parent / "Datasets/new-property-configuration.yaml", 'r', encoding='utf8')
    stream = open(properties_file_path, 'r', encoding='utf8')
    yaml_data = yaml.safe_load(stream)
    # initialize
    kg_schema = KGSchema()
    kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
    etk = ETK(kg_schema=kg_schema, modules=ETKModule)
    doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects")

    # bind prefixes
    doc.kg.bind('wikibase', 'http://wikiba.se/ontology#')
    doc.kg.bind('wd', 'http://www.wikidata.org/entity/')
    doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/')
    doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/')
    doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/')
    doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/')
    doc.kg.bind('wdv', 'http://www.wikidata.org/value/')
    doc.kg.bind('wdref', 'http://www.wikidata.org/reference/')
    doc.kg.bind('p', 'http://www.wikidata.org/prop/')
    doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/')
    doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/')
    doc.kg.bind('prn',
                'http://www.wikidata.org/prop/reference/value-normalized/')
    doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/')
    doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/')
    doc.kg.bind('psn',
                'http://www.wikidata.org/prop/statement/value-normalized/')
    doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/')
    doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/')
    doc.kg.bind('pqn',
                'http://www.wikidata.org/prop/qualifier/value-normalized/')
    doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#')
    doc.kg.bind('prov', 'http://www.w3.org/ns/prov#')
    doc.kg.bind('schema', 'http://schema.org/')
    # sparql_endpoint = "https://query.wikidata.org/sparql"
    sparql_endpoint = "http://dsbox02.isi.edu:8899/bigdata/namespace/wdq/sparql"
    type_map = {'quantity': Datatype.QuantityValue, 'url': URLValue}
    property_type_cache = {}
    for k, v in yaml_data.items():
        if k.startswith('Q'):
            p = WDItem(k, creator='http://www.isi.edu/t2wml')
        elif k.startswith('P'):
            p = WDProperty(k,
                           type_map[v['type']],
                           creator='http://www.isi.edu/t2wml')
        for lang, value in v['label'].items():
            if not isinstance(value, list):
                value = [value]
            for val in value:
                p.add_label(val, lang=lang)
        for lang, value in v['description'].items():
            if not isinstance(value, list):
                value = [value]
            for val in value:
                p.add_description(val, lang=lang)
        for pnode, items in v['statements'].items():
            if not isinstance(items, list):
                items = [items]
            for item in items:

                try:
                    property_type = property_type_cache[pnode]
                except KeyError:
                    property_type = get_property_type(pnode, sparql_endpoint)
                    property_type_cache[pnode] = property_type

                if property_type == "WikibaseItem":
                    values = item['value']
                    if not isinstance(values, list):
                        values = [values]
                    value = [Item(v) for v in values if v is not None]
                elif property_type == "WikibaseProperty":
                    value = Property(item['value'])
                elif property_type == "String":
                    value = StringValue(item['value'])
                elif property_type == "Quantity":
                    values = item['value']
                    if not isinstance(values, list):
                        values = [values]
                    value = [QuantityValue(v) for v in values]
                elif property_type == "Time":
                    value = TimeValue(
                        str(item['value']), Item(item["calendar"]),
                        translate_precision_to_integer(item["precision"]),
                        item["time_zone"])
                elif property_type == "Url":
                    value = URLValue(item['value'])
                elif property_type == "Monolingualtext":
                    value = MonolingualText(item['value'], item["lang"])
                elif property_type == "ExternalId":
                    value = ExternalIdentifier(item['value'])
                elif property_type == "GlobeCoordinate":
                    value = GlobeCoordinate(item["latitude"],
                                            item["longitude"],
                                            item["precision"])

                for val in value:
                    p.add_statement(pnode, val)

        doc.kg.add_subject(p)

    # with open(Path.cwd().parent / "new_properties/result.ttl", "w") as f:
    with open(output_file_path, "w") as f:
        data = doc.kg.serialize('ttl')
        f.write(data)
示例#12
0
        def genNormalTriple(self, node1: str, label: str, node2: str,
                            isPropEdge: bool) -> bool:
            """
            The normal triple's type is determined by 
            1. label's datatype in prop_types.tsv
            2. kgtk format convention of node2 field

            Update the self.STATEMENT
            """
            # determine the node type [property|item]
            if node1 in self.propTypes:
                entity = WDProperty(node1.upper(), self.propTypes[node1])
            else:
                entity = WDItem(node1.upper())
            # determine the edge type
            edgeType = self.propTypes[label]
            if edgeType == Item:
                OBJECT = Item(node2.upper())

            elif edgeType == TimeValue:
                # https://www.wikidata.org/wiki/Help:Dates
                # ^201301-01T00:00:00Z/11
                dateTimeString, precision = node2[1:].split("/")
                dateString, timeString = dateTimeString.split("T")
                OBJECT = TimeValue(
                    value=dateString,
                    calendar=Item("Q1985727"),
                    precision=precision,
                    time_zone=0,
                )

            elif edgeType == GlobeCoordinate:
                latitude, longitude = node2[1:].split("/")
                OBJECT = GlobeCoordinate(latitude,
                                         longitude,
                                         0.0001,
                                         globe=StringValue("Earth"))

            elif edgeType == QuantityValue:
                amount, unit = (re.compile("([\+|\-]?[0-9]+\.?[0-9]*)U([0-9]+)"
                                           ).match(node2).groups())
                OBJECT = QuantityValue(amount=float(amount), unit=Item(unit))

            elif edgeType == MonolingualText:
                try:
                    textString, lang = node2.split("@")
                    OBJECT = MonolingualText(textString, lang)
                except:
                    OBJECT = MonolingualText(textString, "en")
            else:
                # treat everything else as stringValue
                OBJECT = StringValue(node2)

            if isPropEdge:
                # edge: q1 p8 q2 e8
                # create brand new property edge and replace STATEMENT
                self.STATEMENT = entity.add_statement(label.upper(), OBJECT)
            else:
                # edge: e8 p9 ^2013-01-01T00:00:00Z/11
                # create qualifier edge on previous STATEMENT and return the updated STATEMENT
                self.STATEMENT.add_qualifier(label.upper(), OBJECT)
            self.doc.kg.add_subject(self.STATEMENT)
            return True