示例#1
0
 def setUp(self):
     ontology_content = '''
         @prefix : <http://dig.isi.edu/ontologies/dig/> .
         @prefix dig: <http://dig.isi.edu/ontologies/dig/> .
         @prefix owl: <http://www.w3.org/2002/07/owl#> .
         @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
         @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
         @prefix schema: <http://schema.org/> .
         @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
         :Person a owl:Class ;
             rdfs:subClassOf :Actor, :Biological_Object ;
             :common_properties :label, :title, :religion ; .
         :has_name a owl:DatatypeProperty ;
             schema:domainIncludes :Person ;
             schema:rangeIncludes xsd:string ; .
         :has_child a owl:ObjectProperty ;
             schema:domainIncludes :Person ;
             schema:rangeIncludes :Person ; .
         '''
     kg_schema = KGSchema()
     kg_schema.add_schema(ontology_content, 'ttl')
     etk = ETK(kg_schema=kg_schema)
     self.doc = etk.create_document(dict(),
                                    doc_id='http://xxx/1',
                                    type_=[URI('dig:Person')])
示例#2
0
    def setUp(self):
        sample_doc = {
            "projects": [{
                "name": "etk",
                "description":
                "version 2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others.",
                "members": ["dongyu", "amandeep", "sylvia", "Runqi12"],
                "date": "2007-12-05",
                "place": "columbus:georgia:united states:-84.98771:32.46098",
                "s": "segment_test_1"
            }, {
                "name": "rltk",
                "description":
                "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.",
                "members": ["mayank", "yixiang"],
                "date": ["2007-12-05T23:19:00"],
                "cost": -3213.32,
                "s": "segment_test_2"
            }]
        }
        kg_schema = KGSchema(
            json.load(open('etk/unit_tests/ground_truth/test_config.json')))

        etk = ETK(kg_schema)
        self.doc = etk.create_document(sample_doc)
示例#3
0
 def test_segment(self) -> None:
     etk = ETK()
     doc = etk.create_document(sample_input)
     descriptions = doc.select_segments("projects[*].description")
     description_value = [i.value for i in descriptions]
     expected = [
         'version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.',
         'record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.'
     ]
     self.assertEqual(description_value, expected)
 def test_website_patterns_condition(self) -> None:
     etk = ETK()
     doc = etk.create_document(sample_input)
     default_doc_selector = DefaultDocumentSelector()
     res_true = default_doc_selector.select_document(
         doc, website_patterns=[".*unittest", ".*abc"])
     res_false = default_doc_selector.select_document(
         doc, website_patterns=[".*ABc", ".*hhhh"])
     self.assertEqual(True, res_true)
     self.assertEqual(False, res_false)
 def test_json_paths_and_json_paths_regex(self) -> None:
     etk = ETK()
     doc = etk.create_document(sample_input)
     default_doc_selector = DefaultDocumentSelector()
     res_true = default_doc_selector.select_document(
         doc,
         json_paths=["$.website"],
         json_paths_regex=[".*unittest", ".*abc"])
     res_false = default_doc_selector.select_document(
         doc, json_paths=["$.website"], json_paths_regex=[".*hhhh"])
     self.assertEqual(True, res_true)
     self.assertEqual(False, res_false)
示例#6
0
    def test_segment(self) -> None:
        kg_schema = KGSchema(
            json.load(open('etk/unit_tests/ground_truth/test_config.json')))

        etk = ETK(kg_schema=kg_schema)
        doc = etk.create_document(sample_input)
        descriptions = doc.select_segments("projects[*].description")
        description_value = [i.value for i in descriptions]
        expected = [
            'version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.',
            'record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.'
        ]
        self.assertEqual(description_value, expected)
示例#7
0
文件: generator.py 项目: yyht/kgtk
 def reset_etk_doc(self,
                   doc_id: str = "http://isi.edu/default-ns/projects"):
     """
     reset the doc object and return it. Called at initialization and after outputting triples.
     """
     kg_schema = KGSchema()
     kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl")
     self.etk = ETK(kg_schema=kg_schema, modules=ETKModule)
     self.doc = self.etk.create_document({}, doc_id=doc_id)
     for k, v in wiki_namespaces.items():
         if k in self.prefix_dict:
             self.doc.kg.bind(k, self.prefix_dict[k])
         else:
             self.doc.kg.bind(k, v)
示例#8
0
    def __init__(self,
                 master_config,
                 em_paths,
                 logger,
                 worker_id,
                 project_name,
                 kafka_input_args=None,
                 kafka_output_args=None):
        self.logger = logger
        self.worker_id = worker_id
        self.check_interval = 1000
        self.exit_sign = False

        try:
            kg_schema = KGSchema(master_config)
            self.etk_ins = ETK(kg_schema, em_paths, logger=logger)
        except Exception as e:
            logger.exception('ETK initialization failed')
            raise e

        # kafka input
        self.kafka_input_server = config['input_server']
        self.kafka_input_session_timeout = config['input_session_timeout']
        self.kafka_input_group_id = config['input_group_id']
        self.kafka_input_topic = '{project_name}_in'.format(
            project_name=project_name)
        self.kafka_input_args = dict(
        ) if kafka_input_args is None else kafka_input_args
        self.kafka_consumer = KafkaConsumer(
            bootstrap_servers=self.kafka_input_server,
            group_id=self.kafka_input_group_id,
            consumer_timeout_ms=self.check_interval,
            value_deserializer=lambda v: json.loads(v.decode('utf-8')),
            **self.kafka_input_args)
        self.kafka_consumer.subscribe([self.kafka_input_topic])

        # kafka output
        self.kafka_output_server = config['output_server']
        self.kafka_output_topic = '{project_name}_out'.format(
            project_name=project_name)
        self.kafka_output_args = dict(
        ) if kafka_output_args is None else kafka_output_args
        self.kafka_producer = KafkaProducer(
            bootstrap_servers=self.kafka_output_server,
            value_serializer=lambda v: json.dumps(v).encode('utf-8'),
            **self.kafka_output_args)

        self.timeout_count = self.kafka_input_session_timeout / self.check_interval
        self.current_timeout_count = 0
示例#9
0
    def test_EmailExtractor(self) -> None:
        kg_schema = KGSchema(
            json.load(open('etk/unit_tests/ground_truth/test_config.json')))

        etk = ETK(kg_schema=kg_schema, use_spacy_tokenizer=True)

        text = "[email protected] [email protected] " \
               "[email protected] [email protected]  E-mail:[email protected] [email protected]"

        email_extractor = EmailExtractor(nlp=etk.default_nlp,
                                         tokenizer=etk.default_tokenizer,
                                         extractor_name="email_extractor")

        extractions = email_extractor.extract(text)

        extracted = []
        for i in extractions:
            extracted_value = {
                "value":
                i.value,
                "start_char":
                i.provenance["start_char"],
                "end_char":
                i.provenance["end_char"],
                "value_from_text":
                text[i.provenance["start_char"]:i.provenance["end_char"]]
            }
            extracted.append(extracted_value)
            self.assertEqual(extracted_value["value"],
                             extracted_value["value_from_text"])

        expected = [{
            'value': '*****@*****.**',
            'start_char': 97,
            'end_char': 122,
            'value_from_text': '*****@*****.**'
        }, {
            'value': '*****@*****.**',
            'start_char': 0,
            'end_char': 16,
            'value_from_text': '*****@*****.**'
        }, {
            'value': '*****@*****.**',
            'start_char': 77,
            'end_char': 96,
            'value_from_text': '*****@*****.**'
        }, {
            'value': '*****@*****.**',
            'start_char': 17,
            'end_char': 40,
            'value_from_text': '*****@*****.**'
        }, {
            'value': '*****@*****.**',
            'start_char': 51,
            'end_char': 68,
            'value_from_text': '*****@*****.**'
        }]

        self.assertEqual(sorted(expected, key=lambda x: x["start_char"]),
                         sorted(extracted, key=lambda x: x["start_char"]))
 def test_all_condition(self) -> None:
     etk = ETK()
     doc = etk.create_document(sample_input)
     default_doc_selector = DefaultDocumentSelector()
     res_true = default_doc_selector.select_document(
         doc,
         datasets=[".*unittest", ".*abc"],
         url_patterns=[".*unittest", ".*zxc"],
         website_patterns=[".*unittest", ".*abc"],
         json_paths=["$.website"],
         json_paths_regex=[".*unittest", ".*abc"])
     res_false = default_doc_selector.select_document(
         doc,
         datasets=[".*abc", ".*hhhh"],
         url_patterns=[".*ZXc", ".*hhhh"],
         website_patterns=[".*ABc", ".*hhhh"],
         json_paths=["$.website"],
         json_paths_regex=[".*hhhh"])
     self.assertEqual(True, res_true)
     self.assertEqual(False, res_false)
示例#11
0
 def setUp(self):
     ontology_content = '''
             @prefix : <http://dig.isi.edu/ontologies/dig/> .
             @prefix owl: <http://www.w3.org/2002/07/owl#> .
             @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
             @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
             @prefix schema: <http://schema.org/> .
             @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
             :Person a owl:Class ;
                 rdfs:subClassOf :Actor, :Biological_Object ;
                 :common_properties :label, :title, :religion ; .
             :has_name a owl:DatatypeProperty ;
                 schema:domainIncludes :Person ;
                 schema:rangeIncludes xsd:string ; .
             :has_child a owl:ObjectProperty ;
                 schema:domainIncludes :Person ;
                 schema:rangeIncludes :Person ; .
         '''
     ontology = Ontology(ontology_content,
                         validation=False,
                         include_undefined_class=True,
                         quiet=True)
     kg_schema = KGSchema(ontology.merge_with_master_config(dict()))
     etk = ETK(kg_schema=kg_schema,
               ontology=ontology,
               generate_json_ld=True)
     etk2 = ETK(kg_schema=kg_schema,
                ontology=ontology,
                generate_json_ld=False)
     self.doc = etk.create_document(dict(),
                                    doc_id='http://xxx/1',
                                    type_=[DIG.Person.toPython()])
     self.doc2 = etk2.create_document(dict(),
                                      doc_id='http://xxx/2',
                                      type_=[DIG.Person.toPython()])
示例#12
0
 def __init__(
     self,
     propFile: str,
     labelSet: str,
     aliasSet: str,
     descriptionSet: str,
     n: int,
     destFp: TextIO = sys.stdout,
 ):
     self.propTypes = self.__setPropTypes(propFile)
     self.labelSet, self.aliasSet, self.descriptionSet = self.__setSets(
         labelSet, aliasSet, descriptionSet)
     # TODO handle standard output
     self.fp = destFp
     self.n = int(n)
     self.read = 0
     # serialize prfix
     kg_schema = KGSchema()
     kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl")
     self.etk = ETK(kg_schema=kg_schema, modules=ETKModule)
     self.doc = self.__setDoc()
     self.__serialize_prefix()
示例#13
0
 def test_etk_crf_glossary_extraction(self):
     etk = ETK(use_spacy_tokenizer=False)
     s = time.time()
     city_extractor = GlossaryExtractor(
         ['los angeles', 'new york', 'angeles'],
         'city_extractor',
         etk.default_tokenizer,
         case_sensitive=False,
         ngrams=3)
     doc_json = {
         'text':
         'i live in los angeles. my hometown is Beijing. I love New York City.'
     }
     doc = Document(etk,
                    cdr_document=doc_json,
                    mime_type='json',
                    url='',
                    doc_id='1')
     t_segments = doc.select_segments("$.text")
     for t_segment in t_segments:
         extracted_cities = doc.extract(city_extractor, t_segment)
         for extracted_city in extracted_cities:
             self.assertTrue(extracted_city.value in
                             ['los angeles', 'New York', 'angeles'])
示例#14
0
from etk.timeseries_processor import TimeseriesProcessor
import pprint


class TimeseriesETKModule(ETKModule):
    """
       Abstract class for extraction module
       """
    def __init__(self, etk):
        ETKModule.__init__(self, etk)

    def process_document(self, doc):
        pass


if __name__ == "__main__":
    etk = ETK(modules=TimeseriesETKModule)

    annotation = './resources/DIESEL_june_annotation.json'
    spreadsheet = './resources/DIESEL_june_2017.xlsx'
    timeseries_processor = TimeseriesProcessor(etk=etk,
                                               annotation=annotation,
                                               spreadsheet=spreadsheet)
    file_name = 'test_file_name'
    data_set = 'test_data_set'

    docs = [
        doc.cdr_document for doc in timeseries_processor.timeseries_extractor(
            file_name=file_name, data_set=data_set)
    ]
    pprint.pprint(docs)
 #                     "groundpig, whistlepig, whistler, thickwood badger, "
 #                     "Canada marmot, monax, moonack, weenusk, red monk and, "
 #                     "among French Canadians in eastern Canada, siffleur"
 #         },
 #         {
 #             "name": "Test3 - Social Media",
 #             "description": "Parser stress test for tweets",
 #             "text": "Slides onto twitter..... \n"
 #                     ".......slippery floor....... \n"
 #                     "............slides out the other side..."
 #         }
 #     ],
 #     "doc_id": 42069
 # }
 #
 etk = ETK(modules=SentenceSplittingETKModule)
 # doc = etk.create_document(toy_doc)
 #
 # split_doc = etk.process_ems(doc)
 #
 # print(json.dumps(split_doc[0].value, indent=2))
 parser = OptionParser(conflict_handler="resolve")
 parser.add_option("-i",
                   "--input_file",
                   action="store",
                   type="string",
                   dest="input_file")
 parser.add_option("-o",
                   "--output_file",
                   action="store",
                   type="string",
示例#16
0
    def model_statement(self):
        # initialize KGSchema
        kg_schema = KGSchema()
        kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
        etk = ETK(kg_schema=kg_schema, modules=ETKModule)
        doc = etk.create_document({},
                                  doc_id='http://isi.edu/default-ns/projects')

        # bind prefix
        doc = create_custom_prefix(doc, custom_dict={self.ns: self.uri})

        # extract files
        self.extract_files()

        # model statement
        inputs = self.data['inputs']
        for k, v in inputs.items():
            if k != 'metadata':
                # construct wikifier instance
                if k == 'wikifier' and not v['existed']:
                    q = WDItem(v['qnode'],
                               namespace=self.ns,
                               creator=':datamart')
                    q.add_label('A wikifier file for ' +
                                inputs['dataset']['content']['filename'],
                                lang='en')
                    q.add_statement('P31', Item(
                        'SDQ1001',
                        namespace=self.ns))  # an instance of Wikifier
                    q.add_statement('P127',
                                    Item('SDQ1003',
                                         namespace=self.ns))  # belongs to
                    q.add_statement('SDP3003',
                                    StringValue(v['content']),
                                    namespace=self.ns)  # hasFileContent
                    q.add_statement('SDP3004',
                                    StringValue(v['hashcode']),
                                    namespace=self.ns)  # hashValue

                # construct mapping_file instance
                elif k == 'mappingFile' and not v['existed']:
                    q = WDItem(v['qnode'],
                               namespace=self.ns,
                               creator=':datamart')
                    q.add_label('A mapping file for ' +
                                inputs['dataset']['content']['filename'],
                                lang='en')
                    q.add_statement('P31', Item(
                        'SDQ1002',
                        namespace=self.ns))  # an instance of MappingFile
                    q.add_statement('P170', StringValue('T2WML'))
                    q.add_statement('P127', Item('SDQ1003', namespace=self.ns))
                    q.add_statement('SDP3003',
                                    StringValue(json.dumps(v['content'])),
                                    namespace=self.ns)
                    q.add_statement('SDP3004',
                                    StringValue(v['hashcode']),
                                    namespace=self.ns)

                # construct dataset instance
                elif k == 'dataset' and not v['existed']:
                    q = WDItem(v['qnode'],
                               namespace=self.ns,
                               creator=':datamart')
                    q.add_label(v['content']['title'], lang='en')
                    q.add_description(v['content']['description'], lang='en')
                    q.add_statement('P31',
                                    Item('Q1172284'))  # an instance of Dataset
                    q.add_statement('SDP3001',
                                    Item(inputs['wikifier']['qnode'],
                                         namespace=self.ns),
                                    namespace=self.ns)  # a wikifier file
                    q.add_statement('SDP3002',
                                    Item(inputs['mappingFile']['qnode'],
                                         namespace=self.ns),
                                    namespace=self.ns)  # a mapping file
                    q.add_statement('P1476', StringValue(
                        v['content']['title']))  # title
                    q.add_statement(
                        'P921',
                        StringValue(v['content']['description']))  # described
                    q.add_statement('P127',
                                    Item('SDQ1003',
                                         namespace=self.ns))  # belongs to
                    q.add_statement('SDP2004',
                                    StringValue(', '.join(
                                        v['content']['keywords'])),
                                    namespace=self.ns)  # keywords
                    q.add_statement('SDP3004',
                                    StringValue(v['hashcode']),
                                    namespace=self.ns)

                    if self.data['storeColumnValue']:
                        for data in v['content']['variable_measured']:
                            statement = q.add_statement(
                                'SDP2005',
                                StringValue(data['column_name']),
                                namespace=self.ns)  # variable measured
                            statement.add_qualifier(
                                'SDP2006',
                                StringValue(data['values_of_a_column']),
                                namespace=self.ns)  # the values of a column
                            statement.add_qualifier(
                                'SDP2007',
                                Item(data['data_structure_type']),
                                namespace=self.ns)  # data structure type
                            statement.add_qualifier(
                                'SDP2008',
                                URLValue(data['semantic_type_identifier']),
                                namespace=self.ns)  # semantic type
                            statement.add_qualifier(
                                'P1545',
                                QuantityValue(
                                    data['column_index'],
                                    namespace=self.ns))  # column index

                doc.kg.add_subject(q)

        return doc
示例#17
0
    def model_schema(self):
        # read data
        data = self.read_data(self.data['schema'])

        # initialize KGSchema
        custom_dict, ns_dict = {}, {'wd': 'http://www.wikidata.org/entity/'}
        for each in data['prefix']:
            for k, v in each.items():
                custom_dict[k] = v
                if k != 'wd':
                    ns_dict[k] = v + '/entity'
        kg_schema = KGSchema()
        kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
        etk = ETK(kg_schema=kg_schema, modules=ETKModule)
        doc = etk.create_document({},
                                  doc_id='http://isi.edu/default-ns/projects')

        # bind prefix
        doc = create_custom_prefix(doc, custom_dict)

        type_map = {
            'quantity': Datatype.QuantityValue,
            'url': URLValue,
            'item': Datatype.Item,
            'time': Datatype.TimeValue,
            'string': Datatype.StringValue,
            'text': Datatype.MonolingualText
        }

        # model schema
        for k, v in data.items():
            if ':' in k:
                k = k.split(':')
                if 'Q' in k[1]:
                    p = WDItem(k[1], namespace=k[0], creator=':datamart')
                elif 'P' in k[1]:
                    p = WDProperty(k[1],
                                   type_map[v['type']],
                                   namespace=k[0],
                                   creator=':datamart')
                else:
                    raise Exception('There is no P/Q information.')
                    return None

                for lang, value in v['description'].items():
                    for val in value:
                        p.add_description(val, lang=lang)

                for lang, value in v['label'].items():
                    for val in value:
                        p.add_label(val, lang=lang)

                for node, value in v['statements'].items():
                    ns = node.split(':')[0] if ':' in node else 'wd'
                    for val in value:
                        prop_type = self.get_property_type(node, ns_dict[ns])
                        if prop_type == 'WikibaseItem':
                            v = Item(str(val['value']))
                        elif prop_type == 'WikibaseProperty':
                            v = Property(val['value'])
                        elif prop_type == 'String':
                            v = StringValue(val['value'])
                        elif prop_type == 'Quantity':
                            v = QuantityValue(val['value'])
                        elif prop_type == 'Url':
                            v = URLValue(val['value'])
                        elif prop_type == 'Monolingualtext':
                            v = MonolingualText(val['value'], val['lang'])
                        p.add_statement(node, v)
                doc.kg.add_subject(p)

        return doc
示例#18
0
def generate_triples(user_id: str,
                     resolved_excel: list,
                     sparql_endpoint: str,
                     filetype: str = 'ttl',
                     created_by: str = 't2wml') -> str:
    """
    This function uses ETK to generate the RDF triples
    :param user_id:
    :param resolved_excel:
    :param sparql_endpoint:
    :param filetype:
    :return:
    """
    # initialize
    kg_schema = KGSchema()
    kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
    etk = ETK(kg_schema=kg_schema, modules=ETKModule)
    doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects")
    property_type_map = property_type_dict

    # bind prefixes
    doc.kg.bind('wikibase', 'http://wikiba.se/ontology#')
    doc.kg.bind('wd', 'http://www.wikidata.org/entity/')
    doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/')
    doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/')
    doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/')
    doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/')
    doc.kg.bind('wdv', 'http://www.wikidata.org/value/')
    doc.kg.bind('wdref', 'http://www.wikidata.org/reference/')
    doc.kg.bind('p', 'http://www.wikidata.org/prop/')
    doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/')
    doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/')
    doc.kg.bind('prn',
                'http://www.wikidata.org/prop/reference/value-normalized/')
    doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/')
    doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/')
    doc.kg.bind('psn',
                'http://www.wikidata.org/prop/statement/value-normalized/')
    doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/')
    doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/')
    doc.kg.bind('pqn',
                'http://www.wikidata.org/prop/qualifier/value-normalized/')
    doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#')
    doc.kg.bind('prov', 'http://www.w3.org/ns/prov#')
    doc.kg.bind('schema', 'http://schema.org/')

    # property_type_cache = {}
    is_error = False
    for i in resolved_excel:
        _item = i["statement"]["item"]
        if _item is not None:
            item = WDItem(_item,
                          creator='http://www.isi.edu/{}'.format(created_by))
            try:
                property_type = property_type_map[i["statement"]["property"]]
            except KeyError:
                property_type = get_property_type(i["statement"]["property"],
                                                  sparql_endpoint)
                if property_type != "Property Not Found" and i["statement"][
                        "property"] not in property_type_map:
                    property_type_map[i["statement"]
                                      ["property"]] = property_type
            if property_type == "WikibaseItem":
                value = Item(str(i["statement"]["value"]))
            elif property_type == "WikibaseProperty":
                value = Property(i["statement"]["value"])
            elif property_type == "String":
                value = StringValue(i["statement"]["value"])
            elif property_type == "Quantity":
                _value = i["statement"]["value"]
                _value = str(_value).replace(',', '')
                value = QuantityValue(_value)
            elif property_type == "Time":
                value = TimeValue(
                    str(i["statement"]["value"]),
                    Item(i["statement"]["calendar"]),
                    translate_precision_to_integer(
                        i["statement"]["precision"]),
                    i["statement"]["time_zone"])
            elif property_type == "Url":
                value = URLValue(i["statement"]["value"])
            elif property_type == "Monolingualtext":
                value = MonolingualText(i["statement"]["value"],
                                        i["statement"]["lang"])
            elif property_type == "ExternalId":
                value = ExternalIdentifier(i["statement"]["value"])
            elif property_type == "GlobeCoordinate":
                value = GlobeCoordinate(i["statement"]["latitude"],
                                        i["statement"]["longitude"],
                                        i["statement"]["precision"])
            elif property_type == "Property Not Found":
                is_error = True
                break
            s = item.add_statement(i["statement"]["property"], value)
            doc.kg.add_subject(item)

            if "qualifier" in i["statement"]:
                for j in i["statement"]["qualifier"]:
                    try:
                        property_type = property_type_map[j["property"]]

                    except KeyError:
                        property_type = get_property_type(
                            j["property"], sparql_endpoint)
                        if property_type != "Property Not Found" and i[
                                "statement"][
                                    "property"] not in property_type_map:
                            property_type_map[i["statement"]
                                              ["property"]] = property_type
                    if property_type == "WikibaseItem":
                        value = Item(str(j["value"]))
                    elif property_type == "WikibaseProperty":
                        value = Property(j["value"])
                    elif property_type == "String":
                        value = StringValue(j["value"])
                    elif property_type == "Quantity":
                        value = QuantityValue(j["value"])
                    elif property_type == "Time":
                        value = TimeValue(str(j["value"]), Item(j["calendar"]),
                                          j["precision"], j["time_zone"])
                    elif property_type == "Url":
                        value = URLValue(j["value"])
                    elif property_type == "Monolingualtext":
                        value = MonolingualText(j["value"], j["lang"])
                    elif property_type == "ExternalId":
                        value = ExternalIdentifier(j["value"])
                    elif property_type == "GlobeCoordinate":
                        value = GlobeCoordinate(j["latitude"], j["longitude"],
                                                j["precision"])
                    elif property_type == "Property Not Found":
                        is_error = True
                    if value is None:
                        continue
                    else:
                        s.add_qualifier(j["property"], value)
            doc.kg.add_subject(s)
    if not is_error:
        data = doc.kg.serialize(filetype)
    else:
        # data = "Property Not Found"
        raise Exception('data exception while generating triples')

    return data
示例#19
0
    """
    def __init__(self, etk):
        ETKModule.__init__(self, etk)
        self.inferlink_extractor = InferlinkExtractor(
            InferlinkRuleSet(
                InferlinkRuleSet.load_rules_file(
                    '../html_basic/sample_inferlink_rules.json')))

    def process_document(self, doc):
        """
        Add your code for processing the document
        """

        raw = doc.select_segments("$.raw_content")[0]
        extractions = doc.extract(self.inferlink_extractor, raw)
        doc.store(extractions, "inferlink_extraction")
        return list()


if __name__ == "__main__":
    sample_html = json.load(codecs.open('../html_basic/sample_html.json',
                                        'r'))  # read sample file from disk

    etk = ETK(modules=InferlinkETKModule)
    doc = etk.create_document(sample_html,
                              mime_type="text/html",
                              url="http://ex.com/123")

    docs = etk.process_ems(doc)

    print(json.dumps(docs[0].value, indent=2))
示例#20
0
文件: generator.py 项目: yyht/kgtk
class TripleGenerator(Generator):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        prop_declaration = kwargs.pop("prop_declaration")
        dest_fp = kwargs.pop("dest_fp")
        truthy = kwargs.pop("truthy")
        use_id = kwargs.pop("use_id")
        prefix_path = kwargs.pop("prefix_path")
        self.datatype_mapping = {
            # nomenclature from https://w.wiki/Tfn
            "item": Item,
            "WikibaseItem": Item,
            "time": TimeValue,
            "Time": TimeValue,
            "globe-coordinate": GlobeCoordinate,
            "GlobeCoordinate": GlobeCoordinate,
            "quantity": QuantityValue,
            "Quantity": QuantityValue,
            "monolingualtext": MonolingualText,
            "Monolingualtext": MonolingualText,
            "string": StringValue,
            "String": StringValue,
            "external-identifier": ExternalIdentifier,
            "ExternalId": ExternalIdentifier,
            "url": StringValue,  #TODO bug potentially in rdflib
            "Url": StringValue,
            "property": WDProperty,
            "WikibaseProperty": WDProperty
        }
        self.set_prefix(prefix_path)
        self.prop_declaration = prop_declaration
        self.set_properties(self.prop_file)
        self.fp = dest_fp
        self.truthy = truthy
        self.reset_etk_doc()
        self.serialize_prefix()
        self.use_id = use_id

    def set_prefix(self, prefix_path: str):
        self.prefix_dict = {}
        if prefix_path != "NONE":
            with open(prefix_path, "r") as fp:
                for line_num, edge in enumerate(fp):
                    edge_list = edge.strip("\r\n").split("\t")
                    if line_num == 0:
                        node1_index, node2_index = edge_list.index(
                            "node1"), edge_list.index("node2")
                    else:
                        prefix, expand = edge_list[node1_index], edge_list[
                            node2_index]
                        self.prefix_dict[prefix] = expand

    def read_prop_declaration(self, line_number: int, edge: str):
        node1, node2, prop, e_id = self.parse_edges(edge)
        if prop == "data_type":
            self.prop_types[node1] = self.datatype_mapping[node2.strip()]
        return

    def set_properties(self, prop_file: str):
        self.prop_types = {}
        if prop_file == "NONE":
            return

        with open(prop_file, "r") as fp:
            props = fp.readlines()
        for line in props[1:]:
            node1, _, node2 = line.split("\t")
            try:
                self.prop_types[node1] = self.datatype_mapping[node2.strip()]
            except:
                raise KGTKException(
                    "DataType {} of node {} is not supported.\n".format(
                        node2, node1))

    def _node_2_entity(self, node: str):
        '''
        A node can be Qxxx or Pxxx, return the proper entity.
        '''
        if node in self.prop_types:
            entity = WDProperty(node, self.prop_types[node])
        else:
            entity = WDItem(TripleGenerator.replace_illegal_string(node))
        return entity

    def reset_etk_doc(self,
                      doc_id: str = "http://isi.edu/default-ns/projects"):
        """
        reset the doc object and return it. Called at initialization and after outputting triples.
        """
        kg_schema = KGSchema()
        kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl")
        self.etk = ETK(kg_schema=kg_schema, modules=ETKModule)
        self.doc = self.etk.create_document({}, doc_id=doc_id)
        for k, v in wiki_namespaces.items():
            if k in self.prefix_dict:
                self.doc.kg.bind(k, self.prefix_dict[k])
            else:
                self.doc.kg.bind(k, v)

    def serialize(self):
        """
        Seriealize the triples. Used a hack to avoid serializing the prefix again.
        """
        docs = self.etk.process_ems(self.doc)
        self.fp.write("\n\n".join(
            docs[0].kg.serialize("ttl").split("\n\n")[1:]))
        self.fp.flush()
        self.reset()

    def serialize_prefix(self):
        """
        This function should be called only once after the doc object is initialized.
        In order to serialize the prefix at the very begining it has to be printed per the change of rdflib 4.2.2->5.0.0
        Relevent issue: https://github.com/RDFLib/rdflib/issues/965
        """
        for k, v in wiki_namespaces.items():
            if k in self.prefix_dict:
                line = "@prefix " + k + ": <" + self.prefix_dict[k] + "> .\n"
            else:
                line = "@prefix " + k + ": <" + v + "> .\n"
            self.fp.write(line)
        self.fp.write("\n")
        self.fp.flush()
        self.reset()

    def reset(self):
        self.to_append_statement_id = None
        self.to_append_statement = None
        self.read_num_of_lines = 0
        self.reset_etk_doc()

    def generate_label_triple(self, node1: str, node2: str) -> bool:
        entity = self._node_2_entity(node1)
        text_string, lang = TripleGenerator.process_text_string(node2)
        entity.add_label(text_string, lang=lang)
        self.doc.kg.add_subject(entity)
        return True

    def generate_description_triple(self, node1: str, node2: str) -> bool:
        entity = self._node_2_entity(node1)
        text_string, lang = TripleGenerator.process_text_string(node2)
        entity.add_description(text_string, lang=lang)
        self.doc.kg.add_subject(entity)
        return True

    def generate_alias_triple(self, node1: str, node2: str) -> bool:
        entity = self._node_2_entity(node1)
        text_string, lang = TripleGenerator.process_text_string(node2)
        entity.add_alias(text_string, lang=lang)
        self.doc.kg.add_subject(entity)
        return True

    def generate_prop_declaration_triple(self, node1: str, node2: str) -> bool:
        # update the known prop_types
        if node1 in self.prop_types:
            if not self.prop_declaration:
                raise KGTKException(
                    "Duplicated property definition of {} found!".format(
                        node1))
        else:
            self.prop_types[node1] = node2

        prop = WDProperty(node1, self.datatype_mapping[node2])
        self.doc.kg.add_subject(prop)
        return True

    def generate_normal_triple(self, node1: str, property: str, node2: str,
                               is_qualifier_edge: bool, e_id: str) -> bool:
        if self.use_id:
            e_id = TripleGenerator.replace_illegal_string(e_id)
        entity = self._node_2_entity(node1)
        edge_type = self.prop_types[property]
        if edge_type == Item:
            object = WDItem(TripleGenerator.replace_illegal_string(node2))
        elif edge_type == WDProperty:
            object = WDProperty(TripleGenerator.replace_illegal_string(node2),
                                self.prop_types[node2])

        elif edge_type == TimeValue:
            if self.yyyy_mm_dd_pattern.match(node2):
                try:
                    dateTimeString = node2
                    object = TimeValue(
                        value=dateTimeString,  # TODO
                        calendar=Item("Q1985727"),
                        precision=Precision.year,
                        time_zone=0,
                    )
                except:
                    return False
            elif self.yyyy_pattern.match(node2):
                try:
                    dateTimeString = node2 + "-01-01"
                    object = TimeValue(
                        value=dateTimeString,  # TODO
                        calendar=Item("Q1985727"),
                        precision=Precision.year,
                        time_zone=0,
                    )
                except:
                    return False
            else:
                try:
                    # TODO, in future, the two cases above will be dropped in principle to comply with the iso format
                    # now it is iso format
                    assert (node2[0] == "^")
                    node2 = node2[1:]  # remove ^
                    if node2.startswith("+"):
                        node2 = node2[1:]
                    dateTimeString, precision = node2.split("/")
                    dateTimeString = dateTimeString[:-1]  # remove Z
                    object = TimeValue(
                        value=dateTimeString,
                        calendar=Item("Q1985727"),
                        precision=precision,
                        time_zone=0,
                    )
                except:
                    return False

        elif edge_type == GlobeCoordinate:
            latitude, longitude = node2[1:].split("/")
            latitude = float(latitude)
            longitude = float(longitude)
            object = GlobeCoordinate(latitude,
                                     longitude,
                                     0.0001,
                                     globe=Item("Q2"))  # earth

        elif edge_type == QuantityValue:
            # +70[+60,+80]Q743895
            res = self.quantity_pattern.match(node2).groups()
            amount, lower_bound, upper_bound, unit = res

            amount = TripleGenerator.clean_number_string(amount)
            num_type = self.xsd_number_type(amount)

            lower_bound = TripleGenerator.clean_number_string(lower_bound)
            upper_bound = TripleGenerator.clean_number_string(upper_bound)
            if unit != None:
                if upper_bound != None and lower_bound != None:
                    object = QuantityValue(amount,
                                           unit=Item(unit),
                                           upper_bound=upper_bound,
                                           lower_bound=lower_bound,
                                           type=num_type)
                else:
                    object = QuantityValue(amount,
                                           unit=Item(unit),
                                           type=num_type)
            else:
                if upper_bound != None and lower_bound != None:
                    object = QuantityValue(amount,
                                           upper_bound=upper_bound,
                                           lower_bound=lower_bound,
                                           type=num_type)
                else:
                    object = QuantityValue(amount, type=num_type)

        elif edge_type == MonolingualText:
            text_string, lang = TripleGenerator.process_text_string(node2)
            object = MonolingualText(text_string, lang)
        elif edge_type == ExternalIdentifier:
            object = ExternalIdentifier(node2)
        elif edge_type == URLValue:
            if TripleGenerator.is_valid_uri_with_scheme_and_host(node2):
                object = URLValue(node2)
            else:
                return False
        else:
            # treat everything else as stringValue
            object = StringValue(node2)

        if type(object) == WDItem or type(object) == WDProperty:
            self.doc.kg.add_subject(object)

        if is_qualifier_edge:
            # edge: e8 p9 ^2013-01-01T00:00:00Z/11
            # create qualifier edge on previous STATEMENT and return the updated STATEMENT
            self.to_append_statement.add_qualifier(property, object)
            self.doc.kg.add_subject(self.to_append_statement)
        else:
            # edge: q1 p8 q2 e8
            # create brand new property edge and replace STATEMENT
            if self.truthy:
                self.to_append_statement = entity.add_truthy_statement(
                    property, object, statement_id=e_id
                ) if self.use_id else entity.add_truthy_statement(
                    property, object)
            else:
                self.to_append_statement = entity.add_statement(
                    property, object, statement_id=e_id
                ) if self.use_id else entity.add_statement(property, object)
            self.doc.kg.add_subject(entity)
        return True

    def entry_point(self, line_number: int, edge: str):
        # print(line_number,edge)
        """
        generates a list of two, the first element is the determination of the edge type using corresponding edge type
        the second element is a bool indicating whether this is a valid property edge or qualifier edge.
        Call corresponding downstream functions
        """
        if line_number == 1:
            # initialize the order_map
            self.initialize_order_map(edge)
            return

        # use the order_map to map the node
        node1, node2, prop, e_id = self.parse_edges(edge)
        if line_number == 2:
            # by default a statement edge
            is_qualifier_edge = False
        else:
            if node1 != self.to_append_statement_id and node1 != self.corrupted_statement_id:
                is_qualifier_edge = False
                # also a new statement edge
                if self.read_num_of_lines >= self.n:
                    self.serialize()
            else:
                # qualifier edge or property declaration edge
                is_qualifier_edge = True
                if node1 == self.corrupted_statement_id:
                    self.warn_log.write(
                        "QUALIFIER edge at line [{}] associated of corrupted statement edge of id [{}] dropped.\n"
                        .format(line_number, self.corrupted_statement_id))
                    return
        if prop in self.label_set:
            success = self.generate_label_triple(node1, node2)
        elif prop in self.description_set:
            success = self.generate_description_triple(node1, node2)
        elif prop in self.alias_set:
            success = self.generate_alias_triple(node1, node2)
        elif prop == "data_type":
            # special edge of prop declaration
            success = self.generate_prop_declaration_triple(node1, node2)
        else:
            if prop in self.prop_types:
                success = self.generate_normal_triple(node1, prop, node2,
                                                      is_qualifier_edge, e_id)
            else:
                raise KGTKException(
                    "property [{}]'s type is unknown at line [{}].\n".format(
                        prop, line_number))
        if (not success) and self.warning:
            if not is_qualifier_edge:
                self.warn_log.write(
                    "CORRUPTED_STATEMENT edge at line: [{}] with edge id [{}].\n"
                    .format(line_number, e_id))
                self.corrupted_statement_id = e_id
            else:
                self.warn_log.write(
                    "CORRUPTED_QUALIFIER edge at line: [{}] with edge id [{}].\n"
                    .format(line_number, e_id))

        else:
            self.read_num_of_lines += 1
            if not is_qualifier_edge:
                self.to_append_statement_id = e_id

    @staticmethod
    def xsd_number_type(num):
        if isinstance(num, float) and 'e' in str(num).lower():
            return LiteralType.double
        return LiteralType.decimal
示例#21
0
def generate_triples(user_id: str, resolved_excel: list, sparql_endpoint: str, filetype: str = 'ttl') -> str:
	"""
	This function uses ETK to generate the RDF triples
	:param user_id:
	:param resolved_excel:
	:param sparql_endpoint:
	:param filetype:
	:return:
	"""
	# initialize
	kg_schema = KGSchema()
	kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
	etk = ETK(kg_schema=kg_schema, modules=ETKModule)
	doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects")

	# bind prefixes
	doc.kg.bind('wikibase', 'http://wikiba.se/ontology#')
	doc.kg.bind('wd', 'http://www.wikidata.org/entity/')
	doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/')
	doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/')
	doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/')
	doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/')
	doc.kg.bind('wdv', 'http://www.wikidata.org/value/')
	doc.kg.bind('wdref', 'http://www.wikidata.org/reference/')
	doc.kg.bind('p', 'http://www.wikidata.org/prop/')
	doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/')
	doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/')
	doc.kg.bind('prn', 'http://www.wikidata.org/prop/reference/value-normalized/')
	doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/')
	doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/')
	doc.kg.bind('psn', 'http://www.wikidata.org/prop/statement/value-normalized/')
	doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/')
	doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/')
	doc.kg.bind('pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/')
	doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#')
	doc.kg.bind('prov', 'http://www.w3.org/ns/prov#')
	doc.kg.bind('schema', 'http://schema.org/')

	# property_type_cache = {}
	is_error = False
	for i in resolved_excel:
		item = WDItem(i["statement"]["item"],  creator='http://www.isi.edu/t2wml')
		try:
			property_type = property_type_map[i["statement"]["property"]]
		except KeyError:
			property_type = get_property_type(i["statement"]["property"], sparql_endpoint)
			property_type_map[i["statement"]["property"]] = property_type
		if property_type == "WikibaseItem":
			value = Item(str(i["statement"]["value"]))
		elif property_type == "WikibaseProperty":
			value = Property(i["statement"]["value"])
		elif property_type == "String":
			value = StringValue(i["statement"]["value"])
		elif property_type == "Quantity":
			value = QuantityValue(i["statement"]["value"])
		elif property_type == "Time":
			value = TimeValue(str(i["statement"]["value"]), Item(i["statement"]["calendar"]), translate_precision_to_integer(i["statement"]["precision"]), i["statement"]["time_zone"])
		elif property_type == "Url":
			value = URLValue(i["statement"]["value"])
		elif property_type == "Monolingualtext":
			value = MonolingualText(i["statement"]["value"], i["statement"]["lang"])
		elif property_type == "ExternalId":
			value = ExternalIdentifier(i["statement"]["value"])
		elif property_type == "GlobeCoordinate":
			value = GlobeCoordinate(i["statement"]["latitude"], i["statement"]["longitude"], i["statement"]["precision"])
		elif property_type == "Property Not Found":
			is_error = True
			break
		s = item.add_statement(i["statement"]["property"], value)
		doc.kg.add_subject(item)

		if "qualifier" in i["statement"]:
			for j in i["statement"]["qualifier"]:
				try:
					property_type = property_type_map[j["property"]]
				except KeyError:
					property_type = get_property_type(j["property"], sparql_endpoint)
					property_type_map[j["property"]] = property_type
				if property_type == "WikibaseItem":
					value = Item(str(j["value"]))
				elif property_type == "WikibaseProperty":
					value = Property(j["value"])
				elif property_type == "String":
					value = StringValue(j["value"])
				elif property_type == "Quantity":
					value = QuantityValue(j["value"])
				elif property_type == "Time":
					value = TimeValue(str(j["value"]), Item(j["calendar"]), j["precision"], j["time_zone"])
				elif property_type == "Url":
					value = URLValue(j["value"])
				elif property_type == "Monolingualtext":
					value = MonolingualText(j["value"], j["lang"])
				elif property_type == "ExternalId":
					value = ExternalIdentifier(j["value"])
				elif property_type == "GlobeCoordinate":
					value = GlobeCoordinate(j["latitude"], j["longitude"], j["precision"])
				elif property_type == "Property Not Found":
					is_error = True
				s.add_qualifier(j["property"], value)
		doc.kg.add_subject(s)
	if not is_error:
		data = doc.kg.serialize(filetype)
	else:
		data = "Property Not Found"
	# os.makedirs(Path.cwd() / "new_properties", exist_ok=True)
	# results_file_name = user_id + "_results.ttl"
	# changes_file_name = user_id + "_changes.tsv"

	# with open(Path(app.config['downloads']) / results_file_name, "w") as fp:
	# 	fp.write(data)
	# with open(Path(app.config['downloads']) / changes_file_name, "w") as fp:
	# 	serialize_change_record(fp)
	return data
示例#22
0
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix schema: <http://schema.org/> .
:Event a owl:Class ; .
:Entity a owl:Class ; .
:Organization a owl:Class ; .
:MOVEMENT_TRANSPORT a owl:Class ; .
:GeopoliticalEntity a owl:Class ; .
skos:prefLabel a owl:DatatypeProperty ; 
    schema:domainIncludes :Entity, :Event ;
    rdfs:range xsd:string ; .
:conflict_attack_place a owl:ObjectProperty ;
    schema:domainIncludes :Entity, :Event ;
    schema:rangeIncludes :GeopoliticalEntity ; .
    '''

    ontology = Ontology(ontology_content,
                        validation=False,
                        include_undefined_class=True,
                        quiet=True)
    kg_schema = KGSchema(ontology.merge_with_master_config(dict()))
    etk = ETK(modules=ExampleETKModule, kg_schema=kg_schema, ontology=ontology)
    input_data = {'doc_id': '1', 'data': json.loads(sample_input)}
    doc = etk.create_document(input_data)
    docs = etk.process_ems(doc)
    kgs = [json.dumps(doc.kg.value) for doc in docs[1:]]
    with open('output.jsonl', 'w') as f:
        f.write('\n'.join(kgs))
    with open('output.nt', 'w') as f:
        f.writelines(map(rdf_generation, kgs))
示例#23
0
            "news_story": {
                "type": "string"
            },
            "similarity": {
                "type": "number"
            },
            "matched_sentence": {
                "type": "string"
            },
            "date": {
                "type": "string"
            }
        }
    }
    kg_schema = KGSchema(master_config)
    etk = ETK(kg_schema, ["./"])

    # read the news
    news_file = open(
        '/Users/amandeep/Github/etk/examples/ranking_pipeline/resources/new_2018-04-03-first-10000.jl'
    )
    # news_file = open('/Users/amandeep/Github/etk/examples/ranking_pipeline/resources/news_stories_3.jl')
    news_stories = [
        etk.create_document(json.loads(line),
                            url=json.loads(line)['tld'],
                            doc_id=json.loads(line)['doc_id'])
        for line in news_file
    ]
    results = list()
    for news_story in news_stories:
        results.extend(etk.process_ems(news_story))
示例#24
0
import unittest, json
from etk.timeseries_processor import TimeseriesProcessor
from etk.etk import ETK
from etk.knowledge_graph import KGSchema

kg_schema = KGSchema(
    json.load(open('etk/unit_tests/ground_truth/test_config.json')))

etk = ETK(kg_schema=kg_schema)

# python -m unittest etk.unit_tests.test_timeseries_processor to run all unittests


class TestTimeseriesProcessor(unittest.TestCase):
    def test_excel_file(self) -> None:
        annotation = 'etk/timeseries/DIESEL_june_annotation.json'
        spreadsheet = 'etk/unit_tests/ground_truth/DIESEL_june_2017.xlsx'

        timeseriesProcessor = TimeseriesProcessor(etk=etk,
                                                  annotation=annotation,
                                                  spreadsheet=spreadsheet)
        docs = [
            doc.cdr_document
            for doc in timeseriesProcessor.timeseries_extractor()
        ]
        selected_docs = docs[1]
        expected_metadata = {
            "name": "AVERAGE DIESEL (AUTOMATIVE GAS OIL) PRICES/ Litre NGN",
            "granularity": "monthly",
            "provenance": {
                "filename": "DIESEL_june_2017.xlsx",
        "projects": [{
            "name":
            "etk",
            "description":
            "version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep, Anika and others."
        }]
    }, {
        "projects": [{
            "name":
            "rltk",
            "description":
            "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students."
        }]
    }]

    etk = ETK(modules=ExampleETKModule)
    extractions = []

    def mapper(sample, _idx):
        doc = etk.create_document(sample)
        docs = etk.process_ems(doc)
        sys.stdout.flush()
        re = docs[0].value
        # print(re)
        return re

    def collect(extracted):
        extractions.append(extracted)

    pp = ParallelProcessor(2,
                           mapper=mapper,
示例#26
0
        # Add a title to the actor document
        doc.kg.add_value("title", json_path="$.Side")

        # Return an empty list because we didn't create new documents
        return []


# The main is for testing, and is not used in the DIG pipeline
if __name__ == "__main__":

    # Tell ETK the schema of the fields in the KG, the DIG master_config can be used as the schema.
    kg_schema = KGSchema(json.load(open('master_config.json')))

    # Instantiate ETK, with the two processing modules and the schema.
    etk = ETK(modules=[UCDPModule, UCDPActorModule], kg_schema=kg_schema)

    # Create a CSV processor to create documents for the relevant rows in the Excel sheet
    cp = CsvProcessor(etk=etk, heading_row=1)

    with open("ucdp.jl", "w") as f:
        # Iterate over all the rows in the spredsheet
        for doc in cp.tabular_extractor(filename="ucdp_sample.xls", dataset='ucdp'):
            # Each row produces a document, which we sent to ETK.
            # Note that each invocation of process_ems will also process any new documents created while
            # processing each doc
            etk.process_and_frame(doc)
            f.write(json.dumps(doc.cdr_document) + "\n")
            # for result in etk.process_ems(doc):
            #     # print(result.cdr_document["knowledge_graph"])
            #     f.write(json.dumps(result.cdr_document) + "\n")
示例#27
0
def model_data() -> None:
    """
	This function generates triples for user defined properties for uploading them to wikidata
	:return:
	"""
    stream = open(Path.cwd().parent /
                  "Datasets/new-property-configuration.yaml",
                  'r',
                  encoding='utf8')
    yaml_data = yaml.safe_load(stream)
    # initialize
    kg_schema = KGSchema()
    kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
    etk = ETK(kg_schema=kg_schema, modules=ETKModule)
    doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects")

    # bind prefixes
    doc.kg.bind('wikibase', 'http://wikiba.se/ontology#')
    doc.kg.bind('wd', 'http://www.wikidata.org/entity/')
    doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/')
    doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/')
    doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/')
    doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/')
    doc.kg.bind('wdv', 'http://www.wikidata.org/value/')
    doc.kg.bind('wdref', 'http://www.wikidata.org/reference/')
    doc.kg.bind('p', 'http://www.wikidata.org/prop/')
    doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/')
    doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/')
    doc.kg.bind('prn',
                'http://www.wikidata.org/prop/reference/value-normalized/')
    doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/')
    doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/')
    doc.kg.bind('psn',
                'http://www.wikidata.org/prop/statement/value-normalized/')
    doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/')
    doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/')
    doc.kg.bind('pqn',
                'http://www.wikidata.org/prop/qualifier/value-normalized/')
    doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#')
    doc.kg.bind('prov', 'http://www.w3.org/ns/prov#')
    doc.kg.bind('schema', 'http://schema.org/')
    sparql_endpoint = "https://query.wikidata.org/sparql"
    type_map = {'quantity': Datatype.QuantityValue, 'url': URLValue}
    property_type_cache = {}
    for k, v in yaml_data.items():
        p = WDProperty(k,
                       type_map[v['type']],
                       creator='http://www.isi.edu/t2wml')
        for lang, value in v['label'].items():
            for val in value:
                p.add_label(val, lang=lang)
        for lang, value in v['description'].items():
            for val in value:
                p.add_description(val, lang=lang)
        for pnode, items in v['statements'].items():
            for item in items:
                try:
                    property_type = property_type_cache[pnode]
                except KeyError:
                    property_type = get_property_type(pnode, sparql_endpoint)
                    property_type_cache[pnode] = property_type
                if property_type == "WikibaseItem":
                    value = Item(str(item['value']))
                elif property_type == "WikibaseProperty":
                    value = Property(item['value'])
                elif property_type == "String":
                    value = StringValue(item['value'])
                elif property_type == "Quantity":
                    value = QuantityValue(item['value'])
                elif property_type == "Time":
                    value = TimeValue(
                        str(item['value']), Item(item["calendar"]),
                        translate_precision_to_integer(item["precision"]),
                        item["time_zone"])
                elif property_type == "Url":
                    value = URLValue(item['value'])
                elif property_type == "Monolingualtext":
                    value = MonolingualText(item['value'], item["lang"])
                elif property_type == "ExternalId":
                    value = ExternalIdentifier(item['value'])
                elif property_type == "GlobeCoordinate":
                    value = GlobeCoordinate(item["latitude"],
                                            item["longitude"],
                                            item["precision"])

                p.add_statement(pnode, value)

        doc.kg.add_subject(p)

    with open(Path.cwd().parent / "new_properties/result.ttl", "w") as f:
        data = doc.kg.serialize('ttl')
        f.write(data)
示例#28
0
import os, sys, json
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from etk.etk import ETK
from etk.knowledge_graph import KGSchema
from examples.config_to_em.em_base_generator import EmBaseGenerator

ebg = EmBaseGenerator('template.tpl')
ebg.generate_em_base('master_config.json', 'ems/em_base.py')

kg_schema = KGSchema(json.load(open("master_config.json", "r")))

etk = ETK(kg_schema, ["./ems"])

doc = etk.create_document(json.load(open('sample_html.jl', 'r')))

docs = etk.process_ems(doc)

print(json.dumps(docs[0].value, indent=2))
示例#29
0
import os, sys, json, codecs
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from etk.etk import ETK
from etk.extractors.html_content_extractor import HTMLContentExtractor, Strategy
from etk.extractors.html_metadata_extractor import HTMLMetadataExtractor
from etk.extractors.inferlink_extractor import InferlinkExtractor, InferlinkRuleSet

sample_html = json.load(codecs.open('sample_html.json',
                                    'r'))  # read sample file from disk

etk = ETK()
doc = etk.create_document(sample_html,
                          mime_type="text/html",
                          url="http://ex.com/123")

metadata_extractor = HTMLMetadataExtractor()
content_extractor = HTMLContentExtractor()
landmark_extractor = InferlinkExtractor(
    InferlinkRuleSet(
        InferlinkRuleSet.load_rules_file('sample_inferlink_rules.json')))

root = doc.select_segments("$")[0]
raw = doc.select_segments("$.raw_content")[0]

# root.store_extractions(doc.invoke_extractor(metadata_extractor, extract_title=True), "title")
# root.store_extractions(doc.invoke_extractor(metadata_extractor, extract_meta=True), "metadata")
root.store_extractions(
    doc.invoke_extractor(content_extractor, raw, strategy=Strategy.ALL_TEXT),
    "etk2_text")
root.store_extractions(
示例#30
0
        projects = doc.select_segments("projects[*]")

        for d, p in zip(descriptions, projects):
            names = doc.extract(self.rule_extractor, d)
            p.store(names, "members")
        return list()


if __name__ == "__main__":

    sample_input = {
        "projects": [{
            "name":
            "etk",
            "description":
            "version  2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others."
        }, {
            "name":
            "rltk",
            "description":
            "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students."
        }]
    }

    etk = ETK(modules=RuleETKModule)
    doc = etk.create_document(sample_input)

    docs = etk.process_ems(doc)

    print(json.dumps(docs[0].value, indent=2))