Exemplo n.º 1
0
 def setUp(self):
     ontology_content = '''
             @prefix : <http://dig.isi.edu/ontologies/dig/> .
             @prefix owl: <http://www.w3.org/2002/07/owl#> .
             @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
             @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
             @prefix schema: <http://schema.org/> .
             @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
             :Person a owl:Class ;
                 rdfs:subClassOf :Actor, :Biological_Object ;
                 :common_properties :label, :title, :religion ; .
             :has_name a owl:DatatypeProperty ;
                 schema:domainIncludes :Person ;
                 schema:rangeIncludes xsd:string ; .
             :has_child a owl:ObjectProperty ;
                 schema:domainIncludes :Person ;
                 schema:rangeIncludes :Person ; .
         '''
     ontology = Ontology(ontology_content,
                         validation=False,
                         include_undefined_class=True,
                         quiet=True)
     kg_schema = KGSchema(ontology.merge_with_master_config(dict()))
     etk = ETK(kg_schema=kg_schema,
               ontology=ontology,
               generate_json_ld=True)
     etk2 = ETK(kg_schema=kg_schema,
                ontology=ontology,
                generate_json_ld=False)
     self.doc = etk.create_document(dict(),
                                    doc_id='http://xxx/1',
                                    type_=[DIG.Person.toPython()])
     self.doc2 = etk2.create_document(dict(),
                                      doc_id='http://xxx/2',
                                      type_=[DIG.Person.toPython()])
Exemplo n.º 2
0
    def setUp(self):
        sample_doc = {
            "projects": [{
                "name": "etk",
                "description":
                "version 2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others.",
                "members": ["dongyu", "amandeep", "sylvia", "Runqi12"],
                "date": "2007-12-05",
                "place": "columbus:georgia:united states:-84.98771:32.46098",
                "s": "segment_test_1"
            }, {
                "name": "rltk",
                "description":
                "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.",
                "members": ["mayank", "yixiang"],
                "date": ["2007-12-05T23:19:00"],
                "cost": -3213.32,
                "s": "segment_test_2"
            }]
        }
        kg_schema = KGSchema(
            json.load(open('etk/unit_tests/ground_truth/test_config.json')))

        etk = ETK(kg_schema)
        self.doc = etk.create_document(sample_doc)
Exemplo n.º 3
0
 def setUp(self):
     ontology_content = '''
         @prefix : <http://dig.isi.edu/ontologies/dig/> .
         @prefix dig: <http://dig.isi.edu/ontologies/dig/> .
         @prefix owl: <http://www.w3.org/2002/07/owl#> .
         @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
         @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
         @prefix schema: <http://schema.org/> .
         @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
         :Person a owl:Class ;
             rdfs:subClassOf :Actor, :Biological_Object ;
             :common_properties :label, :title, :religion ; .
         :has_name a owl:DatatypeProperty ;
             schema:domainIncludes :Person ;
             schema:rangeIncludes xsd:string ; .
         :has_child a owl:ObjectProperty ;
             schema:domainIncludes :Person ;
             schema:rangeIncludes :Person ; .
         '''
     kg_schema = KGSchema()
     kg_schema.add_schema(ontology_content, 'ttl')
     etk = ETK(kg_schema=kg_schema)
     self.doc = etk.create_document(dict(),
                                    doc_id='http://xxx/1',
                                    type_=[URI('dig:Person')])
Exemplo n.º 4
0
    def test_EmailExtractor(self) -> None:
        kg_schema = KGSchema(
            json.load(open('etk/unit_tests/ground_truth/test_config.json')))

        etk = ETK(kg_schema=kg_schema, use_spacy_tokenizer=True)

        text = "[email protected] [email protected] " \
               "[email protected] [email protected]  E-mail:[email protected] [email protected]"

        email_extractor = EmailExtractor(nlp=etk.default_nlp,
                                         tokenizer=etk.default_tokenizer,
                                         extractor_name="email_extractor")

        extractions = email_extractor.extract(text)

        extracted = []
        for i in extractions:
            extracted_value = {
                "value":
                i.value,
                "start_char":
                i.provenance["start_char"],
                "end_char":
                i.provenance["end_char"],
                "value_from_text":
                text[i.provenance["start_char"]:i.provenance["end_char"]]
            }
            extracted.append(extracted_value)
            self.assertEqual(extracted_value["value"],
                             extracted_value["value_from_text"])

        expected = [{
            'value': '*****@*****.**',
            'start_char': 97,
            'end_char': 122,
            'value_from_text': '*****@*****.**'
        }, {
            'value': '*****@*****.**',
            'start_char': 0,
            'end_char': 16,
            'value_from_text': '*****@*****.**'
        }, {
            'value': '*****@*****.**',
            'start_char': 77,
            'end_char': 96,
            'value_from_text': '*****@*****.**'
        }, {
            'value': '*****@*****.**',
            'start_char': 17,
            'end_char': 40,
            'value_from_text': '*****@*****.**'
        }, {
            'value': '*****@*****.**',
            'start_char': 51,
            'end_char': 68,
            'value_from_text': '*****@*****.**'
        }]

        self.assertEqual(sorted(expected, key=lambda x: x["start_char"]),
                         sorted(extracted, key=lambda x: x["start_char"]))
Exemplo n.º 5
0
 def test_segment(self) -> None:
     etk = ETK()
     doc = etk.create_document(sample_input)
     descriptions = doc.select_segments("projects[*].description")
     description_value = [i.value for i in descriptions]
     expected = [
         'version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.',
         'record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.'
     ]
     self.assertEqual(description_value, expected)
Exemplo n.º 6
0
 def test_website_patterns_condition(self) -> None:
     etk = ETK()
     doc = etk.create_document(sample_input)
     default_doc_selector = DefaultDocumentSelector()
     res_true = default_doc_selector.select_document(
         doc, website_patterns=[".*unittest", ".*abc"])
     res_false = default_doc_selector.select_document(
         doc, website_patterns=[".*ABc", ".*hhhh"])
     self.assertEqual(True, res_true)
     self.assertEqual(False, res_false)
Exemplo n.º 7
0
 def test_json_paths_and_json_paths_regex(self) -> None:
     etk = ETK()
     doc = etk.create_document(sample_input)
     default_doc_selector = DefaultDocumentSelector()
     res_true = default_doc_selector.select_document(
         doc,
         json_paths=["$.website"],
         json_paths_regex=[".*unittest", ".*abc"])
     res_false = default_doc_selector.select_document(
         doc, json_paths=["$.website"], json_paths_regex=[".*hhhh"])
     self.assertEqual(True, res_true)
     self.assertEqual(False, res_false)
Exemplo n.º 8
0
    def test_segment(self) -> None:
        kg_schema = KGSchema(
            json.load(open('etk/unit_tests/ground_truth/test_config.json')))

        etk = ETK(kg_schema=kg_schema)
        doc = etk.create_document(sample_input)
        descriptions = doc.select_segments("projects[*].description")
        description_value = [i.value for i in descriptions]
        expected = [
            'version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.',
            'record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.'
        ]
        self.assertEqual(description_value, expected)
Exemplo n.º 9
0
 def reset_etk_doc(self,
                   doc_id: str = "http://isi.edu/default-ns/projects"):
     """
     reset the doc object and return it. Called at initialization and after outputting triples.
     """
     kg_schema = KGSchema()
     kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl")
     self.etk = ETK(kg_schema=kg_schema, modules=ETKModule)
     self.doc = self.etk.create_document({}, doc_id=doc_id)
     for k, v in wiki_namespaces.items():
         if k in self.prefix_dict:
             self.doc.kg.bind(k, self.prefix_dict[k])
         else:
             self.doc.kg.bind(k, v)
Exemplo n.º 10
0
    def __init__(self,
                 master_config,
                 em_paths,
                 logger,
                 worker_id,
                 project_name,
                 kafka_input_args=None,
                 kafka_output_args=None):
        self.logger = logger
        self.worker_id = worker_id
        self.check_interval = 1000
        self.exit_sign = False

        try:
            kg_schema = KGSchema(master_config)
            self.etk_ins = ETK(kg_schema, em_paths, logger=logger)
        except Exception as e:
            logger.exception('ETK initialization failed')
            raise e

        # kafka input
        self.kafka_input_server = config['input_server']
        self.kafka_input_session_timeout = config['input_session_timeout']
        self.kafka_input_group_id = config['input_group_id']
        self.kafka_input_topic = '{project_name}_in'.format(
            project_name=project_name)
        self.kafka_input_args = dict(
        ) if kafka_input_args is None else kafka_input_args
        self.kafka_consumer = KafkaConsumer(
            bootstrap_servers=self.kafka_input_server,
            group_id=self.kafka_input_group_id,
            consumer_timeout_ms=self.check_interval,
            value_deserializer=lambda v: json.loads(v.decode('utf-8')),
            **self.kafka_input_args)
        self.kafka_consumer.subscribe([self.kafka_input_topic])

        # kafka output
        self.kafka_output_server = config['output_server']
        self.kafka_output_topic = '{project_name}_out'.format(
            project_name=project_name)
        self.kafka_output_args = dict(
        ) if kafka_output_args is None else kafka_output_args
        self.kafka_producer = KafkaProducer(
            bootstrap_servers=self.kafka_output_server,
            value_serializer=lambda v: json.dumps(v).encode('utf-8'),
            **self.kafka_output_args)

        self.timeout_count = self.kafka_input_session_timeout / self.check_interval
        self.current_timeout_count = 0
Exemplo n.º 11
0
 def test_all_condition(self) -> None:
     etk = ETK()
     doc = etk.create_document(sample_input)
     default_doc_selector = DefaultDocumentSelector()
     res_true = default_doc_selector.select_document(
         doc,
         datasets=[".*unittest", ".*abc"],
         url_patterns=[".*unittest", ".*zxc"],
         website_patterns=[".*unittest", ".*abc"],
         json_paths=["$.website"],
         json_paths_regex=[".*unittest", ".*abc"])
     res_false = default_doc_selector.select_document(
         doc,
         datasets=[".*abc", ".*hhhh"],
         url_patterns=[".*ZXc", ".*hhhh"],
         website_patterns=[".*ABc", ".*hhhh"],
         json_paths=["$.website"],
         json_paths_regex=[".*hhhh"])
     self.assertEqual(True, res_true)
     self.assertEqual(False, res_false)
Exemplo n.º 12
0
 def __init__(
     self,
     propFile: str,
     labelSet: str,
     aliasSet: str,
     descriptionSet: str,
     n: int,
     destFp: TextIO = sys.stdout,
 ):
     self.propTypes = self.__setPropTypes(propFile)
     self.labelSet, self.aliasSet, self.descriptionSet = self.__setSets(
         labelSet, aliasSet, descriptionSet)
     # TODO handle standard output
     self.fp = destFp
     self.n = int(n)
     self.read = 0
     # serialize prfix
     kg_schema = KGSchema()
     kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl")
     self.etk = ETK(kg_schema=kg_schema, modules=ETKModule)
     self.doc = self.__setDoc()
     self.__serialize_prefix()
Exemplo n.º 13
0
 def test_etk_crf_glossary_extraction(self):
     etk = ETK(use_spacy_tokenizer=False)
     s = time.time()
     city_extractor = GlossaryExtractor(
         ['los angeles', 'new york', 'angeles'],
         'city_extractor',
         etk.default_tokenizer,
         case_sensitive=False,
         ngrams=3)
     doc_json = {
         'text':
         'i live in los angeles. my hometown is Beijing. I love New York City.'
     }
     doc = Document(etk,
                    cdr_document=doc_json,
                    mime_type='json',
                    url='',
                    doc_id='1')
     t_segments = doc.select_segments("$.text")
     for t_segment in t_segments:
         extracted_cities = doc.extract(city_extractor, t_segment)
         for extracted_city in extracted_cities:
             self.assertTrue(extracted_city.value in
                             ['los angeles', 'New York', 'angeles'])
Exemplo n.º 14
0
def model_data() -> None:
    """
	This function generates triples for user defined properties for uploading them to wikidata
	:return:
	"""
    stream = open(Path.cwd().parent /
                  "Datasets/new-property-configuration.yaml",
                  'r',
                  encoding='utf8')
    yaml_data = yaml.safe_load(stream)
    # initialize
    kg_schema = KGSchema()
    kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
    etk = ETK(kg_schema=kg_schema, modules=ETKModule)
    doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects")

    # bind prefixes
    doc.kg.bind('wikibase', 'http://wikiba.se/ontology#')
    doc.kg.bind('wd', 'http://www.wikidata.org/entity/')
    doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/')
    doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/')
    doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/')
    doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/')
    doc.kg.bind('wdv', 'http://www.wikidata.org/value/')
    doc.kg.bind('wdref', 'http://www.wikidata.org/reference/')
    doc.kg.bind('p', 'http://www.wikidata.org/prop/')
    doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/')
    doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/')
    doc.kg.bind('prn',
                'http://www.wikidata.org/prop/reference/value-normalized/')
    doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/')
    doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/')
    doc.kg.bind('psn',
                'http://www.wikidata.org/prop/statement/value-normalized/')
    doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/')
    doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/')
    doc.kg.bind('pqn',
                'http://www.wikidata.org/prop/qualifier/value-normalized/')
    doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#')
    doc.kg.bind('prov', 'http://www.w3.org/ns/prov#')
    doc.kg.bind('schema', 'http://schema.org/')
    sparql_endpoint = "https://query.wikidata.org/sparql"
    type_map = {'quantity': Datatype.QuantityValue, 'url': URLValue}
    property_type_cache = {}
    for k, v in yaml_data.items():
        p = WDProperty(k,
                       type_map[v['type']],
                       creator='http://www.isi.edu/t2wml')
        for lang, value in v['label'].items():
            for val in value:
                p.add_label(val, lang=lang)
        for lang, value in v['description'].items():
            for val in value:
                p.add_description(val, lang=lang)
        for pnode, items in v['statements'].items():
            for item in items:
                try:
                    property_type = property_type_cache[pnode]
                except KeyError:
                    property_type = get_property_type(pnode, sparql_endpoint)
                    property_type_cache[pnode] = property_type
                if property_type == "WikibaseItem":
                    value = Item(str(item['value']))
                elif property_type == "WikibaseProperty":
                    value = Property(item['value'])
                elif property_type == "String":
                    value = StringValue(item['value'])
                elif property_type == "Quantity":
                    value = QuantityValue(item['value'])
                elif property_type == "Time":
                    value = TimeValue(
                        str(item['value']), Item(item["calendar"]),
                        translate_precision_to_integer(item["precision"]),
                        item["time_zone"])
                elif property_type == "Url":
                    value = URLValue(item['value'])
                elif property_type == "Monolingualtext":
                    value = MonolingualText(item['value'], item["lang"])
                elif property_type == "ExternalId":
                    value = ExternalIdentifier(item['value'])
                elif property_type == "GlobeCoordinate":
                    value = GlobeCoordinate(item["latitude"],
                                            item["longitude"],
                                            item["precision"])

                p.add_statement(pnode, value)

        doc.kg.add_subject(p)

    with open(Path.cwd().parent / "new_properties/result.ttl", "w") as f:
        data = doc.kg.serialize('ttl')
        f.write(data)
        "projects": [{
            "name":
            "etk",
            "description":
            "version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep, Anika and others."
        }]
    }, {
        "projects": [{
            "name":
            "rltk",
            "description":
            "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students."
        }]
    }]

    etk = ETK(modules=ExampleETKModule)
    extractions = []

    def mapper(sample, _idx):
        doc = etk.create_document(sample)
        docs = etk.process_ems(doc)
        sys.stdout.flush()
        re = docs[0].value
        # print(re)
        return re

    def collect(extracted):
        extractions.append(extracted)

    pp = ParallelProcessor(2,
                           mapper=mapper,
Exemplo n.º 16
0
from etk.timeseries_processor import TimeseriesProcessor
import pprint


class TimeseriesETKModule(ETKModule):
    """
       Abstract class for extraction module
       """
    def __init__(self, etk):
        ETKModule.__init__(self, etk)

    def process_document(self, doc):
        pass


if __name__ == "__main__":
    etk = ETK(modules=TimeseriesETKModule)

    annotation = './resources/DIESEL_june_annotation.json'
    spreadsheet = './resources/DIESEL_june_2017.xlsx'
    timeseries_processor = TimeseriesProcessor(etk=etk,
                                               annotation=annotation,
                                               spreadsheet=spreadsheet)
    file_name = 'test_file_name'
    data_set = 'test_data_set'

    docs = [
        doc.cdr_document for doc in timeseries_processor.timeseries_extractor(
            file_name=file_name, data_set=data_set)
    ]
    pprint.pprint(docs)
Exemplo n.º 17
0
        for d, p in zip(descriptions, projects):
            names = doc.extract(self.name_extractor, d)
            p.store(names, "members")
        return list()


if __name__ == "__main__":

    # example for glossary extractor:

    sample_input = {
        "projects": [
            {
                "name": "etk",
                "description": "version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others."
            },
            {
                "name": "rltk",
                "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students."
            }
        ],
        "doc_id": 123
    }

    etk = ETK(modules=HelloWorldETKModule)
    doc = etk.create_document(sample_input)

    docs = etk.process_ems(doc)
    print(json.dumps(docs[0].value, indent=2))
Exemplo n.º 18
0
    def model_statement(self):
        # initialize KGSchema
        kg_schema = KGSchema()
        kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
        etk = ETK(kg_schema=kg_schema, modules=ETKModule)
        doc = etk.create_document({},
                                  doc_id='http://isi.edu/default-ns/projects')

        # bind prefix
        doc = create_custom_prefix(doc, custom_dict={self.ns: self.uri})

        # extract files
        self.extract_files()

        # model statement
        inputs = self.data['inputs']
        for k, v in inputs.items():
            if k != 'metadata':
                # construct wikifier instance
                if k == 'wikifier' and not v['existed']:
                    q = WDItem(v['qnode'],
                               namespace=self.ns,
                               creator=':datamart')
                    q.add_label('A wikifier file for ' +
                                inputs['dataset']['content']['filename'],
                                lang='en')
                    q.add_statement('P31', Item(
                        'SDQ1001',
                        namespace=self.ns))  # an instance of Wikifier
                    q.add_statement('P127',
                                    Item('SDQ1003',
                                         namespace=self.ns))  # belongs to
                    q.add_statement('SDP3003',
                                    StringValue(v['content']),
                                    namespace=self.ns)  # hasFileContent
                    q.add_statement('SDP3004',
                                    StringValue(v['hashcode']),
                                    namespace=self.ns)  # hashValue

                # construct mapping_file instance
                elif k == 'mappingFile' and not v['existed']:
                    q = WDItem(v['qnode'],
                               namespace=self.ns,
                               creator=':datamart')
                    q.add_label('A mapping file for ' +
                                inputs['dataset']['content']['filename'],
                                lang='en')
                    q.add_statement('P31', Item(
                        'SDQ1002',
                        namespace=self.ns))  # an instance of MappingFile
                    q.add_statement('P170', StringValue('T2WML'))
                    q.add_statement('P127', Item('SDQ1003', namespace=self.ns))
                    q.add_statement('SDP3003',
                                    StringValue(json.dumps(v['content'])),
                                    namespace=self.ns)
                    q.add_statement('SDP3004',
                                    StringValue(v['hashcode']),
                                    namespace=self.ns)

                # construct dataset instance
                elif k == 'dataset' and not v['existed']:
                    q = WDItem(v['qnode'],
                               namespace=self.ns,
                               creator=':datamart')
                    q.add_label(v['content']['title'], lang='en')
                    q.add_description(v['content']['description'], lang='en')
                    q.add_statement('P31',
                                    Item('Q1172284'))  # an instance of Dataset
                    q.add_statement('SDP3001',
                                    Item(inputs['wikifier']['qnode'],
                                         namespace=self.ns),
                                    namespace=self.ns)  # a wikifier file
                    q.add_statement('SDP3002',
                                    Item(inputs['mappingFile']['qnode'],
                                         namespace=self.ns),
                                    namespace=self.ns)  # a mapping file
                    q.add_statement('P1476', StringValue(
                        v['content']['title']))  # title
                    q.add_statement(
                        'P921',
                        StringValue(v['content']['description']))  # described
                    q.add_statement('P127',
                                    Item('SDQ1003',
                                         namespace=self.ns))  # belongs to
                    q.add_statement('SDP2004',
                                    StringValue(', '.join(
                                        v['content']['keywords'])),
                                    namespace=self.ns)  # keywords
                    q.add_statement('SDP3004',
                                    StringValue(v['hashcode']),
                                    namespace=self.ns)

                    if self.data['storeColumnValue']:
                        for data in v['content']['variable_measured']:
                            statement = q.add_statement(
                                'SDP2005',
                                StringValue(data['column_name']),
                                namespace=self.ns)  # variable measured
                            statement.add_qualifier(
                                'SDP2006',
                                StringValue(data['values_of_a_column']),
                                namespace=self.ns)  # the values of a column
                            statement.add_qualifier(
                                'SDP2007',
                                Item(data['data_structure_type']),
                                namespace=self.ns)  # data structure type
                            statement.add_qualifier(
                                'SDP2008',
                                URLValue(data['semantic_type_identifier']),
                                namespace=self.ns)  # semantic type
                            statement.add_qualifier(
                                'P1545',
                                QuantityValue(
                                    data['column_index'],
                                    namespace=self.ns))  # column index

                doc.kg.add_subject(q)

        return doc
Exemplo n.º 19
0
    def model_schema(self):
        # read data
        data = self.read_data(self.data['schema'])

        # initialize KGSchema
        custom_dict, ns_dict = {}, {'wd': 'http://www.wikidata.org/entity/'}
        for each in data['prefix']:
            for k, v in each.items():
                custom_dict[k] = v
                if k != 'wd':
                    ns_dict[k] = v + '/entity'
        kg_schema = KGSchema()
        kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
        etk = ETK(kg_schema=kg_schema, modules=ETKModule)
        doc = etk.create_document({},
                                  doc_id='http://isi.edu/default-ns/projects')

        # bind prefix
        doc = create_custom_prefix(doc, custom_dict)

        type_map = {
            'quantity': Datatype.QuantityValue,
            'url': URLValue,
            'item': Datatype.Item,
            'time': Datatype.TimeValue,
            'string': Datatype.StringValue,
            'text': Datatype.MonolingualText
        }

        # model schema
        for k, v in data.items():
            if ':' in k:
                k = k.split(':')
                if 'Q' in k[1]:
                    p = WDItem(k[1], namespace=k[0], creator=':datamart')
                elif 'P' in k[1]:
                    p = WDProperty(k[1],
                                   type_map[v['type']],
                                   namespace=k[0],
                                   creator=':datamart')
                else:
                    raise Exception('There is no P/Q information.')
                    return None

                for lang, value in v['description'].items():
                    for val in value:
                        p.add_description(val, lang=lang)

                for lang, value in v['label'].items():
                    for val in value:
                        p.add_label(val, lang=lang)

                for node, value in v['statements'].items():
                    ns = node.split(':')[0] if ':' in node else 'wd'
                    for val in value:
                        prop_type = self.get_property_type(node, ns_dict[ns])
                        if prop_type == 'WikibaseItem':
                            v = Item(str(val['value']))
                        elif prop_type == 'WikibaseProperty':
                            v = Property(val['value'])
                        elif prop_type == 'String':
                            v = StringValue(val['value'])
                        elif prop_type == 'Quantity':
                            v = QuantityValue(val['value'])
                        elif prop_type == 'Url':
                            v = URLValue(val['value'])
                        elif prop_type == 'Monolingualtext':
                            v = MonolingualText(val['value'], val['lang'])
                        p.add_statement(node, v)
                doc.kg.add_subject(p)

        return doc
Exemplo n.º 20
0
def generate_triples(user_id: str,
                     resolved_excel: list,
                     sparql_endpoint: str,
                     filetype: str = 'ttl',
                     created_by: str = 't2wml') -> str:
    """
    This function uses ETK to generate the RDF triples
    :param user_id:
    :param resolved_excel:
    :param sparql_endpoint:
    :param filetype:
    :return:
    """
    # initialize
    kg_schema = KGSchema()
    kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
    etk = ETK(kg_schema=kg_schema, modules=ETKModule)
    doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects")
    property_type_map = property_type_dict

    # bind prefixes
    doc.kg.bind('wikibase', 'http://wikiba.se/ontology#')
    doc.kg.bind('wd', 'http://www.wikidata.org/entity/')
    doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/')
    doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/')
    doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/')
    doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/')
    doc.kg.bind('wdv', 'http://www.wikidata.org/value/')
    doc.kg.bind('wdref', 'http://www.wikidata.org/reference/')
    doc.kg.bind('p', 'http://www.wikidata.org/prop/')
    doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/')
    doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/')
    doc.kg.bind('prn',
                'http://www.wikidata.org/prop/reference/value-normalized/')
    doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/')
    doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/')
    doc.kg.bind('psn',
                'http://www.wikidata.org/prop/statement/value-normalized/')
    doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/')
    doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/')
    doc.kg.bind('pqn',
                'http://www.wikidata.org/prop/qualifier/value-normalized/')
    doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#')
    doc.kg.bind('prov', 'http://www.w3.org/ns/prov#')
    doc.kg.bind('schema', 'http://schema.org/')

    # property_type_cache = {}
    is_error = False
    for i in resolved_excel:
        _item = i["statement"]["item"]
        if _item is not None:
            item = WDItem(_item,
                          creator='http://www.isi.edu/{}'.format(created_by))
            try:
                property_type = property_type_map[i["statement"]["property"]]
            except KeyError:
                property_type = get_property_type(i["statement"]["property"],
                                                  sparql_endpoint)
                if property_type != "Property Not Found" and i["statement"][
                        "property"] not in property_type_map:
                    property_type_map[i["statement"]
                                      ["property"]] = property_type
            if property_type == "WikibaseItem":
                value = Item(str(i["statement"]["value"]))
            elif property_type == "WikibaseProperty":
                value = Property(i["statement"]["value"])
            elif property_type == "String":
                value = StringValue(i["statement"]["value"])
            elif property_type == "Quantity":
                _value = i["statement"]["value"]
                _value = str(_value).replace(',', '')
                value = QuantityValue(_value)
            elif property_type == "Time":
                value = TimeValue(
                    str(i["statement"]["value"]),
                    Item(i["statement"]["calendar"]),
                    translate_precision_to_integer(
                        i["statement"]["precision"]),
                    i["statement"]["time_zone"])
            elif property_type == "Url":
                value = URLValue(i["statement"]["value"])
            elif property_type == "Monolingualtext":
                value = MonolingualText(i["statement"]["value"],
                                        i["statement"]["lang"])
            elif property_type == "ExternalId":
                value = ExternalIdentifier(i["statement"]["value"])
            elif property_type == "GlobeCoordinate":
                value = GlobeCoordinate(i["statement"]["latitude"],
                                        i["statement"]["longitude"],
                                        i["statement"]["precision"])
            elif property_type == "Property Not Found":
                is_error = True
                break
            s = item.add_statement(i["statement"]["property"], value)
            doc.kg.add_subject(item)

            if "qualifier" in i["statement"]:
                for j in i["statement"]["qualifier"]:
                    try:
                        property_type = property_type_map[j["property"]]

                    except KeyError:
                        property_type = get_property_type(
                            j["property"], sparql_endpoint)
                        if property_type != "Property Not Found" and i[
                                "statement"][
                                    "property"] not in property_type_map:
                            property_type_map[i["statement"]
                                              ["property"]] = property_type
                    if property_type == "WikibaseItem":
                        value = Item(str(j["value"]))
                    elif property_type == "WikibaseProperty":
                        value = Property(j["value"])
                    elif property_type == "String":
                        value = StringValue(j["value"])
                    elif property_type == "Quantity":
                        value = QuantityValue(j["value"])
                    elif property_type == "Time":
                        value = TimeValue(str(j["value"]), Item(j["calendar"]),
                                          j["precision"], j["time_zone"])
                    elif property_type == "Url":
                        value = URLValue(j["value"])
                    elif property_type == "Monolingualtext":
                        value = MonolingualText(j["value"], j["lang"])
                    elif property_type == "ExternalId":
                        value = ExternalIdentifier(j["value"])
                    elif property_type == "GlobeCoordinate":
                        value = GlobeCoordinate(j["latitude"], j["longitude"],
                                                j["precision"])
                    elif property_type == "Property Not Found":
                        is_error = True
                    if value is None:
                        continue
                    else:
                        s.add_qualifier(j["property"], value)
            doc.kg.add_subject(s)
    if not is_error:
        data = doc.kg.serialize(filetype)
    else:
        # data = "Property Not Found"
        raise Exception('data exception while generating triples')

    return data
Exemplo n.º 21
0
    """
    def __init__(self, etk):
        ETKModule.__init__(self, etk)
        self.inferlink_extractor = InferlinkExtractor(
            InferlinkRuleSet(
                InferlinkRuleSet.load_rules_file(
                    '../html_basic/sample_inferlink_rules.json')))

    def process_document(self, doc):
        """
        Add your code for processing the document
        """

        raw = doc.select_segments("$.raw_content")[0]
        extractions = doc.extract(self.inferlink_extractor, raw)
        doc.store(extractions, "inferlink_extraction")
        return list()


if __name__ == "__main__":
    sample_html = json.load(codecs.open('../html_basic/sample_html.json',
                                        'r'))  # read sample file from disk

    etk = ETK(modules=InferlinkETKModule)
    doc = etk.create_document(sample_html,
                              mime_type="text/html",
                              url="http://ex.com/123")

    docs = etk.process_ems(doc)

    print(json.dumps(docs[0].value, indent=2))
Exemplo n.º 22
0
def generate_triples(user_id: str, resolved_excel: list, sparql_endpoint: str, filetype: str = 'ttl') -> str:
	"""
	This function uses ETK to generate the RDF triples
	:param user_id:
	:param resolved_excel:
	:param sparql_endpoint:
	:param filetype:
	:return:
	"""
	# initialize
	kg_schema = KGSchema()
	kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
	etk = ETK(kg_schema=kg_schema, modules=ETKModule)
	doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects")

	# bind prefixes
	doc.kg.bind('wikibase', 'http://wikiba.se/ontology#')
	doc.kg.bind('wd', 'http://www.wikidata.org/entity/')
	doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/')
	doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/')
	doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/')
	doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/')
	doc.kg.bind('wdv', 'http://www.wikidata.org/value/')
	doc.kg.bind('wdref', 'http://www.wikidata.org/reference/')
	doc.kg.bind('p', 'http://www.wikidata.org/prop/')
	doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/')
	doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/')
	doc.kg.bind('prn', 'http://www.wikidata.org/prop/reference/value-normalized/')
	doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/')
	doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/')
	doc.kg.bind('psn', 'http://www.wikidata.org/prop/statement/value-normalized/')
	doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/')
	doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/')
	doc.kg.bind('pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/')
	doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#')
	doc.kg.bind('prov', 'http://www.w3.org/ns/prov#')
	doc.kg.bind('schema', 'http://schema.org/')

	# property_type_cache = {}
	is_error = False
	for i in resolved_excel:
		item = WDItem(i["statement"]["item"],  creator='http://www.isi.edu/t2wml')
		try:
			property_type = property_type_map[i["statement"]["property"]]
		except KeyError:
			property_type = get_property_type(i["statement"]["property"], sparql_endpoint)
			property_type_map[i["statement"]["property"]] = property_type
		if property_type == "WikibaseItem":
			value = Item(str(i["statement"]["value"]))
		elif property_type == "WikibaseProperty":
			value = Property(i["statement"]["value"])
		elif property_type == "String":
			value = StringValue(i["statement"]["value"])
		elif property_type == "Quantity":
			value = QuantityValue(i["statement"]["value"])
		elif property_type == "Time":
			value = TimeValue(str(i["statement"]["value"]), Item(i["statement"]["calendar"]), translate_precision_to_integer(i["statement"]["precision"]), i["statement"]["time_zone"])
		elif property_type == "Url":
			value = URLValue(i["statement"]["value"])
		elif property_type == "Monolingualtext":
			value = MonolingualText(i["statement"]["value"], i["statement"]["lang"])
		elif property_type == "ExternalId":
			value = ExternalIdentifier(i["statement"]["value"])
		elif property_type == "GlobeCoordinate":
			value = GlobeCoordinate(i["statement"]["latitude"], i["statement"]["longitude"], i["statement"]["precision"])
		elif property_type == "Property Not Found":
			is_error = True
			break
		s = item.add_statement(i["statement"]["property"], value)
		doc.kg.add_subject(item)

		if "qualifier" in i["statement"]:
			for j in i["statement"]["qualifier"]:
				try:
					property_type = property_type_map[j["property"]]
				except KeyError:
					property_type = get_property_type(j["property"], sparql_endpoint)
					property_type_map[j["property"]] = property_type
				if property_type == "WikibaseItem":
					value = Item(str(j["value"]))
				elif property_type == "WikibaseProperty":
					value = Property(j["value"])
				elif property_type == "String":
					value = StringValue(j["value"])
				elif property_type == "Quantity":
					value = QuantityValue(j["value"])
				elif property_type == "Time":
					value = TimeValue(str(j["value"]), Item(j["calendar"]), j["precision"], j["time_zone"])
				elif property_type == "Url":
					value = URLValue(j["value"])
				elif property_type == "Monolingualtext":
					value = MonolingualText(j["value"], j["lang"])
				elif property_type == "ExternalId":
					value = ExternalIdentifier(j["value"])
				elif property_type == "GlobeCoordinate":
					value = GlobeCoordinate(j["latitude"], j["longitude"], j["precision"])
				elif property_type == "Property Not Found":
					is_error = True
				s.add_qualifier(j["property"], value)
		doc.kg.add_subject(s)
	if not is_error:
		data = doc.kg.serialize(filetype)
	else:
		data = "Property Not Found"
	# os.makedirs(Path.cwd() / "new_properties", exist_ok=True)
	# results_file_name = user_id + "_results.ttl"
	# changes_file_name = user_id + "_changes.tsv"

	# with open(Path(app.config['downloads']) / results_file_name, "w") as fp:
	# 	fp.write(data)
	# with open(Path(app.config['downloads']) / changes_file_name, "w") as fp:
	# 	serialize_change_record(fp)
	return data
Exemplo n.º 23
0
 #                     "groundpig, whistlepig, whistler, thickwood badger, "
 #                     "Canada marmot, monax, moonack, weenusk, red monk and, "
 #                     "among French Canadians in eastern Canada, siffleur"
 #         },
 #         {
 #             "name": "Test3 - Social Media",
 #             "description": "Parser stress test for tweets",
 #             "text": "Slides onto twitter..... \n"
 #                     ".......slippery floor....... \n"
 #                     "............slides out the other side..."
 #         }
 #     ],
 #     "doc_id": 42069
 # }
 #
 etk = ETK(modules=SentenceSplittingETKModule)
 # doc = etk.create_document(toy_doc)
 #
 # split_doc = etk.process_ems(doc)
 #
 # print(json.dumps(split_doc[0].value, indent=2))
 parser = OptionParser(conflict_handler="resolve")
 parser.add_option("-i",
                   "--input_file",
                   action="store",
                   type="string",
                   dest="input_file")
 parser.add_option("-o",
                   "--output_file",
                   action="store",
                   type="string",
Exemplo n.º 24
0
        projects = doc.select_segments("projects[*]")

        for d, p in zip(descriptions, projects):
            names = doc.extract(self.rule_extractor, d)
            p.store(names, "members")
        return list()


if __name__ == "__main__":

    sample_input = {
        "projects": [{
            "name":
            "etk",
            "description":
            "version  2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others."
        }, {
            "name":
            "rltk",
            "description":
            "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students."
        }]
    }

    etk = ETK(modules=RuleETKModule)
    doc = etk.create_document(sample_input)

    docs = etk.process_ems(doc)

    print(json.dumps(docs[0].value, indent=2))
Exemplo n.º 25
0
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix schema: <http://schema.org/> .
:Event a owl:Class ; .
:Entity a owl:Class ; .
:Organization a owl:Class ; .
:MOVEMENT_TRANSPORT a owl:Class ; .
:GeopoliticalEntity a owl:Class ; .
skos:prefLabel a owl:DatatypeProperty ; 
    schema:domainIncludes :Entity, :Event ;
    rdfs:range xsd:string ; .
:conflict_attack_place a owl:ObjectProperty ;
    schema:domainIncludes :Entity, :Event ;
    schema:rangeIncludes :GeopoliticalEntity ; .
    '''

    ontology = Ontology(ontology_content,
                        validation=False,
                        include_undefined_class=True,
                        quiet=True)
    kg_schema = KGSchema(ontology.merge_with_master_config(dict()))
    etk = ETK(modules=ExampleETKModule, kg_schema=kg_schema, ontology=ontology)
    input_data = {'doc_id': '1', 'data': json.loads(sample_input)}
    doc = etk.create_document(input_data)
    docs = etk.process_ems(doc)
    kgs = [json.dumps(doc.kg.value) for doc in docs[1:]]
    with open('output.jsonl', 'w') as f:
        f.write('\n'.join(kgs))
    with open('output.nt', 'w') as f:
        f.writelines(map(rdf_generation, kgs))
Exemplo n.º 26
0
import unittest, json
from etk.timeseries_processor import TimeseriesProcessor
from etk.etk import ETK
from etk.knowledge_graph import KGSchema

kg_schema = KGSchema(
    json.load(open('etk/unit_tests/ground_truth/test_config.json')))

etk = ETK(kg_schema=kg_schema)

# python -m unittest etk.unit_tests.test_timeseries_processor to run all unittests


class TestTimeseriesProcessor(unittest.TestCase):
    def test_excel_file(self) -> None:
        annotation = 'etk/timeseries/DIESEL_june_annotation.json'
        spreadsheet = 'etk/unit_tests/ground_truth/DIESEL_june_2017.xlsx'

        timeseriesProcessor = TimeseriesProcessor(etk=etk,
                                                  annotation=annotation,
                                                  spreadsheet=spreadsheet)
        docs = [
            doc.cdr_document
            for doc in timeseriesProcessor.timeseries_extractor()
        ]
        selected_docs = docs[1]
        expected_metadata = {
            "name": "AVERAGE DIESEL (AUTOMATIVE GAS OIL) PRICES/ Litre NGN",
            "granularity": "monthly",
            "provenance": {
                "filename": "DIESEL_june_2017.xlsx",
Exemplo n.º 27
0
        # Add a title to the actor document
        doc.kg.add_value("title", json_path="$.Side")

        # Return an empty list because we didn't create new documents
        return []


# The main is for testing, and is not used in the DIG pipeline
if __name__ == "__main__":

    # Tell ETK the schema of the fields in the KG, the DIG master_config can be used as the schema.
    kg_schema = KGSchema(json.load(open('master_config.json')))

    # Instantiate ETK, with the two processing modules and the schema.
    etk = ETK(modules=[UCDPModule, UCDPActorModule], kg_schema=kg_schema)

    # Create a CSV processor to create documents for the relevant rows in the Excel sheet
    cp = CsvProcessor(etk=etk, heading_row=1)

    with open("ucdp.jl", "w") as f:
        # Iterate over all the rows in the spredsheet
        for doc in cp.tabular_extractor(filename="ucdp_sample.xls", dataset='ucdp'):
            # Each row produces a document, which we sent to ETK.
            # Note that each invocation of process_ems will also process any new documents created while
            # processing each doc
            etk.process_and_frame(doc)
            f.write(json.dumps(doc.cdr_document) + "\n")
            # for result in etk.process_ems(doc):
            #     # print(result.cdr_document["knowledge_graph"])
            #     f.write(json.dumps(result.cdr_document) + "\n")
Exemplo n.º 28
0
            "news_story": {
                "type": "string"
            },
            "similarity": {
                "type": "number"
            },
            "matched_sentence": {
                "type": "string"
            },
            "date": {
                "type": "string"
            }
        }
    }
    kg_schema = KGSchema(master_config)
    etk = ETK(kg_schema, ["./"])

    # read the news
    news_file = open(
        '/Users/amandeep/Github/etk/examples/ranking_pipeline/resources/new_2018-04-03-first-10000.jl'
    )
    # news_file = open('/Users/amandeep/Github/etk/examples/ranking_pipeline/resources/news_stories_3.jl')
    news_stories = [
        etk.create_document(json.loads(line),
                            url=json.loads(line)['tld'],
                            doc_id=json.loads(line)['doc_id'])
        for line in news_file
    ]
    results = list()
    for news_story in news_stories:
        results.extend(etk.process_ems(news_story))
Exemplo n.º 29
0
import os, sys, json, codecs
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from etk.etk import ETK
from etk.extractors.html_content_extractor import HTMLContentExtractor, Strategy
from etk.extractors.html_metadata_extractor import HTMLMetadataExtractor
from etk.extractors.inferlink_extractor import InferlinkExtractor, InferlinkRuleSet

sample_html = json.load(codecs.open('sample_html.json',
                                    'r'))  # read sample file from disk

etk = ETK()
doc = etk.create_document(sample_html,
                          mime_type="text/html",
                          url="http://ex.com/123")

metadata_extractor = HTMLMetadataExtractor()
content_extractor = HTMLContentExtractor()
landmark_extractor = InferlinkExtractor(
    InferlinkRuleSet(
        InferlinkRuleSet.load_rules_file('sample_inferlink_rules.json')))

root = doc.select_segments("$")[0]
raw = doc.select_segments("$.raw_content")[0]

# root.store_extractions(doc.invoke_extractor(metadata_extractor, extract_title=True), "title")
# root.store_extractions(doc.invoke_extractor(metadata_extractor, extract_meta=True), "metadata")
root.store_extractions(
    doc.invoke_extractor(content_extractor, raw, strategy=Strategy.ALL_TEXT),
    "etk2_text")
root.store_extractions(
Exemplo n.º 30
0
import os, sys, json
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from etk.etk import ETK
from etk.knowledge_graph import KGSchema
from examples.config_to_em.em_base_generator import EmBaseGenerator

ebg = EmBaseGenerator('template.tpl')
ebg.generate_em_base('master_config.json', 'ems/em_base.py')

kg_schema = KGSchema(json.load(open("master_config.json", "r")))

etk = ETK(kg_schema, ["./ems"])

doc = etk.create_document(json.load(open('sample_html.jl', 'r')))

docs = etk.process_ems(doc)

print(json.dumps(docs[0].value, indent=2))