def setUp(self): ontology_content = ''' @prefix : <http://dig.isi.edu/ontologies/dig/> . @prefix owl: <http://www.w3.org/2002/07/owl#> . @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix schema: <http://schema.org/> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . :Person a owl:Class ; rdfs:subClassOf :Actor, :Biological_Object ; :common_properties :label, :title, :religion ; . :has_name a owl:DatatypeProperty ; schema:domainIncludes :Person ; schema:rangeIncludes xsd:string ; . :has_child a owl:ObjectProperty ; schema:domainIncludes :Person ; schema:rangeIncludes :Person ; . ''' ontology = Ontology(ontology_content, validation=False, include_undefined_class=True, quiet=True) kg_schema = KGSchema(ontology.merge_with_master_config(dict())) etk = ETK(kg_schema=kg_schema, ontology=ontology, generate_json_ld=True) etk2 = ETK(kg_schema=kg_schema, ontology=ontology, generate_json_ld=False) self.doc = etk.create_document(dict(), doc_id='http://xxx/1', type_=[DIG.Person.toPython()]) self.doc2 = etk2.create_document(dict(), doc_id='http://xxx/2', type_=[DIG.Person.toPython()])
def setUp(self): sample_doc = { "projects": [{ "name": "etk", "description": "version 2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others.", "members": ["dongyu", "amandeep", "sylvia", "Runqi12"], "date": "2007-12-05", "place": "columbus:georgia:united states:-84.98771:32.46098", "s": "segment_test_1" }, { "name": "rltk", "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.", "members": ["mayank", "yixiang"], "date": ["2007-12-05T23:19:00"], "cost": -3213.32, "s": "segment_test_2" }] } kg_schema = KGSchema( json.load(open('etk/unit_tests/ground_truth/test_config.json'))) etk = ETK(kg_schema) self.doc = etk.create_document(sample_doc)
def setUp(self): ontology_content = ''' @prefix : <http://dig.isi.edu/ontologies/dig/> . @prefix dig: <http://dig.isi.edu/ontologies/dig/> . @prefix owl: <http://www.w3.org/2002/07/owl#> . @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix schema: <http://schema.org/> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . :Person a owl:Class ; rdfs:subClassOf :Actor, :Biological_Object ; :common_properties :label, :title, :religion ; . :has_name a owl:DatatypeProperty ; schema:domainIncludes :Person ; schema:rangeIncludes xsd:string ; . :has_child a owl:ObjectProperty ; schema:domainIncludes :Person ; schema:rangeIncludes :Person ; . ''' kg_schema = KGSchema() kg_schema.add_schema(ontology_content, 'ttl') etk = ETK(kg_schema=kg_schema) self.doc = etk.create_document(dict(), doc_id='http://xxx/1', type_=[URI('dig:Person')])
def test_EmailExtractor(self) -> None: kg_schema = KGSchema( json.load(open('etk/unit_tests/ground_truth/test_config.json'))) etk = ETK(kg_schema=kg_schema, use_spacy_tokenizer=True) text = "[email protected] [email protected] " \ "[email protected] [email protected] E-mail:[email protected] [email protected]" email_extractor = EmailExtractor(nlp=etk.default_nlp, tokenizer=etk.default_tokenizer, extractor_name="email_extractor") extractions = email_extractor.extract(text) extracted = [] for i in extractions: extracted_value = { "value": i.value, "start_char": i.provenance["start_char"], "end_char": i.provenance["end_char"], "value_from_text": text[i.provenance["start_char"]:i.provenance["end_char"]] } extracted.append(extracted_value) self.assertEqual(extracted_value["value"], extracted_value["value_from_text"]) expected = [{ 'value': '*****@*****.**', 'start_char': 97, 'end_char': 122, 'value_from_text': '*****@*****.**' }, { 'value': '*****@*****.**', 'start_char': 0, 'end_char': 16, 'value_from_text': '*****@*****.**' }, { 'value': '*****@*****.**', 'start_char': 77, 'end_char': 96, 'value_from_text': '*****@*****.**' }, { 'value': '*****@*****.**', 'start_char': 17, 'end_char': 40, 'value_from_text': '*****@*****.**' }, { 'value': '*****@*****.**', 'start_char': 51, 'end_char': 68, 'value_from_text': '*****@*****.**' }] self.assertEqual(sorted(expected, key=lambda x: x["start_char"]), sorted(extracted, key=lambda x: x["start_char"]))
def test_segment(self) -> None: etk = ETK() doc = etk.create_document(sample_input) descriptions = doc.select_segments("projects[*].description") description_value = [i.value for i in descriptions] expected = [ 'version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.', 'record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.' ] self.assertEqual(description_value, expected)
def test_website_patterns_condition(self) -> None: etk = ETK() doc = etk.create_document(sample_input) default_doc_selector = DefaultDocumentSelector() res_true = default_doc_selector.select_document( doc, website_patterns=[".*unittest", ".*abc"]) res_false = default_doc_selector.select_document( doc, website_patterns=[".*ABc", ".*hhhh"]) self.assertEqual(True, res_true) self.assertEqual(False, res_false)
def test_json_paths_and_json_paths_regex(self) -> None: etk = ETK() doc = etk.create_document(sample_input) default_doc_selector = DefaultDocumentSelector() res_true = default_doc_selector.select_document( doc, json_paths=["$.website"], json_paths_regex=[".*unittest", ".*abc"]) res_false = default_doc_selector.select_document( doc, json_paths=["$.website"], json_paths_regex=[".*hhhh"]) self.assertEqual(True, res_true) self.assertEqual(False, res_false)
def test_segment(self) -> None: kg_schema = KGSchema( json.load(open('etk/unit_tests/ground_truth/test_config.json'))) etk = ETK(kg_schema=kg_schema) doc = etk.create_document(sample_input) descriptions = doc.select_segments("projects[*].description") description_value = [i.value for i in descriptions] expected = [ 'version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.', 'record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.' ] self.assertEqual(description_value, expected)
def reset_etk_doc(self, doc_id: str = "http://isi.edu/default-ns/projects"): """ reset the doc object and return it. Called at initialization and after outputting triples. """ kg_schema = KGSchema() kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl") self.etk = ETK(kg_schema=kg_schema, modules=ETKModule) self.doc = self.etk.create_document({}, doc_id=doc_id) for k, v in wiki_namespaces.items(): if k in self.prefix_dict: self.doc.kg.bind(k, self.prefix_dict[k]) else: self.doc.kg.bind(k, v)
def __init__(self, master_config, em_paths, logger, worker_id, project_name, kafka_input_args=None, kafka_output_args=None): self.logger = logger self.worker_id = worker_id self.check_interval = 1000 self.exit_sign = False try: kg_schema = KGSchema(master_config) self.etk_ins = ETK(kg_schema, em_paths, logger=logger) except Exception as e: logger.exception('ETK initialization failed') raise e # kafka input self.kafka_input_server = config['input_server'] self.kafka_input_session_timeout = config['input_session_timeout'] self.kafka_input_group_id = config['input_group_id'] self.kafka_input_topic = '{project_name}_in'.format( project_name=project_name) self.kafka_input_args = dict( ) if kafka_input_args is None else kafka_input_args self.kafka_consumer = KafkaConsumer( bootstrap_servers=self.kafka_input_server, group_id=self.kafka_input_group_id, consumer_timeout_ms=self.check_interval, value_deserializer=lambda v: json.loads(v.decode('utf-8')), **self.kafka_input_args) self.kafka_consumer.subscribe([self.kafka_input_topic]) # kafka output self.kafka_output_server = config['output_server'] self.kafka_output_topic = '{project_name}_out'.format( project_name=project_name) self.kafka_output_args = dict( ) if kafka_output_args is None else kafka_output_args self.kafka_producer = KafkaProducer( bootstrap_servers=self.kafka_output_server, value_serializer=lambda v: json.dumps(v).encode('utf-8'), **self.kafka_output_args) self.timeout_count = self.kafka_input_session_timeout / self.check_interval self.current_timeout_count = 0
def test_all_condition(self) -> None: etk = ETK() doc = etk.create_document(sample_input) default_doc_selector = DefaultDocumentSelector() res_true = default_doc_selector.select_document( doc, datasets=[".*unittest", ".*abc"], url_patterns=[".*unittest", ".*zxc"], website_patterns=[".*unittest", ".*abc"], json_paths=["$.website"], json_paths_regex=[".*unittest", ".*abc"]) res_false = default_doc_selector.select_document( doc, datasets=[".*abc", ".*hhhh"], url_patterns=[".*ZXc", ".*hhhh"], website_patterns=[".*ABc", ".*hhhh"], json_paths=["$.website"], json_paths_regex=[".*hhhh"]) self.assertEqual(True, res_true) self.assertEqual(False, res_false)
def __init__( self, propFile: str, labelSet: str, aliasSet: str, descriptionSet: str, n: int, destFp: TextIO = sys.stdout, ): self.propTypes = self.__setPropTypes(propFile) self.labelSet, self.aliasSet, self.descriptionSet = self.__setSets( labelSet, aliasSet, descriptionSet) # TODO handle standard output self.fp = destFp self.n = int(n) self.read = 0 # serialize prfix kg_schema = KGSchema() kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl") self.etk = ETK(kg_schema=kg_schema, modules=ETKModule) self.doc = self.__setDoc() self.__serialize_prefix()
def test_etk_crf_glossary_extraction(self): etk = ETK(use_spacy_tokenizer=False) s = time.time() city_extractor = GlossaryExtractor( ['los angeles', 'new york', 'angeles'], 'city_extractor', etk.default_tokenizer, case_sensitive=False, ngrams=3) doc_json = { 'text': 'i live in los angeles. my hometown is Beijing. I love New York City.' } doc = Document(etk, cdr_document=doc_json, mime_type='json', url='', doc_id='1') t_segments = doc.select_segments("$.text") for t_segment in t_segments: extracted_cities = doc.extract(city_extractor, t_segment) for extracted_city in extracted_cities: self.assertTrue(extracted_city.value in ['los angeles', 'New York', 'angeles'])
def model_data() -> None: """ This function generates triples for user defined properties for uploading them to wikidata :return: """ stream = open(Path.cwd().parent / "Datasets/new-property-configuration.yaml", 'r', encoding='utf8') yaml_data = yaml.safe_load(stream) # initialize kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") # bind prefixes doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') doc.kg.bind('wd', 'http://www.wikidata.org/entity/') doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') doc.kg.bind('wdv', 'http://www.wikidata.org/value/') doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') doc.kg.bind('p', 'http://www.wikidata.org/prop/') doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') doc.kg.bind('prn', 'http://www.wikidata.org/prop/reference/value-normalized/') doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') doc.kg.bind('psn', 'http://www.wikidata.org/prop/statement/value-normalized/') doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') doc.kg.bind('pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') doc.kg.bind('schema', 'http://schema.org/') sparql_endpoint = "https://query.wikidata.org/sparql" type_map = {'quantity': Datatype.QuantityValue, 'url': URLValue} property_type_cache = {} for k, v in yaml_data.items(): p = WDProperty(k, type_map[v['type']], creator='http://www.isi.edu/t2wml') for lang, value in v['label'].items(): for val in value: p.add_label(val, lang=lang) for lang, value in v['description'].items(): for val in value: p.add_description(val, lang=lang) for pnode, items in v['statements'].items(): for item in items: try: property_type = property_type_cache[pnode] except KeyError: property_type = get_property_type(pnode, sparql_endpoint) property_type_cache[pnode] = property_type if property_type == "WikibaseItem": value = Item(str(item['value'])) elif property_type == "WikibaseProperty": value = Property(item['value']) elif property_type == "String": value = StringValue(item['value']) elif property_type == "Quantity": value = QuantityValue(item['value']) elif property_type == "Time": value = TimeValue( str(item['value']), Item(item["calendar"]), translate_precision_to_integer(item["precision"]), item["time_zone"]) elif property_type == "Url": value = URLValue(item['value']) elif property_type == "Monolingualtext": value = MonolingualText(item['value'], item["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(item['value']) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(item["latitude"], item["longitude"], item["precision"]) p.add_statement(pnode, value) doc.kg.add_subject(p) with open(Path.cwd().parent / "new_properties/result.ttl", "w") as f: data = doc.kg.serialize('ttl') f.write(data)
"projects": [{ "name": "etk", "description": "version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep, Anika and others." }] }, { "projects": [{ "name": "rltk", "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students." }] }] etk = ETK(modules=ExampleETKModule) extractions = [] def mapper(sample, _idx): doc = etk.create_document(sample) docs = etk.process_ems(doc) sys.stdout.flush() re = docs[0].value # print(re) return re def collect(extracted): extractions.append(extracted) pp = ParallelProcessor(2, mapper=mapper,
from etk.timeseries_processor import TimeseriesProcessor import pprint class TimeseriesETKModule(ETKModule): """ Abstract class for extraction module """ def __init__(self, etk): ETKModule.__init__(self, etk) def process_document(self, doc): pass if __name__ == "__main__": etk = ETK(modules=TimeseriesETKModule) annotation = './resources/DIESEL_june_annotation.json' spreadsheet = './resources/DIESEL_june_2017.xlsx' timeseries_processor = TimeseriesProcessor(etk=etk, annotation=annotation, spreadsheet=spreadsheet) file_name = 'test_file_name' data_set = 'test_data_set' docs = [ doc.cdr_document for doc in timeseries_processor.timeseries_extractor( file_name=file_name, data_set=data_set) ] pprint.pprint(docs)
for d, p in zip(descriptions, projects): names = doc.extract(self.name_extractor, d) p.store(names, "members") return list() if __name__ == "__main__": # example for glossary extractor: sample_input = { "projects": [ { "name": "etk", "description": "version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others." }, { "name": "rltk", "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students." } ], "doc_id": 123 } etk = ETK(modules=HelloWorldETKModule) doc = etk.create_document(sample_input) docs = etk.process_ems(doc) print(json.dumps(docs[0].value, indent=2))
def model_statement(self): # initialize KGSchema kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id='http://isi.edu/default-ns/projects') # bind prefix doc = create_custom_prefix(doc, custom_dict={self.ns: self.uri}) # extract files self.extract_files() # model statement inputs = self.data['inputs'] for k, v in inputs.items(): if k != 'metadata': # construct wikifier instance if k == 'wikifier' and not v['existed']: q = WDItem(v['qnode'], namespace=self.ns, creator=':datamart') q.add_label('A wikifier file for ' + inputs['dataset']['content']['filename'], lang='en') q.add_statement('P31', Item( 'SDQ1001', namespace=self.ns)) # an instance of Wikifier q.add_statement('P127', Item('SDQ1003', namespace=self.ns)) # belongs to q.add_statement('SDP3003', StringValue(v['content']), namespace=self.ns) # hasFileContent q.add_statement('SDP3004', StringValue(v['hashcode']), namespace=self.ns) # hashValue # construct mapping_file instance elif k == 'mappingFile' and not v['existed']: q = WDItem(v['qnode'], namespace=self.ns, creator=':datamart') q.add_label('A mapping file for ' + inputs['dataset']['content']['filename'], lang='en') q.add_statement('P31', Item( 'SDQ1002', namespace=self.ns)) # an instance of MappingFile q.add_statement('P170', StringValue('T2WML')) q.add_statement('P127', Item('SDQ1003', namespace=self.ns)) q.add_statement('SDP3003', StringValue(json.dumps(v['content'])), namespace=self.ns) q.add_statement('SDP3004', StringValue(v['hashcode']), namespace=self.ns) # construct dataset instance elif k == 'dataset' and not v['existed']: q = WDItem(v['qnode'], namespace=self.ns, creator=':datamart') q.add_label(v['content']['title'], lang='en') q.add_description(v['content']['description'], lang='en') q.add_statement('P31', Item('Q1172284')) # an instance of Dataset q.add_statement('SDP3001', Item(inputs['wikifier']['qnode'], namespace=self.ns), namespace=self.ns) # a wikifier file q.add_statement('SDP3002', Item(inputs['mappingFile']['qnode'], namespace=self.ns), namespace=self.ns) # a mapping file q.add_statement('P1476', StringValue( v['content']['title'])) # title q.add_statement( 'P921', StringValue(v['content']['description'])) # described q.add_statement('P127', Item('SDQ1003', namespace=self.ns)) # belongs to q.add_statement('SDP2004', StringValue(', '.join( v['content']['keywords'])), namespace=self.ns) # keywords q.add_statement('SDP3004', StringValue(v['hashcode']), namespace=self.ns) if self.data['storeColumnValue']: for data in v['content']['variable_measured']: statement = q.add_statement( 'SDP2005', StringValue(data['column_name']), namespace=self.ns) # variable measured statement.add_qualifier( 'SDP2006', StringValue(data['values_of_a_column']), namespace=self.ns) # the values of a column statement.add_qualifier( 'SDP2007', Item(data['data_structure_type']), namespace=self.ns) # data structure type statement.add_qualifier( 'SDP2008', URLValue(data['semantic_type_identifier']), namespace=self.ns) # semantic type statement.add_qualifier( 'P1545', QuantityValue( data['column_index'], namespace=self.ns)) # column index doc.kg.add_subject(q) return doc
def model_schema(self): # read data data = self.read_data(self.data['schema']) # initialize KGSchema custom_dict, ns_dict = {}, {'wd': 'http://www.wikidata.org/entity/'} for each in data['prefix']: for k, v in each.items(): custom_dict[k] = v if k != 'wd': ns_dict[k] = v + '/entity' kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id='http://isi.edu/default-ns/projects') # bind prefix doc = create_custom_prefix(doc, custom_dict) type_map = { 'quantity': Datatype.QuantityValue, 'url': URLValue, 'item': Datatype.Item, 'time': Datatype.TimeValue, 'string': Datatype.StringValue, 'text': Datatype.MonolingualText } # model schema for k, v in data.items(): if ':' in k: k = k.split(':') if 'Q' in k[1]: p = WDItem(k[1], namespace=k[0], creator=':datamart') elif 'P' in k[1]: p = WDProperty(k[1], type_map[v['type']], namespace=k[0], creator=':datamart') else: raise Exception('There is no P/Q information.') return None for lang, value in v['description'].items(): for val in value: p.add_description(val, lang=lang) for lang, value in v['label'].items(): for val in value: p.add_label(val, lang=lang) for node, value in v['statements'].items(): ns = node.split(':')[0] if ':' in node else 'wd' for val in value: prop_type = self.get_property_type(node, ns_dict[ns]) if prop_type == 'WikibaseItem': v = Item(str(val['value'])) elif prop_type == 'WikibaseProperty': v = Property(val['value']) elif prop_type == 'String': v = StringValue(val['value']) elif prop_type == 'Quantity': v = QuantityValue(val['value']) elif prop_type == 'Url': v = URLValue(val['value']) elif prop_type == 'Monolingualtext': v = MonolingualText(val['value'], val['lang']) p.add_statement(node, v) doc.kg.add_subject(p) return doc
def generate_triples(user_id: str, resolved_excel: list, sparql_endpoint: str, filetype: str = 'ttl', created_by: str = 't2wml') -> str: """ This function uses ETK to generate the RDF triples :param user_id: :param resolved_excel: :param sparql_endpoint: :param filetype: :return: """ # initialize kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") property_type_map = property_type_dict # bind prefixes doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') doc.kg.bind('wd', 'http://www.wikidata.org/entity/') doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') doc.kg.bind('wdv', 'http://www.wikidata.org/value/') doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') doc.kg.bind('p', 'http://www.wikidata.org/prop/') doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') doc.kg.bind('prn', 'http://www.wikidata.org/prop/reference/value-normalized/') doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') doc.kg.bind('psn', 'http://www.wikidata.org/prop/statement/value-normalized/') doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') doc.kg.bind('pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') doc.kg.bind('schema', 'http://schema.org/') # property_type_cache = {} is_error = False for i in resolved_excel: _item = i["statement"]["item"] if _item is not None: item = WDItem(_item, creator='http://www.isi.edu/{}'.format(created_by)) try: property_type = property_type_map[i["statement"]["property"]] except KeyError: property_type = get_property_type(i["statement"]["property"], sparql_endpoint) if property_type != "Property Not Found" and i["statement"][ "property"] not in property_type_map: property_type_map[i["statement"] ["property"]] = property_type if property_type == "WikibaseItem": value = Item(str(i["statement"]["value"])) elif property_type == "WikibaseProperty": value = Property(i["statement"]["value"]) elif property_type == "String": value = StringValue(i["statement"]["value"]) elif property_type == "Quantity": _value = i["statement"]["value"] _value = str(_value).replace(',', '') value = QuantityValue(_value) elif property_type == "Time": value = TimeValue( str(i["statement"]["value"]), Item(i["statement"]["calendar"]), translate_precision_to_integer( i["statement"]["precision"]), i["statement"]["time_zone"]) elif property_type == "Url": value = URLValue(i["statement"]["value"]) elif property_type == "Monolingualtext": value = MonolingualText(i["statement"]["value"], i["statement"]["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(i["statement"]["value"]) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(i["statement"]["latitude"], i["statement"]["longitude"], i["statement"]["precision"]) elif property_type == "Property Not Found": is_error = True break s = item.add_statement(i["statement"]["property"], value) doc.kg.add_subject(item) if "qualifier" in i["statement"]: for j in i["statement"]["qualifier"]: try: property_type = property_type_map[j["property"]] except KeyError: property_type = get_property_type( j["property"], sparql_endpoint) if property_type != "Property Not Found" and i[ "statement"][ "property"] not in property_type_map: property_type_map[i["statement"] ["property"]] = property_type if property_type == "WikibaseItem": value = Item(str(j["value"])) elif property_type == "WikibaseProperty": value = Property(j["value"]) elif property_type == "String": value = StringValue(j["value"]) elif property_type == "Quantity": value = QuantityValue(j["value"]) elif property_type == "Time": value = TimeValue(str(j["value"]), Item(j["calendar"]), j["precision"], j["time_zone"]) elif property_type == "Url": value = URLValue(j["value"]) elif property_type == "Monolingualtext": value = MonolingualText(j["value"], j["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(j["value"]) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(j["latitude"], j["longitude"], j["precision"]) elif property_type == "Property Not Found": is_error = True if value is None: continue else: s.add_qualifier(j["property"], value) doc.kg.add_subject(s) if not is_error: data = doc.kg.serialize(filetype) else: # data = "Property Not Found" raise Exception('data exception while generating triples') return data
""" def __init__(self, etk): ETKModule.__init__(self, etk) self.inferlink_extractor = InferlinkExtractor( InferlinkRuleSet( InferlinkRuleSet.load_rules_file( '../html_basic/sample_inferlink_rules.json'))) def process_document(self, doc): """ Add your code for processing the document """ raw = doc.select_segments("$.raw_content")[0] extractions = doc.extract(self.inferlink_extractor, raw) doc.store(extractions, "inferlink_extraction") return list() if __name__ == "__main__": sample_html = json.load(codecs.open('../html_basic/sample_html.json', 'r')) # read sample file from disk etk = ETK(modules=InferlinkETKModule) doc = etk.create_document(sample_html, mime_type="text/html", url="http://ex.com/123") docs = etk.process_ems(doc) print(json.dumps(docs[0].value, indent=2))
def generate_triples(user_id: str, resolved_excel: list, sparql_endpoint: str, filetype: str = 'ttl') -> str: """ This function uses ETK to generate the RDF triples :param user_id: :param resolved_excel: :param sparql_endpoint: :param filetype: :return: """ # initialize kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") # bind prefixes doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') doc.kg.bind('wd', 'http://www.wikidata.org/entity/') doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') doc.kg.bind('wdv', 'http://www.wikidata.org/value/') doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') doc.kg.bind('p', 'http://www.wikidata.org/prop/') doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') doc.kg.bind('prn', 'http://www.wikidata.org/prop/reference/value-normalized/') doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') doc.kg.bind('psn', 'http://www.wikidata.org/prop/statement/value-normalized/') doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') doc.kg.bind('pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') doc.kg.bind('schema', 'http://schema.org/') # property_type_cache = {} is_error = False for i in resolved_excel: item = WDItem(i["statement"]["item"], creator='http://www.isi.edu/t2wml') try: property_type = property_type_map[i["statement"]["property"]] except KeyError: property_type = get_property_type(i["statement"]["property"], sparql_endpoint) property_type_map[i["statement"]["property"]] = property_type if property_type == "WikibaseItem": value = Item(str(i["statement"]["value"])) elif property_type == "WikibaseProperty": value = Property(i["statement"]["value"]) elif property_type == "String": value = StringValue(i["statement"]["value"]) elif property_type == "Quantity": value = QuantityValue(i["statement"]["value"]) elif property_type == "Time": value = TimeValue(str(i["statement"]["value"]), Item(i["statement"]["calendar"]), translate_precision_to_integer(i["statement"]["precision"]), i["statement"]["time_zone"]) elif property_type == "Url": value = URLValue(i["statement"]["value"]) elif property_type == "Monolingualtext": value = MonolingualText(i["statement"]["value"], i["statement"]["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(i["statement"]["value"]) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(i["statement"]["latitude"], i["statement"]["longitude"], i["statement"]["precision"]) elif property_type == "Property Not Found": is_error = True break s = item.add_statement(i["statement"]["property"], value) doc.kg.add_subject(item) if "qualifier" in i["statement"]: for j in i["statement"]["qualifier"]: try: property_type = property_type_map[j["property"]] except KeyError: property_type = get_property_type(j["property"], sparql_endpoint) property_type_map[j["property"]] = property_type if property_type == "WikibaseItem": value = Item(str(j["value"])) elif property_type == "WikibaseProperty": value = Property(j["value"]) elif property_type == "String": value = StringValue(j["value"]) elif property_type == "Quantity": value = QuantityValue(j["value"]) elif property_type == "Time": value = TimeValue(str(j["value"]), Item(j["calendar"]), j["precision"], j["time_zone"]) elif property_type == "Url": value = URLValue(j["value"]) elif property_type == "Monolingualtext": value = MonolingualText(j["value"], j["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(j["value"]) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(j["latitude"], j["longitude"], j["precision"]) elif property_type == "Property Not Found": is_error = True s.add_qualifier(j["property"], value) doc.kg.add_subject(s) if not is_error: data = doc.kg.serialize(filetype) else: data = "Property Not Found" # os.makedirs(Path.cwd() / "new_properties", exist_ok=True) # results_file_name = user_id + "_results.ttl" # changes_file_name = user_id + "_changes.tsv" # with open(Path(app.config['downloads']) / results_file_name, "w") as fp: # fp.write(data) # with open(Path(app.config['downloads']) / changes_file_name, "w") as fp: # serialize_change_record(fp) return data
# "groundpig, whistlepig, whistler, thickwood badger, " # "Canada marmot, monax, moonack, weenusk, red monk and, " # "among French Canadians in eastern Canada, siffleur" # }, # { # "name": "Test3 - Social Media", # "description": "Parser stress test for tweets", # "text": "Slides onto twitter..... \n" # ".......slippery floor....... \n" # "............slides out the other side..." # } # ], # "doc_id": 42069 # } # etk = ETK(modules=SentenceSplittingETKModule) # doc = etk.create_document(toy_doc) # # split_doc = etk.process_ems(doc) # # print(json.dumps(split_doc[0].value, indent=2)) parser = OptionParser(conflict_handler="resolve") parser.add_option("-i", "--input_file", action="store", type="string", dest="input_file") parser.add_option("-o", "--output_file", action="store", type="string",
projects = doc.select_segments("projects[*]") for d, p in zip(descriptions, projects): names = doc.extract(self.rule_extractor, d) p.store(names, "members") return list() if __name__ == "__main__": sample_input = { "projects": [{ "name": "etk", "description": "version 2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others." }, { "name": "rltk", "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students." }] } etk = ETK(modules=RuleETKModule) doc = etk.create_document(sample_input) docs = etk.process_ems(doc) print(json.dumps(docs[0].value, indent=2))
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix skos: <http://www.w3.org/2004/02/skos/core#> . @prefix schema: <http://schema.org/> . :Event a owl:Class ; . :Entity a owl:Class ; . :Organization a owl:Class ; . :MOVEMENT_TRANSPORT a owl:Class ; . :GeopoliticalEntity a owl:Class ; . skos:prefLabel a owl:DatatypeProperty ; schema:domainIncludes :Entity, :Event ; rdfs:range xsd:string ; . :conflict_attack_place a owl:ObjectProperty ; schema:domainIncludes :Entity, :Event ; schema:rangeIncludes :GeopoliticalEntity ; . ''' ontology = Ontology(ontology_content, validation=False, include_undefined_class=True, quiet=True) kg_schema = KGSchema(ontology.merge_with_master_config(dict())) etk = ETK(modules=ExampleETKModule, kg_schema=kg_schema, ontology=ontology) input_data = {'doc_id': '1', 'data': json.loads(sample_input)} doc = etk.create_document(input_data) docs = etk.process_ems(doc) kgs = [json.dumps(doc.kg.value) for doc in docs[1:]] with open('output.jsonl', 'w') as f: f.write('\n'.join(kgs)) with open('output.nt', 'w') as f: f.writelines(map(rdf_generation, kgs))
import unittest, json from etk.timeseries_processor import TimeseriesProcessor from etk.etk import ETK from etk.knowledge_graph import KGSchema kg_schema = KGSchema( json.load(open('etk/unit_tests/ground_truth/test_config.json'))) etk = ETK(kg_schema=kg_schema) # python -m unittest etk.unit_tests.test_timeseries_processor to run all unittests class TestTimeseriesProcessor(unittest.TestCase): def test_excel_file(self) -> None: annotation = 'etk/timeseries/DIESEL_june_annotation.json' spreadsheet = 'etk/unit_tests/ground_truth/DIESEL_june_2017.xlsx' timeseriesProcessor = TimeseriesProcessor(etk=etk, annotation=annotation, spreadsheet=spreadsheet) docs = [ doc.cdr_document for doc in timeseriesProcessor.timeseries_extractor() ] selected_docs = docs[1] expected_metadata = { "name": "AVERAGE DIESEL (AUTOMATIVE GAS OIL) PRICES/ Litre NGN", "granularity": "monthly", "provenance": { "filename": "DIESEL_june_2017.xlsx",
# Add a title to the actor document doc.kg.add_value("title", json_path="$.Side") # Return an empty list because we didn't create new documents return [] # The main is for testing, and is not used in the DIG pipeline if __name__ == "__main__": # Tell ETK the schema of the fields in the KG, the DIG master_config can be used as the schema. kg_schema = KGSchema(json.load(open('master_config.json'))) # Instantiate ETK, with the two processing modules and the schema. etk = ETK(modules=[UCDPModule, UCDPActorModule], kg_schema=kg_schema) # Create a CSV processor to create documents for the relevant rows in the Excel sheet cp = CsvProcessor(etk=etk, heading_row=1) with open("ucdp.jl", "w") as f: # Iterate over all the rows in the spredsheet for doc in cp.tabular_extractor(filename="ucdp_sample.xls", dataset='ucdp'): # Each row produces a document, which we sent to ETK. # Note that each invocation of process_ems will also process any new documents created while # processing each doc etk.process_and_frame(doc) f.write(json.dumps(doc.cdr_document) + "\n") # for result in etk.process_ems(doc): # # print(result.cdr_document["knowledge_graph"]) # f.write(json.dumps(result.cdr_document) + "\n")
"news_story": { "type": "string" }, "similarity": { "type": "number" }, "matched_sentence": { "type": "string" }, "date": { "type": "string" } } } kg_schema = KGSchema(master_config) etk = ETK(kg_schema, ["./"]) # read the news news_file = open( '/Users/amandeep/Github/etk/examples/ranking_pipeline/resources/new_2018-04-03-first-10000.jl' ) # news_file = open('/Users/amandeep/Github/etk/examples/ranking_pipeline/resources/news_stories_3.jl') news_stories = [ etk.create_document(json.loads(line), url=json.loads(line)['tld'], doc_id=json.loads(line)['doc_id']) for line in news_file ] results = list() for news_story in news_stories: results.extend(etk.process_ems(news_story))
import os, sys, json, codecs sys.path.append(os.path.join(os.path.dirname(__file__), '..')) sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) from etk.etk import ETK from etk.extractors.html_content_extractor import HTMLContentExtractor, Strategy from etk.extractors.html_metadata_extractor import HTMLMetadataExtractor from etk.extractors.inferlink_extractor import InferlinkExtractor, InferlinkRuleSet sample_html = json.load(codecs.open('sample_html.json', 'r')) # read sample file from disk etk = ETK() doc = etk.create_document(sample_html, mime_type="text/html", url="http://ex.com/123") metadata_extractor = HTMLMetadataExtractor() content_extractor = HTMLContentExtractor() landmark_extractor = InferlinkExtractor( InferlinkRuleSet( InferlinkRuleSet.load_rules_file('sample_inferlink_rules.json'))) root = doc.select_segments("$")[0] raw = doc.select_segments("$.raw_content")[0] # root.store_extractions(doc.invoke_extractor(metadata_extractor, extract_title=True), "title") # root.store_extractions(doc.invoke_extractor(metadata_extractor, extract_meta=True), "metadata") root.store_extractions( doc.invoke_extractor(content_extractor, raw, strategy=Strategy.ALL_TEXT), "etk2_text") root.store_extractions(
import os, sys, json sys.path.append(os.path.join(os.path.dirname(__file__), '..')) sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) from etk.etk import ETK from etk.knowledge_graph import KGSchema from examples.config_to_em.em_base_generator import EmBaseGenerator ebg = EmBaseGenerator('template.tpl') ebg.generate_em_base('master_config.json', 'ems/em_base.py') kg_schema = KGSchema(json.load(open("master_config.json", "r"))) etk = ETK(kg_schema, ["./ems"]) doc = etk.create_document(json.load(open('sample_html.jl', 'r'))) docs = etk.process_ems(doc) print(json.dumps(docs[0].value, indent=2))