def make(self, triples_filename, links_filename, verbose=False): #if not triples_filename.startswith('/'): # triples_filename = os.path.dirname(os.path.realpath(__file__)) + '/' + triples_filename #if not links_filename == None and not links_filename.startswith('/'): # links_filename = os.path.dirname(os.path.realpath(__file__)) + '/' + links_filename print('Processing triples from {}'.format(triples_filename)) self.__prefixed = { 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs', 'http://local/local.owl#': 'local', 'http://local/verbnet_roles.owl#': 'vn.role' } if not links_filename == None: with open(links_filename, 'r') as links_file: for line in links_file.readlines(): if len(line) < 2: continue if line.startswith('@PREFIX'): line_list = line.split() prefix = line_list[1] prefix = prefix[:prefix.rfind(':')] uri = line_list[2] uri = uri[uri.find('<') + 1:uri.find('>')] self.__prefixed.update({uri: prefix}) elif line.startswith('@LINK'): line_list = line.split('\t') predicate = line_list[1] links = '\t'.join(line_list[2:]) self.__links.update({predicate: links}) links_file.close() with open(triples_filename, 'r') as triples_file: for line in triples_file.readlines(): line_lst = line.replace('\"', '').split('\t') sentence_number = line_lst[0].strip() subject = line_lst[1].strip() predicate = line_lst[2].strip() object = line_lst[3].strip() if not predicate in self.__links and predicate.find(':') < 0: if verbose: print( 'Warning: no match for predicate "{}" was found in the links! Skipping triple ...' .format(predicate)) # continue predicate_link = '' #if predicate.find(':') < 0: #Predicates that are already resources/links if predicate in self.__links: predicate_link = self.__links[predicate] entities = set([str(X) for X in self.__links.keys()]) closest_subjects = difflib.get_close_matches(subject, entities, n=3, cutoff=1.0) closest_objects = difflib.get_close_matches(object, entities, n=3, cutoff=1.0) if len(closest_subjects) < 1: if verbose: print( 'Warning: no match for subject "{}" was found in the links! Attempting partials ...' .format(subject)) #subj = subject # Reverse sorted list of entities by string length #lst_entities = sorted(list(entities), key=len, reverse=True) #for elem in lst_entities: # if elem in subj: # if verbose: # print('-- Found: {}'.format(elem)) # closest_subjects.append(elem) # subj = subj.replace(elem, '') #if len(closest_subjects) < 1: # if verbose: # print('WARNING: not even partial matches were found for subject "{}" in the links!'.format(subject)) # continue if len(closest_objects) < 1: if verbose: print( 'Warning: no match for object "{}" was found in the links! Atempting partials ...' .format(object)) obj = object # Reverse sorted list of entities by string length #lst_entities = sorted(list(entities), key=len, reverse=True) #for elem in lst_entities: # if elem in obj: # if verbose: # print('-- Found: {}'.format(elem)) # closest_objects.append(elem) # obj = obj.replace(elem, '') #if len(closest_objects) < 1: # if verbose: # print('WARNING: not even partial matches were found for object "{}" in the links!'.format(object)) # continue # Check for exact matches and discard the others if that's the case #for sub in closest_subjects: # if subject == sub: # closest_subjects = [sub] # break #for ob in closest_objects: # if object == ob: # closest_objects = [ob] # break subject_links = [] for subj in closest_subjects: subject_links += [self.__links[subj]] object_links = [] for obj in closest_objects: object_links += [self.__links[obj]] triple = Triple(sentence_number, subject, predicate, object, subject_links, predicate_link, object_links) prefixes, classes, properties, mapped, relation = triple.to_turtle( ) self.__prefixed.update(prefixes) self.__classes.update(classes) self.__mapped_relations.update(mapped) self.__properties.update(properties) self.__relations.add(relation) triples_file.close() output_filename = os.path.splitext(triples_filename)[0] output_filename = output_filename[:output_filename.rfind('_' )] + '_kg.ttl' open(output_filename, 'w').close() # Clean the file in case it exists with open(output_filename, 'a') as output_file: for key in self.__prefixed.keys(): output_file.write('@prefix\t{}:\t<{}>\t.\n'.format( self.__prefixed[key], key)) output_file.write('\n#### Classes ####\n\n') for key in self.__classes.keys(): output_file.write('{}\n\n'.format(self.__classes[key])) output_file.write('#### Properties ####\n\n') for key in self.__properties.keys(): output_file.write('{}\n\n'.format(self.__properties[key])) output_file.write('#### Mapped Relations ####\n\n') for mapping in self.__mapped_relations: output_file.write('{}\n'.format(mapping)) output_file.write('\n#### Relations ####\n\n') for relation in self.__relations: output_file.write('{}\n'.format(relation)) output_file.close() print('Linked entities were stored at {}'.format(output_filename)) return output_filename
def make(self, triples_filename, links_filename, verbose=False): if not triples_filename.startswith('/'): triples_filename = os.path.dirname( os.path.realpath(__file__)) + '/' + triples_filename if not links_filename.startswith('/'): links_filename = os.path.dirname( os.path.realpath(__file__)) + '/' + links_filename print('Processing predicates from {}'.format(triples_filename)) self.__predicates = {} self.__prefixed = { 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs', 'http://local/local.owl#': 'local' } self.__entities = {} with open(links_filename, 'r') as links_file: for line in links_file.readlines(): if len(line) < 2: continue if line.startswith('@PREFIX'): line_list = line.split() prefix = line_list[1] prefix = prefix[:prefix.rfind(':')] uri = line_list[2] uri = uri[uri.find('<') + 1:uri.find('>')] self.__prefixed.update({uri: prefix}) elif line.startswith('@PREDICATE'): line_list = line.split() line = ' '.join(line_list[1:]) line_list = line.split(';') predicate = line_list[0] link = line_list[1] self.__predicates.update({predicate: link}) elif line.startswith('@ENTITY'): line_list = line.split() line = ' '.join(line_list[1:]) line_list = line.split(';') entity = line_list[0] links = line_list[1].split(',') self.__entities.update({entity: links}) links_file.close() with open(triples_filename, 'r') as triples_file: for line in triples_file.readlines(): line_lst = line.replace('\"', '').split('\t') sentence_number = line_lst[0] subject = line_lst[1] predicate = line_lst[2] object = line_lst[3] if not predicate in self.__predicates: print('Warning: predicate "{}" not found in links!'.format( predicate)) continue predicate_link = self.__predicates[predicate] closest_subject = difflib.get_close_matches( subject, self.__entities) closest_object = difflib.get_close_matches( object, self.__entities) if len(closest_subject) < 1 or len(closest_object) < 1: continue subject_link = self.__entities[closest_subject[0]] object_link = self.__entities[closest_object[0]] triple = Triple(sentence_number, closest_subject[0], predicate, closest_object[0], subject_link, predicate_link, object_link) prefixes, classes, properties, mapped, relation = triple.to_turtle( ) self.__prefixed.update(prefixes) self.__classes.update(classes) self.__mapped_relations.update(mapped) self.__properties.update(properties) self.__relations.add(relation) triples_file.close() output_filename = os.path.splitext(triples_filename)[0] output_filename = output_filename[:output_filename.rfind('_' )] + '_kg.ttl' open(output_filename, 'w').close() # Clean the file in case it exists with open(output_filename, 'a') as output_file: for key in self.__prefixed.keys(): output_file.write('@prefix\t{}:\t<{}>\t.\n'.format( self.__prefixed[key], key)) output_file.write('\n#### Classes ####\n\n') for key in self.__classes.keys(): output_file.write('{}\n\n'.format(self.__classes[key])) output_file.write('#### Properties ####\n\n') for key in self.__properties.keys(): output_file.write('{}\n\n'.format(self.__properties[key])) output_file.write('#### Mapped Relations ####\n\n') for mapping in self.__mapped_relations: output_file.write('{}\n'.format(mapping)) output_file.write('\n#### Relations ####\n\n') for relation in self.__relations: output_file.write('{}\n'.format(relation)) output_file.close() print('Linked entities were stored at {}'.format(output_filename)) return output_filename