def scoring(triples: List[Tuple[str, str, str]], n_candidates_threshold: int = 5) -> List[Tuple[str, str, str]]: """Replaces predicates with linked predicates (URIs) :param triples: A list of triples to be processed [(subject, predicate, object), ...] :ex: [["http://dbpedia.org/resource/Barack_Obama", "bear in", "http://dbpedia.org/resource/Hawaii"]] :param n_candidates_threshold: A maximum number of candidates to be verified. The verification algorithm may take a very long time to verify a large set of candidates :ex: 5 :return: A list of triples with predicates replaced """ verbose.info('Mapping predicates using scoring system', scoring) linked_triples = [] for triple in triples: # Find and sort predicate candidates by mapping_score candidates = _find_candidates(triple[1]) candidates = sorted(candidates, key=lambda tup: tup[1], reverse=True) # Choose a candidate with highest mapping_score that is also in a domain range for candidate in candidates[:n_candidates_threshold]: if verify.agreement(triple, candidate[0]): linked_triples.append((triple[0], candidate[0], triple[2])) break return linked_triples
def openie(text: str) -> List[Tuple[str, str, str]]: """Extracts triples using Stanford CoreNLP OpenIE library via CoreNLPConnector in Java REST API :param text: A string to be extracted :ex: Barack Obama born in Hawaii :return: A list of triples """ client = env.resolve('servers.java') verbose.info('Extracting triples using OpenIE at: ' + client['address'], caller=openie) return requests.get('%s/openie/triples' % client['address'], params={'text': text}).json()
def dbpedia(triple: Tuple[str, str, str], predicate: str) -> bool: """Verifies domain-range agreement using DBpedia's ontology :param triple: A triple (subject's entity, predicate, object's entity) :ex: ["http://dbpedia.org/resource/Barack_Obama", "bear in", "http://dbpedia.org/resource/Hawaii"] :param predicate: Entity of the predicate to be verified :ex: http://dbpedia.org/ontology/birthPlace :return: True if the predicate does not constitute domain-range violation """ # NOTE: Since DBpedia is expected to be the primary service for ontology look-up in this project, # removing part of the URI helps reduce redundant information showed on a screen. # The following lines of code should be modified when the service endpoint is changed. config = env.resolve('database.virtuoso') e = '%s:%d/sparql/' % ( config['address'], config['port'] ) if 'port' in config else '%s/sparql/' % config['address'] verbose.info( "Verifying domain-range of a predicate '%s' with SPARQL server: %s" % (predicate.replace('http://dbpedia.org/ontology/', ''), e), dbpedia) sparql = SPARQLWrapper(e) sparql.setQuery('SELECT DISTINCT ?vr WHERE {{ ' '<{0}> <http://www.w3.org/2000/01/rdf-schema#range> ?rp. ' '<{1}> ?vr ?rp. }}'.format(predicate, triple[2])) sparql.setReturnFormat(JSON) try: results = sparql.query().convert() for result in results['results']['bindings']: if len(result['vr']['value']) > 0: return True except QueryBadFormed as error: verbose.error(str(error), dbpedia) return False # Domain-range violation concluded return False
def triples(triples: List[Tuple[str, str, str]], entities: Dict[str, str]) -> List[Tuple[str, str, str]]: """Replaces subjects and objects in a list of triples with entities :param triples: A list of triples [(subject, predicate, object), ...] :ex: [["Barack Obama", "bear in", "Hawaii"]] :param entities: A dictionary of known entities :ex: {"Barack Obama": "http://dbpedia.org/resource/Barack_Obama", "Hawaii": "http://dbpedia.org/resource/Hawaii"} :return: A list of triples with subjects and objects replaced with their entities """ def replace_entity(term): return entities[term] if term in entities else term verbose.info('Aggregating triples', globals()['triples']) aggregated_triples = [] for triple in triples: aggregated_triples.append( (replace_entity(triple[0].strip()), triple[1], replace_entity(triple[2].strip()))) return aggregated_triples
def dbpedia_spotlight(text: str, endpoint: str = None, confidence: float = .5) -> Dict[str, str]: """Maps entities from a text :param text: A string (to be mapped) :ex: Barack Obama born in Hawaii :param endpoint: Annotator endpoint :ex: http://model.dbpedia-spotlight.org/en/annotate :param confidence: Minimum threshold of confidence value of found entities :ex: 0.5 :return: A dictionary of mapped entities (URI) """ config = configs.resolve('knowledge.integration.map.entities.dbpedia') confidence = confidence or config['confidence'] endpoint = endpoint or config['endpoint'] verbose.info('Mapping entities with annotation endpoint: %s' % endpoint, dbpedia_spotlight) response = requests.post(endpoint, data={'text': text, 'confidence': str(confidence)}, headers={'Accept': 'application/json'}) entities = {} for item in response.json()['Resources']: entities[item['@surfaceForm']] = item['@URI'] return entities
def ready(self): if 'UWKGM_STATE' in os.environ and os.environ[ 'UWKGM_STATE'] == 'running': verbose.info('Initializing NLTK wordnet...', KnowledgeConfig) try: _create_unverified_https_context = ssl._create_unverified_context except AttributeError: pass else: ssl._create_default_https_context = _create_unverified_https_context nltk.download('wordnet') verbose.info('Testing MongoDB database connection...') pymongo.MongoClient('mongodb://%s:%d/' % (env.resolve('database.mongo.address'), env.resolve('database.mongo.port'))) verbose.info('MongoDB connection test pass')