def __call__(self, text, subject_begin, subject_end, object_begin, object_end, **kwargs): if subject_begin < object_begin: text_between_begin = subject_end text_between_end = object_begin is_reversed = False else: text_between_begin = object_end text_between_end = subject_begin is_reversed = True info = run_task(Task.SPACY_PROCESS, text) tokens = info['tokens'] idx = info['idx'] pos = info['pos'] between_token_indexes = self.__tokens_in_range(tokens, text_between_begin, text_between_end, idx) between_token_indexes = list(filter(lambda i: pos[i] != 'PUNCT', between_token_indexes)) between_tokens = [tokens[i] for i in between_token_indexes] between_pos = [pos[i] for i in between_token_indexes] between_text = ' '.join(between_tokens) candidates = self.pattern_trie.get_value(between_tokens, between_pos) ontology_candidates = [DBPEDIA_ONTOLOGY_PREFIX + candidate for candidate in candidates] property_candidates = [DBPEDIA_PROPERTY_PREFIX + candidate for candidate in candidates] candidates = ontology_candidates + property_candidates candidates_vs_confidence = [(candidate, 2/len(candidates)) for candidate in candidates] return candidates_vs_confidence
def __map_type(self, node: QueryTree.Node) -> None: type_begin, type_end = self.tree.offset_for_node(node) node.kb_resources = run_task( Task.MAP_TYPE, { 'text': self.question_text, 'type_begin': type_begin, 'type_end': type_end })
def __map_entity(self, node: QueryTree.Node) -> None: node.children = list( filter( lambda x: x.type == NodeType.TOKEN, node.children)) # TODO: handle type constraints for entities entity_begin, entity_end = self.tree.offset_for_node(node) node.kb_resources = run_task( Task.MAP_ENTITY, { 'text': self.question_text, 'entity_begin': entity_begin, 'entity_end': entity_end }) if not node.kb_resources: node.type = NodeType.LITERAL
def __call__(self, query_text: str) -> List[dict]: tokens = run_task(Task.TOKENIZE, query_text) self.syntax_validator = SyntaxChecker(GRAMMAR_FILE_PATH) self.__prepare_input(tokens) self.__run_ncrfpp() candidates = self.__decode_labels(tokens) # Statistically parsed trees might not validate the grammar. Discard invalid trees candidates = list( filter(lambda tree: self.syntax_validator.validate(tree), candidates)) candidates = list( map( lambda tree: tree.to_serializable( SerializationFormat.HIERARCHICAL_DICT), candidates)) #print('Produced {}/{} valid candidates!'.format(len(candidates), TREE_CANDIDATES_N_BEST)) return candidates
def init(index, text): state['tokens'] = tokens = run_task(Task.TOKENIZE, text.strip()) state['example_index'] = index canvas.bind( "<Button-1>", lambda event: user_create_node(event.x, event.y, state['node_type'])) canvas.bind("<Key>", on_key) for index, token in enumerate(tokens): create_node(x=settings['window_width'] * (index / len(tokens)), y=settings['window_height'] - 100, node_type=NodeType.TOKEN, token=index) create_node(x=settings['window_width'] // 2, y=100, node_type=NodeType.ROOT) canvas.focus_set()
return node root = node_from_dict(tree_dict['tree']) # Aggregate unused tokens if len(used_nodes) < len(token_nodes): unused_container_node = QueryTree.Node(NodeType.UNUSED) root.children.append(unused_container_node) for node in token_nodes: if node not in used_nodes: unused_container_node.children.append(node) tree = QueryTree(root, tokens) return tree with open('jimmy.ask', 'w') as output_file: for tree in trees: try: index = int(tree['id']) tokens = run_task(Task.TOKENIZE, questions[index][1]) tokens_to_token(tree['tree']) query_tree = from_dict(tree, tokens) output_file.write( query_tree.to_serializable( SerializationFormat.PREFIX_PARANTHESES)) except: print("failed!")
def generate_prior_candidates(generator, node: QueryTree.Node): # We use the accumulated constraints so far to generate a query that retrieves all possible relations for this node. # Make copies so we don't break the current state node_copy = deepcopy(node) gen = deepcopy(generator) NODE_HANDLERS[node.type](gen=gen, node=node_copy, reverse_relation=False) in_order_query = gen.generate_query_from_current_state( constants.RELATION_EXTRACTION_VARIABLE) in_order_candidates = run_task( Task.RUN_SPARQL_QUERY, { 'query_body': in_order_query, 'return_variable': constants.RELATION_EXTRACTION_VARIABLE.replace('?', '') }) in_order_candidates = list( filter(lambda x: x not in constants.RELATION_MAPPING_BLACKLIST, in_order_candidates)) prior_candidates = in_order_candidates # We also don't know the order yet (in terms of subject-object) of the triple yet, so we need the relation candidates for the revese order as well. if node.type not in { NodeType.ARGMAX, NodeType.ARGMIN, NodeType.ARGNTH, NodeType.TOPN }: # Can't reverse these node_copy = deepcopy(node) gen = deepcopy(generator) NODE_HANDLERS[node.type](gen=gen, node=node_copy, reverse_relation=True) reverse_order_query = gen.generate_query_from_current_state( constants.RELATION_EXTRACTION_VARIABLE) reverse_order_candidates = run_task( Task.RUN_SPARQL_QUERY, { 'query_body': reverse_order_query, 'return_variable': constants.RELATION_EXTRACTION_VARIABLE.replace( '?', '') }) reverse_order_candidates = list( filter( lambda x: x not in constants. RELATION_MAPPING_BLACKLIST, reverse_order_candidates)) reverse_order_candidates = [ EQUIVALENT_RELATION_RESOLVER.reverse_relation(candidate) for candidate in reverse_order_candidates ] prior_candidates.extend(reverse_order_candidates) # Remove any candidates that already mapped in reverse to a child node so as to avoid cycles child_relation_nodes = node.collect(RELATION_NODE_TYPES) child_relations = [] for child in child_relation_nodes: child_relations.extend(child.kb_resources) reversed_child_relations = set([ EQUIVALENT_RELATION_RESOLVER.reverse_relation(relation) for relation in child_relations ]) prior_candidates = list( filter( lambda relation: relation not in reversed_child_relations, prior_candidates)) return prior_candidates
def __map_relation(self, node: QueryTree.Node) -> bool: def generate_prior_candidates(generator, node: QueryTree.Node): # We use the accumulated constraints so far to generate a query that retrieves all possible relations for this node. # Make copies so we don't break the current state node_copy = deepcopy(node) gen = deepcopy(generator) NODE_HANDLERS[node.type](gen=gen, node=node_copy, reverse_relation=False) in_order_query = gen.generate_query_from_current_state( constants.RELATION_EXTRACTION_VARIABLE) in_order_candidates = run_task( Task.RUN_SPARQL_QUERY, { 'query_body': in_order_query, 'return_variable': constants.RELATION_EXTRACTION_VARIABLE.replace('?', '') }) in_order_candidates = list( filter(lambda x: x not in constants.RELATION_MAPPING_BLACKLIST, in_order_candidates)) prior_candidates = in_order_candidates # We also don't know the order yet (in terms of subject-object) of the triple yet, so we need the relation candidates for the revese order as well. if node.type not in { NodeType.ARGMAX, NodeType.ARGMIN, NodeType.ARGNTH, NodeType.TOPN }: # Can't reverse these node_copy = deepcopy(node) gen = deepcopy(generator) NODE_HANDLERS[node.type](gen=gen, node=node_copy, reverse_relation=True) reverse_order_query = gen.generate_query_from_current_state( constants.RELATION_EXTRACTION_VARIABLE) reverse_order_candidates = run_task( Task.RUN_SPARQL_QUERY, { 'query_body': reverse_order_query, 'return_variable': constants.RELATION_EXTRACTION_VARIABLE.replace( '?', '') }) reverse_order_candidates = list( filter( lambda x: x not in constants. RELATION_MAPPING_BLACKLIST, reverse_order_candidates)) reverse_order_candidates = [ EQUIVALENT_RELATION_RESOLVER.reverse_relation(candidate) for candidate in reverse_order_candidates ] prior_candidates.extend(reverse_order_candidates) # Remove any candidates that already mapped in reverse to a child node so as to avoid cycles child_relation_nodes = node.collect(RELATION_NODE_TYPES) child_relations = [] for child in child_relation_nodes: child_relations.extend(child.kb_resources) reversed_child_relations = set([ EQUIVALENT_RELATION_RESOLVER.reverse_relation(relation) for relation in child_relations ]) prior_candidates = list( filter( lambda relation: relation not in reversed_child_relations, prior_candidates)) return prior_candidates parent_node = self.tree.find_parent(node) if node.type == NodeType.EXISTSRELATION or parent_node.type == NodeType.EXISTS: # In case of EXISTS we can't consider prior candidates because mapping implies picking the most probable from # them. In this case EXISTS would always yield true. Instead, the strategy is to get the most probable relation from all relation search space. prior_candidates = [] else: prior_candidates = generate_prior_candidates(self, node) if not prior_candidates: # Relax types in case they yield not results (KB might be inconsistent, answer might be a string etc.) node.children = list( filter(lambda x: x.type != NodeType.TYPE, node.children)) prior_candidates = generate_prior_candidates(self, node) relation_mapping_input = self.tree.generate_relation_extraction_sequence( node) relation_mapping_input['candidates'] = prior_candidates relations = run_task(Task.MAP_RELATIONS, relation_mapping_input) if node.type == NodeType.EXISTSRELATION or parent_node.type == NodeType.EXISTS: # In case of existence checking, consider both directions relations.extend([ EQUIVALENT_RELATION_RESOLVER.reverse_relation(relation) for relation in relations ]) node.kb_resources = relations