def get_statements_from_ldcsc(ldcsc, var_counter=0): statements = [] if not isinstance(ldcsc, Tree): return statements if get_top(ldcsc) in ['DATE', 'NUMBER']: return statements if IsString(ldcsc[0]) and not is_operator(ldcsc[0]): new_var = '?x' + str(var_counter) pred = ldcsc[0].strip('!') if IsString(ldcsc[1]): entity_or_var = ldcsc[1] elif get_top(ldcsc[1]) == 'DATE': entity_or_var = '?d0' statements.extend(get_statements_from_date(ldcsc[1], '?d0')) elif get_top(ldcsc[1]) == 'NUMBER': entity_or_var = get_number_from_constituent(ldcsc[1]) # statements.extend(get_statements_from_number(ldcsc[1], '?n0')) else: entity_or_var = '?x' + str(var_counter + 1) if IsString(ldcsc[0]) and ldcsc[0].startswith('!'): subj, obj = entity_or_var, new_var else: subj, obj = new_var, entity_or_var s = Statement(subj, pred, obj) statements.append(s) var_counter += 1 subtree_ini_index = 1 if IsString(ldcsc[0]) else 0 for subtree in ldcsc[subtree_ini_index:]: statements.extend(get_statements_from_ldcsc(subtree, var_counter)) return statements
def GetRelevantRules(self, tree, path_state): path, state = path_state subtree = tree_index(tree, path) relevant_rules = [] if not isinstance(subtree, NLTKTree): tree_branches_pos = subtree tree_pos = subtree else: tree_branches_pos = \ ' '.join([get_top(t) for t in subtree if isinstance(t, NLTKTree)]) tree_pos = get_top(subtree) rules_indices = self.index[(state, tree_pos, tree_branches_pos)] relevant_rules = [self.rules[i] for i in rules_indices] return relevant_rules
def BuildTiburonRHS(tree, newstates, path=(), quote_tokens=True): """ 1. Quote terminals, 2. Rename variables ?x0|NP would change into x0:NP 3. Remove types of variables. x0:NP would change into x0. 3. Change bracketing (NP (DT the) (NN house)) would change into NP(DT(the) NN(house)) 4. Apply states to variables. (NP (DT ?x0|) ?x1|NN) and {(0,0): 'q1', (1,) : 'q2'} would change into NP(DT(q1.x0) q2.x1) """ rhs_str = '' if IsString(tree): if IsVariable(tree): assert tree.startswith('?') assert '|' in tree assert path in newstates, 'path {0} not in {1}'.format( path, newstates) rhs_str = newstates[path] + '.' + tree[1:tree.index('|')] else: rhs_str = ConvertTokenToTiburon(tree, quote=quote_tokens) else: pos = get_top(tree) rhs_str = ConvertPOSToTiburon(pos) + '(' rhs_str += ' '.join( [BuildTiburonRHS(child, newstates, path + (i,), quote_tokens=quote_tokens) \ for i, child in enumerate(tree)]) rhs_str += ')' return rhs_str
def MakeDeletingRule(self): """ If the LHS does not produce any leaf but RHS does, such rule can be considered as a leaf-deleting rule. It is not clear when lexicalized branches should be replaced by a deleting variable (it depends on the application). Here we replace fully lexicalied branches at level 1 by a deleting variable, only when the RHS does not contain any leaf that is not a variable. """ if IsString(self.lhs): return self if IsString(self.rhs) and not IsVariable(self.rhs): return self if not IsString(self.rhs): rhs_leaves = self.rhs.leaves() if rhs_leaves and any([not IsVariable(l) for l in rhs_leaves]): return self # Make generator of fresh variables. index_new_variable = ('?xx%d|' % i for i in xrange(20)) # Substitute branches at level 1 if they are fully lexicalized. lhs_paths_prefix_1 = set( [p[0] for p in self.lhs_vars_to_paths.values()]) if not lhs_paths_prefix_1: return self for i, branch in enumerate(self.lhs): if i not in lhs_paths_prefix_1: if IsString(branch): self.lhs[i] = index_new_variable.next() else: self.lhs[i] = index_new_variable.next() + get_top(branch) self.lhs_vars_to_paths[self.lhs[i]] = (i, ) return self
def UnconvertAllPOSFromTiburon(tree): leaf_paths = tree.treepositions('leaves') nt_paths = set(tree.treepositions()) - set(leaf_paths) for nt_path in nt_paths: assert not IsString(tree[nt_path]) tiburon_pos = get_top(tree[nt_path]) tree[nt_path].set_label(UnconvertPOSFromTiburon(tiburon_pos)) return tree
def constituent2dcs(tree): '''convert a constituent structure into a DCS tree''' if not isinstance(tree, Tree): return [tree] # elif get_top(tree) == 'COUNT': # assert len(tree) == 1 # return [Tree('count', constituent2dcs(tree[0]))] elif get_top(tree) == 'NUMBER': assert len(tree) == 2 return [Tree('number', tree[:])] elif get_top(tree) == 'DATE': # tree contains a list with only one element, which is the data # joined with underscores. We re-establish the list. if IsString(tree[0]): date_info = tree[0].split('_') try: map(int, date_info) except ValueError: date_info = [tree[0]] return [Tree('date', date_info)] else: return [Tree(get_top(tree[0]), flatten(map(constituent2dcs, tree[1:])))] if get_top(tree) == 'ID' and len(tree) == 2: # The first child is the predicate. The rest are the arguments. assert len(tree) == 2, '%s' % tree predicate = get_top(tree[0]) if predicate == 'COUNT': predicate = predicate.lower() return [Tree(predicate, flatten(map(constituent2dcs, tree[1:])))] if len(tree) > 2: # A length greater than 2 is the only signal we have for "and". return [Tree(get_top(tree[0]), [Tree('and', flatten(map(constituent2dcs, tree[1:])))])] return [tree]
def MakeRuleIndex(self, rules): """ Produces a dictionary indexed by the rule state, the POS of current non-terminal, and the POS of the children. """ rules_index = defaultdict(list) for (i, rule) in enumerate(rules): if not isinstance(rule.lhs, NLTKTree): lhs_branches_pos = rule.lhs lhs_pos = rule.lhs else: lhs_branches_pos = '' for t in rule.lhs: pos = get_top(t).split('|') if len(pos) > 1 and pos[1] != '': lhs_branches_pos += ' ' + pos[1].strip() elif len(pos) == 1: lhs_branches_pos += ' ' + pos[0].strip() lhs_branches_pos = lhs_branches_pos.strip() lhs_pos = get_top(rule.lhs) rules_index[(rule.state, lhs_pos, lhs_branches_pos)].append(i) return rules_index
def ObtainTreePattern(tree, path, subpaths): subtree = tree_index(tree, path) if not subpaths: return deepcopy(subtree) if not isinstance(subtree, NLTKTree) and (subpaths[0] == () or path == subpaths[0]): return '?x0|' if isinstance(subtree, NLTKTree) and (subpaths[0] == () or path == subpaths[0]): return '?x0|' + get_top(subtree) if not isinstance(subtree, NLTKTree) and subpaths[0] != (): raise(ValueError, \ 'String {0} cannot be indexed by {1}'.format(subtree, subpaths)) depth_subtree = len(path) tree_pattern = deepcopy(subtree) for i, subpath in enumerate(subpaths): subpath_relative = subpath[depth_subtree:] branch = tree_index(tree, subpath) if not isinstance(branch, NLTKTree): tree_pattern[subpath_relative] = '?x' + str(i) + '|' else: tree_pattern[subpath_relative] = '?x' + str(i) + '|' + get_top( branch) return tree_pattern
def get_statements_from_date(ldcsc, var): assert get_top(ldcsc) == 'DATE' and len(ldcsc) == 1 statements = [] if not IsString(ldcsc[0]): return statements try: year = int(ldcsc[0].split('_')[0]) except ValueError: return statements statements = [ 'FILTER (xsd:dateTime({0}) >= xsd:dateTime("{1}"^^xsd:datetime)) .'\ .format(var, year), 'FILTER (xsd:dateTime({0}) < xsd:dateTime("{1}"^^xsd:datetime)) .'\ .format(var, year + 1)] return statements
def BuildTiburonLHS(tree, quote_tokens=True): """ 1. Quote terminals, 2. Rename variables ?x0|NP -> x0:NP 3. Change bracketing (NP (DT the) (NN house)) -> NP(DT(the) NN(house)) """ lhs_str = '' if IsString(tree): if IsVariable(tree): lhs_str = ConvertVarToTiburon(tree) else: lhs_str = ConvertTokenToTiburon(tree, quote=quote_tokens) else: pos = get_top(tree) lhs_str = ConvertPOSToTiburon(pos) + '(' lhs_str += ' '.join([ BuildTiburonLHS(child, quote_tokens=quote_tokens) for child in tree ]) lhs_str += ')' return lhs_str
def get_number_from_constituent(ldcsc): assert get_top(ldcsc) == 'NUMBER' dummy_number = '?n0' if IsString(ldcsc): return dummy_number return ldcsc[0] if IsString(ldcsc[0]) else dummy_number
def dcs2constituent(dcs): '''convert DCS tree into constituent structure''' # Check for a malformed tree. if isinstance(dcs, Tree) and len(dcs) == 0: return [] if not isinstance(dcs, Tree): return [dcs] if dcs.label() == 'and': return flatten(map(dcs2constituent, dcs)) elif dcs.label() == 'count': # assert len(dcs) == 1 if len(dcs) != 1: logging.warning('Invalid {0} tree: {1}'.format(dcs.label(), dcs)) return [] return [Tree("ID", ["COUNT"] + dcs2constituent(dcs[0]))] elif dcs.label() == 'date': # assert len(dcs) == 3, 'Unexpected dcs for date: {0}'.format(dcs) if len(dcs) != 3: logging.warning('Invalid {0} tree: {1}'.format(dcs.label(), dcs)) return [] return [Tree("DATE", ['_'.join(dcs)])] elif dcs.label() == 'number': # assert len(dcs) == 2 if len(dcs) != 2: logging.warning('Invalid {0} tree: {1}'.format(dcs.label(), dcs)) return [] return [Tree("NUMBER", flatten(map(dcs2constituent, dcs)))] elif dcs.label() == '': # must be lambda expression application # assert len(dcs) == 2 if len(dcs) != 2: logging.warning('Invalid l-application tree: {1}'.format(dcs.label(), dcs)) return [] new_dcs = replaceVariable(dcs[0], dcs[1]) return dcs2constituent(new_dcs) elif dcs.label() == 'var': # This is a variable of a lambda expression that has not been # substituted by any argument. I don't know how to deal with # these cases. For the time being, I will just remove it. # assert len(dcs) == 1 # assert get_top(dcs[0]) == 'x' if len(dcs) != 1 or get_top(dcs[0]) != 'x': logging.warning('Invalid {0} tree: {1}'.format(dcs.label(), dcs)) return [] return [] elif dcs.label() == 'lambda': # This is a lambda expression that could not be resolved. # Since I don't know how to deal with it either, I will remove it. # assert len(dcs) == 2 # assert get_top(dcs[0]) == 'x' if len(dcs) != 2 or get_top(dcs[0]) != 'x': logging.warning('Invalid {0} tree: {1}'.format(dcs.label(), dcs)) return [] return dcs2constituent(dcs[1]) else: # assert len(dcs) == 1, '%s' % dcs if len(dcs) != 1: logging.warning('Invalid {0} tree: {1}'.format(dcs.label(), dcs)) return [] dtrs = dcs2constituent(dcs[0]) return [Tree("ID", [dcs.label()] + dtrs)]