示例#1
0
def get_statements_from_ldcsc(ldcsc, var_counter=0):
    statements = []
    if not isinstance(ldcsc, Tree):
        return statements
    if get_top(ldcsc) in ['DATE', 'NUMBER']:
        return statements
    if IsString(ldcsc[0]) and not is_operator(ldcsc[0]):
        new_var = '?x' + str(var_counter)
        pred = ldcsc[0].strip('!')
        if IsString(ldcsc[1]):
            entity_or_var = ldcsc[1]
        elif get_top(ldcsc[1]) == 'DATE':
            entity_or_var = '?d0'
            statements.extend(get_statements_from_date(ldcsc[1], '?d0'))
        elif get_top(ldcsc[1]) == 'NUMBER':
            entity_or_var = get_number_from_constituent(ldcsc[1])
            # statements.extend(get_statements_from_number(ldcsc[1], '?n0'))
        else:
            entity_or_var = '?x' + str(var_counter + 1)
        if IsString(ldcsc[0]) and ldcsc[0].startswith('!'):
            subj, obj = entity_or_var, new_var
        else:
            subj, obj = new_var, entity_or_var
        s = Statement(subj, pred, obj)
        statements.append(s)
        var_counter += 1
    subtree_ini_index = 1 if IsString(ldcsc[0]) else 0
    for subtree in ldcsc[subtree_ini_index:]:
        statements.extend(get_statements_from_ldcsc(subtree, var_counter))
    return statements
示例#2
0
 def GetRelevantRules(self, tree, path_state):
   path, state = path_state
   subtree = tree_index(tree, path)
   relevant_rules = []
   if not isinstance(subtree, NLTKTree):
     tree_branches_pos = subtree
     tree_pos = subtree
   else:
     tree_branches_pos = \
       ' '.join([get_top(t) for t in subtree if isinstance(t, NLTKTree)])
     tree_pos = get_top(subtree)
   rules_indices = self.index[(state, tree_pos, tree_branches_pos)]
   relevant_rules = [self.rules[i] for i in rules_indices]
   return relevant_rules
示例#3
0
def BuildTiburonRHS(tree, newstates, path=(), quote_tokens=True):
    """
  1. Quote terminals,
  2. Rename variables ?x0|NP would change into x0:NP
  3. Remove types of variables. x0:NP would change into x0.
  3. Change bracketing (NP (DT the) (NN house)) would
     change into NP(DT(the) NN(house))
  4. Apply states to variables. (NP (DT ?x0|) ?x1|NN) and
     {(0,0): 'q1', (1,) : 'q2'} would change into
     NP(DT(q1.x0) q2.x1)
  """
    rhs_str = ''
    if IsString(tree):
        if IsVariable(tree):
            assert tree.startswith('?')
            assert '|' in tree
            assert path in newstates, 'path {0} not in {1}'.format(
                path, newstates)
            rhs_str = newstates[path] + '.' + tree[1:tree.index('|')]
        else:
            rhs_str = ConvertTokenToTiburon(tree, quote=quote_tokens)
    else:
        pos = get_top(tree)
        rhs_str = ConvertPOSToTiburon(pos) + '('
        rhs_str += ' '.join(
          [BuildTiburonRHS(child, newstates, path + (i,), quote_tokens=quote_tokens) \
             for i, child in enumerate(tree)])
        rhs_str += ')'
    return rhs_str
示例#4
0
 def MakeDeletingRule(self):
     """
 If the LHS does not produce any leaf but RHS does, such rule can be
 considered as a leaf-deleting rule. It is not clear when lexicalized
 branches should be replaced by a deleting variable (it depends on the
 application). Here we replace fully lexicalied branches at level 1
 by a deleting variable, only when the RHS does not contain any leaf
 that is not a variable.
 """
     if IsString(self.lhs):
         return self
     if IsString(self.rhs) and not IsVariable(self.rhs):
         return self
     if not IsString(self.rhs):
         rhs_leaves = self.rhs.leaves()
         if rhs_leaves and any([not IsVariable(l) for l in rhs_leaves]):
             return self
     # Make generator of fresh variables.
     index_new_variable = ('?xx%d|' % i for i in xrange(20))
     # Substitute branches at level 1 if they are fully lexicalized.
     lhs_paths_prefix_1 = set(
         [p[0] for p in self.lhs_vars_to_paths.values()])
     if not lhs_paths_prefix_1:
         return self
     for i, branch in enumerate(self.lhs):
         if i not in lhs_paths_prefix_1:
             if IsString(branch):
                 self.lhs[i] = index_new_variable.next()
             else:
                 self.lhs[i] = index_new_variable.next() + get_top(branch)
             self.lhs_vars_to_paths[self.lhs[i]] = (i, )
     return self
示例#5
0
def UnconvertAllPOSFromTiburon(tree):
    leaf_paths = tree.treepositions('leaves')
    nt_paths = set(tree.treepositions()) - set(leaf_paths)
    for nt_path in nt_paths:
        assert not IsString(tree[nt_path])
        tiburon_pos = get_top(tree[nt_path])
        tree[nt_path].set_label(UnconvertPOSFromTiburon(tiburon_pos))
    return tree
示例#6
0
def constituent2dcs(tree):
  '''convert a constituent structure into a DCS tree'''
  if not isinstance(tree, Tree):
    return [tree]
  # elif get_top(tree) == 'COUNT':
  #   assert len(tree) == 1
  #   return [Tree('count', constituent2dcs(tree[0]))]
  elif get_top(tree) == 'NUMBER':
    assert len(tree) == 2
    return [Tree('number', tree[:])]
  elif get_top(tree) == 'DATE':
    # tree contains a list with only one element, which is the data
    # joined with underscores. We re-establish the list.
    if IsString(tree[0]):
      date_info = tree[0].split('_')
      try:
        map(int, date_info)
      except ValueError:
        date_info = [tree[0]]
      return [Tree('date', date_info)]
    else:
      return [Tree(get_top(tree[0]), flatten(map(constituent2dcs, tree[1:])))]
  if get_top(tree) == 'ID' and len(tree) == 2:
    # The first child is the predicate. The rest are the arguments.
    assert len(tree) == 2, '%s' % tree
    predicate = get_top(tree[0])
    if predicate == 'COUNT':
      predicate = predicate.lower()
    return [Tree(predicate, flatten(map(constituent2dcs, tree[1:])))]
  if len(tree) > 2:
    # A length greater than 2 is the only signal we have for "and".
    return [Tree(get_top(tree[0]), [Tree('and', flatten(map(constituent2dcs, tree[1:])))])]
  return [tree]
示例#7
0
 def MakeRuleIndex(self, rules):
   """
   Produces a dictionary indexed by the rule state,
   the POS of current non-terminal, and the POS of the children.
   """
   rules_index = defaultdict(list)
   for (i, rule) in enumerate(rules):
     if not isinstance(rule.lhs, NLTKTree):
       lhs_branches_pos = rule.lhs
       lhs_pos = rule.lhs
     else:
       lhs_branches_pos = ''
       for t in rule.lhs:
         pos = get_top(t).split('|')
         if len(pos) > 1 and pos[1] != '':
             lhs_branches_pos += ' ' + pos[1].strip()
         elif len(pos) == 1:
           lhs_branches_pos += ' ' + pos[0].strip()
       lhs_branches_pos = lhs_branches_pos.strip()
       lhs_pos = get_top(rule.lhs)
     rules_index[(rule.state, lhs_pos, lhs_branches_pos)].append(i)
   return rules_index
示例#8
0
def ObtainTreePattern(tree, path, subpaths):
    subtree = tree_index(tree, path)
    if not subpaths:
        return deepcopy(subtree)
    if not isinstance(subtree, NLTKTree) and (subpaths[0] == ()
                                              or path == subpaths[0]):
        return '?x0|'
    if isinstance(subtree, NLTKTree) and (subpaths[0] == ()
                                          or path == subpaths[0]):
        return '?x0|' + get_top(subtree)
    if not isinstance(subtree, NLTKTree) and subpaths[0] != ():
        raise(ValueError, \
              'String {0} cannot be indexed by {1}'.format(subtree, subpaths))
    depth_subtree = len(path)
    tree_pattern = deepcopy(subtree)
    for i, subpath in enumerate(subpaths):
        subpath_relative = subpath[depth_subtree:]
        branch = tree_index(tree, subpath)
        if not isinstance(branch, NLTKTree):
            tree_pattern[subpath_relative] = '?x' + str(i) + '|'
        else:
            tree_pattern[subpath_relative] = '?x' + str(i) + '|' + get_top(
                branch)
    return tree_pattern
示例#9
0
def get_statements_from_date(ldcsc, var):
    assert get_top(ldcsc) == 'DATE' and len(ldcsc) == 1
    statements = []
    if not IsString(ldcsc[0]):
        return statements
    try:
        year = int(ldcsc[0].split('_')[0])
    except ValueError:
        return statements
    statements = [
      'FILTER (xsd:dateTime({0}) >= xsd:dateTime("{1}"^^xsd:datetime)) .'\
      .format(var, year),
      'FILTER (xsd:dateTime({0}) < xsd:dateTime("{1}"^^xsd:datetime)) .'\
      .format(var, year + 1)]
    return statements
示例#10
0
def BuildTiburonLHS(tree, quote_tokens=True):
    """
  1. Quote terminals,
  2. Rename variables ?x0|NP -> x0:NP
  3. Change bracketing (NP (DT the) (NN house)) -> NP(DT(the) NN(house))
  """
    lhs_str = ''
    if IsString(tree):
        if IsVariable(tree):
            lhs_str = ConvertVarToTiburon(tree)
        else:
            lhs_str = ConvertTokenToTiburon(tree, quote=quote_tokens)
    else:
        pos = get_top(tree)
        lhs_str = ConvertPOSToTiburon(pos) + '('
        lhs_str += ' '.join([
            BuildTiburonLHS(child, quote_tokens=quote_tokens) for child in tree
        ])
        lhs_str += ')'
    return lhs_str
示例#11
0
def get_number_from_constituent(ldcsc):
    assert get_top(ldcsc) == 'NUMBER'
    dummy_number = '?n0'
    if IsString(ldcsc):
        return dummy_number
    return ldcsc[0] if IsString(ldcsc[0]) else dummy_number
示例#12
0
def dcs2constituent(dcs):
    '''convert DCS tree into constituent structure'''
    # Check for a malformed tree.
    if isinstance(dcs, Tree) and len(dcs) == 0:
        return []
    if not isinstance(dcs, Tree):
        return [dcs]
    if dcs.label() == 'and':
        return flatten(map(dcs2constituent, dcs))
    elif dcs.label() == 'count':
        # assert len(dcs) == 1
        if len(dcs) != 1:
          logging.warning('Invalid {0} tree: {1}'.format(dcs.label(), dcs))
          return []
        return [Tree("ID", ["COUNT"] + dcs2constituent(dcs[0]))]
    elif dcs.label() == 'date':
        # assert len(dcs) == 3, 'Unexpected dcs for date: {0}'.format(dcs)
        if len(dcs) != 3:
          logging.warning('Invalid {0} tree: {1}'.format(dcs.label(), dcs))
          return []
        return [Tree("DATE", ['_'.join(dcs)])]
    elif dcs.label() == 'number':
        # assert len(dcs) == 2
        if len(dcs) != 2:
          logging.warning('Invalid {0} tree: {1}'.format(dcs.label(), dcs))
          return []
        return [Tree("NUMBER", flatten(map(dcs2constituent, dcs)))]
    elif dcs.label() == '':
        # must be lambda expression application
        # assert len(dcs) == 2
        if len(dcs) != 2:
          logging.warning('Invalid l-application tree: {1}'.format(dcs.label(), dcs))
          return []
        new_dcs = replaceVariable(dcs[0], dcs[1])
        return dcs2constituent(new_dcs)
    elif dcs.label() == 'var':
        # This is a variable of a lambda expression that has not been
        # substituted by any argument. I don't know how to deal with
        # these cases. For the time being, I will just remove it.
        # assert len(dcs) == 1
        # assert get_top(dcs[0]) == 'x'
        if len(dcs) != 1 or get_top(dcs[0]) != 'x':
          logging.warning('Invalid {0} tree: {1}'.format(dcs.label(), dcs))
          return []
        return []
    elif dcs.label() == 'lambda':
        # This is a lambda expression that could not be resolved.
        # Since I don't know how to deal with it either, I will remove it.
        # assert len(dcs) == 2
        # assert get_top(dcs[0]) == 'x'
        if len(dcs) != 2 or get_top(dcs[0]) != 'x':
          logging.warning('Invalid {0} tree: {1}'.format(dcs.label(), dcs))
          return []
        return dcs2constituent(dcs[1])
    else:
        # assert len(dcs) == 1, '%s' % dcs
        if len(dcs) != 1:
          logging.warning('Invalid {0} tree: {1}'.format(dcs.label(), dcs))
          return []
        dtrs = dcs2constituent(dcs[0])
        return [Tree("ID", [dcs.label()] + dtrs)]