Exemplo n.º 1
0
 def MakeDeletingRule(self):
     """
 If the LHS does not produce any leaf but RHS does, such rule can be
 considered as a leaf-deleting rule. It is not clear when lexicalized
 branches should be replaced by a deleting variable (it depends on the
 application). Here we replace fully lexicalied branches at level 1
 by a deleting variable, only when the RHS does not contain any leaf
 that is not a variable.
 """
     if IsString(self.lhs):
         return self
     if IsString(self.rhs) and not IsVariable(self.rhs):
         return self
     if not IsString(self.rhs):
         rhs_leaves = self.rhs.leaves()
         if rhs_leaves and any([not IsVariable(l) for l in rhs_leaves]):
             return self
     # Make generator of fresh variables.
     index_new_variable = ('?xx%d|' % i for i in xrange(20))
     # Substitute branches at level 1 if they are fully lexicalized.
     lhs_paths_prefix_1 = set(
         [p[0] for p in self.lhs_vars_to_paths.values()])
     if not lhs_paths_prefix_1:
         return self
     for i, branch in enumerate(self.lhs):
         if i not in lhs_paths_prefix_1:
             if IsString(branch):
                 self.lhs[i] = index_new_variable.next()
             else:
                 self.lhs[i] = index_new_variable.next() + get_top(branch)
             self.lhs_vars_to_paths[self.lhs[i]] = (i, )
     return self
Exemplo n.º 2
0
def get_statements_from_ldcsc(ldcsc, var_counter=0):
    statements = []
    if not isinstance(ldcsc, Tree):
        return statements
    if get_top(ldcsc) in ['DATE', 'NUMBER']:
        return statements
    if IsString(ldcsc[0]) and not is_operator(ldcsc[0]):
        new_var = '?x' + str(var_counter)
        pred = ldcsc[0].strip('!')
        if IsString(ldcsc[1]):
            entity_or_var = ldcsc[1]
        elif get_top(ldcsc[1]) == 'DATE':
            entity_or_var = '?d0'
            statements.extend(get_statements_from_date(ldcsc[1], '?d0'))
        elif get_top(ldcsc[1]) == 'NUMBER':
            entity_or_var = get_number_from_constituent(ldcsc[1])
            # statements.extend(get_statements_from_number(ldcsc[1], '?n0'))
        else:
            entity_or_var = '?x' + str(var_counter + 1)
        if IsString(ldcsc[0]) and ldcsc[0].startswith('!'):
            subj, obj = entity_or_var, new_var
        else:
            subj, obj = new_var, entity_or_var
        s = Statement(subj, pred, obj)
        statements.append(s)
        var_counter += 1
    subtree_ini_index = 1 if IsString(ldcsc[0]) else 0
    for subtree in ldcsc[subtree_ini_index:]:
        statements.extend(get_statements_from_ldcsc(subtree, var_counter))
    return statements
Exemplo n.º 3
0
def GetURIsFromRules(rules):
  uris = set()
  for rule in rules:
    if IsString(rule.rhs) and not IsVariable(rule.rhs):
      uris.add(rule.rhs)
    if not IsString(rule.rhs):
      uris.update(u for u in rule.rhs.leaves() if not IsVariable(u))
  return uris
Exemplo n.º 4
0
 def GetSimilarity(self, tree1, tree2):
     similarities = []
     tree1_str = tree1 if IsString(tree1) else repr(tree1)
     tree2_str = tree2 if IsString(tree2) else repr(tree2)
     if tree1_str == tree2_str:
         similarities = [
             Similarity(self.kCost, self.kDefaultState, tree1, tree2)
         ]
     return similarities
Exemplo n.º 5
0
 def __repr__(self):
     lhs_str = self.lhs.encode('utf-8') if IsString(self.lhs) else repr(
         self.lhs)
     rhs_str = self.rhs.encode('utf-8') if IsString(self.rhs) else repr(
         self.rhs)
     return (("<rule.\n  state: {0}\n  lhs: {1}\n  rhs: {2}\n" +
              "  newstates: {3}\n  weight: {4}>").format(
                  self.state, lhs_str, rhs_str, self.newstates,
                  self.weight))
Exemplo n.º 6
0
 def GetSimilarity(self, tree1, tree2):
   num_nodes_tree1 = 0 if IsString(tree1) \
                       else tree1.GetNumSubtrees()
   num_nodes_tree2 = 0 if IsString(tree2) \
                       else tree2.GetNumSubtrees()
   weight = 0.0
   if not (num_nodes_tree1 == 0 and num_nodes_tree2 == 0):
     weight = (float(abs(num_nodes_tree1 - num_nodes_tree2)) \
               / max(num_nodes_tree1, num_nodes_tree2))
   return [Similarity(weight, 'nodes_difference', tree1, tree2)]
Exemplo n.º 7
0
 def StringifyWithoutWeight(self):
     if not self.stringified:
         lhs_str = self.lhs.encode('utf-8') if IsString(self.lhs) else repr(
             self.lhs)
         rhs_str = self.rhs.encode('utf-8') if IsString(self.rhs) else repr(
             self.rhs)
         self.stringified = (
             "<rule.\n  state: {0}\n  lhs: {1}\n  rhs: {2}\n" +
             "  newstates: {3}>").format(self.state, lhs_str, rhs_str,
                                         self.newstates)
     return self.stringified
Exemplo n.º 8
0
 def GetVariables(self, tree):
     if isinstance(tree, TreePattern):
         tree_vars = self.MakeVariablesFromTreePattern(tree)
     elif isinstance(tree, NLTKTree):
         tree_vars = [
             var.split('|')[0] for (var, path) in variables_to_paths(tree)
         ]
     elif IsString(tree) and tree.startswith('?x'):
         tree_vars = [tree]
     elif IsString(tree) and not tree.startswith('?x'):
         tree_vars = []
     else:
         tree_vars = None
     return tree_vars
Exemplo n.º 9
0
 def insert_cvt_if_needed(self, tree):
     predicate = get_main_predicate_from_tree(tree)
     cvt = self.get_cvt_cached(predicate)
     if cvt:
         if IsString(tree):
             tree = tree_or_string('(ID !{0} {1})'.format(cvt, tree))
         elif tree.label() == u'COUNT':
             tree = tree_or_string('(COUNT (ID !{0} {1}))'.format(
                 cvt, tree[0]))
         elif not IsString(tree[0]):
             tree_repr = ' '.join(map(str, tree))
             tree = tree_or_string('(ID !{0} {1})'.format(cvt, tree_repr))
         else:
             tree = tree_or_string('(ID !{0} {1})'.format(cvt, tree))
     return tree
Exemplo n.º 10
0
def _GetURIField(uri, field):
    """
  Retrieves information of URIs or words according to the index of Freebase.
  URIs that are prefixed with "!" are stripped to remove that operator.
  """
    if field == 'numFound':
        try:
            assert _IsConnectionAlive()
            words = [] if uri == '<total>' else [uri]
            num_docs = _GetNumDocsFound(words)
        except ValueError:
            num_docs = 0
        return num_docs
    if field.startswith('uri_type'):
        arg = field.split('|')[1]
        return GetURIType(uri, arg)
    if not IsString(uri):
        return None
    if field == 'role':
        return _GetURIRole(uri)
    elif field == 'text':
        return _GetURIText(uri)
    try:
        uri_field = _GetFieldFromURI(uri, field)
    except ValueError:
        uri_field = None
    return uri_field
Exemplo n.º 11
0
def DecodeInputTree(wrtg, nbest, lambda_dcs_str_list):
    """
  lambda_dcs_str_list is an output parameter, where we store the valid
  output trees (string representations of lambda-DCS trees).
  An output parameter is used in order to retrieve partial lists in
  case of timeouts.
  """
    transductions = wrtg.GenerateNBestTreesMax(nbest)
    for best_tree, optimal_weight in transductions:
        if cvt_inserter:
            best_tree = cvt_inserter.insert_cvt_if_needed(best_tree)
        constituent_str = \
          best_tree if IsString(best_tree) else best_tree.pprint(margin=10000)
        query_results = QueryLambdaDCSC(constituent_str, query_manager)
        logging.info('\nConstituent: {0}\nWeight: {1}'\
          .format(constituent_str, optimal_weight))
        if query_results is None:
            continue
        if query_results not in invalid_results:
            lambda_dcs_str = ConvertConstituent2DCS(constituent_str)
            logging.info('Found. Weight: {0}\tTransduction: {1}'\
                          .format(optimal_weight, lambda_dcs_str))
            logging.info(u'Answer: {0}'.format(query_results))
            lambda_dcs_str_list.append(str(lambda_dcs_str))
    return
Exemplo n.º 12
0
def GetBestValidDerivations(
  wrtg, cvt_inserter, nbest=1000, nvalid=100, query_manager=None):
  """
  It obtains derivations in descending order of score from wRTG wrtg.
  It inserts CVTs when necessary.
  If a derivation produces a sparql query that retrieves an invalid
  result, then such derivation is skipped until a good derivation is found.
  The maximum number of explored derivations is given by nbest.
  It returns a list of up to nvalid valid derivations and corresponding
  constituent trees with a CVT inserted (when necessary).
  If not found, returns an empty list.
  """
  # This variable contains the result as a list of tuples.
  valid_derivations = []
  wrtg.ClearCaches()
  derivations = wrtg.ObtainDerivationsFromNT()
  for i, derivation in enumerate(derivations):
    if i >= nbest:
      break
    constituent, _ = TargetProjectionFromDerivation(derivation)
    constituent_str = \
      constituent if IsString(constituent) else constituent.pprint(margin=10000)
    valid_derivations.append((derivation, constituent))
    if i == 0:
      first_derivation = derivation
      first_tree = constituent
    if len(valid_derivations) >= nvalid:
      break
  if not valid_derivations:
    valid_derivations.append((first_derivation, first_tree))
  return valid_derivations
Exemplo n.º 13
0
 def fromldcsc(ldcsc, var_prefs=None):
     """
 other_query_vars_prefs is a list with the prefixes
 of the other query variable instantiations that we
 also want to retrieve. E.g. if ['p', 'r'], then
 SELECT DISTINCT ?x0 , ?p0, ?p1, ?r0, ?r1 WHERE { ...
 """
     if not isinstance(ldcsc, Tree):
         if IsString(ldcsc) and not ldcsc.startswith(
                 '(') and not ldcsc.endswith(')'):
             return None
         else:
             raise (ValueError(
                 'This method expects a Tree instance. Got type {0} for instance {1}'
                 .format(type(ldcsc), ldcsc)))
     try:
         statements = get_statements_from_ldcsc(ldcsc)
     except:
         logging.warning('Failed to get statements from l-dcsc: {0}'.format(
             str(ldcsc)))
         statements = []
     if not statements:
         return None
     operator = ldcsc[0] if is_operator(ldcsc[0]) else ""
     query_vars = get_query_vars(statements, var_prefs)
     query_str = build_query_str(statements, '?x0', query_vars, operator)
     query = Query(query_str)
     query.query_vars = query_vars
     query.ldcsc = ldcsc
     return query
Exemplo n.º 14
0
def BuildTiburonRHS(tree, newstates, path=(), quote_tokens=True):
    """
  1. Quote terminals,
  2. Rename variables ?x0|NP would change into x0:NP
  3. Remove types of variables. x0:NP would change into x0.
  3. Change bracketing (NP (DT the) (NN house)) would
     change into NP(DT(the) NN(house))
  4. Apply states to variables. (NP (DT ?x0|) ?x1|NN) and
     {(0,0): 'q1', (1,) : 'q2'} would change into
     NP(DT(q1.x0) q2.x1)
  """
    rhs_str = ''
    if IsString(tree):
        if IsVariable(tree):
            assert tree.startswith('?')
            assert '|' in tree
            assert path in newstates, 'path {0} not in {1}'.format(
                path, newstates)
            rhs_str = newstates[path] + '.' + tree[1:tree.index('|')]
        else:
            rhs_str = ConvertTokenToTiburon(tree, quote=quote_tokens)
    else:
        pos = get_top(tree)
        rhs_str = ConvertPOSToTiburon(pos) + '('
        rhs_str += ' '.join(
          [BuildTiburonRHS(child, newstates, path + (i,), quote_tokens=quote_tokens) \
             for i, child in enumerate(tree)])
        rhs_str += ')'
    return rhs_str
Exemplo n.º 15
0
def is_operator(op):
    if isinstance(op, Tree):
        return False
    assert IsString(op)
    return op == "COUNT" or \
           op == "MAX" or \
           op == "MIN"
Exemplo n.º 16
0
def constituent2dcs(tree):
  '''convert a constituent structure into a DCS tree'''
  if not isinstance(tree, Tree):
    return [tree]
  # elif get_top(tree) == 'COUNT':
  #   assert len(tree) == 1
  #   return [Tree('count', constituent2dcs(tree[0]))]
  elif get_top(tree) == 'NUMBER':
    assert len(tree) == 2
    return [Tree('number', tree[:])]
  elif get_top(tree) == 'DATE':
    # tree contains a list with only one element, which is the data
    # joined with underscores. We re-establish the list.
    if IsString(tree[0]):
      date_info = tree[0].split('_')
      try:
        map(int, date_info)
      except ValueError:
        date_info = [tree[0]]
      return [Tree('date', date_info)]
    else:
      return [Tree(get_top(tree[0]), flatten(map(constituent2dcs, tree[1:])))]
  if get_top(tree) == 'ID' and len(tree) == 2:
    # The first child is the predicate. The rest are the arguments.
    assert len(tree) == 2, '%s' % tree
    predicate = get_top(tree[0])
    if predicate == 'COUNT':
      predicate = predicate.lower()
    return [Tree(predicate, flatten(map(constituent2dcs, tree[1:])))]
  if len(tree) > 2:
    # A length greater than 2 is the only signal we have for "and".
    return [Tree(get_top(tree[0]), [Tree('and', flatten(map(constituent2dcs, tree[1:])))])]
  return [tree]
Exemplo n.º 17
0
def UnconvertAllPOSFromTiburon(tree):
    leaf_paths = tree.treepositions('leaves')
    nt_paths = set(tree.treepositions()) - set(leaf_paths)
    for nt_path in nt_paths:
        assert not IsString(tree[nt_path])
        tiburon_pos = get_top(tree[nt_path])
        tree[nt_path].set_label(UnconvertPOSFromTiburon(tiburon_pos))
    return tree
Exemplo n.º 18
0
def GetLeavePositions(tree):
  positions = []
  if IsString(tree):
    if not tree.startswith(u'?x'):
      positions.append( () )
  else:
    positions = [position for position in tree.treepositions('leaves') \
                   if not tree[position].startswith(u'?x')]
  return positions
Exemplo n.º 19
0
def QueryLambdaDCSC(ldcsc_str, query_manager=None):
    assert IsString(ldcsc_str)
    if query_manager is None:
        query_manager = query_manager_global
    results = []
    ldcsc = tree_or_string(ldcsc_str)
    query = Query.fromldcsc(ldcsc)
    if query is not None:
        results = [r[0] for r in query.get_results(query_manager)]
    return results
Exemplo n.º 20
0
 def rule_meets_conds(self, rule, conds):
     if not conds:
         return True
     for cond in conds:
         target = rule.lhs if cond.startswith('lhs:') else rule.rhs
         if cond.endswith('is_var') and not IsVariable(target):
             return False
         if cond.endswith('is_str') and not IsString(target):
             return False
     return True
Exemplo n.º 21
0
def GetNewstatesFromRHSInTiburon(rhs_str):
    """
  Given a string representation of a RHS in Tiburon format,
  it returns a dictionary: path -> varname, where varname is converted to our
  software's variable name convention (.e.g ?x0|).
  """
    rhs_str_nltk = TiburonToStanford(rhs_str)
    rhs_nltk = tree_or_string(rhs_str_nltk)
    newstates = {}
    if IsString(rhs_nltk) and IsTiburonStateVariable(rhs_nltk):
        state = rhs_nltk[:rhs_nltk.index('.')]
        newstates[()] = state
    elif not IsString(rhs_nltk):
        for path in rhs_nltk.treepositions('leaves'):
            if IsTiburonStateVariable(rhs_nltk[path]):
                state_var = rhs_nltk[path]
                state = state_var[:state_var.index('.')]
                newstates[path] = state
    return newstates
Exemplo n.º 22
0
def get_entity_label(entity):
  assert IsString(entity)
  if ' ' in entity:
    return entity
  label_results = QueryLambdaDCSC(u'(ID !fb:type.object.name <{0}>)'.format(entity))
  if not label_results:
    return entity
  if len(label_results) > 1:
    logging.warning(
      u'More than one label results for entity {0} = {1}'.format(
      entity, ', '.join(label_results)))
  return label_results[0]
Exemplo n.º 23
0
def ConvertConstituent2DCS(constituent_tree):
  """
  Wrapper for constituent2dcs, where we try to convert an eventual
  tree string into a tree.
  This function also retrieves the first item of the resulting list,
  which contains the final constituent structure, and transforms it
  into a utils.tree_tools.Tree object.
  """
  if IsString(constituent_tree):
    constituent_tree = tree_or_string(constituent_tree)
  dcs_tree_fragments = constituent2dcs(constituent_tree)
  assert isinstance(dcs_tree_fragments, list) and len(dcs_tree_fragments) == 1
  dcs_tree = tree_or_string(str(dcs_tree_fragments[0]))
  return dcs_tree
Exemplo n.º 24
0
def LoadAlignments(alignment_fname):
    """
  Load a filename with the following structure:
    src_tree
    trg_tree
    alignment
    ...
    src_tree
    trg_tree
    alignment
  into a dictionary indexed by a tuple (src_tree_str, trg_tree_str),
  whose values are Alignment objects.
  """
    alignments = {}
    with codecs.open(alignment_fname, 'r', 'utf-8') as fin:
        lines = fin.readlines()
        assert len(
            lines) % 3 == 0, 'Lines in {0} are not a multiple of 3.'.format(
                alignment_fname)
        for i, line in enumerate(lines):
            if i % 3 == 0:
                src_tree_str = line.strip()
                src_tree = tree_or_string(src_tree_str)
                src_leaves = src_tree.leaves() if not IsString(src_tree) else [
                    src_tree
                ]
            if i % 3 == 1:
                trg_tree_str = line.strip()
                trg_tree = tree_or_string(trg_tree_str)
                trg_leaves = trg_tree.leaves() if not IsString(trg_tree) else [
                    trg_tree
                ]
            if i % 3 == 2:
                alignment_str = line.strip()
                alignment = Alignment(alignment_str, src_leaves, trg_leaves)
                alignments[(src_tree_str, trg_tree_str)] = alignment
    return alignments
Exemplo n.º 25
0
def get_statements_from_date(ldcsc, var):
    assert get_top(ldcsc) == 'DATE' and len(ldcsc) == 1
    statements = []
    if not IsString(ldcsc[0]):
        return statements
    try:
        year = int(ldcsc[0].split('_')[0])
    except ValueError:
        return statements
    statements = [
      'FILTER (xsd:dateTime({0}) >= xsd:dateTime("{1}"^^xsd:datetime)) .'\
      .format(var, year),
      'FILTER (xsd:dateTime({0}) < xsd:dateTime("{1}"^^xsd:datetime)) .'\
      .format(var, year + 1)]
    return statements
Exemplo n.º 26
0
def get_main_predicate_from_tree(tree):
    """
  Given a constituent representation of a sparql query,
  it returns the main predicate (as in lambda-DCS) by
  returning the left-most leaf. If "COUNT" operator is
  the left-most leaf, then it returns the leaf immediately
  on the right of the "COUNT" operator.
  """
    if IsString(tree):
        predicate = tree
    else:
        leaves = tree.leaves()
        assert leaves
        predicate = leaves[0]  # left-most-leaf
        if predicate.lower() == 'count':
            predicate = leaves[1]
    return predicate
Exemplo n.º 27
0
def GetTreePattern(tree, subpaths):
    """
  Converts a rule LHS or RHS into a TreePattern.
  The tree attribute of the TreePattern would simply be the
  LHS or RHS tree.
  The path to the root (beginning) of the TreePattern would be (),
  because we do not have the real information on at what level this
  rule was originally extracted (or is being applied).
  The subpaths of the TreePattern would be the relative paths of the
  variables in the LHS or RHS.
  """
    path = ()
    if IsString(tree):
        if IsVariable(tree):
            return TreePattern(tree, path, [()])
        else:
            return TreePattern(tree, path, [])
    subpaths_sorted = sorted(subpaths)
    return TreePattern(tree, path, subpaths_sorted)
Exemplo n.º 28
0
def get_query_vars(statements, prefixes):
    if prefixes is None:
        prefixes = []
    query_vars = list()
    for s in statements:
        if IsString(s):
            continue
        if is_var(s.subj):
            query_vars.append(s.subj)
        if is_var(s.rel):
            query_vars.append(s.rel)
        if is_var(s.obj):
            query_vars.append(s.obj)
    out_vars = set()
    for pref in prefixes:
        for v in query_vars:
            if v.startswith(pref):
                out_vars.add(v)
    return sorted(out_vars)
Exemplo n.º 29
0
def BuildTiburonLHS(tree, quote_tokens=True):
    """
  1. Quote terminals,
  2. Rename variables ?x0|NP -> x0:NP
  3. Change bracketing (NP (DT the) (NN house)) -> NP(DT(the) NN(house))
  """
    lhs_str = ''
    if IsString(tree):
        if IsVariable(tree):
            lhs_str = ConvertVarToTiburon(tree)
        else:
            lhs_str = ConvertTokenToTiburon(tree, quote=quote_tokens)
    else:
        pos = get_top(tree)
        lhs_str = ConvertPOSToTiburon(pos) + '('
        lhs_str += ' '.join([
            BuildTiburonLHS(child, quote_tokens=quote_tokens) for child in tree
        ])
        lhs_str += ')'
    return lhs_str
Exemplo n.º 30
0
def get_alignment(pair_tt, entities_lex, predicates_lex):
    """
  Obtains maximum-length alignment between leaves of source and target tree.
  @pair_tt is a tuple (src_tree, trg_tree).
  @entities_lex and @predicates_lex are dictionaries, as constructed above.
  """
    assert len(pair_tt) == 2
    src_tree, trg_tree = map(tree_or_string, pair_tt)
    src_leaves = src_tree.leaves()
    if IsString(trg_tree):
        trg_leaves = [trg_tree]
    else:
        trg_leaves = trg_tree.leaves()
    # This is a list of lists. Each list will have source word indices.
    alignments = []
    for i, trg_leaf in enumerate(trg_leaves):
        alignment = align(trg_leaf, src_leaves, entities_lex, predicates_lex)
        alignments.append(alignment)
    alignments = fix_unaligned(alignments, trg_leaves)
    return alignments