Пример #1
0
      def subproc(pid, tasks, results, exits, lockTask):
        setting.runningMode = "mert"
        setting.load(["file_translation_input_tree","file_translation_input_dep","file_translation_output"])
        decoder = GentileDecoder()

        while True:
          lockTask.acquire()
          if not tasks.empty():
            task = tasks.get()
          else:
            task = None
          lockTask.release()
          if not task:
            exits.put(pid)
            return
          tid, lineTree, lineDep = task
          
          hyps = decoder.translateNBest(lineTree, lineDep)
          output = ""
          for hyp in hyps:
            line_output = " ||| ".join([str(tid),hyp.getTranslation(),
                                      " ".join([str(n) for n in hyp.getLambdas()])
                                     ])
            output += line_output + "\n"
          msg = "[%d] Got %d | %s" % (tid,len(hyps),hyps[0].getTranslation())
          results.put((tid, output, msg))
Пример #2
0
    def subproc(pid, tasks, results, exits, lockTask):
      setting.runningMode = "normal"
      setting.load(["file_translation_input_tree","file_translation_input_dep","file_translation_output","size_cube_pruning"])
      decoder = GentileDecoder()

      while True:
        lockTask.acquire()
        if not tasks.empty():
          task = tasks.get()
        else:
          task = None
        lockTask.release()
        if not task:
          exits.put(pid)
          return
        tid, lineTree, lineDep = task
        
        hyps = decoder.translateNBest(lineTree, lineDep)
        result = hyps[0].getTranslation()
        output = result + "\n"
        msgStream = StringIO.StringIO()
        hyps[0].trace(stream=msgStream)
        print >> msgStream, "[%d]" % tid , result
        
        msg = msgStream.getvalue()
        results.put((tid, output, msg))
Пример #3
0
 def __init__(self):
   """
   Load rule table files.
   """
   setting.load(["rule_table_path","dispersion_tables"])
   print "[GentileRuleTable] Loading rule table handles ..."
   self.ntables = setting.dispersion_tables
   self.ruletables = {}
   for itable in range(setting.dispersion_tables):
     self.ruletables[itable] = open("%s/rules.final.%d" % (setting.rule_table_path, itable))
   print "[GentileRuleTable] Loading index tables ..."
   self.indextables = {}
   for itable in range(setting.dispersion_tables):
     map_index = {}
     path_table = "%s/index.final.%d" % (setting.rule_table_path, itable)
     file_index_table = open(path_table,"r")
     ftable = file_index_table
     line = ftable.readline()
     while line:
       pair = line.strip().split(" ")
       if len(pair) != 2:
         line = ftable.readline()
         continue
       hash_src, pos = pair
       hash_src = int(hash_src, 16)
       pos = int(pos)
       map_index[hash_src] = pos
       line = ftable.readline()
     self.indextables[itable] = map_index
     file_index_table.close()
     ftable.close()
Пример #4
0
  def build_lex_stack(self,tree,ruletable,model):
    """
    Build a stack for saving first n best lexical translation hypothesis for each node
    this n is usually same as the size of cube pruning
    @type tree: DepTreeStruct
    @type ruletable: NovelDepStrRuleTable
    @type model
    @rtype: object
    """
    stack_lex = {}
    # for each node in tree
    for id in tree.nodes:
      if id == 0 : continue # root node
      word = tree.nodes[id][0]
      # find lexical rules of the word of this node , and sort them
      rules = ruletable.findByHeadAndSrcLen(word+":0",1)
      if len(rules) == 0:
        # no lex rules found
        # ok we need to go to lex table to find one
        # and construct a pseudo rule
        setting.load(["file_lexical_translation_table"])
        if word in self.lexicalTable:
          # lexical rules of this word cached
          rules = self.lexicalTable[word]
        else:
          # not found in lexical translation table
          # then build pseudo rules
          print "[NovelDepStrDecoder] build pseudo lex rule for '%s'" % word
          lines = open(setting.file_lexical_translation_table).xreadlines()
          section_entered = False
          for line in lines:
            if line.startswith(word+" "):
              section_entered = True
              word_src,word_ref,prob = line.strip().split(" ")
              pseudo_rule = "%s:0 ||| %s ||| %s |||  ||| %s ||| %s %s %s %s 2.7182" \
                         % (word,word,word_ref,prob,prob,prob,prob,prob)
              rules.append(pseudo_rule)
            else:
              if section_entered == True:
                # all lexical rules should be created ,
                # and now its another section here
                # so there should be no more rule except to be extracted
                break
          if len(rules) == 0:
            # still not found , then make itself be translation (maybe a number)
            pseudo_rule = "%s:0 ||| %s ||| %s |||  ||| 0.001 ||| 0.001 0.001 0.001 0.001 2.7182" \
                         % (word,word,word)
            rules.append(pseudo_rule)
          # build pseudo rules finished , got rules
          # cache it
          self.lexicalTable[word] = rules
      rules = model.sortRules(rules,setting.size_cube_pruning)
      # build hypothesises and append to lexical stack
      stack_lex[id] = []
      for rule in rules:
        stack_lex[id].append( NovelDepStrHypothesis(model,(id,tree),rule) )

    return stack_lex
Пример #5
0
 def __init__(self):
   """
   Initiate the language model
   initiate the model with weights in setting file
   """
   setting.load(["max_gram","file_lm","weight_word_penalty","weight_lm","weight_translate_costs"])
   self.maxGram = int(setting.max_gram)
   sys.stderr.write("[NovelDepStrModel] loading language model... \n")
   self.lm = LanguageModel(lm_file=setting.file_lm, n=setting.max_gram)
   self.weight_word_penalty = setting.weight_word_penalty
   self.weight_lm = setting.weight_lm
   self.weight_translate_costs = setting.weight_translate_costs
Пример #6
0
 def __init__(self):
   """
   Initiate the language model
   initiate the model with weights in setting file
   """
   setting.load(["max_gram","file_lm","weights"])
   self.maxGram = int(setting.max_gram)
   sys.stderr.write("[GentileModel] Loading language model... \n")
   self.lm = LanguageModel(lm_file=setting.file_lm, n=setting.max_gram)
   self.weights = setting.weights
   self.lenWeights = len(self.weights)
   self.weightsForRules = self.weights[:-1]
   self.cacheLMProbs = {}
Пример #7
0
 def translateNBest(self, data_tree, data_dep):
   """
   Translate in forest
   @param data_tree:
   @param data_dep:
   @return:
   """
   # Get pare tree.
   #self.model.cacheMode = False
   setting.load(["nbest", "hypothesis_cluster_limit", "head_phrases_limit"])
   tree = SenseTree(data_tree,data_dep)
   tree.rebuildTopNode()
   tree.appendXToTree()
   tree.upMergeAllConjNodes()
   tree.rebuildCommaNodes()
   tree.convertTags()
   tree.separateContiniousNonTerminals()
   tree.buildLevelMap()
   # Prepare for chiropractic process.
   treeForest = [tree]
   resultStack = []
   for tree in treeForest:
     resultStack.append(self.chiropracticTranslation(tree))
Пример #8
0
"""
Gentile Sense-to-string Model.
Estimator for dispersed temperate rule tables.

This will create final rule tables with translation probabilities.

- Raphael 2012.9
"""

import sys, os, math

from abraham.setting import setting

setting.load(["rule_table_path", "dispersion_tables"])

class Estimator:
  """
  A Estimator to calulate rule probabilities then produce final rules.
  """
  nSplittingRuleTable = None
  pathTables = None
  mapTargetCount = None
  stackContextRules = None

  def __init__(self):
    self.mapTargetCount = {}
    self.nSplittingRuleTable = setting.dispersion_tables
    self.pathTables = setting.rule_table_path

  
  def processTargetCountTable(self,path_file):
Пример #9
0
  def translateNBestOLD(self,data_tree,data_dep):
    """
    Translate and return a N-best list
    @type data_tag: string
    @type data_dep: string
    @rtype: list of GentileHypothesis
    """
    # first, we need get the tree of input
    self.model.cacheMode = False
    setting.load(["nbest", "head_phrases_limit"])
    tree = SenseTree(data_tree,data_dep)
    tree.rebuildTopNode()
    tree.appendXToTree()
    tree.upMergeAllConjNodes()
    tree.rebuildCommaNodes()
    tree.convertTags()
    tree.separateContiniousNonTerminals()
    # tree.mergeContinuousNTs()
    fetcher = self.prepareRulesForTranslation(tree)
    # build lexical hypothesis stack
    # { id->[lexical hyp,] }
    # stack_lex = self.buildLexicalStack(fetcher)
    # { id->[lexical hyp,] }
    hypStacks = {}
    # for each fragment ( head node is not leaf ) at bottom-up style
    # use corresponding rules and basic hypothesis(lex or normal) to build normal hyp for this fragment
    tree.buildLevelMap()
    cur_level = tree.getMaxLevel()
    # A dirty trick: save current sense tree to cross-module global variable.
    __builtin__.currentSenseTree = tree
    # start pruning
    self.model.cacheMode = True
    while cur_level > 0:
      # [head id,]
      nodes_cur_level = tree.getNodesByLevel(cur_level)
      if cur_level == 1:
        self.model.smode = True
      else:
        self.model.smode = False
      for node in nodes_cur_level:
        if node not in fetcher.joints:
          # only prune for joint nodes
          continue
        # get rules
        rules, sitesInvolved = fetcher.mapJointRules[node]
        # okay available could in random order
        # we dont need sort it
        if not rules:
          # No rules found, force to use CYK.
          rc = Reconstructor(self.ruletable, self.model,
                             tree, hypStacks, node)
          hyps = rc.parse()
        else:
          # Rules found then cube prunning.
          # sort rules
          rules = self.model.sortRules(rules)
          # now run the cube pruning and get normal hypothesises for current node
          hyps = separately_prune(self.model, node, rules, sitesInvolved, hypStacks)
        hypStacks[node] = hyps
        self.model.clearCache()
      # end of current node
      cur_level -= 1

    rootNode = tree.getRootNode()
    if rootNode not in hypStacks or len(hypStacks[rootNode])==0:
      # failed
      print "[GentileDecoder]","Translation Failed!!!"
      return []

    # end building normal hypothesis stack
    # hypStacks[rootNode][0].trace()

    return hypStacks[rootNode][:setting.nbest]
Пример #10
0
import os
import sys

from abraham.treestruct import DepTreeStruct
from abraham.setting import setting
from gentile.tagconvertor import convert_tags_for_tokens

setting.load(["x_as_tag"])

PATTERN_SEPARATE_NTS = "NP NP,NP NP NP,NP VP,NP VBZ,NP NP VBZ,S VBZ,DT X".split(",")

class GeneralTree:
  """
  A class represents gernal tree structure.
  """
  nodes = None
  mapParent = None
  mapChildren = None
  root = None

  def __init__(self):
    """
    Initialize members.
    """
    self.nodes, self.mapParent, self.mapChildren = {}, {}, {}

  def node(self, id):
    """
    Get the node by given node id.
    """
    return self.nodes[id] if id in self.nodes else None
Пример #11
0
  def translateStanfordNBest(self,data_tag,data_dep):
    """
    Translate and return a N-best list
    @type data_tag: string
    @type data_dep: string
    @rtype: list of NovelDepStrHypothesis
    """
    # first, we need get the tree of input
    setting.load(["nbest"])
    tree = DepTreeStruct(data_tag,data_dep)
    ruletable = self.ruletable
    model = self.model
    # build lexical hypothesis stack
    # { id->[lexical hyp,] }
    stack_lex = self.build_lex_stack(tree,ruletable,model)
    # { id->[lexical hyp,] }
    stack_normal = {}
    # for each fragment ( head node is not leaf ) at bottom-up style
    # use corresponding rules and basic hypothesis(lex or normal) to build normal hyp for this fragment
    cur_level = tree.getMaxLevel()-1
    while cur_level > 0:
      # [head id,]
      nodes_cur_level = tree.getNodesByLevel(cur_level)
      for headid in nodes_cur_level:
        # only build normal hypothesises for internal nodes
        if tree.isLeafNode(headid) : continue
        # build rule list
        cur_fragment = (headid,tree)
        rules = ruletable.findByFragment(cur_fragment)
        if len(rules) == 0:
          # rules not found !!!
          rules = [ruletable.buildPsuedoRule(cur_fragment)]
          
        # [(rule,cost),...]
        stack_rules = model.sortRules(rules,setting.size_cube_pruning)
        # build hypothesis stacks for cube pruning
        # head node : lexical hypothesises
        # internal nodes : normal hypothesises
        # leaf nodes : lexical hypothesises
        # [[hyp,...],...]
        stacks_pruning = []
        # append lexical hypothesis stack for head node
        stacks_pruning.append(stack_lex[headid])
        # add other hypothesis stacks
        for nodeid in tree.getChildNodes(headid):
          if tree.isLeafNode(nodeid): # leaf node
            stacks_pruning.append(stack_lex[nodeid])
          else: # internal node
            assert nodeid in stack_normal
            stacks_pruning.append(stack_normal[nodeid])
        # now run the cube pruning and get normal hypothesises for current node
        pruner = AllInOneCubePruner(model,cur_fragment,stack_rules,stacks_pruning,setting.size_cube_pruning)
        list_hyps = pruner.prune()
        # sort pruned hyps
        stack_normal[headid] = list_hyps
      # end of current node
      cur_level -= 1

    if tree.headNodeId not in stack_normal or len(stack_normal[tree.headNodeId])==0:
      # failed
      print "NovelDepStrDecoder","Translation Failed!!!"
      return []

    # end building normal hypothesis stack
    stack_normal[tree.headNodeId][0].trace()

    return stack_normal[tree.headNodeId][:setting.nbest]
Пример #12
0
sys.path += ["%s/abraham" % os.path.dirname(os.path.abspath(__file__))]
from abraham.setting import setting
from gentile.decoder import GentileDecoder

if __name__ == "__main__":
  #sys.argv.append ("config.yaml")
  #sys.argv.append ("mert")
  arg_length = len(sys.argv)
  if arg_length == 1:
    # abraham.py
    print "usage : python abraham.py config.yaml"

  elif arg_length == 2:
    # abraham.py config.yaml
    setting.runningMode = "normal"
    setting.load(["file_translation_input_tree","file_translation_input_dep","file_translation_output","size_cube_pruning"])
    
    decoder = GentileDecoder()
    

    print "[Gentile]", "Interactive Mode"

    while True:
      sentence = raw_input("[INPUT]")

      sentence = sentence.strip()

      _, pathText = tempfile.mkstemp()
      _, pathCFG = tempfile.mkstemp()
      _, pathDep = tempfile.mkstemp()
Пример #13
0
  the hot X1 went ||| SONO ATSUI X1 NAKU NATTA ||| NN ||| 0-0 1-1 ||| 0.125

- Raphael 2012.8
"""
import sys, os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import re
import math
from heapq import merge
from abraham.setting import setting
from sense import SenseTree, GeneralTree
import itertools



setting.load(["max_merge_nodes", "max_tokens"])

class Extractor:
  """
  Rule extractor for Gentile.
  """

  contentNodeSet = None
  sense = None
  """ @type: SenseTree """
  tree = None
  """ @type: GeneralTree """
  targets = None
  """ @type: list of string """
  mapAlignment = None
  """ @type: dict """
Пример #14
0
"""
Gentile, sense tree-to-string model.
Rule extractor.

- Raphael 2012.8
"""
import sys, os
from abraham.setting import setting
from gentile.extractor import Extractor
from gentile.sense import SenseTree




#sys.argv.append("config.yaml")
setting.load(["file_source_tree","file_source_dep",
  "file_target","file_alignment","rule_table_path"])
if len(sys.argv) == 7:
  _,FILE_SOURCE_TREE,FILE_SOURCE_DEP,FILE_TARGET,FILE_ALIGNMENT,PATH_RULETABLES,_ = sys.argv

else:
  FILE_SOURCE_TREE = setting.file_source_tree
  FILE_SOURCE_DEP = setting.file_source_dep
  FILE_TARGET = setting.file_target
  FILE_ALIGNMENT = setting.file_alignment
  PATH_RULETABLES = setting.rule_table_path

linesTree = open(FILE_SOURCE_TREE).readlines()
linesDep = open(FILE_SOURCE_DEP).read().split("\n\n")
linesTarget = open(FILE_TARGET).readlines()
linesAlignment = open(FILE_ALIGNMENT).readlines()
print "[GENTILE] Extracting ..."
Пример #15
0
from abraham.setting import setting
from gentile.decoder import GentileDecoder
from chiropractor.decoder import ChiropracticDecoder

if __name__ == "__main__":
  #sys.argv.append ("config.yaml")
  #sys.argv.append ("mert")
  arg_length = len(sys.argv)
  if arg_length == 1:
    # abraham.py
    print "usage : python abraham.py config.yaml"

  elif arg_length == 2:
    # abraham.py config.yaml
    setting.runningMode = "normal"
    setting.load(["enable_chiropractic","file_translation_input_tree","file_translation_input_dep","file_translation_output","size_cube_pruning"])
    linesDep = open(setting.file_translation_input_dep).read().split("\n\n")
    linesTree = open(setting.file_translation_input_tree).readlines()

    decoder = setting.enable_chiropractic and ChiropracticDecoder() or GentileDecoder()
    foutput = open(setting.file_translation_output, "w")

    print "[Abraham]","translate %d sentences..." % (len(linesTree))
    
    for i in range(len(linesTree)):
      lineTree = linesTree[i].strip()
      lineDep = linesDep[i].strip()
      hyps = decoder.translateNBest(lineTree, lineDep)
#      hyps[0].trace()
#      if len(hyps)==0:
#        print "[%d]" % i , "Translation Failed!!!"
Пример #16
0
import sys, os
from abraham.setting import setting

setting.load(["rule_table_path"])

if len(sys.argv) != 2:
  print "python auto-do-all.py [config.yaml]"
  sys.exit()

_, cfg = sys.argv

if not os.path.exists(cfg):
  print "config not exist"
  sys.exit()

def run(cmd):
  print cmd
  os.system(cmd)

desc = cfg.replace("config.", "").replace(".yaml", "")

run("python extractor.gentile.py %s" % cfg)
# Filter extracted rules
ruletable = "%s/rules.extracted" % setting.rule_table_path
run("mv %s %s.withdtcomma" % (ruletable, ruletable))
run("python research/remove-dt-to-comma-rules.py %s.withdtcomma > %s" % (ruletable, ruletable))
run("python estimator.gentile.py --disperse %s" % cfg)
run("python estimator.gentile.py --estimate %s" % cfg)
run("python index.gentile.py %s" % cfg)
#run("python gentile.m.py %s > research/%s.log" % (cfg, desc))
run("date")