Пример #1
0
def getbasenps(pospath, pennpath):
   symbols = reader.readsymbols(pennpath)
   n = 9
   for possent in posio.posread(pospath):
      base_nps = []
      for index in range(len(possent)):
#         possymbol = '/'.join(possent[index])
         symbol = symbols.next().replace('(', '-LRB-').replace(')', '-RRB-').replace('{', '-LCB-').replace('}', '-RCB-')
         lastn = 0
         while symbol[:symbol.rfind('/')] != possent[index][0]:
            if lastn != n:
               posio.posprint(possent)
               print n
               lastn = n
            print symbol
            if symbol == '[':
               assert not base_nps or base_nps[-1][1] != None
               base_nps.append([index, None])
            elif symbol == ']':
               if base_nps:
                  assert base_nps and base_nps[-1][1] == None
                  base_nps[-1][1] = index
	    symbol = symbols.next().replace('(', '-LRB-').replace(')', '-RRB-').replace('{', '-LCB-').replace('}', '-RCB-')
      if base_nps and base_nps[-1][1] == None:
#	 symbol = symbols.next().replace('(', '-LRB-').replace(')', '-RRB-').replace('{', '-LCB-').replace('}', '-RCB-')
#         assert symbol == ']'
	 base_nps[-1][1] = index
      assert not base_nps or base_nps[-1][1]!=None
      print base_nps
      n += 1
Пример #2
0
def randompos(path, cutoff):
    for p in posio.posread(path):
        n = 0
        for w, t in p:
            if random.random() < cutoff:
                p[n][1] = '_NONE-'
            n += 1
        posio.posprint(p)
Пример #3
0
def tagdict(path, dDict):
   import posio
   for sent in posio.posread(path):
      for word in sent:
         if not word[0] in dDict:
            dDict[word[0]] = {}
         if not word[1] in dDict[word[0]]:
            dDict[word[0]][word[1]] = 0
         dDict[word[0]][word[1]] += 1
Пример #4
0
def mergeconllpos(conll_path, pos_path):
    pos = posio.posread(pos_path)
    for sent in depio.depread(conll_path):
        sentpos = pos.next()
        assert len(sent) == len(sentpos)
        for n in range(len(sent)):
            #         print sent[n][1], sentpos[n][0]
            assert sent[n][1] == sentpos[n][0]
            sent[n][4] = sentpos[n][1]
        depio.depprint(sent)
Пример #5
0
def mergeconllpos(conll_path, pos_path):
   pos = posio.posread(pos_path)
   for sent in depio.depread(conll_path):
      sentpos = pos.next()
      assert len(sent) == len(sentpos)
      for n in range(len(sent)):
#         print sent[n][1], sentpos[n][0]
         assert sent[n][1] == sentpos[n][0]
         sent[n][4] =sentpos[n][1]
      depio.depprint(sent)
Пример #6
0
def updatepos(path, pospath, sSep):
    def updatenode(node, sent):
        if node.type == 'token':
            node.name = sent[0][1]
            sent.pop(0)
        else:
            updatenode(node.left_child, sent)
            if node.right_child:
                updatenode(node.right_child, sent)

    file = open(path)
    pos_it = posio.posread(pospath, sSep)
    for line in file:
        pos = pos_it.next()
        # get node
        srcnode = binarize.CBinarizedTreeNode()
        srcnode.load(line)
        updatenode(srcnode, pos)
        # prin
        print srcnode
Пример #7
0
def updatepos(path, pospath, sSep, output):
    def updatenode(node, sent):
        if node.type == 'token':
            node.name = sent[0][1]
            sent.pop(0)
        else:
            updatenode(node.left_child, sent)
            if node.right_child:
                updatenode(node.right_child, sent)

    file = codecs.open(path, encoding='utf-8')
    pos_it = posio.posread(pospath, sSep)
    for line in file:
        pos = pos_it.next()
        # get node
        srcnode = binarize.CBinarizedTreeNode()
        srcnode.load(line)
        updatenode(srcnode, pos)
        # prin
        output.write(srcnode.utf8print() + "\n")
Пример #8
0
def updatepos(path, pospath, sSep):

   def updatenode(node, sent):
      if node.type == 'token':
         node.name = sent[0][1]
         sent.pop(0)
      else:
         updatenode(node.left_child, sent)
         if node.right_child:
            updatenode(node.right_child, sent)

   file = open(path)
   pos_it = posio.posread(pospath, sSep)
   for line in file:
      pos = pos_it.next()
      # get node
      srcnode = binarize.CBinarizedTreeNode()
      srcnode.load(line)
      updatenode(srcnode, pos)
      # prin
      print srcnode
Пример #9
0
def updatepos(path, pospath, sSep, output):

   def updatenode(node, sent):
      if node.type == 'token':
         node.name = sent[0][1]
         sent.pop(0)
      else:
         updatenode(node.left_child, sent)
         if node.right_child:
            updatenode(node.right_child, sent)

   file=codecs.open(path,encoding='utf-8')
   pos_it = posio.posread(pospath, sSep)
   for line in file:
      pos = pos_it.next()
      # get node
      srcnode = binarize.CBinarizedTreeNode()
      srcnode.load(line)
      updatenode(srcnode, pos)
      # prin
      output.write(srcnode.utf8print()+"\n")
Пример #10
0
 third = None
 for opt in opts:
    if opt[0] == '-v':
       verbose = True
    if opt[0] == '-t':
       third = opt[1]
       verbose = True
 if verbose:
 #if v:
    print 'Word	correct	system'
    print '='*30
 dic = {}
 words = {}
 if len(args) == 3:
    builddic(args[2], dic, words)
 ref = posio.posread(ref_file)
 out = posio.posread(out_file)
 if third:
    third = posio.posread(third)
 total = 0
 correct = 0
 oov = 0
 sent_third = None
 for sent in out:
    sent_ref = ref.next()
    if third: 
       sent_third = third.next()
    res = eval(sent, sent_ref, sent_third, dic, words, verbose)
    total += res[0]
    correct += res[1]
    oov += res[2]
Пример #11
0
 third = None
 for opt in opts:
     if opt[0] == '-v':
         verbose = True
     if opt[0] == '-t':
         third = opt[1]
         verbose = True
 if verbose:
     #if v:
     print 'Word	correct	system'
     print '=' * 30
 dic = {}
 words = {}
 if len(args) == 3:
     builddic(args[2], dic, words)
 ref = posio.posread(ref_file)
 out = posio.posread(out_file)
 if third:
     third = posio.posread(third)
 total = 0
 correct = 0
 oov = 0
 sent_third = None
 for sent in out:
     sent_ref = ref.next()
     if third:
         sent_third = third.next()
     res = eval(sent, sent_ref, sent_third, dic, words, verbose)
     total += res[0]
     correct += res[1]
     oov += res[2]
Пример #12
0
def tagdicttup(path, dDict):
   import posio
   for sent in posio.posread(path):
      for word in sent:
         dDict[(word[0], word[1])] = dDict.get((word[0], word[1]), 0) + 1
Пример #13
0
import os
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../pos'))

import posio

def evalmt(pos1, pos2):
   correct = 0
   for word1 in pos1: 
      match = -1
      for i2 in range(len(pos2)):
         if pos2[i2][0] == word1[0] and pos2[i2][1] == word1[1]:
            correct+=1
            match = i2
            break
      if match != -1:
         pos2[match] = [None, None]
   return correct, len(pos1)

if __name__ == '__main__':
   ref_file = posio.posread(sys.argv[2])
   out_file = posio.posread(sys.argv[1])
   total = 0
   correct = 0
   for sent in out_file:
      ref = ref_file.next()
      retval = evalmt(sent, ref)
      total += retval[1]
      correct += retval[0]
   print 'precision: ', float(correct) / total
Пример #14
0
import sys
import posio
import replace

# how many sentences
N = 5000

f = sys.argv[1]
r = replace.CReplace(sys.argv[2])
i = 0
for sent in posio.posread(f, '_'):
    if i < N:
        for n in range(len(sent)):
            sent[n][0] = r.replace(sent[n][0])
    i += 1
    posio.posprint(sent, '_')
Пример #15
0
import sys
import posio
import replace

# how many sentences
N=5000

f = sys.argv[1]
r = replace.CReplace(sys.argv[2])
i = 0
for sent in posio.posread(f, '_'):
   if i < N:
      for n in range(len(sent)):
         sent[n][0] = r.replace(sent[n][0])
   i += 1; posio.posprint(sent, '_')