示例#1
0
def getalllabels(path):
   labels = set()
   for dep in depio.depread(path):
      for word in dep:
         label = word[3]
         labels.add(label)
   return labels
示例#2
0
def printStats(path):
   def _ou(s):
      return s
   #   return g_macroNamed[s]
   dLabel = {}
   setPOS = set([])
   for sent in depio.depread(path):
      simplelabelanal(sent, dLabel, setPOS)
   print 'Set of labels'
   print ' '.join(dLabel.keys())
   for label in dLabel:
      print label, '==='
      setDeps = set([])
      dPair = dLabel.get(label, {})
      dDepCount={}
      for head in dPair:
         headCount = 0
         dDep = dPair.get(head, {})
         for key in dDep.keys():
            headCount += dDep[key]
            if not key in dDepCount:
               dDepCount[key] = 0
            dDepCount[key]+=dDep[key]
         print head, '(', headCount, ')', ' : ', ' '.join([key+'('+str(dDep[key])+')' for key in dDep.keys()])
         setDeps = setDeps.union(dDep.keys())
      print
      print 'Set of heads: ', ' '.join(map(_ou, dPair.keys()))
      assert set(dDepCount.keys()) == setDeps
      print 'Set of depcounts: ', ' '.join([_ou(key)+'('+str(dDepCount[key])+')' for key in dDepCount.keys()])
      print 'Set of deps: ', ' '.join(map(_ou, setDeps))
      print
      print 'Set of nonheads: ', ' '.join(map(_ou, (setPOS-set(dPair.keys()))))
      print 'Set of nondeps: ', ' '.join(map(_ou, (setPOS-setDeps)))
      print
示例#3
0
def getalllabels(path):
    labels = set()
    for dep in depio.depread(path):
        for word in dep:
            label = word[3]
            labels.add(label)
    return labels
示例#4
0
def randomdep(path, cutoff):
    for d in depio.depread(path):
        n = 0
        for w, p, h, l in d:
            if h != '-1' and random.random() < cutoff:
                d[n][2] = '-1'
            n += 1
        depio.depprint(d)
示例#5
0
def randomdep(path, cutoff):
    for d in depio.depread(path):
        n = 0
        for w, p, h, l in d:
            if h != "-1" and random.random() < cutoff:
                d[n][2] = "-1"
            n += 1
        depio.depprint(d)
示例#6
0
def mergeconllpos(conll_path, pos_path):
    pos = posio.posread(pos_path)
    for sent in depio.depread(conll_path):
        sentpos = pos.next()
        assert len(sent) == len(sentpos)
        for n in range(len(sent)):
            #         print sent[n][1], sentpos[n][0]
            assert sent[n][1] == sentpos[n][0]
            sent[n][4] = sentpos[n][1]
        depio.depprint(sent)
示例#7
0
def mergeconllpos(conll_path, pos_path):
   pos = posio.posread(pos_path)
   for sent in depio.depread(conll_path):
      sentpos = pos.next()
      assert len(sent) == len(sentpos)
      for n in range(len(sent)):
#         print sent[n][1], sentpos[n][0]
         assert sent[n][1] == sentpos[n][0]
         sent[n][4] =sentpos[n][1]
      depio.depprint(sent)
示例#8
0
文件: depop.py 项目: StevenLOL/zpar
def headdepcount(path, counts):
   for deptree in depio.depread(input):
      for index in range(len(deptree)):
         headindex = int(deptree[index][2])
         if headindex != -1:
            dep_word = deptree[index]
            head_word = deptree[headindex]
            dep = dep_word[0]
            dep_pos = dep_word[1]
            head = head_word[0]
            head_pos = head_word[1]
            head_direction = 'L'
            if headindex > index:
               head_direction = 'R'
            key = (head, dep, head_pos, dep_pos, head_direction)
            if not key in counts:
               counts[key] = 0
            counts[key] += 1
示例#9
0
def headdepcount(path, counts):
    for deptree in depio.depread(input):
        for index in range(len(deptree)):
            headindex = int(deptree[index][2])
            if headindex != -1:
                dep_word = deptree[index]
                head_word = deptree[headindex]
                dep = dep_word[0]
                dep_pos = dep_word[1]
                head = head_word[0]
                head_pos = head_word[1]
                head_direction = 'L'
                if headindex > index:
                    head_direction = 'R'
                key = (head, dep, head_pos, dep_pos, head_direction)
                if not key in counts:
                    counts[key] = 0
                counts[key] += 1
示例#10
0
def printStats(path):
    def _ou(s):
        return s

    #   return g_macroNamed[s]
    dLabel = {}
    setPOS = set([])
    for sent in depio.depread(path):
        simplelabelanal(sent, dLabel, setPOS)
    print 'Set of labels'
    print ' '.join(dLabel.keys())
    for label in dLabel:
        print label, '==='
        setDeps = set([])
        dPair = dLabel.get(label, {})
        dDepCount = {}
        for head in dPair:
            headCount = 0
            dDep = dPair.get(head, {})
            for key in dDep.keys():
                headCount += dDep[key]
                if not key in dDepCount:
                    dDepCount[key] = 0
                dDepCount[key] += dDep[key]
            print head, '(', headCount, ')', ' : ', ' '.join(
                [key + '(' + str(dDep[key]) + ')' for key in dDep.keys()])
            setDeps = setDeps.union(dDep.keys())
        print
        print 'Set of heads: ', ' '.join(map(_ou, dPair.keys()))
        assert set(dDepCount.keys()) == setDeps
        print 'Set of depcounts: ', ' '.join([
            _ou(key) + '(' + str(dDepCount[key]) + ')'
            for key in dDepCount.keys()
        ])
        print 'Set of deps: ', ' '.join(map(_ou, setDeps))
        print
        print 'Set of nonheads: ', ' '.join(
            map(_ou, (setPOS - set(dPair.keys()))))
        print 'Set of nondeps: ', ' '.join(map(_ou, (setPOS - setDeps)))
        print
示例#11
0
def maxmodifycount(path, condition):
    ret = {}
    for sent in depio.depread(path):
        counts = {}
        for index, word in enumerate(sent):
            head = int(word[2])
            if head != -1 and condition(head, index):
                if not head in counts:
                    counts[head] = 0
                counts[head] += 1
            for head in counts:
                count = counts[head]
                pos = sent[head][1]
                if count > ret.get(pos, 0):
                    ret[pos] = count
    maxcnt = 0
    for pos in ret:
        count = ret[pos]
        print pos, ':', count
        if count > maxcnt:
            maxcnt = count
    print 'Overall', ':', maxcnt
示例#12
0
文件: depop.py 项目: StevenLOL/zpar
def maxmodifycount(path, condition):
   ret = {}
   for sent in depio.depread(path):
      counts = {}
      for index, word in enumerate(sent):
         head = int(word[2])
         if head != -1 and condition(head, index):
            if not head in counts:
               counts[head]=0
            counts[head] += 1
         for head in counts:
            count = counts[head]
            pos=sent[head][1]
            if count > ret.get(pos, 0):
               ret[pos]=count
   maxcnt=0
   for pos in ret:
      count = ret[pos]
      print pos, ':', count
      if count>maxcnt:
         maxcnt = count
   print 'Overall', ':', maxcnt
示例#13
0
def writeCppCode(path):
    def _label(s):
        return 'PENN_DEP_' + s.upper()

    def _pos(s):
        return g_macroNamed[s]

    dLabel = {}
    setPOS = set([])
    for sent in depio.depread(path):
        labelanal(sent, dLabel, setPOS)
    # write header
    print '#include "tags.h"'
    print '#ifdef LABELED'
    print '#include "dependency/label/penn.h"'
    print '#endif'
    print
    print 'namespace english {'
    print '#ifdef LABELED'
    print 'inline bool canAssignLabel(const vector< CTaggedWord<CTag,TAG_SEPARATOR> > &sent, const int &head, const int &dep, const CDependencyLabel&label) {'
    print '   assert(head==DEPENDENCY_LINK_NO_HEAD||head>=0); // correct head'
    print '   assert(dep>=0);'
    print '   // if the head word is none, only ROOT'
    print '   if (head==DEPENDENCY_LINK_NO_HEAD) {'
    print '      if (label.code()==PENN_DEP_ROOT) '
    print '         return true;'
    print '      return false;'
    print '   }'
    print '      // for each case'
    print '   const unsigned &head_pos = sent[head].tag.code();'
    print '   const unsigned &dep_pos = sent[dep].tag.code();'
    print '   assert(head!=DEPENDENCY_LINK_NO_HEAD);'
    print '   if (label == PENN_DEP_ROOT) // now head is not DEPENDENCY_LINK_NO_HEAD'
    print '      return false;'
    # for each label
    nTotalRules = 0
    nLabel = 0
    for label in dLabel:
        # print condition
        if nLabel == 0:
            print "   if (label==%s) {" % _label(label)
        else:
            print "   else if (label==%s) {" % _label(label)
        nLabel += 1
        # collect statistics
        dHeadCount = {}  # head : count
        dDepCount = {}  # dep : count
        nTotalCount = 0  # arc
        dEntry = dLabel.get(label, {})  # head, dep : count
        for key in dEntry:
            head = key[0]
            dep = key[1]
            if not head in dHeadCount:
                dHeadCount[head] = 0
            dHeadCount[head] += dEntry[key]
            if not dep in dDepCount:
                dDepCount[dep] = 0
            dDepCount[dep] += dEntry[key]
            nTotalCount += dEntry[key]
        # write head condition
        threshold = 1  # g_freqCutoff * nTotalCount
        nCount = 0
        for pos in setPOS:
            if dHeadCount.get(pos, 0) < threshold:
                if nCount == 0:
                    print "      if ( head_pos==%s" % _pos(pos)
                else:
                    print "           || head_pos==%s" % _pos(pos)
                nCount += 1
        for pos in setPOS:
            if dDepCount.get(pos, 0) < threshold:
                if nCount == 0:
                    print "      if ( dep_pos==%s" % _pos(pos)
                else:
                    print "           || dep_pos==%s" % _pos(pos)
                nCount += 1
        if nCount > 0:
            print '         ) return false;'
        nTotalRules += nCount
        # finish condition
        print "   }"
    # write footer
    print "   // total number of rules are %d." % nTotalRules
    print "   return true;"
    print '}'
    print '#endif'
    print
    print 'inline const bool hasLeftHead(const unsigned &tag) {'
    print '   return true;'
    print '}'
    print
    print 'inline const bool hasRightHead(const unsigned &tag) {'
    print '   return true;'
    print '}'
    print 'inline const bool canBeRoot(const unsigned &tag) {'
    print '   return true;'
    print '}'
    print '}'
示例#14
0
    'VBP': 'PENN_TAG_VERB_PRES',
    'VBZ': 'PENN_TAG_VERB_THIRD_SINGLE',
    'WDT': 'PENN_TAG_WDT',
    'WP': 'PENN_TAG_WP',
    'WP$': 'PENN_TAG_WP_DOLLAR',
    'WRB': 'PENN_TAG_WRB'
}


def _ou(s):
    #   return s
    return g_macroNamed[s]


if __name__ == "__main__":
    dHead = {}
    dDep = {}
    setRoot = set()
    for sent in depio.depread(sys.argv[1]):
        diranal(sent, dHead, dDep, setRoot)
    print "head POS with left / right dep"
    for head_pos in dHead:
        print head_pos, dHead[head_pos][0], dHead[head_pos][1]
    print
    print "dep POS with left / right head"
    for pos in dDep:
        print pos, dDep[pos][0], dDep[pos][1]
    print
    print "the set root"
    print ' '.join(pos for pos in setRoot)
示例#15
0
def get_set_elements(sent, morph_func):
    els = zip([sent[0]] * len(sent[1]), map(morph_func, sent[1]))
    seen = dict()
    for i, next in enumerate(els):
        val = seen.get(next[1], 0)
        if val > 0:
            els[i] = (els[i][0], tuple(list(els[i][1]) + [str(val + 1)]))
            seen[next[1]] += 1
        else:
            seen[next[1]] = 1
        last = els[i]
    return els


if __name__ == '__main__':
    file_output = list(enumerate(depio.depread(sys.argv[1])))
    file_ref = list(enumerate(depio.depread(sys.argv[2])))
    print '\t'.join([
        'comparison-function', 'gold', 'pred', 'true-positive', 'precision',
        'recall', 'f1', 'em'
    ])
    for morph_func_name, morph_func_opts in morph_funcs:
        morph_func = get_morph_func(morph_func_name)
        output_set = set(
            flatten(map(lambda s: get_set_elements(s, morph_func),
                        file_output)))
        ref_set = set(
            flatten(map(lambda s: get_set_elements(s, morph_func), file_ref)))
        exact_match = Counter(
            map(lambda (x, y): x == y, zip(file_output, file_ref)))
        exact_match_pct = float(exact_match[True]) / float(len(file_ref))
示例#16
0
        dep_pos = word[1]
        if dep_pos == ',' or dep_pos == '.' or dep_pos == '-LRB-' or dep_pos == '-RRB-' or dep_pos == ':' or dep_pos == '``' or dep_pos == '"' or dep_pos == '#' or dep_pos == '$':
            punct.add(dep_word)


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print "depanal.py options input >output"
        print "options: countfrag listarc findpunct"
        sys.exit(1)
    option = sys.argv[1]
    input = sys.argv[2]
    if option == "countfrag":
        count = 0
        total = 0
        for sent in depio.depread(input):
            count += countfrag(sent)
            total += 1
        print "%d are fragmented from %d sentences." % (count, total)
    elif option == "listarc":
        arc = {}
        for sent in depio.depread(input):
            listanarc(sent, arc)
        for anarc in arc:
            print '	'.join(anarc), arc[anarc]
    elif option == "findpunct":
        punct = set()
        for sent in depio.depread(input):
            findpunct(sent, punct)
        for punc in punct:
            print punc
示例#17
0
    def printrawnode(self, node):
        retval = []
        for left_child in node.left_children:
            retval.extend(self.printrawnode(left_child))
        retval.append(node.token)
        for right_child in node.right_children:
            retval.extend(self.printrawnode(right_child))
        return retval

    def printposnode(self, node):
        retval = []
        for left_child in node.left_children:
            retval.extend(self.printposnode(left_child))
        retval.append([node.token, node.pos])
        for right_child in node.right_children:
            retval.extend(self.printposnode(right_child))
        return retval

    def toRaw(self):
        return ' '.join(self.printrawnode(self.root))

    def toPOS(self):
        return ' '.join(
            ['|'.join(word) for word in self.printposnode(self.root)])


if __name__ == '__main__':
    for words in depio.depread(sys.argv[1]):
        dep = CDep(words)
        print dep
示例#18
0
文件: getsent.py 项目: shovalsa/yap
#!/usr/bin/python

import sys
import depio

sentnum = int(sys.argv[2])
fnames = [sys.argv[1]]
for fname in fnames:
    sents = list(depio.depread(fname))
    i = 0
    out = open("%d.%s" % (sentnum, fname), 'w')
    for outl in sents[sentnum]:
        out.write('\t'.join(outl) + '\n')
    out.write('\n')
    out.close()
示例#19
0
def malt2zpar(path):
   for sent in depio.depread(path):
      for word in sent:
         word[2] = str(int(word[2])-1)
      depio.depprint(sent)
示例#20
0
        ref_word = reference[index]
        assert word[0] == ref_word[0]
        if g_reP.match(word[0]):
            continue
        if word[2] == ref_word[2]:
            correct_head += 1
            if word[3] == ref_word[3]:
                correct_label += 1
        else:
            total_uem = 0
        total += 1
    return correct_head, correct_label, total, total_uem


if __name__ == '__main__':
    file_output = depio.depread(sys.argv[1])
    file_ref = depio.depread(sys.argv[2])
    total_sent = 0
    total_uem = 0
    total = 0
    correct_head = 0
    correct_label = 0
    for output in file_output:
        ref = file_ref.next()
        ret = eval(output, ref)
        correct_head += ret[0]
        correct_label += ret[1]
        total += ret[2]
        total_uem += ret[3]
        total_sent += 1
    print float(correct_head) / total, float(correct_label) / total, float(
示例#21
0
import sys
import depio
import replace

# how many changed
N = 10000

r = replace.CReplace(sys.argv[2])
n = 0
for sent in depio.depread(sys.argv[1]):
    if n < N:
        for i in range(len(sent)):
            sent[i][0] = r.replace(sent[i][0])
    n += 1
    depio.depprint(sent)
示例#22
0
文件: dep.py 项目: StevenLOL/zpar
      return self.printtree(self.root)
   
   def printrawnode(self, node):
      retval = []
      for left_child in node.left_children:
         retval.extend(self.printrawnode(left_child))
      retval.append(node.token)
      for right_child in node.right_children:
         retval.extend(self.printrawnode(right_child))
      return retval

   def printposnode(self, node):
      retval = []
      for left_child in node.left_children:
         retval.extend(self.printposnode(left_child))
      retval.append([node.token, node.pos])
      for right_child in node.right_children:
         retval.extend(self.printposnode(right_child))
      return retval

   def toRaw(self):
      return ' '.join(self.printrawnode(self.root))

   def toPOS(self):
      return ' '.join(['|'.join(word) for word in self.printposnode(self.root)])

if __name__ == '__main__':
   for words in depio.depread(sys.argv[1]):
      dep = CDep(words)
      print dep
示例#23
0
文件: filterb.py 项目: StevenLOL/zpar
      print 0, 0, dept.toRaw()
   elif count > float(total) * 0.3:
      print 0, dept.toRaw()
   else:
      print dept.toRaw()

#========================================


#========================================

if __name__ == '__main__':
   opts, args = getopt.getopt(sys.argv[1:], "")
   if len(args) < 1:
      print 'adapt input align'
      sys.exit(0)

   # get parameter
   if len(args) != 2:
      print "The alignment file must be provided"
      sys.exit(0)
   sInput = args[0]
   sAlign = args[1]

   # input
   alignFile = brute.readAlign(sAlign) 

   for tree in depio.depread(sInput):
      align = alignFile.next()
      filter(tree, align)
示例#24
0
        print 0, 0, dept.toRaw()
    elif count > float(total) * 0.3:
        print 0, dept.toRaw()
    else:
        print dept.toRaw()


#========================================

#========================================

if __name__ == '__main__':
    opts, args = getopt.getopt(sys.argv[1:], "")
    if len(args) < 1:
        print 'adapt input align'
        sys.exit(0)

    # get parameter
    if len(args) != 2:
        print "The alignment file must be provided"
        sys.exit(0)
    sInput = args[0]
    sAlign = args[1]

    # input
    alignFile = brute.readAlign(sAlign)

    for tree in depio.depread(sInput):
        align = alignFile.next()
        filter(tree, align)
示例#25
0
import sys
import depio

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print "addindex.py input >output"
        sys.exit(1)
    input = sys.argv[1]
    for sent in depio.depread(input):
        index = 0
        for word in sent:
            print "\t".join([str(index)] + word)
            index += 1
        print
示例#26
0
   alignFile = None
   align = None
   model = None
   if sInput == 'a':
      if len(args) != 3:
         print "The alignment file must be provided with -ia"
         sys.exit(0)
      model = readModel(args[1], True)
      alignFile = readAlign(args[2]) 
   elif sInput == 'c':
      model = None #readModel(args[1], False)
   else:
      print 'The input format is invalid'
      sys.exit(0)

   for tree in depio.depread(args[0]):
      if alignFile:
         align = alignFile.next()
         if align == None:
            depio.depprint(tree)
         assert align != None
      dept = dep.CDep(tree)
      reorder(dept, align, model, bDebug)
      if sOutput == 'd':
         print dept
      elif sOutput == 'p':
         print dept.toPOS()
      elif sOutput == 'r':
         print dept.toRaw()
      elif sOutput == 'i':
         print printOrder(dept)
示例#27
0
def malt2zpar(path):
    for sent in depio.depread(path):
        for word in sent:
            word[2] = str(int(word[2]) - 1)
        depio.depprint(sent)
示例#28
0
def toSuperTag(tree):
   return toSuperTagForNode(tree.root)

def encode(hl, hr, dl, dr, rt):
   return str( (int(hl)<<0) | (int(hr)<<1) | (int(dl)<<2) | (int(dr)<<3) | (int(rt)<<4) )

if __name__== '__main__':
   opts, args = getopt.getopt(sys.argv[1:], "o:")
   if len(args) < 1:
      print 'reorder [-o hl|hr|ml|mr|h|m|hm] input'
      sys.exit(0)
   sOutput = 'hm'
   for opt, val in opts:
      if opt == '-o':
         sOutput = val
   for sent in depio.depread(args[0]):
      dept = dep.CDep(sent)
      supertags = toSuperTag(dept)
      if sOutput == 'hm':
         print ' '.join(['|'.join([word[0], word[1], encode(word[2], word[3], word[4], word[5], word[6])]) for word in supertags])
      elif sOutput == 'h':
         print ' '.join(['|'.join([word[0], word[1], encode(word[2], word[3], 0, 0, 0)]) for word in supertags])
      elif sOutput == 'm':
         print ' '.join(['|'.join([word[0], word[1], encode(0, 0, word[4], word[5], 0)]) for word in supertags])
      elif sOutput == 'hl':
         print ' '.join(['|'.join([word[0], word[1], word[2]]) for word in supertags])
      elif sOutput == 'hr':
         print ' '.join(['|'.join([word[0], word[1], word[3]]) for word in supertags])
      elif sOutput == 'ml':
         print ' '.join(['|'.join([word[0], word[1], word[4]]) for word in supertags])
      elif sOutput == 'mr':
示例#29
0
    #assert assert word[1] == ref_word[1]
    #if if g_reP.match( word[1] ) :
    #continue continue
    #if if word[6] == ref_word[6]:
    #correct_head correct_head += 1
    #if if word[7] == ref_word[7]:
    #correct_label correct_label += 1
    #else else:
    #total_uem total_uem = 0
    #total total += 1
    return correct_head, correct_label, total, total_uem, incorrect_head, incorrect_label, missing_head, missing_label, len(
        list(filter(filt_unmapped, reference)))


if __name__ == '__main__':
    file_output = list(depio.depread(sys.argv[1]))
    file_ref = list(depio.depread(sys.argv[2]))
    total_sent = 0
    total_uem = 0
    total = 0
    total_gold = 0
    correct_head = 0
    incorrect_head = 0
    correct_label = 0
    incorrect_label = 0
    missing_head = 0
    missing_label = 0
    for ref, output in zip(file_ref, file_output):
        # ref = file_ref.next()
        ret = eval(output, ref)
        correct_head += ret[0]
示例#30
0
文件: eval.py 项目: StevenLOL/zpar
   for index, word in enumerate(output):
      ref_word = reference[index]
      assert word[0] == ref_word[0]
      if g_reP.match( word[0] ) :
         continue
      if word[2] == ref_word[2]:
         correct_head += 1
         if word[3] == ref_word[3]:
            correct_label += 1
      else:
         total_uem = 0
      total += 1
   return correct_head, correct_label, total, total_uem

if __name__ == '__main__':
   file_output = depio.depread(sys.argv[1])
   file_ref = depio.depread(sys.argv[2])
   total_sent = 0
   total_uem = 0
   total = 0
   correct_head = 0
   correct_label  =0
   for output in file_output:
      ref = file_ref.next()
      ret = eval(output, ref)
      correct_head += ret[0]
      correct_label += ret[1]
      total += ret[2]
      total_uem += ret[3]
      total_sent += 1
   print float(correct_head)/total, float(correct_label)/total, float(total_uem)/total_sent
示例#31
0
def encode(hl, hr, dl, dr, rt):
    return str((int(hl) << 0) | (int(hr) << 1) | (int(dl) << 2)
               | (int(dr) << 3) | (int(rt) << 4))


if __name__ == '__main__':
    opts, args = getopt.getopt(sys.argv[1:], "o:")
    if len(args) < 1:
        print 'reorder [-o hl|hr|ml|mr|h|m|hm] input'
        sys.exit(0)
    sOutput = 'hm'
    for opt, val in opts:
        if opt == '-o':
            sOutput = val
    for sent in depio.depread(args[0]):
        dept = dep.CDep(sent)
        supertags = toSuperTag(dept)
        if sOutput == 'hm':
            print ' '.join([
                '|'.join([
                    word[0], word[1],
                    encode(word[2], word[3], word[4], word[5], word[6])
                ]) for word in supertags
            ])
        elif sOutput == 'h':
            print ' '.join([
                '|'.join([word[0], word[1],
                          encode(word[2], word[3], 0, 0, 0)])
                for word in supertags
            ])
示例#32
0
def writeCppCode(path):
   def _label(s):
      return 'PENN_DEP_'+s.upper()
   def _pos(s):
      return g_macroNamed[s]
   dLabel = {}
   setPOS = set([])
   for sent in depio.depread(path):
      labelanal(sent, dLabel, setPOS)
   # write header
   print '#include "tags.h"'
   print '#ifdef LABELED'
   print '#include "dependency/label/penn.h"'
   print '#endif'
   print
   print 'namespace english {'
   print '#ifdef LABELED'
   print 'inline bool canAssignLabel(const vector< CTaggedWord<CTag,TAG_SEPARATOR> > &sent, const int &head, const int &dep, const CDependencyLabel&label) {'
   print '   assert(head==DEPENDENCY_LINK_NO_HEAD||head>=0); // correct head'
   print '   assert(dep>=0);'
   print '   // if the head word is none, only ROOT'
   print '   if (head==DEPENDENCY_LINK_NO_HEAD) {'
   print '      if (label.code()==PENN_DEP_ROOT) '
   print '         return true;'
   print '      return false;'
   print '   }'
   print '      // for each case'
   print '   const unsigned &head_pos = sent[head].tag.code();'
   print '   const unsigned &dep_pos = sent[dep].tag.code();'
   print '   assert(head!=DEPENDENCY_LINK_NO_HEAD);'
   print '   if (label == PENN_DEP_ROOT) // now head is not DEPENDENCY_LINK_NO_HEAD'
   print '      return false;'
   # for each label
   nTotalRules=0
   nLabel=0
   for label in dLabel:
      # print condition
      if nLabel == 0:
         print "   if (label==%s) {" % _label(label)
      else:
         print "   else if (label==%s) {" % _label(label)
      nLabel +=1
      # collect statistics
      dHeadCount={} # head : count
      dDepCount={} # dep : count
      nTotalCount=0 # arc
      dEntry = dLabel.get(label, {}) # head, dep : count
      for key in dEntry:
         head = key[0]
         dep = key[1]
         if not head in dHeadCount:
            dHeadCount[head] = 0
         dHeadCount[head] += dEntry[key]
         if not dep in dDepCount:
            dDepCount[dep] = 0
         dDepCount[dep] += dEntry[key]
         nTotalCount += dEntry[key]
      # write head condition
      threshold = 1 # g_freqCutoff * nTotalCount
      nCount=0
      for pos in setPOS:
         if dHeadCount.get(pos, 0) < threshold:
            if nCount == 0:
               print "      if ( head_pos==%s" % _pos(pos)
            else:
               print "           || head_pos==%s" % _pos(pos)
            nCount += 1
      for pos in setPOS:
         if dDepCount.get(pos, 0) < threshold:
            if nCount == 0:
               print "      if ( dep_pos==%s" % _pos(pos)
            else:
               print "           || dep_pos==%s" % _pos(pos)
            nCount += 1
      if nCount>0:
         print '         ) return false;'
      nTotalRules+=nCount
      # finish condition
      print "   }"
   # write footer
   print "   // total number of rules are %d." % nTotalRules
   print "   return true;"
   print '}'
   print '#endif'
   print
   print 'inline const bool hasLeftHead(const unsigned &tag) {'
   print '   return true;'
   print '}'
   print
   print 'inline const bool hasRightHead(const unsigned &tag) {'
   print '   return true;'
   print '}'
   print 'inline const bool canBeRoot(const unsigned &tag) {'
   print '   return true;'
   print '}'
   print '}'
示例#33
0
def run():
    """bla"""
    old = 'train5k.hebtb.gold.lattices'
    new = 'train5k.hebtb.truegold.lattices'

    osents = list(depio.depread(old))
    nsents = list(depio.depread(new))

    zipped = zip(osents, nsents)

    outfile = open('train5k.hebtb.truegold_fixed.lattices', 'w')
    fixtypes = defaultdict(int)

    def fixsimple(osent, nsent):
        """Fix simple"""
        zosent, znsent = zip(*osent), zip(*nsent)
        znsent[-1] = zosent[-1]
        nsent = zip(*znsent)
        return nsent

    log = True

    def matchmiss(osent, nsent):
        j = 0
        i = 0
        numchanges = 0
        while i < len(nsent):
            truemorph = nsent[i]
            predmorph = osent[j]
            if log:
                print '\tAt %s and %s' % (predmorph[2], truemorph[2])
            if predmorph[2] == truemorph[2]:
                if log:
                    print '\t\tFixing1 %s with %s' % (truemorph[2], predmorph[2])
                truemorph[-1] = predmorph[-1]
                j += 1
                i += 1
                numchanges += 1
            elif j < len(osent)-1 and ''.join([predmorph[2], osent[j+1][2]]) == truemorph[2]:
                if log:
                    print '\t\tFixing2 %s with %s' % (truemorph[2], predmorph[2])
                truemorph[-1] = predmorph[-1]
                j += 2
                i += 1
                numchanges += 1
            elif j < len(osent)-1 and ''.join([predmorph[2], osent[j+1][2][1:]]) == truemorph[2]:
                if log:
                    print '\t\tFixing2 %s with %s' % (truemorph[2], predmorph[2])
                truemorph[-1] = predmorph[-1]
                j += 2
                i += 1
                numchanges += 1
            elif i < len(nsent)-1 and ''.join([truemorph[2], nsent[i+1][2]]) == predmorph[2]:
                if log:
                    print '\t\tFixing3 %s with %s' % (truemorph[2], predmorph[2])
                truemorph[-1] = predmorph[-1]
                i += 1
                numchanges += 1
            elif i > 0 and ''.join([nsent[i-1][2], truemorph[2]]) == predmorph[2]:
                if log:
                    print '\t\tFixing4 %s with %s' % (truemorph[2], predmorph[2])
                truemorph[-1] = predmorph[-1]
                j += 1 
                i += 1
                numchanges += 1
            elif truemorph[2][:3] == predmorph[2][:3] and len(osent)>j+1 and \
                    osent[j+1][4] == 'S_PRN':
                if log:
                    print '\t\tFixing6 %s with %s' % (truemorph[2], predmorph[2])
                truemorph[-1] = predmorph[-1]
                j += 2
                i += 1
                numchanges += 1
            elif truemorph[2][:3] == predmorph[2][:3] and len(nsent)>i+1 and \
                    nsent[i+1][4] == 'S_PRN':
                if log:
                    print '\t\tFixing8 %s with %s' % (truemorph[2], predmorph[2])
                    print '\t\tFixing8 %s with %s' % (nsent[i+1][2], predmorph[2])
                truemorph[-1] = predmorph[-1]
                nsent[i+1][-1] = predmorph[-1]
                j += 1
                i += 2
                numchanges += 2
            elif len(nsent)> i+1 and len(osent) > j+1 and \
            ''.join([truemorph[2], nsent[i+1][2]]) == ''.join([predmorph[2], osent[j+1][2]]):
                if log:
                    print '\t\tFixing10 %s with %s' % (truemorph[2], predmorph[2])
                    print '\t\tFixing10 %s with %s' % (nsent[i+1][2], osent[j+1][2])
                truemorph[-1] = predmorph[-1]
                nsent[i+1][-1] = osent[j+1][-1]
                j += 2
                i += 2
                numchanges += 2
            elif set([truemorph[2][:3], predmorph[2][:3]]) == set(['EM', 'AT']) and \
                      len(osent)>j+1 and len(nsent)>i+1 and \
                    nsent[i+1][4] == 'S_PRN' and osent[j+1][4] == 'S_PRN':
                if log:
                    print '\t\tFixing9 %s with %s' % (truemorph[2], predmorph[2])
                truemorph[-1] = predmorph[-1]
                j += 1
                i += 1
                numchanges += 1
            elif set([truemorph[2], predmorph[2]]) == set(['ATH', 'AT']) and \
                    truemorph[4] == 'S_PRN' and truemorph[4] == 'S_PRN':
                if log:
                    print '\t\tFixing13 %s with %s' % (truemorph[2], predmorph[2])
                truemorph[-1] = predmorph[-1]
                j += 1
                i += 1
                numchanges += 1
            elif truemorph[4] == 'IN' and len(nsent) > i+1 and nsent[i+1][4] == 'S_PRN':
                if log:
                    print '\t\tFixing7 %s with %s' % (truemorph[2], predmorph[2])
                    print '\t\tFixing7 %s with %s' % (nsent[i+1][-1], predmorph[2])
                truemorph[-1] = predmorph[-1]
                nsent[i+1][-1] = predmorph[-1]
                j += 1
                i += 2
                numchanges += 2
            elif truemorph[2] == 'B' and truemorph[4] == 'PREPOSITION' and \
                 len(nsent) > i+1 and len(osent) > j+1 and \
                 predmorph[4] == 'IN' and osent[j+1][4] == 'S_PRN':
                if log:
                    print '\t\tFixing7 %s with %s' % (truemorph[2], predmorph[2])
                    print '\t\tFixing7 %s with %s' % (nsent[i+1][-1], predmorph[2])
                truemorph[-1] = predmorph[-1]
                nsent[i+1][-1] = predmorph[-1]
                j += 2
                i += 2
                numchanges += 2
            elif len(nsent) > i+2 and predmorph[2] == ''.join([truemorph[2], nsent[i+1][2], nsent[i+2][2]]):
                if log:
                    print '\t\tFixing12 %s with %s' % (truemorph[2], predmorph[2])
                    print '\t\tFixing12 %s with %s' % (nsent[i+1][-1], predmorph[2])
                    print '\t\tFixing12 %s with %s' % (nsent[i+2][-1], predmorph[2])
                truemorph[-1] = predmorph[-1]
                j += 1
                i += 3
                numchanges += 3
            elif len(nsent) > i+2 and nsent[i+1][4] == 'IN' and nsent[i+2][4] == 'S_PRN':
                if log:
                    print '\t\tFixing11 %s with %s' % (truemorph[2], predmorph[2])
                    print '\t\tFixing11 %s with %s' % (nsent[i+1][-1], predmorph[2])
                    print '\t\tFixing11 %s with %s' % (nsent[i+2][-1], predmorph[2])
                truemorph[-1] = predmorph[-1]
                nsent[i+1][-1] = predmorph[-1]
                nsent[i+2][-1] = predmorph[-1]
                j += 1
                i += 3
                numchanges += 3
            elif len(nsent) > i+2 and ''.join([truemorph[2], nsent[i+2][2]]) == predmorph[2]:
                if log:
                    print '\t\tFixing7 %s with %s' % (truemorph[2], predmorph[2])
                    print '\t\tFixing7 %s with %s' % (nsent[i+1][-1], predmorph[2])
                    print '\t\tFixing7 %s with %s' % (nsent[i+2][-1], predmorph[2])
                truemorph[-1] = predmorph[-1]
                nsent[i+1][-1] = predmorph[-1]
                nsent[i+2][-1] = predmorph[-1]
                j += 1
                i += 3
                numchanges += 3
            elif truemorph[2] == 'H' and i>0 and len(nsent)>i+1 and len(osent)>j+1 and \
            ''.join([osent[j-1][2], predmorph[2]]) == ''.join([nsent[i-1][2], nsent[i+1][2]]):
                if log:
                    print '\t\tFixing5 %s with %s' % (truemorph[2], predmorph[2])
                i += 1
                truemorph[-1] = predmorph[-1]
                numchanges += 1
            else:
                i += 1
        return nsent, numchanges == len(nsent)

    for num, (osent, nsent) in enumerate(zipped):
        print 'At %s' % str(num)
        fget = itemgetter(2)
        oforms, nforms = map(fget, osent), map(fget, nsent)
        out = nsent
        success = False
        if len(osent) == len(nsent) and oforms == nforms:
            out = fixsimple(osent, nsent)
            fixtypes['proper'] += 1
            success = True
        else:
            out, success = matchmiss(osent, nsent)
            fixtypes['match' if success else 'nomatch'] += 1
        if not success:
            print 'Failed at %s' % str(num)
        outfile.write(depio.depstring(out))
    pprint(fixtypes)
    print 'Total %s' % str(sum(fixtypes.values()))