def getalllabels(path): labels = set() for dep in depio.depread(path): for word in dep: label = word[3] labels.add(label) return labels
def printStats(path): def _ou(s): return s # return g_macroNamed[s] dLabel = {} setPOS = set([]) for sent in depio.depread(path): simplelabelanal(sent, dLabel, setPOS) print 'Set of labels' print ' '.join(dLabel.keys()) for label in dLabel: print label, '===' setDeps = set([]) dPair = dLabel.get(label, {}) dDepCount={} for head in dPair: headCount = 0 dDep = dPair.get(head, {}) for key in dDep.keys(): headCount += dDep[key] if not key in dDepCount: dDepCount[key] = 0 dDepCount[key]+=dDep[key] print head, '(', headCount, ')', ' : ', ' '.join([key+'('+str(dDep[key])+')' for key in dDep.keys()]) setDeps = setDeps.union(dDep.keys()) print print 'Set of heads: ', ' '.join(map(_ou, dPair.keys())) assert set(dDepCount.keys()) == setDeps print 'Set of depcounts: ', ' '.join([_ou(key)+'('+str(dDepCount[key])+')' for key in dDepCount.keys()]) print 'Set of deps: ', ' '.join(map(_ou, setDeps)) print print 'Set of nonheads: ', ' '.join(map(_ou, (setPOS-set(dPair.keys())))) print 'Set of nondeps: ', ' '.join(map(_ou, (setPOS-setDeps))) print
def randomdep(path, cutoff): for d in depio.depread(path): n = 0 for w, p, h, l in d: if h != '-1' and random.random() < cutoff: d[n][2] = '-1' n += 1 depio.depprint(d)
def randomdep(path, cutoff): for d in depio.depread(path): n = 0 for w, p, h, l in d: if h != "-1" and random.random() < cutoff: d[n][2] = "-1" n += 1 depio.depprint(d)
def mergeconllpos(conll_path, pos_path): pos = posio.posread(pos_path) for sent in depio.depread(conll_path): sentpos = pos.next() assert len(sent) == len(sentpos) for n in range(len(sent)): # print sent[n][1], sentpos[n][0] assert sent[n][1] == sentpos[n][0] sent[n][4] = sentpos[n][1] depio.depprint(sent)
def mergeconllpos(conll_path, pos_path): pos = posio.posread(pos_path) for sent in depio.depread(conll_path): sentpos = pos.next() assert len(sent) == len(sentpos) for n in range(len(sent)): # print sent[n][1], sentpos[n][0] assert sent[n][1] == sentpos[n][0] sent[n][4] =sentpos[n][1] depio.depprint(sent)
def headdepcount(path, counts): for deptree in depio.depread(input): for index in range(len(deptree)): headindex = int(deptree[index][2]) if headindex != -1: dep_word = deptree[index] head_word = deptree[headindex] dep = dep_word[0] dep_pos = dep_word[1] head = head_word[0] head_pos = head_word[1] head_direction = 'L' if headindex > index: head_direction = 'R' key = (head, dep, head_pos, dep_pos, head_direction) if not key in counts: counts[key] = 0 counts[key] += 1
def printStats(path): def _ou(s): return s # return g_macroNamed[s] dLabel = {} setPOS = set([]) for sent in depio.depread(path): simplelabelanal(sent, dLabel, setPOS) print 'Set of labels' print ' '.join(dLabel.keys()) for label in dLabel: print label, '===' setDeps = set([]) dPair = dLabel.get(label, {}) dDepCount = {} for head in dPair: headCount = 0 dDep = dPair.get(head, {}) for key in dDep.keys(): headCount += dDep[key] if not key in dDepCount: dDepCount[key] = 0 dDepCount[key] += dDep[key] print head, '(', headCount, ')', ' : ', ' '.join( [key + '(' + str(dDep[key]) + ')' for key in dDep.keys()]) setDeps = setDeps.union(dDep.keys()) print print 'Set of heads: ', ' '.join(map(_ou, dPair.keys())) assert set(dDepCount.keys()) == setDeps print 'Set of depcounts: ', ' '.join([ _ou(key) + '(' + str(dDepCount[key]) + ')' for key in dDepCount.keys() ]) print 'Set of deps: ', ' '.join(map(_ou, setDeps)) print print 'Set of nonheads: ', ' '.join( map(_ou, (setPOS - set(dPair.keys())))) print 'Set of nondeps: ', ' '.join(map(_ou, (setPOS - setDeps))) print
def maxmodifycount(path, condition): ret = {} for sent in depio.depread(path): counts = {} for index, word in enumerate(sent): head = int(word[2]) if head != -1 and condition(head, index): if not head in counts: counts[head] = 0 counts[head] += 1 for head in counts: count = counts[head] pos = sent[head][1] if count > ret.get(pos, 0): ret[pos] = count maxcnt = 0 for pos in ret: count = ret[pos] print pos, ':', count if count > maxcnt: maxcnt = count print 'Overall', ':', maxcnt
def maxmodifycount(path, condition): ret = {} for sent in depio.depread(path): counts = {} for index, word in enumerate(sent): head = int(word[2]) if head != -1 and condition(head, index): if not head in counts: counts[head]=0 counts[head] += 1 for head in counts: count = counts[head] pos=sent[head][1] if count > ret.get(pos, 0): ret[pos]=count maxcnt=0 for pos in ret: count = ret[pos] print pos, ':', count if count>maxcnt: maxcnt = count print 'Overall', ':', maxcnt
def writeCppCode(path): def _label(s): return 'PENN_DEP_' + s.upper() def _pos(s): return g_macroNamed[s] dLabel = {} setPOS = set([]) for sent in depio.depread(path): labelanal(sent, dLabel, setPOS) # write header print '#include "tags.h"' print '#ifdef LABELED' print '#include "dependency/label/penn.h"' print '#endif' print print 'namespace english {' print '#ifdef LABELED' print 'inline bool canAssignLabel(const vector< CTaggedWord<CTag,TAG_SEPARATOR> > &sent, const int &head, const int &dep, const CDependencyLabel&label) {' print ' assert(head==DEPENDENCY_LINK_NO_HEAD||head>=0); // correct head' print ' assert(dep>=0);' print ' // if the head word is none, only ROOT' print ' if (head==DEPENDENCY_LINK_NO_HEAD) {' print ' if (label.code()==PENN_DEP_ROOT) ' print ' return true;' print ' return false;' print ' }' print ' // for each case' print ' const unsigned &head_pos = sent[head].tag.code();' print ' const unsigned &dep_pos = sent[dep].tag.code();' print ' assert(head!=DEPENDENCY_LINK_NO_HEAD);' print ' if (label == PENN_DEP_ROOT) // now head is not DEPENDENCY_LINK_NO_HEAD' print ' return false;' # for each label nTotalRules = 0 nLabel = 0 for label in dLabel: # print condition if nLabel == 0: print " if (label==%s) {" % _label(label) else: print " else if (label==%s) {" % _label(label) nLabel += 1 # collect statistics dHeadCount = {} # head : count dDepCount = {} # dep : count nTotalCount = 0 # arc dEntry = dLabel.get(label, {}) # head, dep : count for key in dEntry: head = key[0] dep = key[1] if not head in dHeadCount: dHeadCount[head] = 0 dHeadCount[head] += dEntry[key] if not dep in dDepCount: dDepCount[dep] = 0 dDepCount[dep] += dEntry[key] nTotalCount += dEntry[key] # write head condition threshold = 1 # g_freqCutoff * nTotalCount nCount = 0 for pos in setPOS: if dHeadCount.get(pos, 0) < threshold: if nCount == 0: print " if ( head_pos==%s" % _pos(pos) else: print " || head_pos==%s" % _pos(pos) nCount += 1 for pos in setPOS: if dDepCount.get(pos, 0) < threshold: if nCount == 0: print " if ( dep_pos==%s" % _pos(pos) else: print " || dep_pos==%s" % _pos(pos) nCount += 1 if nCount > 0: print ' ) return false;' nTotalRules += nCount # finish condition print " }" # write footer print " // total number of rules are %d." % nTotalRules print " return true;" print '}' print '#endif' print print 'inline const bool hasLeftHead(const unsigned &tag) {' print ' return true;' print '}' print print 'inline const bool hasRightHead(const unsigned &tag) {' print ' return true;' print '}' print 'inline const bool canBeRoot(const unsigned &tag) {' print ' return true;' print '}' print '}'
'VBP': 'PENN_TAG_VERB_PRES', 'VBZ': 'PENN_TAG_VERB_THIRD_SINGLE', 'WDT': 'PENN_TAG_WDT', 'WP': 'PENN_TAG_WP', 'WP$': 'PENN_TAG_WP_DOLLAR', 'WRB': 'PENN_TAG_WRB' } def _ou(s): # return s return g_macroNamed[s] if __name__ == "__main__": dHead = {} dDep = {} setRoot = set() for sent in depio.depread(sys.argv[1]): diranal(sent, dHead, dDep, setRoot) print "head POS with left / right dep" for head_pos in dHead: print head_pos, dHead[head_pos][0], dHead[head_pos][1] print print "dep POS with left / right head" for pos in dDep: print pos, dDep[pos][0], dDep[pos][1] print print "the set root" print ' '.join(pos for pos in setRoot)
def get_set_elements(sent, morph_func): els = zip([sent[0]] * len(sent[1]), map(morph_func, sent[1])) seen = dict() for i, next in enumerate(els): val = seen.get(next[1], 0) if val > 0: els[i] = (els[i][0], tuple(list(els[i][1]) + [str(val + 1)])) seen[next[1]] += 1 else: seen[next[1]] = 1 last = els[i] return els if __name__ == '__main__': file_output = list(enumerate(depio.depread(sys.argv[1]))) file_ref = list(enumerate(depio.depread(sys.argv[2]))) print '\t'.join([ 'comparison-function', 'gold', 'pred', 'true-positive', 'precision', 'recall', 'f1', 'em' ]) for morph_func_name, morph_func_opts in morph_funcs: morph_func = get_morph_func(morph_func_name) output_set = set( flatten(map(lambda s: get_set_elements(s, morph_func), file_output))) ref_set = set( flatten(map(lambda s: get_set_elements(s, morph_func), file_ref))) exact_match = Counter( map(lambda (x, y): x == y, zip(file_output, file_ref))) exact_match_pct = float(exact_match[True]) / float(len(file_ref))
dep_pos = word[1] if dep_pos == ',' or dep_pos == '.' or dep_pos == '-LRB-' or dep_pos == '-RRB-' or dep_pos == ':' or dep_pos == '``' or dep_pos == '"' or dep_pos == '#' or dep_pos == '$': punct.add(dep_word) if __name__ == "__main__": if len(sys.argv) != 3: print "depanal.py options input >output" print "options: countfrag listarc findpunct" sys.exit(1) option = sys.argv[1] input = sys.argv[2] if option == "countfrag": count = 0 total = 0 for sent in depio.depread(input): count += countfrag(sent) total += 1 print "%d are fragmented from %d sentences." % (count, total) elif option == "listarc": arc = {} for sent in depio.depread(input): listanarc(sent, arc) for anarc in arc: print ' '.join(anarc), arc[anarc] elif option == "findpunct": punct = set() for sent in depio.depread(input): findpunct(sent, punct) for punc in punct: print punc
def printrawnode(self, node): retval = [] for left_child in node.left_children: retval.extend(self.printrawnode(left_child)) retval.append(node.token) for right_child in node.right_children: retval.extend(self.printrawnode(right_child)) return retval def printposnode(self, node): retval = [] for left_child in node.left_children: retval.extend(self.printposnode(left_child)) retval.append([node.token, node.pos]) for right_child in node.right_children: retval.extend(self.printposnode(right_child)) return retval def toRaw(self): return ' '.join(self.printrawnode(self.root)) def toPOS(self): return ' '.join( ['|'.join(word) for word in self.printposnode(self.root)]) if __name__ == '__main__': for words in depio.depread(sys.argv[1]): dep = CDep(words) print dep
#!/usr/bin/python import sys import depio sentnum = int(sys.argv[2]) fnames = [sys.argv[1]] for fname in fnames: sents = list(depio.depread(fname)) i = 0 out = open("%d.%s" % (sentnum, fname), 'w') for outl in sents[sentnum]: out.write('\t'.join(outl) + '\n') out.write('\n') out.close()
def malt2zpar(path): for sent in depio.depread(path): for word in sent: word[2] = str(int(word[2])-1) depio.depprint(sent)
ref_word = reference[index] assert word[0] == ref_word[0] if g_reP.match(word[0]): continue if word[2] == ref_word[2]: correct_head += 1 if word[3] == ref_word[3]: correct_label += 1 else: total_uem = 0 total += 1 return correct_head, correct_label, total, total_uem if __name__ == '__main__': file_output = depio.depread(sys.argv[1]) file_ref = depio.depread(sys.argv[2]) total_sent = 0 total_uem = 0 total = 0 correct_head = 0 correct_label = 0 for output in file_output: ref = file_ref.next() ret = eval(output, ref) correct_head += ret[0] correct_label += ret[1] total += ret[2] total_uem += ret[3] total_sent += 1 print float(correct_head) / total, float(correct_label) / total, float(
import sys import depio import replace # how many changed N = 10000 r = replace.CReplace(sys.argv[2]) n = 0 for sent in depio.depread(sys.argv[1]): if n < N: for i in range(len(sent)): sent[i][0] = r.replace(sent[i][0]) n += 1 depio.depprint(sent)
return self.printtree(self.root) def printrawnode(self, node): retval = [] for left_child in node.left_children: retval.extend(self.printrawnode(left_child)) retval.append(node.token) for right_child in node.right_children: retval.extend(self.printrawnode(right_child)) return retval def printposnode(self, node): retval = [] for left_child in node.left_children: retval.extend(self.printposnode(left_child)) retval.append([node.token, node.pos]) for right_child in node.right_children: retval.extend(self.printposnode(right_child)) return retval def toRaw(self): return ' '.join(self.printrawnode(self.root)) def toPOS(self): return ' '.join(['|'.join(word) for word in self.printposnode(self.root)]) if __name__ == '__main__': for words in depio.depread(sys.argv[1]): dep = CDep(words) print dep
print 0, 0, dept.toRaw() elif count > float(total) * 0.3: print 0, dept.toRaw() else: print dept.toRaw() #======================================== #======================================== if __name__ == '__main__': opts, args = getopt.getopt(sys.argv[1:], "") if len(args) < 1: print 'adapt input align' sys.exit(0) # get parameter if len(args) != 2: print "The alignment file must be provided" sys.exit(0) sInput = args[0] sAlign = args[1] # input alignFile = brute.readAlign(sAlign) for tree in depio.depread(sInput): align = alignFile.next() filter(tree, align)
import sys import depio if __name__ == "__main__": if len(sys.argv) != 2: print "addindex.py input >output" sys.exit(1) input = sys.argv[1] for sent in depio.depread(input): index = 0 for word in sent: print "\t".join([str(index)] + word) index += 1 print
alignFile = None align = None model = None if sInput == 'a': if len(args) != 3: print "The alignment file must be provided with -ia" sys.exit(0) model = readModel(args[1], True) alignFile = readAlign(args[2]) elif sInput == 'c': model = None #readModel(args[1], False) else: print 'The input format is invalid' sys.exit(0) for tree in depio.depread(args[0]): if alignFile: align = alignFile.next() if align == None: depio.depprint(tree) assert align != None dept = dep.CDep(tree) reorder(dept, align, model, bDebug) if sOutput == 'd': print dept elif sOutput == 'p': print dept.toPOS() elif sOutput == 'r': print dept.toRaw() elif sOutput == 'i': print printOrder(dept)
def malt2zpar(path): for sent in depio.depread(path): for word in sent: word[2] = str(int(word[2]) - 1) depio.depprint(sent)
def toSuperTag(tree): return toSuperTagForNode(tree.root) def encode(hl, hr, dl, dr, rt): return str( (int(hl)<<0) | (int(hr)<<1) | (int(dl)<<2) | (int(dr)<<3) | (int(rt)<<4) ) if __name__== '__main__': opts, args = getopt.getopt(sys.argv[1:], "o:") if len(args) < 1: print 'reorder [-o hl|hr|ml|mr|h|m|hm] input' sys.exit(0) sOutput = 'hm' for opt, val in opts: if opt == '-o': sOutput = val for sent in depio.depread(args[0]): dept = dep.CDep(sent) supertags = toSuperTag(dept) if sOutput == 'hm': print ' '.join(['|'.join([word[0], word[1], encode(word[2], word[3], word[4], word[5], word[6])]) for word in supertags]) elif sOutput == 'h': print ' '.join(['|'.join([word[0], word[1], encode(word[2], word[3], 0, 0, 0)]) for word in supertags]) elif sOutput == 'm': print ' '.join(['|'.join([word[0], word[1], encode(0, 0, word[4], word[5], 0)]) for word in supertags]) elif sOutput == 'hl': print ' '.join(['|'.join([word[0], word[1], word[2]]) for word in supertags]) elif sOutput == 'hr': print ' '.join(['|'.join([word[0], word[1], word[3]]) for word in supertags]) elif sOutput == 'ml': print ' '.join(['|'.join([word[0], word[1], word[4]]) for word in supertags]) elif sOutput == 'mr':
#assert assert word[1] == ref_word[1] #if if g_reP.match( word[1] ) : #continue continue #if if word[6] == ref_word[6]: #correct_head correct_head += 1 #if if word[7] == ref_word[7]: #correct_label correct_label += 1 #else else: #total_uem total_uem = 0 #total total += 1 return correct_head, correct_label, total, total_uem, incorrect_head, incorrect_label, missing_head, missing_label, len( list(filter(filt_unmapped, reference))) if __name__ == '__main__': file_output = list(depio.depread(sys.argv[1])) file_ref = list(depio.depread(sys.argv[2])) total_sent = 0 total_uem = 0 total = 0 total_gold = 0 correct_head = 0 incorrect_head = 0 correct_label = 0 incorrect_label = 0 missing_head = 0 missing_label = 0 for ref, output in zip(file_ref, file_output): # ref = file_ref.next() ret = eval(output, ref) correct_head += ret[0]
for index, word in enumerate(output): ref_word = reference[index] assert word[0] == ref_word[0] if g_reP.match( word[0] ) : continue if word[2] == ref_word[2]: correct_head += 1 if word[3] == ref_word[3]: correct_label += 1 else: total_uem = 0 total += 1 return correct_head, correct_label, total, total_uem if __name__ == '__main__': file_output = depio.depread(sys.argv[1]) file_ref = depio.depread(sys.argv[2]) total_sent = 0 total_uem = 0 total = 0 correct_head = 0 correct_label =0 for output in file_output: ref = file_ref.next() ret = eval(output, ref) correct_head += ret[0] correct_label += ret[1] total += ret[2] total_uem += ret[3] total_sent += 1 print float(correct_head)/total, float(correct_label)/total, float(total_uem)/total_sent
def encode(hl, hr, dl, dr, rt): return str((int(hl) << 0) | (int(hr) << 1) | (int(dl) << 2) | (int(dr) << 3) | (int(rt) << 4)) if __name__ == '__main__': opts, args = getopt.getopt(sys.argv[1:], "o:") if len(args) < 1: print 'reorder [-o hl|hr|ml|mr|h|m|hm] input' sys.exit(0) sOutput = 'hm' for opt, val in opts: if opt == '-o': sOutput = val for sent in depio.depread(args[0]): dept = dep.CDep(sent) supertags = toSuperTag(dept) if sOutput == 'hm': print ' '.join([ '|'.join([ word[0], word[1], encode(word[2], word[3], word[4], word[5], word[6]) ]) for word in supertags ]) elif sOutput == 'h': print ' '.join([ '|'.join([word[0], word[1], encode(word[2], word[3], 0, 0, 0)]) for word in supertags ])
def writeCppCode(path): def _label(s): return 'PENN_DEP_'+s.upper() def _pos(s): return g_macroNamed[s] dLabel = {} setPOS = set([]) for sent in depio.depread(path): labelanal(sent, dLabel, setPOS) # write header print '#include "tags.h"' print '#ifdef LABELED' print '#include "dependency/label/penn.h"' print '#endif' print print 'namespace english {' print '#ifdef LABELED' print 'inline bool canAssignLabel(const vector< CTaggedWord<CTag,TAG_SEPARATOR> > &sent, const int &head, const int &dep, const CDependencyLabel&label) {' print ' assert(head==DEPENDENCY_LINK_NO_HEAD||head>=0); // correct head' print ' assert(dep>=0);' print ' // if the head word is none, only ROOT' print ' if (head==DEPENDENCY_LINK_NO_HEAD) {' print ' if (label.code()==PENN_DEP_ROOT) ' print ' return true;' print ' return false;' print ' }' print ' // for each case' print ' const unsigned &head_pos = sent[head].tag.code();' print ' const unsigned &dep_pos = sent[dep].tag.code();' print ' assert(head!=DEPENDENCY_LINK_NO_HEAD);' print ' if (label == PENN_DEP_ROOT) // now head is not DEPENDENCY_LINK_NO_HEAD' print ' return false;' # for each label nTotalRules=0 nLabel=0 for label in dLabel: # print condition if nLabel == 0: print " if (label==%s) {" % _label(label) else: print " else if (label==%s) {" % _label(label) nLabel +=1 # collect statistics dHeadCount={} # head : count dDepCount={} # dep : count nTotalCount=0 # arc dEntry = dLabel.get(label, {}) # head, dep : count for key in dEntry: head = key[0] dep = key[1] if not head in dHeadCount: dHeadCount[head] = 0 dHeadCount[head] += dEntry[key] if not dep in dDepCount: dDepCount[dep] = 0 dDepCount[dep] += dEntry[key] nTotalCount += dEntry[key] # write head condition threshold = 1 # g_freqCutoff * nTotalCount nCount=0 for pos in setPOS: if dHeadCount.get(pos, 0) < threshold: if nCount == 0: print " if ( head_pos==%s" % _pos(pos) else: print " || head_pos==%s" % _pos(pos) nCount += 1 for pos in setPOS: if dDepCount.get(pos, 0) < threshold: if nCount == 0: print " if ( dep_pos==%s" % _pos(pos) else: print " || dep_pos==%s" % _pos(pos) nCount += 1 if nCount>0: print ' ) return false;' nTotalRules+=nCount # finish condition print " }" # write footer print " // total number of rules are %d." % nTotalRules print " return true;" print '}' print '#endif' print print 'inline const bool hasLeftHead(const unsigned &tag) {' print ' return true;' print '}' print print 'inline const bool hasRightHead(const unsigned &tag) {' print ' return true;' print '}' print 'inline const bool canBeRoot(const unsigned &tag) {' print ' return true;' print '}' print '}'
def run(): """bla""" old = 'train5k.hebtb.gold.lattices' new = 'train5k.hebtb.truegold.lattices' osents = list(depio.depread(old)) nsents = list(depio.depread(new)) zipped = zip(osents, nsents) outfile = open('train5k.hebtb.truegold_fixed.lattices', 'w') fixtypes = defaultdict(int) def fixsimple(osent, nsent): """Fix simple""" zosent, znsent = zip(*osent), zip(*nsent) znsent[-1] = zosent[-1] nsent = zip(*znsent) return nsent log = True def matchmiss(osent, nsent): j = 0 i = 0 numchanges = 0 while i < len(nsent): truemorph = nsent[i] predmorph = osent[j] if log: print '\tAt %s and %s' % (predmorph[2], truemorph[2]) if predmorph[2] == truemorph[2]: if log: print '\t\tFixing1 %s with %s' % (truemorph[2], predmorph[2]) truemorph[-1] = predmorph[-1] j += 1 i += 1 numchanges += 1 elif j < len(osent)-1 and ''.join([predmorph[2], osent[j+1][2]]) == truemorph[2]: if log: print '\t\tFixing2 %s with %s' % (truemorph[2], predmorph[2]) truemorph[-1] = predmorph[-1] j += 2 i += 1 numchanges += 1 elif j < len(osent)-1 and ''.join([predmorph[2], osent[j+1][2][1:]]) == truemorph[2]: if log: print '\t\tFixing2 %s with %s' % (truemorph[2], predmorph[2]) truemorph[-1] = predmorph[-1] j += 2 i += 1 numchanges += 1 elif i < len(nsent)-1 and ''.join([truemorph[2], nsent[i+1][2]]) == predmorph[2]: if log: print '\t\tFixing3 %s with %s' % (truemorph[2], predmorph[2]) truemorph[-1] = predmorph[-1] i += 1 numchanges += 1 elif i > 0 and ''.join([nsent[i-1][2], truemorph[2]]) == predmorph[2]: if log: print '\t\tFixing4 %s with %s' % (truemorph[2], predmorph[2]) truemorph[-1] = predmorph[-1] j += 1 i += 1 numchanges += 1 elif truemorph[2][:3] == predmorph[2][:3] and len(osent)>j+1 and \ osent[j+1][4] == 'S_PRN': if log: print '\t\tFixing6 %s with %s' % (truemorph[2], predmorph[2]) truemorph[-1] = predmorph[-1] j += 2 i += 1 numchanges += 1 elif truemorph[2][:3] == predmorph[2][:3] and len(nsent)>i+1 and \ nsent[i+1][4] == 'S_PRN': if log: print '\t\tFixing8 %s with %s' % (truemorph[2], predmorph[2]) print '\t\tFixing8 %s with %s' % (nsent[i+1][2], predmorph[2]) truemorph[-1] = predmorph[-1] nsent[i+1][-1] = predmorph[-1] j += 1 i += 2 numchanges += 2 elif len(nsent)> i+1 and len(osent) > j+1 and \ ''.join([truemorph[2], nsent[i+1][2]]) == ''.join([predmorph[2], osent[j+1][2]]): if log: print '\t\tFixing10 %s with %s' % (truemorph[2], predmorph[2]) print '\t\tFixing10 %s with %s' % (nsent[i+1][2], osent[j+1][2]) truemorph[-1] = predmorph[-1] nsent[i+1][-1] = osent[j+1][-1] j += 2 i += 2 numchanges += 2 elif set([truemorph[2][:3], predmorph[2][:3]]) == set(['EM', 'AT']) and \ len(osent)>j+1 and len(nsent)>i+1 and \ nsent[i+1][4] == 'S_PRN' and osent[j+1][4] == 'S_PRN': if log: print '\t\tFixing9 %s with %s' % (truemorph[2], predmorph[2]) truemorph[-1] = predmorph[-1] j += 1 i += 1 numchanges += 1 elif set([truemorph[2], predmorph[2]]) == set(['ATH', 'AT']) and \ truemorph[4] == 'S_PRN' and truemorph[4] == 'S_PRN': if log: print '\t\tFixing13 %s with %s' % (truemorph[2], predmorph[2]) truemorph[-1] = predmorph[-1] j += 1 i += 1 numchanges += 1 elif truemorph[4] == 'IN' and len(nsent) > i+1 and nsent[i+1][4] == 'S_PRN': if log: print '\t\tFixing7 %s with %s' % (truemorph[2], predmorph[2]) print '\t\tFixing7 %s with %s' % (nsent[i+1][-1], predmorph[2]) truemorph[-1] = predmorph[-1] nsent[i+1][-1] = predmorph[-1] j += 1 i += 2 numchanges += 2 elif truemorph[2] == 'B' and truemorph[4] == 'PREPOSITION' and \ len(nsent) > i+1 and len(osent) > j+1 and \ predmorph[4] == 'IN' and osent[j+1][4] == 'S_PRN': if log: print '\t\tFixing7 %s with %s' % (truemorph[2], predmorph[2]) print '\t\tFixing7 %s with %s' % (nsent[i+1][-1], predmorph[2]) truemorph[-1] = predmorph[-1] nsent[i+1][-1] = predmorph[-1] j += 2 i += 2 numchanges += 2 elif len(nsent) > i+2 and predmorph[2] == ''.join([truemorph[2], nsent[i+1][2], nsent[i+2][2]]): if log: print '\t\tFixing12 %s with %s' % (truemorph[2], predmorph[2]) print '\t\tFixing12 %s with %s' % (nsent[i+1][-1], predmorph[2]) print '\t\tFixing12 %s with %s' % (nsent[i+2][-1], predmorph[2]) truemorph[-1] = predmorph[-1] j += 1 i += 3 numchanges += 3 elif len(nsent) > i+2 and nsent[i+1][4] == 'IN' and nsent[i+2][4] == 'S_PRN': if log: print '\t\tFixing11 %s with %s' % (truemorph[2], predmorph[2]) print '\t\tFixing11 %s with %s' % (nsent[i+1][-1], predmorph[2]) print '\t\tFixing11 %s with %s' % (nsent[i+2][-1], predmorph[2]) truemorph[-1] = predmorph[-1] nsent[i+1][-1] = predmorph[-1] nsent[i+2][-1] = predmorph[-1] j += 1 i += 3 numchanges += 3 elif len(nsent) > i+2 and ''.join([truemorph[2], nsent[i+2][2]]) == predmorph[2]: if log: print '\t\tFixing7 %s with %s' % (truemorph[2], predmorph[2]) print '\t\tFixing7 %s with %s' % (nsent[i+1][-1], predmorph[2]) print '\t\tFixing7 %s with %s' % (nsent[i+2][-1], predmorph[2]) truemorph[-1] = predmorph[-1] nsent[i+1][-1] = predmorph[-1] nsent[i+2][-1] = predmorph[-1] j += 1 i += 3 numchanges += 3 elif truemorph[2] == 'H' and i>0 and len(nsent)>i+1 and len(osent)>j+1 and \ ''.join([osent[j-1][2], predmorph[2]]) == ''.join([nsent[i-1][2], nsent[i+1][2]]): if log: print '\t\tFixing5 %s with %s' % (truemorph[2], predmorph[2]) i += 1 truemorph[-1] = predmorph[-1] numchanges += 1 else: i += 1 return nsent, numchanges == len(nsent) for num, (osent, nsent) in enumerate(zipped): print 'At %s' % str(num) fget = itemgetter(2) oforms, nforms = map(fget, osent), map(fget, nsent) out = nsent success = False if len(osent) == len(nsent) and oforms == nforms: out = fixsimple(osent, nsent) fixtypes['proper'] += 1 success = True else: out, success = matchmiss(osent, nsent) fixtypes['match' if success else 'nomatch'] += 1 if not success: print 'Failed at %s' % str(num) outfile.write(depio.depstring(out)) pprint(fixtypes) print 'Total %s' % str(sum(fixtypes.values()))