def getbasenps(pospath, pennpath): symbols = reader.readsymbols(pennpath) n = 9 for possent in posio.posread(pospath): base_nps = [] for index in range(len(possent)): # possymbol = '/'.join(possent[index]) symbol = symbols.next().replace('(', '-LRB-').replace(')', '-RRB-').replace('{', '-LCB-').replace('}', '-RCB-') lastn = 0 while symbol[:symbol.rfind('/')] != possent[index][0]: if lastn != n: posio.posprint(possent) print n lastn = n print symbol if symbol == '[': assert not base_nps or base_nps[-1][1] != None base_nps.append([index, None]) elif symbol == ']': if base_nps: assert base_nps and base_nps[-1][1] == None base_nps[-1][1] = index symbol = symbols.next().replace('(', '-LRB-').replace(')', '-RRB-').replace('{', '-LCB-').replace('}', '-RCB-') if base_nps and base_nps[-1][1] == None: # symbol = symbols.next().replace('(', '-LRB-').replace(')', '-RRB-').replace('{', '-LCB-').replace('}', '-RCB-') # assert symbol == ']' base_nps[-1][1] = index assert not base_nps or base_nps[-1][1]!=None print base_nps n += 1
def randompos(path, cutoff): for p in posio.posread(path): n = 0 for w, t in p: if random.random() < cutoff: p[n][1] = '_NONE-' n += 1 posio.posprint(p)
def tagdict(path, dDict): import posio for sent in posio.posread(path): for word in sent: if not word[0] in dDict: dDict[word[0]] = {} if not word[1] in dDict[word[0]]: dDict[word[0]][word[1]] = 0 dDict[word[0]][word[1]] += 1
def mergeconllpos(conll_path, pos_path): pos = posio.posread(pos_path) for sent in depio.depread(conll_path): sentpos = pos.next() assert len(sent) == len(sentpos) for n in range(len(sent)): # print sent[n][1], sentpos[n][0] assert sent[n][1] == sentpos[n][0] sent[n][4] = sentpos[n][1] depio.depprint(sent)
def mergeconllpos(conll_path, pos_path): pos = posio.posread(pos_path) for sent in depio.depread(conll_path): sentpos = pos.next() assert len(sent) == len(sentpos) for n in range(len(sent)): # print sent[n][1], sentpos[n][0] assert sent[n][1] == sentpos[n][0] sent[n][4] =sentpos[n][1] depio.depprint(sent)
def updatepos(path, pospath, sSep): def updatenode(node, sent): if node.type == 'token': node.name = sent[0][1] sent.pop(0) else: updatenode(node.left_child, sent) if node.right_child: updatenode(node.right_child, sent) file = open(path) pos_it = posio.posread(pospath, sSep) for line in file: pos = pos_it.next() # get node srcnode = binarize.CBinarizedTreeNode() srcnode.load(line) updatenode(srcnode, pos) # prin print srcnode
def updatepos(path, pospath, sSep, output): def updatenode(node, sent): if node.type == 'token': node.name = sent[0][1] sent.pop(0) else: updatenode(node.left_child, sent) if node.right_child: updatenode(node.right_child, sent) file = codecs.open(path, encoding='utf-8') pos_it = posio.posread(pospath, sSep) for line in file: pos = pos_it.next() # get node srcnode = binarize.CBinarizedTreeNode() srcnode.load(line) updatenode(srcnode, pos) # prin output.write(srcnode.utf8print() + "\n")
def updatepos(path, pospath, sSep, output): def updatenode(node, sent): if node.type == 'token': node.name = sent[0][1] sent.pop(0) else: updatenode(node.left_child, sent) if node.right_child: updatenode(node.right_child, sent) file=codecs.open(path,encoding='utf-8') pos_it = posio.posread(pospath, sSep) for line in file: pos = pos_it.next() # get node srcnode = binarize.CBinarizedTreeNode() srcnode.load(line) updatenode(srcnode, pos) # prin output.write(srcnode.utf8print()+"\n")
third = None for opt in opts: if opt[0] == '-v': verbose = True if opt[0] == '-t': third = opt[1] verbose = True if verbose: #if v: print 'Word correct system' print '='*30 dic = {} words = {} if len(args) == 3: builddic(args[2], dic, words) ref = posio.posread(ref_file) out = posio.posread(out_file) if third: third = posio.posread(third) total = 0 correct = 0 oov = 0 sent_third = None for sent in out: sent_ref = ref.next() if third: sent_third = third.next() res = eval(sent, sent_ref, sent_third, dic, words, verbose) total += res[0] correct += res[1] oov += res[2]
third = None for opt in opts: if opt[0] == '-v': verbose = True if opt[0] == '-t': third = opt[1] verbose = True if verbose: #if v: print 'Word correct system' print '=' * 30 dic = {} words = {} if len(args) == 3: builddic(args[2], dic, words) ref = posio.posread(ref_file) out = posio.posread(out_file) if third: third = posio.posread(third) total = 0 correct = 0 oov = 0 sent_third = None for sent in out: sent_ref = ref.next() if third: sent_third = third.next() res = eval(sent, sent_ref, sent_third, dic, words, verbose) total += res[0] correct += res[1] oov += res[2]
def tagdicttup(path, dDict): import posio for sent in posio.posread(path): for word in sent: dDict[(word[0], word[1])] = dDict.get((word[0], word[1]), 0) + 1
import os sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../pos')) import posio def evalmt(pos1, pos2): correct = 0 for word1 in pos1: match = -1 for i2 in range(len(pos2)): if pos2[i2][0] == word1[0] and pos2[i2][1] == word1[1]: correct+=1 match = i2 break if match != -1: pos2[match] = [None, None] return correct, len(pos1) if __name__ == '__main__': ref_file = posio.posread(sys.argv[2]) out_file = posio.posread(sys.argv[1]) total = 0 correct = 0 for sent in out_file: ref = ref_file.next() retval = evalmt(sent, ref) total += retval[1] correct += retval[0] print 'precision: ', float(correct) / total
import sys import posio import replace # how many sentences N = 5000 f = sys.argv[1] r = replace.CReplace(sys.argv[2]) i = 0 for sent in posio.posread(f, '_'): if i < N: for n in range(len(sent)): sent[n][0] = r.replace(sent[n][0]) i += 1 posio.posprint(sent, '_')
import sys import posio import replace # how many sentences N=5000 f = sys.argv[1] r = replace.CReplace(sys.argv[2]) i = 0 for sent in posio.posread(f, '_'): if i < N: for n in range(len(sent)): sent[n][0] = r.replace(sent[n][0]) i += 1; posio.posprint(sent, '_')