def freqs(xmlfile,freqsdics): from treebankfiles import rhapsodie2Sentences print "freqs" doc=brandNewXml() sample=doc.getElementsByTagName("sample")[0] phraseStrus=doc.createElement("constrees") phraseStrus.setAttribute("type","phraseStructure") sample.appendChild(phraseStrus) sentences, fs = rhapsodie2Sentences(xmlfile) filecode=xmlfile.split("Rhap")[1].split("Synt")[0][1:-1] for treeindex,tree in enumerate(sentences): #rootindeces=addinfototree(tree, False) for i,node in tree.items(): freqsdics[0][node["cat"]]=freqsdics[0].get(node["cat"],[])+[(filecode,treeindex)] for f in node["gov"].values(): freqsdics[1][f]=freqsdics[1].get(f,[])+[(filecode,treeindex)] for gi,f in node["gov"].iteritems(): if gi: freqsdics[2][ "-".join([tree[gi]["cat"],f ]) ]=freqsdics[2].get( "-".join([tree[gi]["cat"],f ]), [])+[(filecode,treeindex)] freqsdics[4][ "-".join([tree[gi]["cat"],f, node["cat"]]) ]=freqsdics[4].get( "-".join([tree[gi]["cat"],f , node["cat"]]), [])+[(filecode,treeindex)] freqsdics[3][ "-".join([f,node["cat"]]) ]=freqsdics[3].get( "-".join([f,node["cat"]]) , [])+[(filecode,treeindex)] return freqsdics
def freqs(xmlfile, freqsdics): from treebankfiles import rhapsodie2Sentences print "freqs" doc = brandNewXml() sample = doc.getElementsByTagName("sample")[0] phraseStrus = doc.createElement("constrees") phraseStrus.setAttribute("type", "phraseStructure") sample.appendChild(phraseStrus) sentences, fs = rhapsodie2Sentences(xmlfile) filecode = xmlfile.split("Rhap")[1].split("Synt")[0][1:-1] for treeindex, tree in enumerate(sentences): #rootindeces=addinfototree(tree, False) for i, node in tree.items(): freqsdics[0][node["cat"]] = freqsdics[0].get( node["cat"], []) + [(filecode, treeindex)] for f in node["gov"].values(): freqsdics[1][f] = freqsdics[1].get( f, []) + [(filecode, treeindex)] for gi, f in node["gov"].iteritems(): if gi: freqsdics[2]["-".join([ tree[gi]["cat"], f ])] = freqsdics[2].get("-".join([tree[gi]["cat"], f]), []) + [(filecode, treeindex)] freqsdics[4]["-".join( [tree[gi]["cat"], f, node["cat"]])] = freqsdics[4].get( "-".join([tree[gi]["cat"], f, node["cat"]]), []) + [(filecode, treeindex)] freqsdics[3]["-".join([f, node["cat"]])] = freqsdics[3].get( "-".join([f, node["cat"]]), []) + [(filecode, treeindex)] return freqsdics
def joinRhapsodies(goodfilename, oldfilename, newcombinedname): from treebankfiles import rhapsodie2Sentences if verbose: print "reading the files" print "oldfilename", oldfilename print "goodfilename", goodfilename old, olddoc, oldlexes, oldlexids = joinRead(oldfilename) # contains time dumbolddoc = minidom.parse(oldfilename) # only for markup new, newdoc, newlexes, newlexids = joinRead(goodfilename) # good for deps! if verbose: print "finished reading the files" lexidtonode = {} sentences, fs = rhapsodie2Sentences(goodfilename) for s in sentences: rootindeces = addinfototree(s) for i, node in s.items(): lexidtonode[node["lexid"]] = node #for i in sorted(lexidtonode): print i,lexidtonode[i] brandnewdoc = brandNewXml() newtokens = brandnewdoc.createElement("words") newtokens.setAttribute("type", "tokens") newoldlexid, oldnewlexid = {}, {} s = difflib.SequenceMatcher(None, newlexes, oldlexes) tokcounter = 0 for e in new["lexemes"].values(): for refchild in e.getElementsByTagName('ref'): # old refs out e.removeChild(refchild) correctfeatures(e, lexidtonode) oldidtoktoidpiv = {} for oid, oxtok in old["tokens"].items(): for xr in oxtok.getElementsByTagName("ref"): oldidtoktoidpiv[oid] = oldidtoktoidpiv.get( oid, []) + [xr.getAttribute("idref")] #print oldidtoktoidpiv for info, n1, n2, o1, o2 in s.get_opcodes(): if verbose: print "\n_____________", info, "_________________ new:", n1, n2, "old:", o1, o2 if info == "equal": for i, olid in enumerate(oldlexids[o1:o2]): newlexid = newlexids[n1:n2][i] #id newlex = newlexes[n1:n2][i] # str oldlex = oldlexes[o1:o2][i] newoldlexid[newlexid] = olid # put in dico oldnewlexid[olid] = newlexid # put in dico tokcounter = align(old, new, olddoc, newdoc, olid, newlexid, oldlex, newlex, newtokens, tokcounter, oldidtoktoidpiv) elif info == "replace" or info == "insert" or info == "delete": totn = float(len(" ".join( newlexes[n1:n2]))) # complete length of the new lexemes toto = float(len(" ".join( oldlexes[o1:o2]))) # complete length of the old lexemes #print "matching",toto,totn if verbose: print "newlexes", ", ".join(newlexes[n1:n2]) #print newlexids[n1:n2] if verbose: print "oldlexes", ", ".join(oldlexes[o1:o2]) #print oldlexids[o1:o2] lastlen = 0 nperc = [] # contains the fractions of each lexemes (0 < x < 1= for t in newlexes[n1:n2]: nperc += [(len(t) + lastlen) / totn] lastlen += len(t) + 1 lastlen = 0 operc = [] # contains the fractions of each lexemes (0 < x < 1= for t in oldlexes[o1:o2]: operc += [(len(t) + lastlen) / toto] lastlen += len(t) + 1 #print "nperc",nperc #print "operc",operc coperc = operc[:] # copy of operc #print #print [len(t)/toto for t in newlexes[n1:n2]] #print [len(t)/totn for t in oldlexes[o1:o2]] aligndic = { } # hashtable that sends the corresponding fractions of new lexemes to the old ones lasto = None for i, p in enumerate(nperc): #print "p",p while operc and operc[0] <= p: #print "operc[0]<=p",operc[0],p lasto = operc.pop(0) aligndic[p] = aligndic.get(p, []) + [lasto] #print aligndic if operc: #print "operc[0]>p",operc[0],p,lasto #print abs(operc[0]-p),abs(operc[0]-nperc[i+1]) if abs(operc[0] - p) < abs(operc[0] - nperc[i + 1]): #print "next one still better comes to me!" lasto = operc.pop(0) aligndic[p] = aligndic.get(p, []) + [lasto] #print lasto,abs(lasto-p),abs(operc[0]-p) if lasto in aligndic.get(p, []): continue # case i just found smaller ones to aligndic to #print "got to aligndic up to the next one" aligndic[p] = aligndic.get(p, []) + [operc[0]] #elif abs(lasto-p)<abs(operc[0]-p): #print "got to aligndic down to the last one" #aligndic[p]=aligndic.get(p,[])+[lasto] if verbose: print aligndic for p, opli in sorted(aligndic.items()): for op in opli: olid = oldlexids[o1:o2][coperc.index(op)] newlex = newlexes[n1:n2][nperc.index(p)] oldlex = oldlexes[o1:o2][coperc.index(op)] newlexid = newlexids[n1:n2][nperc.index(p)] #id newoldlexid[newlexid] = olid # put in dico oldnewlexid[olid] = newlexid # put in dico #print "aligning",newlex,oldlex tokcounter = align(old, new, olddoc, newdoc, olid, newlexid, oldlex, newlex, newtokens, tokcounter, oldidtoktoidpiv) else: print info, n1, n2, o1, o2 1 / 0 #print "_______________" if verbose: print "_________________________" for c in olddoc.getElementsByTagName("const"): if c.getAttribute("ctype") == "lexeme": #print c.getAttribute("idref") if c.getAttribute("idref") in oldnewlexid: c.setAttribute("idref", oldnewlexid[c.getAttribute("idref")]) else: c.parentNode.removeChild(c) #sample=brandnewdoc.getElementsByTagName("sample")[0] #for e in sample.getElementsByTagName("words"): #print e.getAttribute("type") #print brandnewdoc.getElementsByTagName("words") #print sample = brandnewdoc.getElementsByTagName("sample")[ 0] # the principal node for e in newdoc.getElementsByTagName( "words"): # looking at all the wordgroups if e.getAttribute("type") == "lexemes": nxlexemes = e break for e in newdoc.getElementsByTagName( "dependencies"): # looking at all the wordgroups if e.getAttribute("type") == "syntax": nxdependencies = e break for e in olddoc.getElementsByTagName( "words"): # looking at all the wordgroups if e.getAttribute("type") == "pivot": nxpivot = e break for e in olddoc.getElementsByTagName( "constrees"): # looking at all the wordgroups if e.getAttribute("type") == "pile_tree": nxpiles = e #if e.getAttribute("type")=="topology_tree": ################################################### !!!!!!!!!!!!!!!!!!!!!!! if e.getAttribute("type") == "macrosyntax_tree": nxtopos = e bxlexemes = brandnewdoc.importNode(nxlexemes, True) bnxdependencies = brandnewdoc.importNode(nxdependencies, True) #markup = (olddoc.getElementsByTagName("markup_text")[0]).firstChild.data.split xtext = brandnewdoc.importNode( dumbolddoc.getElementsByTagName("markup_text")[0], True) # putting it all together: sample.appendChild(xtext) sample.appendChild(nxpivot) sample.appendChild(newtokens) sample.appendChild(bxlexemes) sample.appendChild(bnxdependencies) sample.appendChild(nxpiles) sample.appendChild(nxtopos) #allCats,allFuncs,catdeps,depcats=freqs(goodfilename,allCats,allFuncs,catdeps,depcats) #1/0 allConstTrees = phraseStructure(goodfilename, brandnewdoc) #print sample.toprettyxml() xmlrhaps = codecs.open(newcombinedname, "w", "utf-8") #text.appendChild(doc.createTextNode(alltext)) #PrettyPrint(doc,xmlrhaps) texte = brandnewdoc.toprettyxml() #indent=" " xmlrhaps.write(texte) xmlrhaps.close() return allConstTrees
def phraseStructure(xmlfile, doc=None): if verbose: print "making phrase structure" if not doc: doc = brandNewXml() sample = doc.getElementsByTagName("sample")[0] phraseStrus = doc.createElement("constrees") phraseStrus.setAttribute("type", "phraseStructure") sample.appendChild(phraseStrus) constDic = { "V": Tree("(S)"), "N": Tree("(NP)"), "I": Tree("(IP)"), "Adj": Tree("(AP)"), "Pre": Tree("(PP)"), "CS": Tree("(CP)"), "Pro": Tree("(NP)") } #left in automatic mode: D "Adv":Tree("(AdvP)"), "Cl":Tree("(NP)"), sentences, fs = rhapsodie2Sentences(xmlfile) phrasestructurecounter = 0 #print sentences[0] allConstTrees = [] #print len(sentences) for treeindex, tree in enumerate(sentences): # [:33] [19:20] if verbose: print "_____________", treeindex rootindeces = addinfototree(tree) for i, node in tree.items(): # creating the actual phrase structure #if node.get("mode",""): print "mode",node.get("mode","") #print i, node makeConstNode(i, node, constDic, rootindeces, tree) #print "____",unicode(ntree) alreadyconst = [] for i, node in tree.items( ): # putting the const trees for each node together children = sorted(node["children"]) for j, chi in enumerate(children): if chi in alreadyconst: continue if chi < i: node["const"].insert(j, tree[chi]["const"]) alreadyconst += [chi] elif chi > i: # necessary to kick out autochildren node["const"].insert(len(node["const"]), tree[chi]["const"]) alreadyconst += [chi] #print "===",unicode(node["const"]) #for i,n in tree.items(): #print i,n["orthotext"],n nodesshown = [] for i in rootindeces: # output if i in nodesshown: continue ctree = tree[i]["const"] allConstTrees += [ctree] #for t in ctree.subtrees(lambda t: t.height() == 2): #print unicode(t) nodesshown += [ t.depindex for t in list(ctree.subtrees(lambda t: t.height() == 2)) ] #for pro in ctree.productions(): #print pro phraseStru = doc.createElement("constree") phraseStru.setAttribute( "id", "phrasestruct" + str(phrasestructurecounter)) phraseStru.appendChild( doc.createComment(" ".join(list(ctree.leaves())))) phraseStrus.appendChild(phraseStru) root = doc.createElement("const") root.setAttribute("ctype", "phrase") root.setAttribute("cat", unicode(ctree.node)) root.setAttribute("func", "root") phraseStru.appendChild(root) for child in ctree: traverse(child, doc, root) #print "vvv",doc.toprettyxml() #ctree.draw() phrasestructurecounter += 1 #print print "iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii", treeindex, i if (treeindex, i) >= (118, 0): if ctree.height() > 3: print "drawn" print "height", ctree.height() ctree.draw() #print doc.toprettyxml() return allConstTrees
def joinRhapsodies(goodfilename,oldfilename,newcombinedname): from treebankfiles import rhapsodie2Sentences if verbose: print "reading the files" print "oldfilename",oldfilename print "goodfilename",goodfilename old,olddoc,oldlexes,oldlexids=joinRead(oldfilename) # contains time dumbolddoc = minidom.parse(oldfilename) # only for markup new,newdoc,newlexes,newlexids=joinRead(goodfilename)# good for deps! if verbose: print "finished reading the files" lexidtonode={} sentences, fs = rhapsodie2Sentences(goodfilename) for s in sentences: rootindeces=addinfototree(s) for i,node in s.items(): lexidtonode[node["lexid"]]=node #for i in sorted(lexidtonode): print i,lexidtonode[i] brandnewdoc=brandNewXml() newtokens=brandnewdoc.createElement("words") newtokens.setAttribute("type","tokens") newoldlexid,oldnewlexid={},{} s = difflib.SequenceMatcher(None, newlexes, oldlexes) tokcounter=0 for e in new["lexemes"].values(): for refchild in e.getElementsByTagName('ref'): # old refs out e.removeChild(refchild) correctfeatures(e,lexidtonode) oldidtoktoidpiv={} for oid,oxtok in old["tokens"].items(): for xr in oxtok.getElementsByTagName("ref"): oldidtoktoidpiv[oid]=oldidtoktoidpiv.get(oid,[])+[xr.getAttribute("idref")] #print oldidtoktoidpiv for info,n1,n2,o1,o2 in s.get_opcodes(): if verbose:print "\n_____________",info,"_________________ new:",n1,n2,"old:",o1,o2 if info=="equal": for i,olid in enumerate(oldlexids[o1:o2]): newlexid=newlexids[n1:n2][i] #id newlex=newlexes[n1:n2][i] # str oldlex=oldlexes[o1:o2][i] newoldlexid[newlexid]=olid # put in dico oldnewlexid[olid]=newlexid # put in dico tokcounter=align(old, new, olddoc,newdoc, olid, newlexid, oldlex,newlex, newtokens, tokcounter, oldidtoktoidpiv) elif info=="replace" or info=="insert" or info=="delete": totn=float(len(" ".join(newlexes[n1:n2]))) # complete length of the new lexemes toto=float(len(" ".join(oldlexes[o1:o2]))) # complete length of the old lexemes #print "matching",toto,totn if verbose: print "newlexes",", ".join(newlexes[n1:n2]) #print newlexids[n1:n2] if verbose: print "oldlexes",", ".join(oldlexes[o1:o2]) #print oldlexids[o1:o2] lastlen=0 nperc=[] # contains the fractions of each lexemes (0 < x < 1= for t in newlexes[n1:n2]: nperc+=[(len(t)+lastlen)/totn] lastlen+=len(t)+1 lastlen=0 operc=[] # contains the fractions of each lexemes (0 < x < 1= for t in oldlexes[o1:o2]: operc+=[(len(t)+lastlen)/toto] lastlen+=len(t)+1 #print "nperc",nperc #print "operc",operc coperc=operc[:] # copy of operc #print #print [len(t)/toto for t in newlexes[n1:n2]] #print [len(t)/totn for t in oldlexes[o1:o2]] aligndic={} # hashtable that sends the corresponding fractions of new lexemes to the old ones lasto=None for i,p in enumerate(nperc): #print "p",p while operc and operc[0]<=p: #print "operc[0]<=p",operc[0],p lasto=operc.pop(0) aligndic[p]=aligndic.get(p,[])+[lasto] #print aligndic if operc: #print "operc[0]>p",operc[0],p,lasto #print abs(operc[0]-p),abs(operc[0]-nperc[i+1]) if abs(operc[0]-p)<abs(operc[0]-nperc[i+1]): #print "next one still better comes to me!" lasto=operc.pop(0) aligndic[p]=aligndic.get(p,[])+[lasto] #print lasto,abs(lasto-p),abs(operc[0]-p) if lasto in aligndic.get(p,[]): continue # case i just found smaller ones to aligndic to #print "got to aligndic up to the next one" aligndic[p]=aligndic.get(p,[])+[operc[0]] #elif abs(lasto-p)<abs(operc[0]-p): #print "got to aligndic down to the last one" #aligndic[p]=aligndic.get(p,[])+[lasto] if verbose:print aligndic for p,opli in sorted(aligndic.items()): for op in opli: olid=oldlexids[o1:o2][coperc.index(op)] newlex=newlexes[n1:n2][nperc.index(p)] oldlex=oldlexes[o1:o2][coperc.index(op)] newlexid=newlexids[n1:n2][nperc.index(p)] #id newoldlexid[newlexid]=olid # put in dico oldnewlexid[olid]=newlexid # put in dico #print "aligning",newlex,oldlex tokcounter=align(old, new, olddoc,newdoc, olid, newlexid, oldlex,newlex, newtokens, tokcounter, oldidtoktoidpiv) else: print info,n1,n2,o1,o2 1/0 #print "_______________" if verbose:print "_________________________" for c in olddoc.getElementsByTagName("const"): if c.getAttribute("ctype")=="lexeme": #print c.getAttribute("idref") if c.getAttribute("idref") in oldnewlexid: c.setAttribute( "idref", oldnewlexid[c.getAttribute("idref")] ) else: c.parentNode.removeChild(c) #sample=brandnewdoc.getElementsByTagName("sample")[0] #for e in sample.getElementsByTagName("words"): #print e.getAttribute("type") #print brandnewdoc.getElementsByTagName("words") #print sample=brandnewdoc.getElementsByTagName("sample")[0] # the principal node for e in newdoc.getElementsByTagName("words"):# looking at all the wordgroups if e.getAttribute("type")=="lexemes": nxlexemes = e break for e in newdoc.getElementsByTagName("dependencies"):# looking at all the wordgroups if e.getAttribute("type")=="syntax": nxdependencies = e break for e in olddoc.getElementsByTagName("words"):# looking at all the wordgroups if e.getAttribute("type")=="pivot": nxpivot = e break for e in olddoc.getElementsByTagName("constrees"):# looking at all the wordgroups if e.getAttribute("type")=="pile_tree": nxpiles = e #if e.getAttribute("type")=="topology_tree": ################################################### !!!!!!!!!!!!!!!!!!!!!!! if e.getAttribute("type")=="macrosyntax_tree": nxtopos = e bxlexemes = brandnewdoc.importNode(nxlexemes, True) bnxdependencies = brandnewdoc.importNode(nxdependencies, True) #markup = (olddoc.getElementsByTagName("markup_text")[0]).firstChild.data.split xtext=brandnewdoc.importNode(dumbolddoc.getElementsByTagName("markup_text")[0],True) # putting it all together: sample.appendChild(xtext) sample.appendChild(nxpivot) sample.appendChild(newtokens) sample.appendChild(bxlexemes) sample.appendChild(bnxdependencies) sample.appendChild(nxpiles) sample.appendChild(nxtopos) #allCats,allFuncs,catdeps,depcats=freqs(goodfilename,allCats,allFuncs,catdeps,depcats) #1/0 allConstTrees=phraseStructure(goodfilename, brandnewdoc) #print sample.toprettyxml() xmlrhaps=codecs.open(newcombinedname, "w","utf-8") #text.appendChild(doc.createTextNode(alltext)) #PrettyPrint(doc,xmlrhaps) texte= brandnewdoc.toprettyxml() #indent=" " xmlrhaps.write(texte) xmlrhaps.close() return allConstTrees
def phraseStructure(xmlfile, doc=None): if verbose:print "making phrase structure" if not doc: doc=brandNewXml() sample=doc.getElementsByTagName("sample")[0] phraseStrus=doc.createElement("constrees") phraseStrus.setAttribute("type","phraseStructure") sample.appendChild(phraseStrus) constDic={"V":Tree("(S)"),"N":Tree("(NP)"),"I":Tree("(IP)"),"Adj":Tree("(AP)"),"Pre":Tree("(PP)"),"CS":Tree("(CP)") ,"Pro":Tree("(NP)") } #left in automatic mode: D "Adv":Tree("(AdvP)"), "Cl":Tree("(NP)"), sentences, fs = rhapsodie2Sentences(xmlfile) phrasestructurecounter=0 #print sentences[0] allConstTrees=[] #print len(sentences) for treeindex,tree in enumerate(sentences): # [:33] [19:20] if verbose: print "_____________",treeindex rootindeces=addinfototree(tree) for i,node in tree.items(): # creating the actual phrase structure #if node.get("mode",""): print "mode",node.get("mode","") #print i, node makeConstNode(i,node,constDic,rootindeces,tree) #print "____",unicode(ntree) alreadyconst=[] for i,node in tree.items(): # putting the const trees for each node together children=sorted(node["children"]) for j,chi in enumerate(children): if chi in alreadyconst: continue if chi<i: node["const"].insert(j,tree[chi]["const"]) alreadyconst+=[chi] elif chi>i: # necessary to kick out autochildren node["const"].insert(len(node["const"]),tree[chi]["const"]) alreadyconst+=[chi] #print "===",unicode(node["const"]) #for i,n in tree.items(): #print i,n["orthotext"],n nodesshown=[] for i in rootindeces: # output if i in nodesshown:continue ctree=tree[i]["const"] allConstTrees+=[ctree] #for t in ctree.subtrees(lambda t: t.height() == 2): #print unicode(t) nodesshown += [t.depindex for t in list(ctree.subtrees(lambda t: t.height() == 2))] #for pro in ctree.productions(): #print pro phraseStru=doc.createElement("constree") phraseStru.setAttribute("id","phrasestruct"+str(phrasestructurecounter)) phraseStru.appendChild(doc.createComment(" ".join(list(ctree.leaves())))) phraseStrus.appendChild(phraseStru) root=doc.createElement("const") root.setAttribute("ctype","phrase") root.setAttribute("cat",unicode(ctree.node)) root.setAttribute("func","root") phraseStru.appendChild(root) for child in ctree: traverse(child,doc,root) #print "vvv",doc.toprettyxml() #ctree.draw() phrasestructurecounter+=1 #print print "iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii",treeindex,i if (treeindex,i)>=(118,0): if ctree.height()>3: print "drawn" print "height",ctree.height() ctree.draw() #print doc.toprettyxml() return allConstTrees