Python rhapsodie2Sentences示例，treebankfiles.rhapsodie2Sentences Python示例

示例#1

0

显示文件

文件： rhapsoxml.py 项目： amir-zeldes/arborator

def freqs(xmlfile,freqsdics):
	from treebankfiles import rhapsodie2Sentences
	print "freqs"
	
	doc=brandNewXml()
	sample=doc.getElementsByTagName("sample")[0]
	phraseStrus=doc.createElement("constrees")
	phraseStrus.setAttribute("type","phraseStructure")
	sample.appendChild(phraseStrus)
	
	sentences, fs = rhapsodie2Sentences(xmlfile)
	
	filecode=xmlfile.split("Rhap")[1].split("Synt")[0][1:-1]
	
	
	
	
	for treeindex,tree in enumerate(sentences):
		#rootindeces=addinfototree(tree, False)
		
		
		
		for i,node in tree.items():
			
			freqsdics[0][node["cat"]]=freqsdics[0].get(node["cat"],[])+[(filecode,treeindex)]
			for f in node["gov"].values():
				freqsdics[1][f]=freqsdics[1].get(f,[])+[(filecode,treeindex)]
			for gi,f in node["gov"].iteritems():
				if gi:
					freqsdics[2][ "-".join([tree[gi]["cat"],f ]) ]=freqsdics[2].get( "-".join([tree[gi]["cat"],f ]), [])+[(filecode,treeindex)]
					freqsdics[4][ "-".join([tree[gi]["cat"],f, node["cat"]]) ]=freqsdics[4].get( "-".join([tree[gi]["cat"],f , node["cat"]]), [])+[(filecode,treeindex)]
				freqsdics[3][ "-".join([f,node["cat"]]) ]=freqsdics[3].get( "-".join([f,node["cat"]]) , [])+[(filecode,treeindex)]
				
	return freqsdics

示例#2

0

显示文件

文件： rhapsoxml.py 项目： ajaiswal-ht/arborator-server

def freqs(xmlfile, freqsdics):
    from treebankfiles import rhapsodie2Sentences
    print "freqs"

    doc = brandNewXml()
    sample = doc.getElementsByTagName("sample")[0]
    phraseStrus = doc.createElement("constrees")
    phraseStrus.setAttribute("type", "phraseStructure")
    sample.appendChild(phraseStrus)

    sentences, fs = rhapsodie2Sentences(xmlfile)

    filecode = xmlfile.split("Rhap")[1].split("Synt")[0][1:-1]

    for treeindex, tree in enumerate(sentences):
        #rootindeces=addinfototree(tree, False)

        for i, node in tree.items():

            freqsdics[0][node["cat"]] = freqsdics[0].get(
                node["cat"], []) + [(filecode, treeindex)]
            for f in node["gov"].values():
                freqsdics[1][f] = freqsdics[1].get(
                    f, []) + [(filecode, treeindex)]
            for gi, f in node["gov"].iteritems():
                if gi:
                    freqsdics[2]["-".join([
                        tree[gi]["cat"], f
                    ])] = freqsdics[2].get("-".join([tree[gi]["cat"], f]),
                                           []) + [(filecode, treeindex)]
                    freqsdics[4]["-".join(
                        [tree[gi]["cat"], f, node["cat"]])] = freqsdics[4].get(
                            "-".join([tree[gi]["cat"], f, node["cat"]]),
                            []) + [(filecode, treeindex)]
                freqsdics[3]["-".join([f, node["cat"]])] = freqsdics[3].get(
                    "-".join([f, node["cat"]]), []) + [(filecode, treeindex)]

    return freqsdics

示例#3

0

显示文件

文件： rhapsoxml.py 项目： ajaiswal-ht/arborator-server

def joinRhapsodies(goodfilename, oldfilename, newcombinedname):
    from treebankfiles import rhapsodie2Sentences
    if verbose:
        print "reading the files"
        print "oldfilename", oldfilename
        print "goodfilename", goodfilename
    old, olddoc, oldlexes, oldlexids = joinRead(oldfilename)  # contains time
    dumbolddoc = minidom.parse(oldfilename)  # only for markup

    new, newdoc, newlexes, newlexids = joinRead(goodfilename)  # good for deps!
    if verbose: print "finished reading the files"
    lexidtonode = {}
    sentences, fs = rhapsodie2Sentences(goodfilename)
    for s in sentences:
        rootindeces = addinfototree(s)
        for i, node in s.items():
            lexidtonode[node["lexid"]] = node

    #for i in sorted(lexidtonode): print i,lexidtonode[i]
    brandnewdoc = brandNewXml()
    newtokens = brandnewdoc.createElement("words")
    newtokens.setAttribute("type", "tokens")

    newoldlexid, oldnewlexid = {}, {}
    s = difflib.SequenceMatcher(None, newlexes, oldlexes)

    tokcounter = 0

    for e in new["lexemes"].values():
        for refchild in e.getElementsByTagName('ref'):  # old refs out
            e.removeChild(refchild)
        correctfeatures(e, lexidtonode)

    oldidtoktoidpiv = {}
    for oid, oxtok in old["tokens"].items():

        for xr in oxtok.getElementsByTagName("ref"):
            oldidtoktoidpiv[oid] = oldidtoktoidpiv.get(
                oid, []) + [xr.getAttribute("idref")]

    #print oldidtoktoidpiv

    for info, n1, n2, o1, o2 in s.get_opcodes():
        if verbose:
            print "\n_____________", info, "_________________ new:", n1, n2, "old:", o1, o2
        if info == "equal":

            for i, olid in enumerate(oldlexids[o1:o2]):
                newlexid = newlexids[n1:n2][i]  #id
                newlex = newlexes[n1:n2][i]  # str
                oldlex = oldlexes[o1:o2][i]

                newoldlexid[newlexid] = olid  # put in dico
                oldnewlexid[olid] = newlexid  # put in dico

                tokcounter = align(old, new, olddoc, newdoc, olid, newlexid,
                                   oldlex, newlex, newtokens, tokcounter,
                                   oldidtoktoidpiv)

        elif info == "replace" or info == "insert" or info == "delete":

            totn = float(len(" ".join(
                newlexes[n1:n2])))  # complete length of the new lexemes
            toto = float(len(" ".join(
                oldlexes[o1:o2])))  # complete length of the old lexemes

            #print "matching",toto,totn
            if verbose: print "newlexes", ", ".join(newlexes[n1:n2])
            #print newlexids[n1:n2]
            if verbose: print "oldlexes", ", ".join(oldlexes[o1:o2])
            #print oldlexids[o1:o2]

            lastlen = 0
            nperc = []  # contains the fractions of each lexemes (0 < x < 1=
            for t in newlexes[n1:n2]:
                nperc += [(len(t) + lastlen) / totn]
                lastlen += len(t) + 1
            lastlen = 0
            operc = []  # contains the fractions of each lexemes (0 < x < 1=
            for t in oldlexes[o1:o2]:
                operc += [(len(t) + lastlen) / toto]
                lastlen += len(t) + 1

            #print "nperc",nperc
            #print "operc",operc
            coperc = operc[:]  # copy of operc
            #print
            #print [len(t)/toto for t in newlexes[n1:n2]]
            #print [len(t)/totn for t in oldlexes[o1:o2]]
            aligndic = {
            }  # hashtable that sends the corresponding fractions of new lexemes to the old ones
            lasto = None
            for i, p in enumerate(nperc):
                #print "p",p
                while operc and operc[0] <= p:
                    #print "operc[0]<=p",operc[0],p
                    lasto = operc.pop(0)
                    aligndic[p] = aligndic.get(p, []) + [lasto]
                    #print aligndic

                if operc:
                    #print "operc[0]>p",operc[0],p,lasto

                    #print abs(operc[0]-p),abs(operc[0]-nperc[i+1])
                    if abs(operc[0] - p) < abs(operc[0] - nperc[i + 1]):
                        #print "next one still better comes to me!"
                        lasto = operc.pop(0)
                        aligndic[p] = aligndic.get(p, []) + [lasto]

                    #print lasto,abs(lasto-p),abs(operc[0]-p)
                    if lasto in aligndic.get(p, []):
                        continue  # case i just found smaller ones to aligndic to
                    #print "got to aligndic up to the next one"
                    aligndic[p] = aligndic.get(p, []) + [operc[0]]
                    #elif abs(lasto-p)<abs(operc[0]-p):
                    #print "got to aligndic down to the last one"
                    #aligndic[p]=aligndic.get(p,[])+[lasto]

            if verbose: print aligndic
            for p, opli in sorted(aligndic.items()):
                for op in opli:

                    olid = oldlexids[o1:o2][coperc.index(op)]
                    newlex = newlexes[n1:n2][nperc.index(p)]
                    oldlex = oldlexes[o1:o2][coperc.index(op)]
                    newlexid = newlexids[n1:n2][nperc.index(p)]  #id

                    newoldlexid[newlexid] = olid  # put in dico
                    oldnewlexid[olid] = newlexid  # put in dico
                    #print "aligning",newlex,oldlex
                    tokcounter = align(old, new, olddoc, newdoc, olid,
                                       newlexid, oldlex, newlex, newtokens,
                                       tokcounter, oldidtoktoidpiv)

        else:
            print info, n1, n2, o1, o2
            1 / 0

        #print "_______________"

    if verbose: print "_________________________"

    for c in olddoc.getElementsByTagName("const"):
        if c.getAttribute("ctype") == "lexeme":
            #print c.getAttribute("idref")
            if c.getAttribute("idref") in oldnewlexid:

                c.setAttribute("idref", oldnewlexid[c.getAttribute("idref")])
            else:
                c.parentNode.removeChild(c)

    #sample=brandnewdoc.getElementsByTagName("sample")[0]
    #for e in  sample.getElementsByTagName("words"):
    #print e.getAttribute("type")
    #print brandnewdoc.getElementsByTagName("words")
    #print
    sample = brandnewdoc.getElementsByTagName("sample")[
        0]  # the principal node

    for e in newdoc.getElementsByTagName(
            "words"):  # looking at all the wordgroups
        if e.getAttribute("type") == "lexemes":
            nxlexemes = e
            break
    for e in newdoc.getElementsByTagName(
            "dependencies"):  # looking at all the wordgroups
        if e.getAttribute("type") == "syntax":
            nxdependencies = e
            break
    for e in olddoc.getElementsByTagName(
            "words"):  # looking at all the wordgroups
        if e.getAttribute("type") == "pivot":
            nxpivot = e
            break
    for e in olddoc.getElementsByTagName(
            "constrees"):  # looking at all the wordgroups
        if e.getAttribute("type") == "pile_tree":
            nxpiles = e
        #if e.getAttribute("type")=="topology_tree":
        ################################################### !!!!!!!!!!!!!!!!!!!!!!!
        if e.getAttribute("type") == "macrosyntax_tree":
            nxtopos = e

    bxlexemes = brandnewdoc.importNode(nxlexemes, True)
    bnxdependencies = brandnewdoc.importNode(nxdependencies, True)

    #markup = (olddoc.getElementsByTagName("markup_text")[0]).firstChild.data.split
    xtext = brandnewdoc.importNode(
        dumbolddoc.getElementsByTagName("markup_text")[0], True)

    # putting it all together:
    sample.appendChild(xtext)
    sample.appendChild(nxpivot)
    sample.appendChild(newtokens)
    sample.appendChild(bxlexemes)
    sample.appendChild(bnxdependencies)
    sample.appendChild(nxpiles)
    sample.appendChild(nxtopos)

    #allCats,allFuncs,catdeps,depcats=freqs(goodfilename,allCats,allFuncs,catdeps,depcats)
    #1/0
    allConstTrees = phraseStructure(goodfilename, brandnewdoc)
    #print sample.toprettyxml()

    xmlrhaps = codecs.open(newcombinedname, "w", "utf-8")
    #text.appendChild(doc.createTextNode(alltext))
    #PrettyPrint(doc,xmlrhaps)
    texte = brandnewdoc.toprettyxml()  #indent="     "
    xmlrhaps.write(texte)

    xmlrhaps.close()
    return allConstTrees

示例#4

0

显示文件

文件： rhapsoxml.py 项目： ajaiswal-ht/arborator-server

def phraseStructure(xmlfile, doc=None):

    if verbose: print "making phrase structure"

    if not doc: doc = brandNewXml()
    sample = doc.getElementsByTagName("sample")[0]
    phraseStrus = doc.createElement("constrees")
    phraseStrus.setAttribute("type", "phraseStructure")
    sample.appendChild(phraseStrus)

    constDic = {
        "V": Tree("(S)"),
        "N": Tree("(NP)"),
        "I": Tree("(IP)"),
        "Adj": Tree("(AP)"),
        "Pre": Tree("(PP)"),
        "CS": Tree("(CP)"),
        "Pro": Tree("(NP)")
    }
    #left in automatic mode: D "Adv":Tree("(AdvP)"), "Cl":Tree("(NP)"),
    sentences, fs = rhapsodie2Sentences(xmlfile)

    phrasestructurecounter = 0
    #print sentences[0]

    allConstTrees = []
    #print len(sentences)
    for treeindex, tree in enumerate(sentences):  # [:33] [19:20]

        if verbose:
            print "_____________", treeindex

        rootindeces = addinfototree(tree)

        for i, node in tree.items():  # creating the actual phrase structure
            #if node.get("mode",""): print "mode",node.get("mode","")
            #print i, node

            makeConstNode(i, node, constDic, rootindeces, tree)

            #print "____",unicode(ntree)

        alreadyconst = []
        for i, node in tree.items(
        ):  # putting the const trees for each node together
            children = sorted(node["children"])

            for j, chi in enumerate(children):

                if chi in alreadyconst:
                    continue

                if chi < i:
                    node["const"].insert(j, tree[chi]["const"])
                    alreadyconst += [chi]
                elif chi > i:  # necessary to kick out autochildren
                    node["const"].insert(len(node["const"]),
                                         tree[chi]["const"])
                    alreadyconst += [chi]
            #print "===",unicode(node["const"])

        #for i,n in tree.items():
        #print i,n["orthotext"],n

        nodesshown = []
        for i in rootindeces:  # output
            if i in nodesshown: continue
            ctree = tree[i]["const"]
            allConstTrees += [ctree]

            #for t in ctree.subtrees(lambda t: t.height() == 2):
            #print unicode(t)

            nodesshown += [
                t.depindex
                for t in list(ctree.subtrees(lambda t: t.height() == 2))
            ]

            #for pro in ctree.productions():
            #print pro

            phraseStru = doc.createElement("constree")
            phraseStru.setAttribute(
                "id", "phrasestruct" + str(phrasestructurecounter))
            phraseStru.appendChild(
                doc.createComment(" ".join(list(ctree.leaves()))))
            phraseStrus.appendChild(phraseStru)

            root = doc.createElement("const")
            root.setAttribute("ctype", "phrase")
            root.setAttribute("cat", unicode(ctree.node))
            root.setAttribute("func", "root")
            phraseStru.appendChild(root)

            for child in ctree:
                traverse(child, doc, root)
            #print "vvv",doc.toprettyxml()
            #ctree.draw()
            phrasestructurecounter += 1

            #print
            print "iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii", treeindex, i

            if (treeindex, i) >= (118, 0):
                if ctree.height() > 3:

                    print "drawn"
                    print "height", ctree.height()
                    ctree.draw()

        #print doc.toprettyxml()
    return allConstTrees

示例#5

0

显示文件

文件： rhapsoxml.py 项目： amir-zeldes/arborator

def joinRhapsodies(goodfilename,oldfilename,newcombinedname):
	from treebankfiles import rhapsodie2Sentences
	if verbose: 
		print "reading the files"
		print "oldfilename",oldfilename
		print "goodfilename",goodfilename
	old,olddoc,oldlexes,oldlexids=joinRead(oldfilename) # contains time
	dumbolddoc = minidom.parse(oldfilename) # only for markup
	
	new,newdoc,newlexes,newlexids=joinRead(goodfilename)# good for deps!
	if verbose: print "finished reading the files"
	lexidtonode={}
	sentences, fs = rhapsodie2Sentences(goodfilename)
	for s in sentences:
		rootindeces=addinfototree(s)
		for i,node in s.items():
			lexidtonode[node["lexid"]]=node	
	
	#for i in sorted(lexidtonode): print i,lexidtonode[i]
	brandnewdoc=brandNewXml()
	newtokens=brandnewdoc.createElement("words")
	newtokens.setAttribute("type","tokens")
	
	
	newoldlexid,oldnewlexid={},{}
	s = difflib.SequenceMatcher(None, newlexes, oldlexes)
	
	tokcounter=0
	
	
	for e in new["lexemes"].values():
		for refchild in e.getElementsByTagName('ref'): # old refs out
			e.removeChild(refchild)
		correctfeatures(e,lexidtonode)
	
	oldidtoktoidpiv={}
	for oid,oxtok in old["tokens"].items():
		
		for xr in oxtok.getElementsByTagName("ref"):
			oldidtoktoidpiv[oid]=oldidtoktoidpiv.get(oid,[])+[xr.getAttribute("idref")]
			
	#print oldidtoktoidpiv
	
	
	for info,n1,n2,o1,o2 in s.get_opcodes():
		if verbose:print "\n_____________",info,"_________________ new:",n1,n2,"old:",o1,o2
		if info=="equal":
			
			for i,olid in enumerate(oldlexids[o1:o2]):
				newlexid=newlexids[n1:n2][i] #id
				newlex=newlexes[n1:n2][i] # str
				oldlex=oldlexes[o1:o2][i]
				
				newoldlexid[newlexid]=olid # put in dico
				oldnewlexid[olid]=newlexid # put in dico
				
				tokcounter=align(old, new, olddoc,newdoc, olid, newlexid, oldlex,newlex, newtokens, tokcounter, oldidtoktoidpiv)
				
				
		elif info=="replace" or info=="insert" or info=="delete":
			
			totn=float(len(" ".join(newlexes[n1:n2]))) # complete length of the new lexemes
			toto=float(len(" ".join(oldlexes[o1:o2]))) # complete length of the old lexemes
			
			#print "matching",toto,totn
			if verbose: print "newlexes",", ".join(newlexes[n1:n2])
			#print newlexids[n1:n2]
			if verbose: print "oldlexes",", ".join(oldlexes[o1:o2])
			#print oldlexids[o1:o2]
			
			lastlen=0
			nperc=[] # contains the fractions of each lexemes (0 < x < 1=
			for t in newlexes[n1:n2]:
				nperc+=[(len(t)+lastlen)/totn]
				lastlen+=len(t)+1
			lastlen=0
			operc=[] # contains the fractions of each lexemes (0 < x < 1=
			for t in oldlexes[o1:o2]:
				operc+=[(len(t)+lastlen)/toto]
				lastlen+=len(t)+1
			
			#print "nperc",nperc
			#print "operc",operc
			coperc=operc[:] # copy of operc
			#print 
			#print [len(t)/toto for t in newlexes[n1:n2]]
			#print [len(t)/totn for t in oldlexes[o1:o2]]
			aligndic={} # hashtable that sends the corresponding fractions of new lexemes to the old ones
			lasto=None
			for i,p in enumerate(nperc):
				#print "p",p
				while operc and operc[0]<=p:
					#print "operc[0]<=p",operc[0],p
					lasto=operc.pop(0)
					aligndic[p]=aligndic.get(p,[])+[lasto]
					#print aligndic
				
				if operc:
					#print "operc[0]>p",operc[0],p,lasto
					
					#print abs(operc[0]-p),abs(operc[0]-nperc[i+1])
					if abs(operc[0]-p)<abs(operc[0]-nperc[i+1]):
						#print "next one still better comes to me!"
						lasto=operc.pop(0)
						aligndic[p]=aligndic.get(p,[])+[lasto]
					
					#print lasto,abs(lasto-p),abs(operc[0]-p)
					if lasto in aligndic.get(p,[]): continue # case i just found smaller ones to aligndic to
					#print "got to aligndic up to the next one"
					aligndic[p]=aligndic.get(p,[])+[operc[0]]
					#elif abs(lasto-p)<abs(operc[0]-p):
						#print "got to aligndic down to the last one"
						#aligndic[p]=aligndic.get(p,[])+[lasto]
					
			if verbose:print aligndic
			for p,opli in sorted(aligndic.items()):
				for op in opli:
					
					
					
					olid=oldlexids[o1:o2][coperc.index(op)]
					newlex=newlexes[n1:n2][nperc.index(p)]
					oldlex=oldlexes[o1:o2][coperc.index(op)]
					newlexid=newlexids[n1:n2][nperc.index(p)] #id
					
					newoldlexid[newlexid]=olid # put in dico
					oldnewlexid[olid]=newlexid # put in dico
					#print "aligning",newlex,oldlex
					tokcounter=align(old, new, olddoc,newdoc, olid, newlexid, oldlex,newlex, newtokens, tokcounter, oldidtoktoidpiv)
					
				
			
		else: 
			print info,n1,n2,o1,o2
			1/0
			
				
		#print "_______________"
		
		
	if verbose:print "_________________________"
	
	
	
	for c in olddoc.getElementsByTagName("const"):
		if c.getAttribute("ctype")=="lexeme":
			#print c.getAttribute("idref")
			if c.getAttribute("idref") in oldnewlexid:
				
				c.setAttribute( "idref", oldnewlexid[c.getAttribute("idref")] )
			else:
				c.parentNode.removeChild(c)
	
	
	
	
	
	#sample=brandnewdoc.getElementsByTagName("sample")[0]
	#for e in  sample.getElementsByTagName("words"):
		#print e.getAttribute("type")
	#print brandnewdoc.getElementsByTagName("words")
	#print
	sample=brandnewdoc.getElementsByTagName("sample")[0] # the principal node
	
	
	
	
	for e in newdoc.getElementsByTagName("words"):# looking at all the wordgroups
		if e.getAttribute("type")=="lexemes":
			nxlexemes = e
			break
	for e in newdoc.getElementsByTagName("dependencies"):# looking at all the wordgroups
		if e.getAttribute("type")=="syntax":
			nxdependencies = e
			break		
	for e in olddoc.getElementsByTagName("words"):# looking at all the wordgroups
		if e.getAttribute("type")=="pivot":
			nxpivot = e
			break
	for e in olddoc.getElementsByTagName("constrees"):# looking at all the wordgroups
		if e.getAttribute("type")=="pile_tree":
			nxpiles = e
		#if e.getAttribute("type")=="topology_tree":
		################################################### !!!!!!!!!!!!!!!!!!!!!!!
		if e.getAttribute("type")=="macrosyntax_tree":
			nxtopos = e
					
	bxlexemes = brandnewdoc.importNode(nxlexemes,  True)
	bnxdependencies = brandnewdoc.importNode(nxdependencies,  True)
	
	#markup = (olddoc.getElementsByTagName("markup_text")[0]).firstChild.data.split
	xtext=brandnewdoc.importNode(dumbolddoc.getElementsByTagName("markup_text")[0],True)
	
	# putting it all together:
	sample.appendChild(xtext)
	sample.appendChild(nxpivot)
	sample.appendChild(newtokens)
	sample.appendChild(bxlexemes)
	sample.appendChild(bnxdependencies)
	sample.appendChild(nxpiles)
	sample.appendChild(nxtopos)
	
	#allCats,allFuncs,catdeps,depcats=freqs(goodfilename,allCats,allFuncs,catdeps,depcats)
	#1/0
	allConstTrees=phraseStructure(goodfilename, brandnewdoc)
	#print sample.toprettyxml()
	
	xmlrhaps=codecs.open(newcombinedname, "w","utf-8")
	#text.appendChild(doc.createTextNode(alltext))
	#PrettyPrint(doc,xmlrhaps)
	texte= brandnewdoc.toprettyxml() #indent="     "
	xmlrhaps.write(texte)
	
	xmlrhaps.close()
	return allConstTrees

示例#6

0

显示文件

文件： rhapsoxml.py 项目： amir-zeldes/arborator

def phraseStructure(xmlfile, doc=None):
	
	
	if verbose:print "making phrase structure"
	
	if not doc:	doc=brandNewXml()
	sample=doc.getElementsByTagName("sample")[0]
	phraseStrus=doc.createElement("constrees")
	phraseStrus.setAttribute("type","phraseStructure")
	sample.appendChild(phraseStrus)
	
	
	constDic={"V":Tree("(S)"),"N":Tree("(NP)"),"I":Tree("(IP)"),"Adj":Tree("(AP)"),"Pre":Tree("(PP)"),"CS":Tree("(CP)") ,"Pro":Tree("(NP)")   }
	#left in automatic mode: D "Adv":Tree("(AdvP)"), "Cl":Tree("(NP)"),
	sentences, fs = rhapsodie2Sentences(xmlfile)
	
	phrasestructurecounter=0
	#print sentences[0]
	
	allConstTrees=[]
	#print len(sentences)		
	for treeindex,tree in enumerate(sentences): # [:33] [19:20]
		
		
		if verbose:
			print "_____________",treeindex

		
		rootindeces=addinfototree(tree)
		
		
		
		
		
		for i,node in tree.items(): # creating the actual phrase structure
			#if node.get("mode",""): print "mode",node.get("mode","")
			#print i, node
			
			makeConstNode(i,node,constDic,rootindeces,tree)
			

			
				
			#print "____",unicode(ntree)
	
		alreadyconst=[]
		for i,node in tree.items(): # putting the const trees for each node together
			children=sorted(node["children"])

			for j,chi in enumerate(children):
				
				if chi in alreadyconst:
					continue
				
				if chi<i:
					node["const"].insert(j,tree[chi]["const"])
					alreadyconst+=[chi]
				elif chi>i: # necessary to kick out autochildren
					node["const"].insert(len(node["const"]),tree[chi]["const"])
					alreadyconst+=[chi]
			#print "===",unicode(node["const"])
				
	
		#for i,n in tree.items():
			#print i,n["orthotext"],n
			
		nodesshown=[]	
		for i in rootindeces: # output
			if i in nodesshown:continue
			ctree=tree[i]["const"]
			allConstTrees+=[ctree]
			
			#for t in ctree.subtrees(lambda t: t.height() == 2):
				#print unicode(t)
			
			nodesshown += [t.depindex for t in list(ctree.subtrees(lambda t: t.height() == 2))]
			
			#for pro in ctree.productions():
				#print pro
			
			phraseStru=doc.createElement("constree")
			phraseStru.setAttribute("id","phrasestruct"+str(phrasestructurecounter))
			phraseStru.appendChild(doc.createComment(" ".join(list(ctree.leaves()))))
			phraseStrus.appendChild(phraseStru)
			
			root=doc.createElement("const")
			root.setAttribute("ctype","phrase")
			root.setAttribute("cat",unicode(ctree.node))			
			root.setAttribute("func","root")
			phraseStru.appendChild(root)
			
			for child in ctree:
				traverse(child,doc,root)
			#print "vvv",doc.toprettyxml()
			#ctree.draw()
			phrasestructurecounter+=1

			#print 
			print "iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii",treeindex,i
			
			if (treeindex,i)>=(118,0):
				if ctree.height()>3:
					
					print "drawn"
					print "height",ctree.height()
					ctree.draw()

		#print doc.toprettyxml()
	return allConstTrees