def exportUniqueSentences(project, mode="lasttree", pattern=False): """ exports one tree per sentences: the first time the sentence is found, the newest tree """ sql = SQL(project) db, cursor = sql.open() sentences = {} # toks -> tree outdir = os.path.join("..", "projects", project, "export") try: os.mkdir(outdir) except OSError: pass outfile = os.path.join(outdir, "allSentences.conll") if pattern: command = """select trees.rowid,userid,max(timestamp) from trees, sentences, texts where texts.rowid=sentences.textid and sentences.rowid=trees.sentenceid and textname like "{pattern}" group by sentenceid order by trees.rowid;""".format(pattern=pattern) else: command = """select trees.rowid,userid,max(timestamp) from trees, sentences, texts where texts.rowid=sentences.textid and sentences.rowid=trees.sentenceid group by sentenceid order by trees.rowid;""" for i, ( treeid, userid, timestamp, ) in enumerate(cursor.execute(command).fetchall()): tree = sql.gettree(treeid=treeid, indb=db, incursor=cursor)["tree"] toks = tuple(tree[i]["t"] for i in tree) print "___", i, "\r", if toks not in sentences: sentences[toks] = tree print "writing file with", len(sentences), "sentences..." conll.trees2conllFile([sentences[toks] for toks in sorted(sentences)], outfile=outfile, columns=10) return outfile
def transform(infolder, outfolder, mixOldNew=True): createNonExistingFolders(outfolder) spaceToks = {} #for infile in sorted(glob.glob(os.path.join(infolder,"test.conll"))): for infile in sorted(glob.glob(os.path.join(infolder, "*.conll"))): basename = os.path.basename(infile) print "reading", basename trees = conll.conllFile2trees(infile) newtrees = [] for tree in trees: if mixOldNew: newtrees += [tree] newtree = copy.deepcopy(tree) newtree = platinum(newtree) newtrees += [newtree] findSpaces(spaceToks, tree) conll.trees2conllFile(newtrees, os.path.join(outfolder, fixOutname(basename))) #corrdic = correctionDics("corrConll.txt") #for c in corrdic: #print c #qsdf for i, tok in enumerate(sorted(spaceToks)): print i, tok, spaceToks[tok]
def makeConllUfromNaijaFile(infilename): trees = [] tree = {} with codecs.open(infilename, "r", "utf-8") as infile: for line in infile: #print "$$$$",line line = line.strip() if line and line[0] != "#": cells = line.split('\t') nrCells = len(cells) if nrCells != 10: print line continue nr, t, x, tag = cells[:4] nr = int(nr) newf = {'id': nr, 't': t, 'tag': tag} x = x.strip() if "=" in x: mf = dict([(av.split("=")[0], av.split("=")[-1]) for av in x.split("|")]) newf = update({"features": mf}, newf) elif x != ".": newf = update({"lemma": x}, newf) if nr == 1: trees += [tree.copy()] tree = {} tree[nr] = update(tree.get(nr, {}), newf) print len(trees), "trees" trees2conllFile(trees, os.path.basename(infilename).split(".")[0] + ".conllu", columns="u")
def degradeConllfile(conllfile, removeFuncs=["para"], removeDeps=0.4): trees = conll.conllFile2trees(conllfile) nbgovs = 0 for arbre in trees: for i, node in arbre.iteritems(): if "gov" in node and node["gov"].keys( )[0] != -1 and node["gov"].values()[0] not in removeFuncs: nbgovs += 1 print int(nbgovs * removeDeps) tobeRemoved = sorted(random.sample(range(nbgovs), int(nbgovs * removeDeps))) print "nbgovs:", nbgovs, "tobeRemoved:", tobeRemoved nbgovs = 0 for arbre in trees: for i, node in arbre.iteritems(): if "gov" in node and node["gov"].keys()[0] != -1: if node["gov"].values()[0] in removeFuncs: node["gov"] = {} else: nbgovs += 1 if nbgovs in tobeRemoved: node["gov"] = {} newname = conllfile if conllfile.endswith(".conll"): newname = conllfile[:-len(".conll")] shutil.move(conllfile, newname + ".orig") conll.trees2conllFile(trees, newname + ".deg", columns=10)
def lastTreeForAllSamples(project, onlyHuman=True, combine=False): outdir = os.path.join("..", "projects", project, "export") try: os.mkdir(outdir) except OSError: pass sql = SQL(project) db, cursor = sql.open() if onlyHuman: parserid = 0 for pid, in cursor.execute( "select rowid from users where user='******';"): parserid = pid else: parserid = -1 sents = sorted( cursor.execute( "select texts.textname, sentences.rowid, sentences.nr from sentences, texts where texts.rowid=sentences.textid;" ).fetchall()) print "todo:", len(sents), "sentences" pbar = tqdm.tqdm(total=len(sents)) annotators = {} if combine: trees = [] getTreesForSents(sents, trees, annotators, parserid, cursor, db, sql, pbar, project=project) outfile = os.path.join(outdir, project + ".lastHumanTreeForAllSamples.conllu") conll.trees2conllFile(trees, outfile=outfile) print "wrote", outfile else: for tid, textname, nrtokens in list( cursor.execute("select rowid, * from texts;")): print tid, textname, nrtokens sents = list( cursor.execute( "select rowid, * from sentences where textid=?;", (tid, )).fetchall()) trees = [] getTreesForSents(sents, trees, annotators, parserid, cursor, db, sql, pbar) if textname.endswith(".conll_parse"): textname = textname[:len(".conll_parse")] outfile = os.path.join(outdir, textname + ".lastHumanTrees.conllu") conll.trees2conllFile(trees, outfile=outfile) print "wrote", outfile for a in annotators: print a, annotators[a]
def exportConllByAnnotators(project, annotators=["prof", "Sy", "parser"]): """ exports complete project for every sentence, trees of annotators in given order. if no tree: throw error """ outfiles = [] sql = SQL(project) db, cursor = sql.open() goodTexts = {} outdir = os.path.join("..", "projects", project, "export") try: os.mkdir(outdir) except OSError: pass annotatorIds = tuple(a for (a, ) in [ list( cursor.execute("select rowid from users where user =?;", ( annotator, )))[0] for annotator in annotators ]) #print annotators, annotatorIds for textid, textname, nrtokens in list( cursor.execute("select rowid, * from texts;")): # for each text print "doing", textname, "with", nrtokens, "tokens" nrutids = {} for nr, userid, treeid in list( cursor.execute( "select nr,userid,trees.rowid as treeid from trees, sentences where sentenceid=sentences.rowid and userid in {annotatorIds} and textid = ? order by nr;" .format(annotatorIds=annotatorIds), (textid, ))): nrutids[nr] = nrutids.get(nr, {}) nrutids[nr][userid] = treeid trees = [] for nr in sorted(nrutids): # for each sentence tree = None for aid in annotatorIds: # for each interesting annotator id if aid in nrutids[nr]: tree = sql.gettree(treeid=nrutids[nr][aid], indb=db, incursor=cursor)["tree"] trees += [tree] #print "atree:",tree break if not tree: print "problem: no tree for nr", nr, "type", type(nr) print "annotatorIds", annotatorIds raise Exception('no tree', nr) if textname.endswith(".conll"): textname = textname[:-len(".conll")] outfile = os.path.join(outdir, textname) conll.trees2conllFile(trees, outfile=outfile, columns=10) print len(trees), "trees" outfiles += [outfile] return outfiles
def exportConllByAnnotators(project, annotators=["prof", "Sy", "parser"], fileExtension=".conllu"): """ exports complete project for every sentence, trees of annotators in given order. if no tree: throw error """ outfiles = [] sql = SQL(project) db, cursor = sql.open() goodTexts = {} outdir = os.path.join("..", "projects", project, "export") try: os.mkdir(outdir) except OSError: pass try: annotatorIds = tuple(a for (a, ) in [ list( cursor.execute("select rowid from users where user =?;", ( annotator, )))[0] for annotator in annotators ]) except: print "some required annotator IDs are not in the database" return print annotators, annotatorIds for textid, textname, nrtokens in list( cursor.execute("select rowid, * from texts;")): # for each text print "doing", textname, "with", nrtokens, "tokens" nrutids = {} for nr, userid, treeid in list( cursor.execute( "select nr,userid,trees.rowid as treeid from trees, sentences where sentenceid=sentences.rowid and userid in {annotatorIds} and textid = ? order by nr;" .format(annotatorIds=annotatorIds if len(annotatorIds) > 1 else '(' + str(annotatorIds[0]) + ')'), (textid, ))): nrutids[nr] = nrutids.get(nr, {}) nrutids[nr][userid] = treeid trees = getSpecificTrees(sql, db, cursor, nrutids, annotatorIds) if trees: if textname.endswith(".conll"): textname = textname[:-len(".conll")] if textname.endswith(".conllu"): textname = textname[:-len(".conllu")] outfile = os.path.join(outdir, textname + fileExtension) conll.trees2conllFile(trees, outfile=outfile, columns=10) print len(trees), "trees" outfiles += [outfile] else: print "skipped", textname return outfiles
def transform(infolder, outfolder, mixOldNew=True): createNonExistingFolders(outfolder) #for infile in sorted(glob.glob(os.path.join(infolder,"test.conll"))): for infile in sorted(glob.glob(os.path.join(infolder, "*.conll"))): basename = os.path.basename(infile) print "reading", basename trees = conll.conllFile2trees(infile) newtrees = [] for tree in trees: if mixOldNew: newtrees += [tree] newtree = copy.deepcopy(tree) newtree = platinum(newtree) newtrees += [newtree] conll.trees2conllFile(newtrees, os.path.join(outfolder, fixOutname(basename)))
def transform(infolder, outfolder, mixOldNew=False): createNonExistingFolders(outfolder) spaceToks = {} #for infile in sorted(glob.glob(os.path.join(infolder,"test.conll"))): for infile in sorted(glob.glob(os.path.join(infolder, "*"))): if not os.path.isfile(infile): continue basename = os.path.basename(infile) print "reading", basename trees = conll.conllFile2trees(infile) newtrees = [] for tree in trees: if mixOldNew: newtrees += [tree] newtree = copy.deepcopy(tree) newtree = correct(newtree) newtrees += [newtree] conll.trees2conllFile(newtrees, os.path.join(outfolder, fixOutname(basename)))
def search(infolder,fun): goodtrees=[] print "doing", fun.__name__ #try: os.mkdir(outdir) #except OSError: pass for infile in sorted(glob.glob(os.path.join(infolder,"*"))): # .conll if not os.path.isfile(infile): continue basename=os.path.basename(infile) print "reading",basename trees = conll.conllFile2trees(infile) for tree in trees: #if hasVerbalDm(tree): #if isNonProjective(tree): if fun(tree): goodtrees+=[tree] print "found",len(goodtrees) if goodtrees: conll.trees2conllFile(goodtrees,fun.__name__+".conll")
def transform(infolder, outfolder, mixOldNew=False): createNonExistingFolders(outfolder) corrinst = compil('corrinst.txt') print len(corrinst), "rules" for infile in sorted(glob.glob(os.path.join(infolder, "*"))): if not os.path.isfile(infile): continue basename = os.path.basename(infile) print "reading", basename trees = conll.conllFile2trees(infile) newtrees = [] for tree in trees: if mixOldNew: newtrees += [tree] newtree = copy.deepcopy(tree) newtree = correct(newtree, corrinst) newtrees += [newtree] conll.trees2conllFile(newtrees, os.path.join(outfolder, fixOutname(basename)))
def retokeniser(nomdufichier, path="", addtoout=""): if not path: path, _ = os.path.split( nomdufichier) # take the same path as the nomdufichier if path and path[-1] != "/": path = path + "/" trees = conll.conllFile2trees(nomdufichier) # on lit le fichier print "le fichier", nomdufichier, "a", len(trees), "arbres" #newtrees, alltrees=[], [] newtrees = [] digitsandnumbers = codecs.open(droporfeo + "lexique/gg", "r", "utf-8").read().split('\n') for i, arbre in enumerate(trees): # on boucle sur les arbres #alltrees+=[copy.deepcopy(arbre)] #oldtree=copy.deepcopy(arbre) racines = addinfototree(arbre) oldtree = copy.deepcopy(arbre) arbre = corrigerNumerotation(arbre) arbre = nombresComposes(arbre) arbre = digits(arbre, digitsandnumbers) arbre = corrigerArbreCompos( arbre) # Décomposition des expressions multimots #for i, node in arbre.items(): # Reconfiguration des enfants #if node["gov"] == {}: #print "crap" arbre = recomposerMultimots(arbre, expressions_multimots) arbre = corrigerNumerotationSplice(arbre) arbre = corrigerSegmentationClitiques(arbre, dico_clitiques) arbre = corrigerInaudibles(arbre) arbre = corrigerClitiques(arbre) arbre = retoken(arbre) if arbre != oldtree: print i for ii in arbre: if arbre[ii] != oldtree.get(ii, None): print ii, arbre[ii]['t'], arbre[ii], oldtree.get(ii, None) newtrees.append(arbre) newname = path + os.path.basename(nomdufichier + addtoout) conll.trees2conllFile(newtrees, newname, columns=10) return newname
def addArbitraryPuncs(infolder, outfolder): createNonExistingFolders(outfolder) for conllinfile in glob.glob(os.path.join(infolder, '*')): print conllinfile trees = conll.conllFile2trees(conllinfile) for i, tree in enumerate(trees): m = max(tree) splitcode = ".,!?;:()" p = splitcode[i % len(splitcode)] tree[m + 1] = { u'tag': u'PUNC', u'lemma': p, u't': p, 'gov': { 0: u'punc' } } conll.trees2conllFile(trees, os.path.join(outfolder, os.path.basename(conllinfile)), columns=14)
def degradeConllfile(conllfile, removeFuncs=["para"], removeDeps=0.2): trees = conll.conllFile2trees(conllfile) nbgovs = 0 for arbre in trees: for i, node in arbre.iteritems(): if "gov" in node and node["gov"].keys( )[0] != -1 and node["gov"].values()[0] not in removeFuncs: nbgovs += 1 print int(nbgovs * removeDeps) tobeRemoved = sorted(random.sample(range(nbgovs), int(nbgovs * removeDeps))) print nbgovs, tobeRemoved nbgovs = 0 for arbre in trees: for i, node in arbre.iteritems(): if "gov" in node and node["gov"].keys( )[0] != -1 and node["gov"].values()[0] not in removeFuncs: nbgovs += 1 if nbgovs in tobeRemoved: node["gov"] = {} shutil.move(conllfile, conllfile + ".orig") conll.trees2conllFile(trees, conllfile, columns=10)
def reorder(trees, outfile): # 重新排序conll树,可重命名所有sent_id """ Reorders the trees based on the nr sentencefeature, adds updated text and sentence_id. Once this is done, the trees are written to a new file. input: List(Tree), Str does: Writes <outfile> output: None """ prefix = "_".join(trees[0].sentencefeatures.get("sent_id").split("_")[:-1]) sortable = sorted( list([(int(t.sentencefeatures.get("nr")), t) for t in trees])) new_trees = list() for nr, tree in sortable: # adding metadatas 应该是重命名sent_id,从0开始 tree.sentencefeatures["text"] = tree.sentence() tree.sentencefeatures["sent_id"] = prefix + "_" + str(nr - 1) # removing useless metadata del tree.sentencefeatures["nr"] new_trees.append(tree) conll.trees2conllFile(new_trees, outfile)
def split(conllfile, maxi): trees = conll.conllFile2trees(conllfile) for j, ts in enumerate( [trees[i:i + maxi] for i in range(0, len(trees), maxi)]): conll.trees2conllFile(ts, conllfile + str(j))
def trainingEvaluationParsing(project=u"OrfeoGold2016", parserType="graph", whoseTrees="validator", evaluationPercent=10, additionnalLexicon=None, resultAnnotator="mate", getFromFolder=False, parseDB=False, memory="40G", stopOnError=False): """ if additionnalLexicon is given, it is joined to the training file for lemmatization and tagging. change memory here! todo : - add function to choose parser type (lang=) - creer mate.log pour progression (fin = "Ready.") """ mateLogs("Begin") ti = time.time() if getFromFolder: parseDB = False # TODO: correct this so that all options are available parserType = (parserType or "graph") whoseTrees = whoseTrees or "validator" evaluationPercent = evaluationPercent or 10 resultAnnotator = resultAnnotator or "mate" try: os.chmod("mate/parse.log", 0666) # just in case... except: pass timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M') ##### #defining project and creation of saves directories ##### basepath = createDailyPath("./mate/", project) if parseDB: backupbase = backupOldDatabase(project, basepath) mateLogs( "A copy of the database has been stored in {backupbase}. Getting validated trees..." .format(backupbase=backupbase)) traindir = createDirectory(basepath + "training") modeldir = createDirectory(basepath + "models") logdir = createDirectory(basepath + "logs") parsedir = createDirectory(basepath + "parses") ##### #getting gold trees for training ##### if getFromFolder: # getFromFolder contains folder name containing only conll files error = False goldtrees = [] for infile in glob.glob(os.path.join(getFromFolder, '*')): if os.path.isfile(infile): print "reading", infile gtrees = conll.conllFile2trees(infile) for tree in gtrees: problemkeys = [] for i in tree: for gi in tree[i]["gov"]: if not 0 <= gi <= len(tree): print infile print tree print "has a problematic governor:", gi error = True problemkeys += [i] for problemk in problemkeys: del tree[problemk] goldtrees += gtrees if error and stopOnError: sys.exit() else: goldtrees = trees2train.getValidatedTrees(project, basepath, whoseTrees) mateLogs( u"{nrtrees} validated trees extracted".format(nrtrees=len(goldtrees))) lemma = None if goldtrees: # see whether the first token of the first tree has a lemma. if lemma==None: we'll skip lemmatization lemma = goldtrees[0][sorted(goldtrees[0])[0]].get( "lemma", None) # just trying to get the first lemma value if lemma == "_": lemma = None print "found lemma in first tree:", lemma #TODO: do something here: double tokens as lemmas for chinese, see function makeTrainTestSets else: print "no trees from:", getFromFolder sys.exit() print "goldtrees:", len(goldtrees) ##### #creating trainingfiles ##### alldeptraining = traindir + "alldeptraining.conll" conll.trees2conllFile(goldtrees, alldeptraining, columns=14) traintrees = makeTrainTestSets(traindir, pattern=os.path.basename(alldeptraining), train="partialdeptrain.conll", test="test.conll", empty="emptytest.conll", testsize=int(evaluationPercent), lemma=lemma) print "traintrees:", len(traintrees) if additionnalLexicon: lexicontrees = conll.conllFile2trees(additionnalLexicon) print "lexicontrees:", len(lexicontrees) alllemtagtrain = traindir + "alllemtagtrain.conll" conll.trees2conllFile(goldtrees + lexicontrees, alllemtagtrain, columns=14) partiallemtagtrain = traindir + "partiallemtagtrain.conll" conll.trees2conllFile(traintrees + lexicontrees, partiallemtagtrain, columns=14) else: alllemtagtrain = alldeptraining partiallemtagtrain = traindir + "partialdeptrain.conll" #creating files used for evaluation #if isinstance(evaluationPercent, str): evaluationPercent = int(evaluationPercent) mateLogs("trainfiles created") if verbose: print "just testing whether i can load them..." conll.conllFile2trees(traindir + "partialdeptrain.conll") conll.conllFile2trees(traindir + "emptytest.conll") conll.conllFile2trees(traindir + "test.conll") mateLogs("training of partial tree file for evaluation... ====") lemodelpartial, tagmodelpartial, parsemodelpartial = makeTrainingModels( basepath, lemtagin=partiallemtagtrain, depin=traindir + "partialdeptrain.conll", outfolder=modeldir, memory=memory, testfile=traindir + "emptytest.conll", evalfile=traindir + "test.conll", lemma=lemma, parserType=parserType) mateLogs("evaluation...") #evaluation evaluFileName = detailedEvaluation(parserType=parserType, memory=memory, testfile=traindir + "emptytest.conll_parse", evalfile=traindir + "test.conll", path=logdir, evaluationPercent=evaluationPercent) evalu = unicode(evaluFileName) + "\n" with codecs.open(evaluFileName, "r", "utf-8") as f: evalu += f.read() #full training mateLogs("training of full tree file for parsing... ====") lemodel, tagmodel, parsemodel = makeTrainingModels(basepath, lemtagin=alllemtagtrain, depin=alldeptraining, outfolder=modeldir, memory=memory, lemma=lemma, parserType=parserType) #getting texts to parse mateLogs( "training and evaluation complete. Starting the parse...\n\n{evalu}". format(evalu=evalu)) #filenames=getTextsForParsing.main(project, parsedir) if parseDB: filenames = getTextsForParsing.extractConllFiles(project, parsedir) #parsing for infile in filenames: #mateLogs("Training and evaluation complete. Starting the parse of {infile}\n\n{evalu}".format(infile=infile, evalu=evalu)) mateLogs( "Training and evaluation complete. Starting the parse of {}\n\n" .format(infile)) parsedfile = parsing(infile, lemodel=lemodel, tagmodel=tagmodel, parsemodel=parsemodel, outfolder=parsedir, parserType=parserType, memory=memory) #update on base newname = os.path.basename(parsedfile) updateTrees.updateParseResult(project, parsedir, filepattern=newname, annotatorName=resultAnnotator, removeToGetDB="_parse") # make it easy for everyone to erase all this stuff: for root, dirs, files in os.walk(basepath): for momo in dirs: try: os.chmod(os.path.join(root, momo), 0777) except: pass for momo in files: try: os.chmod(os.path.join(root, momo), 0666) except: pass totaltime = (time.time() - ti) / 60 mateLogs( "Ready. It took {totaltime} minutes for the whole process\n\n{evalu}". format(totaltime=round(totaltime, 1), evalu=evalu))
def exportGoodTexts(project, lastHuman=False, onlyValidated=True, pattern=False): """ TODO : - ajouter parametre p/selectionner Texte ex : "UD_ZH_[number]" """ outfiles = [] sql = SQL(project) db, cursor = sql.open() goodTexts = {} if onlyValidated: onlyValidated = "and todos.status=1" else: onlyValidated = "" # take all texts where a validator has validated if pattern: command = "select distinct * from texts, todos, users where texts.rowid=todos.textid and users.rowid=todos.userid and texts.textname {pattern};".format( pattern=pattern) # like 'UD_ZH%' else: command = "select distinct * from texts, todos, users where texts.rowid=todos.textid and todos.type=1 {onlyValidated} and users.rowid=todos.userid;".format( onlyValidated=onlyValidated) for row in cursor.execute(command): textname, nrtokens, userid, textid, validator, status, comment, user, realname = row goodTexts[textid] = (textname, userid, user) print "i'll take", textname, "validated by", user, "with", nrtokens, "tokens" sentenceValidationInValidatedText(cursor, sql, db) outdir = os.path.join("..", "projects", project, "export") try: os.mkdir(outdir) except OSError: pass for textid, (textname, userid, user) in goodTexts.iteritems(): textname = textname.replace("-one-word-per-line.conll14_Parse", "") if lastHuman: outfile = os.path.join(outdir, textname + ".lastHuman.conll") else: outfile = os.path.join( outdir, "validated." + textname + "." + user + ".conll") print "doing", textname, textid trees = [] if lastHuman: snr2all = {} for row in cursor.execute( """ select sentences.nr as snr, trees.rowid as treeid, users.user, trees.timestamp from sentences, trees, users where sentences.textid=? and sentences.rowid=trees.sentenceid and users.rowid = trees.userid; """, (textid, )): snr, treeid, user, timestamp = row snr2all[snr] = snr2all.get(snr, []) + [(timestamp, user, treeid)] lastpourc = -1 for c, snr in enumerate(sorted(snr2all)): pourc = int(float(c) / len(snr2all) * 100) if pourc != lastpourc: print "___{pourc}%___\r".format(pourc=pourc), lastusersnotparser = sorted([ (timestamp, user, treeid) for (timestamp, user, treeid) in snr2all[snr] if user not in ["parser", "mate"] ]) if len(lastusersnotparser) > 0: time, u, tid = lastusersnotparser[-1] # last tree by human else: time, u, tid = sorted( snr2all[snr])[-1] # last tree by whoever #print "je prends l'arbre de",u trees += [ sql.gettree(treeid=treeid, indb=db, incursor=cursor)["tree"] ] else: for ( treeid, sentencenr, ) in cursor.execute( "select trees.rowid, sentences.nr from texts, trees, sentences where texts.rowid=? and trees.userid=? and trees.sentenceid = sentences.rowid and sentences.textid=texts.rowid order by sentences.nr;", ( textid, userid, )).fetchall(): #print "ooo",sentencenr,"\r", print "nr", sentencenr, "_____\r", trees += [ sql.gettree(treeid=treeid, indb=db, incursor=cursor)["tree"] ] print "exporting", len(trees), "trees into", outfile outfiles += [outfile] conll.trees2conllFile(trees, outfile, columns=10) return outfiles
def fusionForgottenTrees(project="Platinum", fusdir="../projects/OrfeoGold2016/platinum/*", annotators=["admin"]): """ takes trees from project ordered by annotators. if they exist fuse them into the fusdir result has the extension "cool.conll" ,"Sy","Marion" """ #print lemmacorrection sys.path.insert(0, '../tools') import difflib outfiles = [] sql = SQL(project) db, cursor = sql.open() goodTexts = {} outdir = os.path.join("..", "projects", project, "exportcool") try: os.mkdir(outdir) except OSError: pass for annotator in annotators: print[ list( cursor.execute("select rowid from users where user =?;", (annotator, ))) ] annotatorIds = tuple(a for (a, ) in [ list( cursor.execute("select rowid from users where user =?;", ( annotator, )))[0] for annotator in annotators ]) print annotators, annotatorIds for textid, textname, nrtokens in list( cursor.execute("select rowid, * from texts;")): # for each text print "\n__________________________doing", textname, "with", nrtokens, "tokens" nrutids = {} for nr, userid, treeid in list( cursor.execute( "select nr,userid,trees.rowid as treeid from trees, sentences where sentenceid=sentences.rowid and userid in {annotatorIds} and textid = ? order by nr;" .format(annotatorIds=annotatorIds), (textid, ))): nrutids[nr] = nrutids.get(nr, {}) nrutids[nr][userid] = treeid trees = {} for nr in sorted(nrutids): # for each sentence tree = None for aid in annotatorIds: # for each interesting annotator id if aid in nrutids[nr]: tree = sql.gettree(treeid=nrutids[nr][aid], indb=db, incursor=cursor)["tree"] trees[nr] = tree #print "atree:",tree break #if not tree: #print "problem: no tree for nr",nr,"type",type(nr) #print "annotatorIds",annotatorIds #raise Exception('no tree', nr) #print trees print len(trees), "trees from", project print textname, textname.split(".")[0] btextname = os.path.basename(textname).split(".")[0] if btextname.endswith("-one-word-per-line"): btextname = btextname[:-len("-one-word-per-line")] #print glob.glob(fusdir),[os.path.basename(fi).split(".")[0] for fi in glob.glob(fusdir)] cooltrees = [] ptrees, ftrees = 0, 0 for fi in glob.glob(fusdir): if btextname == os.path.basename(fi).split(".")[0]: print "yes", btextname fustrees = conll.conllFile2trees(fi) print len(fustrees), "ftrees", fi for nr, ftree in enumerate(fustrees): if nr + 1 in trees: #print "added tree",nr+1,"from database" #ptree=platinum(trees[nr+1]) ptree = trees[nr + 1] for iii in ptree: ptree[iii]["tag2"] = "_" if ptree[iii]["lemma"] in lemmacorrection: ptree[iii]["lemma"] = lemmacorrection[ ptree[iii]["lemma"]] cooltrees += [ptree] #print nr+1,"tree from",project#,tree ptrees += 1 if ftree.sentence() != u" ".join( [ptree[i].get("t", "") for i in sorted(ptree)]): print "\n_________", nr + 1 print ftree.sentence() print u" ".join( [ptree[i].get("t", "") for i in sorted(ptree)]) #for l in difflib.context_diff(ftree.sentence() ,u" ".join([ptree[i].get("t","") for i in sorted(ptree)])):print l #print "dbtree",platinum(trees[nr+1]) else: for iii in ftree: ftree[iii]["tag2"] = "_" if ftree[iii]["lemma"] in lemmacorrection: ftree[iii]["lemma"] = lemmacorrection[ ftree[iii]["lemma"]] #print nr+1,"tree from",fusdir#,tree ftrees += 1 cooltrees += [ftree] #print "added tree",nr+1,"from fustrees",fi outfile = os.path.join(outdir, textname + ".cool.conll") conll.trees2conllFile(cooltrees, outfile=outfile, columns=10) print "wrote", outfile print ptrees, "ptrees, ", ftrees, "ftrees" break if len(cooltrees) == 0: print "nothing for", btextname outfiles += [outfile] #qsdf return outfiles
#translate(u"准许 一 位 人士 入境 的 权力".split()) for conllinfile in glob.glob(os.path.join("corpus/conll/", 'CONV*.*')): print conllinfile trees = conllFile2trees(conllinfile) path, base = os.path.split(conllinfile) translateDic = {} counter = 0 for tree in trees: for i, node in tree.iteritems(): node["tag2"] = pinyin.get(node["t"]) translateDic[node["t"]] = None counter += 1 if not counter % 100: print counter, "trees" words = sorted(translateDic) print len(words), "words" trads = translate(words) translateDic = dict(zip(words, trads)) print len(translateDic), "translations" for tree in trees: for i, node in tree.iteritems(): node["gloss"] = translateDic[node["t"]] counter += 1 if not counter % 100: print counter, "trees" #lines+=[u" ".join(words+[u"."])] trees2conllFile(trees, path + "/" + "UD-" + base[len("CONV-CORREC-"):])