def extractConllFiles(project,outfolder): """ creates the empty files with a word per line as a first step to mate parsing """ if outfolder[-1]!="/": outfolder=outfolder+"/" texts={} sql = SQL(project) db, cursor = sql.open() command = """select distinct texts.textname, sentences.nr, features.nr, features.value from features, trees, texts, sentences, users where attr = "t" and trees.rowid=features.treeid --and sentences.textid=13 and trees.sentenceid=sentences.rowid and sentences.textid=texts.rowid and users.user="******";""" cursor.execute(command) a = cursor.fetchall() #print a for nr, (textname, snr, num, token) in enumerate(a): #sql.exportAnnotations(textid, textname, "lastconll") texts[textname]= texts.get(textname,[])+[(snr, num,token)] newfiles=[] for textname in texts: print "processing",textname f=codecs.open(outfolder+textname, "w", "utf-8") for c, (snr, num, tok) in enumerate(sorted(texts[textname])): if num == 1 and c > 0: f.write('\n') f.write("\t".join([str(num), tok]+["_"]*12)+'\n') print c+1, "tokens" f.close() newfiles+=[outfolder+textname] return newfiles
def exportUniqueSentences(project, mode="lasttree", pattern=False): """ exports one tree per sentences: the first time the sentence is found, the newest tree """ sql = SQL(project) db, cursor = sql.open() sentences = {} # toks -> tree outdir = os.path.join("..", "projects", project, "export") try: os.mkdir(outdir) except OSError: pass outfile = os.path.join(outdir, "allSentences.conll") if pattern: command = """select trees.rowid,userid,max(timestamp) from trees, sentences, texts where texts.rowid=sentences.textid and sentences.rowid=trees.sentenceid and textname like "{pattern}" group by sentenceid order by trees.rowid;""".format(pattern=pattern) else: command = """select trees.rowid,userid,max(timestamp) from trees, sentences, texts where texts.rowid=sentences.textid and sentences.rowid=trees.sentenceid group by sentenceid order by trees.rowid;""" for i, ( treeid, userid, timestamp, ) in enumerate(cursor.execute(command).fetchall()): tree = sql.gettree(treeid=treeid, indb=db, incursor=cursor)["tree"] toks = tuple(tree[i]["t"] for i in tree) print "___", i, "\r", if toks not in sentences: sentences[toks] = tree print "writing file with", len(sentences), "sentences..." conll.trees2conllFile([sentences[toks] for toks in sorted(sentences)], outfile=outfile, columns=10) return outfile
def directDatabaseChangeForForgottenCorrection(): from database import SQL sql = SQL("Platinum") db, cursor = sql.open() cursor.execute('update links set function="comp" where function="aff";') db.commit() db.close() print "changed"
def lastTreeForAllSamples(project, onlyHuman=True, combine=False): outdir = os.path.join("..", "projects", project, "export") try: os.mkdir(outdir) except OSError: pass sql = SQL(project) db, cursor = sql.open() if onlyHuman: parserid = 0 for pid, in cursor.execute( "select rowid from users where user='******';"): parserid = pid else: parserid = -1 sents = sorted( cursor.execute( "select texts.textname, sentences.rowid, sentences.nr from sentences, texts where texts.rowid=sentences.textid;" ).fetchall()) print "todo:", len(sents), "sentences" pbar = tqdm.tqdm(total=len(sents)) annotators = {} if combine: trees = [] getTreesForSents(sents, trees, annotators, parserid, cursor, db, sql, pbar, project=project) outfile = os.path.join(outdir, project + ".lastHumanTreeForAllSamples.conllu") conll.trees2conllFile(trees, outfile=outfile) print "wrote", outfile else: for tid, textname, nrtokens in list( cursor.execute("select rowid, * from texts;")): print tid, textname, nrtokens sents = list( cursor.execute( "select rowid, * from sentences where textid=?;", (tid, )).fetchall()) trees = [] getTreesForSents(sents, trees, annotators, parserid, cursor, db, sql, pbar) if textname.endswith(".conll_parse"): textname = textname[:len(".conll_parse")] outfile = os.path.join(outdir, textname + ".lastHumanTrees.conllu") conll.trees2conllFile(trees, outfile=outfile) print "wrote", outfile for a in annotators: print a, annotators[a]
def exportConllByAnnotators(project, annotators=["prof", "Sy", "parser"]): """ exports complete project for every sentence, trees of annotators in given order. if no tree: throw error """ outfiles = [] sql = SQL(project) db, cursor = sql.open() goodTexts = {} outdir = os.path.join("..", "projects", project, "export") try: os.mkdir(outdir) except OSError: pass annotatorIds = tuple(a for (a, ) in [ list( cursor.execute("select rowid from users where user =?;", ( annotator, )))[0] for annotator in annotators ]) #print annotators, annotatorIds for textid, textname, nrtokens in list( cursor.execute("select rowid, * from texts;")): # for each text print "doing", textname, "with", nrtokens, "tokens" nrutids = {} for nr, userid, treeid in list( cursor.execute( "select nr,userid,trees.rowid as treeid from trees, sentences where sentenceid=sentences.rowid and userid in {annotatorIds} and textid = ? order by nr;" .format(annotatorIds=annotatorIds), (textid, ))): nrutids[nr] = nrutids.get(nr, {}) nrutids[nr][userid] = treeid trees = [] for nr in sorted(nrutids): # for each sentence tree = None for aid in annotatorIds: # for each interesting annotator id if aid in nrutids[nr]: tree = sql.gettree(treeid=nrutids[nr][aid], indb=db, incursor=cursor)["tree"] trees += [tree] #print "atree:",tree break if not tree: print "problem: no tree for nr", nr, "type", type(nr) print "annotatorIds", annotatorIds raise Exception('no tree', nr) if textname.endswith(".conll"): textname = textname[:-len(".conll")] outfile = os.path.join(outdir, textname) conll.trees2conllFile(trees, outfile=outfile, columns=10) print len(trees), "trees" outfiles += [outfile] return outfiles
def exportConllByAnnotators(project, annotators=["prof", "Sy", "parser"], fileExtension=".conllu"): """ exports complete project for every sentence, trees of annotators in given order. if no tree: throw error """ outfiles = [] sql = SQL(project) db, cursor = sql.open() goodTexts = {} outdir = os.path.join("..", "projects", project, "export") try: os.mkdir(outdir) except OSError: pass try: annotatorIds = tuple(a for (a, ) in [ list( cursor.execute("select rowid from users where user =?;", ( annotator, )))[0] for annotator in annotators ]) except: print "some required annotator IDs are not in the database" return print annotators, annotatorIds for textid, textname, nrtokens in list( cursor.execute("select rowid, * from texts;")): # for each text print "doing", textname, "with", nrtokens, "tokens" nrutids = {} for nr, userid, treeid in list( cursor.execute( "select nr,userid,trees.rowid as treeid from trees, sentences where sentenceid=sentences.rowid and userid in {annotatorIds} and textid = ? order by nr;" .format(annotatorIds=annotatorIds if len(annotatorIds) > 1 else '(' + str(annotatorIds[0]) + ')'), (textid, ))): nrutids[nr] = nrutids.get(nr, {}) nrutids[nr][userid] = treeid trees = getSpecificTrees(sql, db, cursor, nrutids, annotatorIds) if trees: if textname.endswith(".conll"): textname = textname[:-len(".conll")] if textname.endswith(".conllu"): textname = textname[:-len(".conllu")] outfile = os.path.join(outdir, textname + fileExtension) conll.trees2conllFile(trees, outfile=outfile, columns=10) print len(trees), "trees" outfiles += [outfile] else: print "skipped", textname return outfiles
def printTree(project, treeid): sql = SQL(project) db, cursor = sql.open() dic = sql.gettree( None, None, treeid, indb=db, incursor=cursor) # dic -> get tree == on récupère l'arbre #print dic if dic and dic["tree"]: sentencetree = dic["tree"] #sentencetree=corrigerNumerotation(sentencetree) for i in sorted(sentencetree): print i, sentencetree[i]
def bulkcorrectDB(project, treeids=[], commit=True): """ bulk correction of a whole project! very slow! better to do directly in sql, for example: #change all functions: #update links set function='dep' where function='det'; """ sql = SQL(project) db, cursor = sql.open() if treeids: a, v = ["rowid"], treeids else: a, v = [], [] allt = list(sql.getall(cursor, "trees", a, v)) print "nb trees:", len(allt) ti = time() for nr, (treeid, sid, uid, annotype, status, comment, timestamp) in enumerate(allt): dic = sql.gettree(None, None, treeid, indb=db, incursor=cursor) if dic: tree = dic["tree"] newtree, changed = correctLowerProperNouns(tree) if changed: print "________________________________\n" ws, sentence, _ = sql.enterTree(cursor, newtree, sid, uid, tokensChanged=True) print sentence print "changed", changed if not nr % 100: print "committing..." if commit: db.commit() if not nr % 100: print "_____ treeid", treeid, "nr", nr + 1, "/", len( allt), "---", int(float(nr + 1) / (time() - ti)), "trees per second", int( float(len(allt) - nr + 1) / (float(nr + 1) / (time() - ti))), "seconds (", round( float(len(allt) - nr + 1) / (float(nr + 1) / (time() - ti)) / 60, 1), "minutes) to go" if commit: db.commit() db.close()
def bulkcorrectDB(project, treeids=[]): """ bulk correction of a whole project! very slow! better to do directly in sql, for example: #change all functions: #update links set function='dep' where function='det'; """ sql = SQL(project) db,cursor=sql.open() if treeids: a,v=["rowid"],treeids else: a,v=[],[] allt=sql.getall(cursor, "trees",a,v) ti = time() for nr, (treeid,sid,uid,annotype,status,comment,timestamp) in enumerate(allt): print "_____ treeid",treeid,"nr",nr+1,"/",len(allt),"---",float(nr+1)/(time()-ti),"trees per second",float(len(allt)-nr+1)/(float(nr+1)/(time()-ti)),"seconds to go",float(len(allt)-nr+1)/(float(nr+1)/(time()-ti))/60,"minutes to go" dic=sql.gettree(None,None,treeid, indb=db,incursor=cursor) if dic: sentencetree=dic["tree"] #newdic, changed = complexbulkcorrectdic(sentencetree) #newdic, changed = simplebulkcorrectdic(sentencetree) #newdic, changed = correctfeatures(sentencetree) #newdic, changed = correctfeatures(sentencetree,4,{}) newdic, changed = correctfeatures(sentencetree,4,{ 'i1':{ u'person': u'3', u'number': u'sg', u'cat': u'V', u'lemma': u'\xeatre', u'token': u'est', u'tense': u'present', u'mode': u'indicative', 'gov': {'i0':'root'}, u't': u'A'}, 'i2':{u'cat': u'Cl', u'lemma': u'c', u'token': u'-ce', u't': u'B', 'gov': {'i1': u'sub'}, 'child':None} }) #break if changed: print "________________________________\n" #for i,node in newdic.iteritems(): #print i, node["t"], node #1/0 tokensChanged=True ws,sentence,_ = sql.enterTree(cursor, newdic, sid, uid,tokensChanged=tokensChanged) print sentence print "changed" db.commit() #db.commit() db.close()
def getValidatedTrees(project, folder, whoseTrees="validator"): sql = SQL(project) db, cursor = sql.open() sentenceValidationInValidatedText(cursor, sql, db) #on récupère les nouveaux arbres b = databaseQuery(cursor, table=whoseTrees) print len(b), u"trees to extract" sids2all = {} trees = [] error_trees = [] textnames = {} for nr, (treeid, textname, user, snr, sid, uid, annotype, status, comment, timestamp) in enumerate(b): # TODO: remove: #if textname.startswith("mandarinParsed"):continue sids2all[sid] = sids2all.get( sid, []) + [(timestamp, textname, user, snr, treeid)] textnames[textname] = None #print len(sids2all) print u"trees extracted from the samples", ", ".join(sorted(textnames)) lastpourc = -1 for c, sid in enumerate(sids2all): pourc = int(float(c) / len(sids2all) * 100) if pourc != lastpourc: sys.stdout.write("{pourc}%\r".format(pourc=pourc)) sys.stdout.flush() snr, treeid2get = sorted(sids2all[sid])[0][-2:] #print treeid2get, type(treeid2get) #lknlk dic = sql.gettree(None, None, treeid2get, indb=db, incursor=cursor) # dic -> get tree #if treeid2get==9669: #print 9669,dic if dic: sentencetree = dic["tree"] sentencetree = corrigerNumerotation(sentencetree) trees.append(sentencetree) #print " ".join(node["t"] for i,node in sentencetree.iteritems()) if checkTree(sentencetree)[0] == False: if checkTree(sentencetree)[1] == "self": error_trees += [ "\t".join([ textname, str(snr), user, "node " + str(checkTree(sentencetree)[2]) + " points to itself" ]) ] else: error_trees += [ "\t".join([ textname, str(snr), user, "no gov at node " + str(checkTree(sentencetree)[2]) ]) ] trees.remove(sentencetree) #print "nr arbres",len(trees) lastpourc = pourc print len(error_trees), "arbre(s) avec erreurs." if len(error_trees) > 0: print "\t".join(["Texte", "num phrase", "correcteur", "cause"]) for x in sorted(list(set(error_trees))): print x f = codecs.open( folder + "logs/log_erreurs." + datetime.datetime.now().strftime('%Y-%m-%d') + ".tsv", "w", "utf-8") f.write("\t".join(["Texte", "num phrase", "correcteur", "cause"]) + '\n') for e in error_trees: f.write(e + '\n') f.close() print "Erreurs dans", f.name print len(trees), "arbres restants pour entrainement" #Creation d'un fichier log db.commit() db.close() return trees
def fusionForgottenTrees(project="Platinum", fusdir="../projects/OrfeoGold2016/platinum/*", annotators=["admin"]): """ takes trees from project ordered by annotators. if they exist fuse them into the fusdir result has the extension "cool.conll" ,"Sy","Marion" """ #print lemmacorrection sys.path.insert(0, '../tools') import difflib outfiles = [] sql = SQL(project) db, cursor = sql.open() goodTexts = {} outdir = os.path.join("..", "projects", project, "exportcool") try: os.mkdir(outdir) except OSError: pass for annotator in annotators: print[ list( cursor.execute("select rowid from users where user =?;", (annotator, ))) ] annotatorIds = tuple(a for (a, ) in [ list( cursor.execute("select rowid from users where user =?;", ( annotator, )))[0] for annotator in annotators ]) print annotators, annotatorIds for textid, textname, nrtokens in list( cursor.execute("select rowid, * from texts;")): # for each text print "\n__________________________doing", textname, "with", nrtokens, "tokens" nrutids = {} for nr, userid, treeid in list( cursor.execute( "select nr,userid,trees.rowid as treeid from trees, sentences where sentenceid=sentences.rowid and userid in {annotatorIds} and textid = ? order by nr;" .format(annotatorIds=annotatorIds), (textid, ))): nrutids[nr] = nrutids.get(nr, {}) nrutids[nr][userid] = treeid trees = {} for nr in sorted(nrutids): # for each sentence tree = None for aid in annotatorIds: # for each interesting annotator id if aid in nrutids[nr]: tree = sql.gettree(treeid=nrutids[nr][aid], indb=db, incursor=cursor)["tree"] trees[nr] = tree #print "atree:",tree break #if not tree: #print "problem: no tree for nr",nr,"type",type(nr) #print "annotatorIds",annotatorIds #raise Exception('no tree', nr) #print trees print len(trees), "trees from", project print textname, textname.split(".")[0] btextname = os.path.basename(textname).split(".")[0] if btextname.endswith("-one-word-per-line"): btextname = btextname[:-len("-one-word-per-line")] #print glob.glob(fusdir),[os.path.basename(fi).split(".")[0] for fi in glob.glob(fusdir)] cooltrees = [] ptrees, ftrees = 0, 0 for fi in glob.glob(fusdir): if btextname == os.path.basename(fi).split(".")[0]: print "yes", btextname fustrees = conll.conllFile2trees(fi) print len(fustrees), "ftrees", fi for nr, ftree in enumerate(fustrees): if nr + 1 in trees: #print "added tree",nr+1,"from database" #ptree=platinum(trees[nr+1]) ptree = trees[nr + 1] for iii in ptree: ptree[iii]["tag2"] = "_" if ptree[iii]["lemma"] in lemmacorrection: ptree[iii]["lemma"] = lemmacorrection[ ptree[iii]["lemma"]] cooltrees += [ptree] #print nr+1,"tree from",project#,tree ptrees += 1 if ftree.sentence() != u" ".join( [ptree[i].get("t", "") for i in sorted(ptree)]): print "\n_________", nr + 1 print ftree.sentence() print u" ".join( [ptree[i].get("t", "") for i in sorted(ptree)]) #for l in difflib.context_diff(ftree.sentence() ,u" ".join([ptree[i].get("t","") for i in sorted(ptree)])):print l #print "dbtree",platinum(trees[nr+1]) else: for iii in ftree: ftree[iii]["tag2"] = "_" if ftree[iii]["lemma"] in lemmacorrection: ftree[iii]["lemma"] = lemmacorrection[ ftree[iii]["lemma"]] #print nr+1,"tree from",fusdir#,tree ftrees += 1 cooltrees += [ftree] #print "added tree",nr+1,"from fustrees",fi outfile = os.path.join(outdir, textname + ".cool.conll") conll.trees2conllFile(cooltrees, outfile=outfile, columns=10) print "wrote", outfile print ptrees, "ptrees, ", ftrees, "ftrees" break if len(cooltrees) == 0: print "nothing for", btextname outfiles += [outfile] #qsdf return outfiles
def exportGoodTexts(project, lastHuman=False, onlyValidated=True, pattern=False): """ TODO : - ajouter parametre p/selectionner Texte ex : "UD_ZH_[number]" """ outfiles = [] sql = SQL(project) db, cursor = sql.open() goodTexts = {} if onlyValidated: onlyValidated = "and todos.status=1" else: onlyValidated = "" # take all texts where a validator has validated if pattern: command = "select distinct * from texts, todos, users where texts.rowid=todos.textid and users.rowid=todos.userid and texts.textname {pattern};".format( pattern=pattern) # like 'UD_ZH%' else: command = "select distinct * from texts, todos, users where texts.rowid=todos.textid and todos.type=1 {onlyValidated} and users.rowid=todos.userid;".format( onlyValidated=onlyValidated) for row in cursor.execute(command): textname, nrtokens, userid, textid, validator, status, comment, user, realname = row goodTexts[textid] = (textname, userid, user) print "i'll take", textname, "validated by", user, "with", nrtokens, "tokens" sentenceValidationInValidatedText(cursor, sql, db) outdir = os.path.join("..", "projects", project, "export") try: os.mkdir(outdir) except OSError: pass for textid, (textname, userid, user) in goodTexts.iteritems(): textname = textname.replace("-one-word-per-line.conll14_Parse", "") if lastHuman: outfile = os.path.join(outdir, textname + ".lastHuman.conll") else: outfile = os.path.join( outdir, "validated." + textname + "." + user + ".conll") print "doing", textname, textid trees = [] if lastHuman: snr2all = {} for row in cursor.execute( """ select sentences.nr as snr, trees.rowid as treeid, users.user, trees.timestamp from sentences, trees, users where sentences.textid=? and sentences.rowid=trees.sentenceid and users.rowid = trees.userid; """, (textid, )): snr, treeid, user, timestamp = row snr2all[snr] = snr2all.get(snr, []) + [(timestamp, user, treeid)] lastpourc = -1 for c, snr in enumerate(sorted(snr2all)): pourc = int(float(c) / len(snr2all) * 100) if pourc != lastpourc: print "___{pourc}%___\r".format(pourc=pourc), lastusersnotparser = sorted([ (timestamp, user, treeid) for (timestamp, user, treeid) in snr2all[snr] if user not in ["parser", "mate"] ]) if len(lastusersnotparser) > 0: time, u, tid = lastusersnotparser[-1] # last tree by human else: time, u, tid = sorted( snr2all[snr])[-1] # last tree by whoever #print "je prends l'arbre de",u trees += [ sql.gettree(treeid=treeid, indb=db, incursor=cursor)["tree"] ] else: for ( treeid, sentencenr, ) in cursor.execute( "select trees.rowid, sentences.nr from texts, trees, sentences where texts.rowid=? and trees.userid=? and trees.sentenceid = sentences.rowid and sentences.textid=texts.rowid order by sentences.nr;", ( textid, userid, )).fetchall(): #print "ooo",sentencenr,"\r", print "nr", sentencenr, "_____\r", trees += [ sql.gettree(treeid=treeid, indb=db, incursor=cursor)["tree"] ] print "exporting", len(trees), "trees into", outfile outfiles += [outfile] conll.trees2conllFile(trees, outfile, columns=10) return outfiles
new_trees = list() for nr, tree in sortable: # adding metadatas 应该是重命名sent_id,从0开始 tree.sentencefeatures["text"] = tree.sentence() tree.sentencefeatures["sent_id"] = prefix + "_" + str(nr - 1) # removing useless metadata del tree.sentencefeatures["nr"] new_trees.append(tree) conll.trees2conllFile(new_trees, outfile) if __name__ == "__main__": ## Open project database sql = SQL("NaijaSUD") # 输入project名字 db, cursor = sql.open() ## Use 2 functions : # - exportLastBestAnnotations in lib/database.py -> writes a file with trees and their rank # - reorder in lib/yuchen.py -> reorder trees based on their rank, write a file with the output users, c = sql.exportLastBestAnnotations( 115, "P_ABJ_GWA_06_Ugo-lifestory_PRO" ) # 输入textid和text name,可通过链接https://arborator.ilpga.fr/editor.cgi?project=NaijaSUD&textid=74&opensentence=1看到textid print(users, c) fpath = "E:/TAL/Stage/arborator/projects/NaijaSUD/export/P_ABJ_GWA_06_Ugo.lifestory_PRO.most.recent.trees.with.feats.conllu" # 输入导出的文件所在路径 trees = conll.conllFile2trees(fpath) # 重新排序conll树,重命名sent_id reorder(trees, fpath + "_reordered")
def bulkcorrectDB(project, treeids=[]): """ bulk correction of a whole project! very slow! better to do directly in sql, for example: #change all functions: #update links set function='dep' where function='det'; """ sql = SQL(project) db, cursor = sql.open() if treeids: a, v = ["rowid"], treeids else: a, v = [], [] allt = sql.getall(cursor, "trees", a, v) ti = time() for nr, (treeid, sid, uid, annotype, status, comment, timestamp) in enumerate(allt): print "_____ treeid", treeid, "nr", nr + 1, "/", len( allt ), "---", float(nr + 1) / ( time() - ti), "trees per second", float(len(allt) - nr + 1) / ( float(nr + 1) / (time() - ti)), "seconds to go", float(len(allt) - nr + 1) / ( float(nr + 1) / (time() - ti)) / 60, "minutes to go" dic = sql.gettree(None, None, treeid, indb=db, incursor=cursor) if dic: sentencetree = dic["tree"] #newdic, changed = complexbulkcorrectdic(sentencetree) #newdic, changed = simplebulkcorrectdic(sentencetree) #newdic, changed = correctfeatures(sentencetree) #newdic, changed = correctfeatures(sentencetree,4,{}) newdic, changed = correctfeatures( sentencetree, 4, { 'i1': { u'person': u'3', u'number': u'sg', u'cat': u'V', u'lemma': u'\xeatre', u'token': u'est', u'tense': u'present', u'mode': u'indicative', 'gov': { 'i0': 'root' }, u't': u'A' }, 'i2': { u'cat': u'Cl', u'lemma': u'c', u'token': u'-ce', u't': u'B', 'gov': { 'i1': u'sub' }, 'child': None } }) #break if changed: print "________________________________\n" #for i,node in newdic.iteritems(): #print i, node["t"], node #1/0 tokensChanged = True ws, sentence, _ = sql.enterTree(cursor, newdic, sid, uid, tokensChanged=tokensChanged) print sentence print "changed" db.commit() #db.commit() db.close()