Exemplo n.º 1
0
def exportUniqueSentences(project, mode="lasttree", pattern=False):
    """
	exports one tree per sentences: the first time the sentence is found, the newest tree
	"""
    sql = SQL(project)
    db, cursor = sql.open()
    sentences = {}  # toks -> tree
    outdir = os.path.join("..", "projects", project, "export")
    try:
        os.mkdir(outdir)
    except OSError:
        pass
    outfile = os.path.join(outdir, "allSentences.conll")
    if pattern:
        command = """select trees.rowid,userid,max(timestamp) from trees, sentences, texts where texts.rowid=sentences.textid and sentences.rowid=trees.sentenceid
		and textname like "{pattern}"
		group by sentenceid order by trees.rowid;""".format(pattern=pattern)
    else:
        command = """select trees.rowid,userid,max(timestamp) from trees, sentences, texts where texts.rowid=sentences.textid and sentences.rowid=trees.sentenceid
		group by sentenceid order by trees.rowid;"""
    for i, (
            treeid,
            userid,
            timestamp,
    ) in enumerate(cursor.execute(command).fetchall()):
        tree = sql.gettree(treeid=treeid, indb=db, incursor=cursor)["tree"]
        toks = tuple(tree[i]["t"] for i in tree)
        print "___", i, "\r",
        if toks not in sentences:
            sentences[toks] = tree
    print "writing file with", len(sentences), "sentences..."
    conll.trees2conllFile([sentences[toks] for toks in sorted(sentences)],
                          outfile=outfile,
                          columns=10)
    return outfile
Exemplo n.º 2
0
def exportConllByAnnotators(project, annotators=["prof", "Sy", "parser"]):
    """
	exports complete project
	for every sentence, trees of annotators in given order.
	if no tree: throw error 
	
	"""
    outfiles = []
    sql = SQL(project)
    db, cursor = sql.open()
    goodTexts = {}
    outdir = os.path.join("..", "projects", project, "export")
    try:
        os.mkdir(outdir)
    except OSError:
        pass

    annotatorIds = tuple(a for (a, ) in [
        list(
            cursor.execute("select rowid from users where user =?;", (
                annotator, )))[0] for annotator in annotators
    ])
    #print annotators, annotatorIds

    for textid, textname, nrtokens in list(
            cursor.execute("select rowid, * from texts;")):  # for each text
        print "doing", textname, "with", nrtokens, "tokens"
        nrutids = {}
        for nr, userid, treeid in list(
                cursor.execute(
                    "select nr,userid,trees.rowid as treeid from trees, sentences where sentenceid=sentences.rowid and userid in {annotatorIds} and  textid = ? order by nr;"
                    .format(annotatorIds=annotatorIds), (textid, ))):
            nrutids[nr] = nrutids.get(nr, {})
            nrutids[nr][userid] = treeid
        trees = []
        for nr in sorted(nrutids):  # for each sentence
            tree = None
            for aid in annotatorIds:  # for each interesting annotator id
                if aid in nrutids[nr]:
                    tree = sql.gettree(treeid=nrutids[nr][aid],
                                       indb=db,
                                       incursor=cursor)["tree"]
                    trees += [tree]
                    #print "atree:",tree
                    break
            if not tree:
                print "problem: no tree for nr", nr, "type", type(nr)
                print "annotatorIds", annotatorIds
                raise Exception('no tree', nr)

        if textname.endswith(".conll"): textname = textname[:-len(".conll")]
        outfile = os.path.join(outdir, textname)
        conll.trees2conllFile(trees, outfile=outfile, columns=10)
        print len(trees), "trees"
        outfiles += [outfile]
    return outfiles
Exemplo n.º 3
0
def printTree(project, treeid):
    sql = SQL(project)
    db, cursor = sql.open()
    dic = sql.gettree(
        None, None, treeid, indb=db,
        incursor=cursor)  # dic -> get tree == on récupère l'arbre
    #print dic
    if dic and dic["tree"]:
        sentencetree = dic["tree"]
        #sentencetree=corrigerNumerotation(sentencetree)
        for i in sorted(sentencetree):
            print i, sentencetree[i]
def bulkcorrectDB(project, treeids=[], commit=True):
    """
	bulk correction of a whole project! very slow!
	
	better to do directly in sql, for example:
	#change all functions:
		#update links set function='dep' where function='det';

	"""

    sql = SQL(project)
    db, cursor = sql.open()

    if treeids: a, v = ["rowid"], treeids
    else: a, v = [], []

    allt = list(sql.getall(cursor, "trees", a, v))
    print "nb trees:", len(allt)
    ti = time()

    for nr, (treeid, sid, uid, annotype, status, comment,
             timestamp) in enumerate(allt):

        dic = sql.gettree(None, None, treeid, indb=db, incursor=cursor)
        if dic:
            tree = dic["tree"]
            newtree, changed = correctLowerProperNouns(tree)

            if changed:
                print "________________________________\n"
                ws, sentence, _ = sql.enterTree(cursor,
                                                newtree,
                                                sid,
                                                uid,
                                                tokensChanged=True)
                print sentence
                print "changed", changed
                if not nr % 100:
                    print "committing..."
                    if commit: db.commit()
            if not nr % 100:
                print "_____ treeid", treeid, "nr", nr + 1, "/", len(
                    allt), "---", int(float(nr + 1) /
                                      (time() - ti)), "trees per second", int(
                                          float(len(allt) - nr + 1) /
                                          (float(nr + 1) /
                                           (time() - ti))), "seconds (", round(
                                               float(len(allt) - nr + 1) /
                                               (float(nr + 1) /
                                                (time() - ti)) / 60,
                                               1), "minutes) to go"
    if commit: db.commit()
    db.close()
Exemplo n.º 5
0
def bulkcorrectDB(project, treeids=[]):
	"""
	bulk correction of a whole project! very slow!
	
	better to do directly in sql, for example:
	#change all functions:
		#update links set function='dep' where function='det';

	"""
	
	sql = SQL(project)
	db,cursor=sql.open()
	
	if treeids:	a,v=["rowid"],treeids
	else:		a,v=[],[]
	
	allt=sql.getall(cursor, "trees",a,v)
	
	ti = time()
	
	for nr, (treeid,sid,uid,annotype,status,comment,timestamp) in enumerate(allt):
		
		print "_____ treeid",treeid,"nr",nr+1,"/",len(allt),"---",float(nr+1)/(time()-ti),"trees per second",float(len(allt)-nr+1)/(float(nr+1)/(time()-ti)),"seconds to go",float(len(allt)-nr+1)/(float(nr+1)/(time()-ti))/60,"minutes to go"
		
		dic=sql.gettree(None,None,treeid, indb=db,incursor=cursor)
		if dic:
			sentencetree=dic["tree"]
			
			#newdic, changed = complexbulkcorrectdic(sentencetree)
			#newdic, changed = simplebulkcorrectdic(sentencetree)
			#newdic, changed = correctfeatures(sentencetree)
			#newdic, changed = correctfeatures(sentencetree,4,{})
			newdic, changed = correctfeatures(sentencetree,4,{	
				'i1':{ u'person': u'3', u'number': u'sg', u'cat': u'V', u'lemma': u'\xeatre', u'token': u'est', u'tense': u'present', u'mode': u'indicative', 'gov': {'i0':'root'}, u't': u'A'},
				'i2':{u'cat': u'Cl', u'lemma': u'c', u'token': u'-ce', u't': u'B', 'gov': {'i1': u'sub'}, 'child':None}
			})
			
	
			
			#break
			if changed:
				print "________________________________\n"
				#for i,node in newdic.iteritems():
					#print i, node["t"], node
				#1/0
				tokensChanged=True
				ws,sentence,_ = sql.enterTree(cursor, newdic, sid, uid,tokensChanged=tokensChanged)
				print sentence
				print "changed"
				db.commit()
	#db.commit()
	db.close()
Exemplo n.º 6
0
def getValidatedTrees(project, folder, whoseTrees="validator"):
    sql = SQL(project)
    db, cursor = sql.open()
    sentenceValidationInValidatedText(cursor, sql, db)
    #on récupère les nouveaux arbres
    b = databaseQuery(cursor, table=whoseTrees)
    print len(b), u"trees to extract"
    sids2all = {}
    trees = []
    error_trees = []
    textnames = {}
    for nr, (treeid, textname, user, snr, sid, uid, annotype, status, comment,
             timestamp) in enumerate(b):
        # TODO: remove:
        #if textname.startswith("mandarinParsed"):continue
        sids2all[sid] = sids2all.get(
            sid, []) + [(timestamp, textname, user, snr, treeid)]
        textnames[textname] = None
    #print len(sids2all)
    print u"trees extracted from the samples", ", ".join(sorted(textnames))
    lastpourc = -1
    for c, sid in enumerate(sids2all):
        pourc = int(float(c) / len(sids2all) * 100)
        if pourc != lastpourc:
            sys.stdout.write("{pourc}%\r".format(pourc=pourc))
        sys.stdout.flush()

        snr, treeid2get = sorted(sids2all[sid])[0][-2:]
        #print treeid2get, type(treeid2get)
        #lknlk
        dic = sql.gettree(None, None, treeid2get, indb=db,
                          incursor=cursor)  # dic -> get tree
        #if treeid2get==9669:
        #print 9669,dic

        if dic:
            sentencetree = dic["tree"]
            sentencetree = corrigerNumerotation(sentencetree)
            trees.append(sentencetree)
            #print " ".join(node["t"] for i,node in sentencetree.iteritems())
            if checkTree(sentencetree)[0] == False:
                if checkTree(sentencetree)[1] == "self":
                    error_trees += [
                        "\t".join([
                            textname,
                            str(snr), user,
                            "node " + str(checkTree(sentencetree)[2]) +
                            " points to itself"
                        ])
                    ]
                else:
                    error_trees += [
                        "\t".join([
                            textname,
                            str(snr), user, "no gov at node " +
                            str(checkTree(sentencetree)[2])
                        ])
                    ]
                trees.remove(sentencetree)
                #print "nr arbres",len(trees)
        lastpourc = pourc
    print len(error_trees), "arbre(s) avec erreurs."
    if len(error_trees) > 0:
        print "\t".join(["Texte", "num phrase", "correcteur", "cause"])
        for x in sorted(list(set(error_trees))):
            print x
        f = codecs.open(
            folder + "logs/log_erreurs." +
            datetime.datetime.now().strftime('%Y-%m-%d') + ".tsv", "w",
            "utf-8")
        f.write("\t".join(["Texte", "num phrase", "correcteur", "cause"]) +
                '\n')
        for e in error_trees:
            f.write(e + '\n')
        f.close()
        print "Erreurs dans", f.name
    print len(trees), "arbres restants pour entrainement"
    #Creation d'un fichier log
    db.commit()
    db.close()
    return trees
Exemplo n.º 7
0
def fusionForgottenTrees(project="Platinum",
                         fusdir="../projects/OrfeoGold2016/platinum/*",
                         annotators=["admin"]):
    """
	takes trees from project ordered by annotators. if they exist fuse them into the fusdir
	result has the extension "cool.conll"
	,"Sy","Marion"
	"""

    #print lemmacorrection
    sys.path.insert(0, '../tools')
    import difflib
    outfiles = []
    sql = SQL(project)
    db, cursor = sql.open()
    goodTexts = {}
    outdir = os.path.join("..", "projects", project, "exportcool")
    try:
        os.mkdir(outdir)
    except OSError:
        pass
    for annotator in annotators:
        print[
            list(
                cursor.execute("select rowid from users where user =?;",
                               (annotator, )))
        ]
    annotatorIds = tuple(a for (a, ) in [
        list(
            cursor.execute("select rowid from users where user =?;", (
                annotator, )))[0] for annotator in annotators
    ])
    print annotators, annotatorIds

    for textid, textname, nrtokens in list(
            cursor.execute("select rowid, * from texts;")):  # for each text
        print "\n__________________________doing", textname, "with", nrtokens, "tokens"
        nrutids = {}
        for nr, userid, treeid in list(
                cursor.execute(
                    "select nr,userid,trees.rowid as treeid from trees, sentences where sentenceid=sentences.rowid and userid in {annotatorIds} and  textid = ? order by nr;"
                    .format(annotatorIds=annotatorIds), (textid, ))):
            nrutids[nr] = nrutids.get(nr, {})
            nrutids[nr][userid] = treeid
        trees = {}
        for nr in sorted(nrutids):  # for each sentence
            tree = None
            for aid in annotatorIds:  # for each interesting annotator id
                if aid in nrutids[nr]:
                    tree = sql.gettree(treeid=nrutids[nr][aid],
                                       indb=db,
                                       incursor=cursor)["tree"]
                    trees[nr] = tree
                    #print "atree:",tree
                    break
            #if not tree:
            #print "problem: no tree for nr",nr,"type",type(nr)
            #print "annotatorIds",annotatorIds
            #raise Exception('no tree', nr)
        #print trees
        print len(trees), "trees from", project
        print textname, textname.split(".")[0]
        btextname = os.path.basename(textname).split(".")[0]
        if btextname.endswith("-one-word-per-line"):
            btextname = btextname[:-len("-one-word-per-line")]
        #print glob.glob(fusdir),[os.path.basename(fi).split(".")[0] for fi in glob.glob(fusdir)]
        cooltrees = []
        ptrees, ftrees = 0, 0
        for fi in glob.glob(fusdir):
            if btextname == os.path.basename(fi).split(".")[0]:
                print "yes", btextname
                fustrees = conll.conllFile2trees(fi)
                print len(fustrees), "ftrees", fi
                for nr, ftree in enumerate(fustrees):
                    if nr + 1 in trees:
                        #print "added tree",nr+1,"from database"
                        #ptree=platinum(trees[nr+1])
                        ptree = trees[nr + 1]
                        for iii in ptree:
                            ptree[iii]["tag2"] = "_"
                            if ptree[iii]["lemma"] in lemmacorrection:
                                ptree[iii]["lemma"] = lemmacorrection[
                                    ptree[iii]["lemma"]]
                        cooltrees += [ptree]
                        #print nr+1,"tree from",project#,tree
                        ptrees += 1
                        if ftree.sentence() != u" ".join(
                            [ptree[i].get("t", "") for i in sorted(ptree)]):
                            print "\n_________", nr + 1
                            print ftree.sentence()
                            print u" ".join(
                                [ptree[i].get("t", "") for i in sorted(ptree)])
                            #for l in difflib.context_diff(ftree.sentence() ,u" ".join([ptree[i].get("t","") for i in sorted(ptree)])):print l

                        #print "dbtree",platinum(trees[nr+1])
                    else:
                        for iii in ftree:
                            ftree[iii]["tag2"] = "_"
                            if ftree[iii]["lemma"] in lemmacorrection:
                                ftree[iii]["lemma"] = lemmacorrection[
                                    ftree[iii]["lemma"]]
                        #print nr+1,"tree from",fusdir#,tree
                        ftrees += 1
                        cooltrees += [ftree]
                        #print "added tree",nr+1,"from fustrees",fi
                outfile = os.path.join(outdir, textname + ".cool.conll")
                conll.trees2conllFile(cooltrees, outfile=outfile, columns=10)
                print "wrote", outfile
                print ptrees, "ptrees, ", ftrees, "ftrees"
                break
        if len(cooltrees) == 0: print "nothing for", btextname
        outfiles += [outfile]
        #qsdf
    return outfiles
Exemplo n.º 8
0
def exportGoodTexts(project,
                    lastHuman=False,
                    onlyValidated=True,
                    pattern=False):
    """
	TODO :
	- ajouter parametre p/selectionner Texte
	ex : "UD_ZH_[number]"
	"""
    outfiles = []
    sql = SQL(project)
    db, cursor = sql.open()
    goodTexts = {}
    if onlyValidated: onlyValidated = "and todos.status=1"
    else: onlyValidated = ""
    # take all texts where a validator has validated
    if pattern:
        command = "select distinct * from texts, todos, users where texts.rowid=todos.textid and users.rowid=todos.userid and texts.textname {pattern};".format(
            pattern=pattern)  # like 'UD_ZH%'
    else:
        command = "select distinct * from texts, todos, users where texts.rowid=todos.textid and todos.type=1 {onlyValidated} and users.rowid=todos.userid;".format(
            onlyValidated=onlyValidated)
    for row in cursor.execute(command):
        textname, nrtokens, userid, textid, validator, status, comment, user, realname = row
        goodTexts[textid] = (textname, userid, user)
        print "i'll take", textname, "validated by", user, "with", nrtokens, "tokens"
    sentenceValidationInValidatedText(cursor, sql, db)
    outdir = os.path.join("..", "projects", project, "export")
    try:
        os.mkdir(outdir)
    except OSError:
        pass

    for textid, (textname, userid, user) in goodTexts.iteritems():
        textname = textname.replace("-one-word-per-line.conll14_Parse", "")

        if lastHuman:
            outfile = os.path.join(outdir, textname + ".lastHuman.conll")
        else:
            outfile = os.path.join(
                outdir, "validated." + textname + "." + user + ".conll")
        print "doing", textname, textid
        trees = []

        if lastHuman:
            snr2all = {}
            for row in cursor.execute(
                    """
			select sentences.nr as snr, trees.rowid as treeid, users.user, trees.timestamp 
			from sentences, trees, users 
			where sentences.textid=? 
			and sentences.rowid=trees.sentenceid 
			and users.rowid = trees.userid; """, (textid, )):
                snr, treeid, user, timestamp = row
                snr2all[snr] = snr2all.get(snr,
                                           []) + [(timestamp, user, treeid)]
            lastpourc = -1
            for c, snr in enumerate(sorted(snr2all)):
                pourc = int(float(c) / len(snr2all) * 100)
                if pourc != lastpourc:
                    print "___{pourc}%___\r".format(pourc=pourc),

                lastusersnotparser = sorted([
                    (timestamp, user, treeid)
                    for (timestamp, user, treeid) in snr2all[snr]
                    if user not in ["parser", "mate"]
                ])
                if len(lastusersnotparser) > 0:
                    time, u, tid = lastusersnotparser[-1]  # last tree by human
                else:
                    time, u, tid = sorted(
                        snr2all[snr])[-1]  # last tree by whoever
                #print "je prends l'arbre de",u
                trees += [
                    sql.gettree(treeid=treeid, indb=db,
                                incursor=cursor)["tree"]
                ]

        else:

            for (
                    treeid,
                    sentencenr,
            ) in cursor.execute(
                    "select trees.rowid, sentences.nr from texts, trees, sentences where texts.rowid=? and trees.userid=? and trees.sentenceid = sentences.rowid and sentences.textid=texts.rowid order by sentences.nr;",
                (
                    textid,
                    userid,
                )).fetchall():
                #print "ooo",sentencenr,"\r",
                print "nr", sentencenr, "_____\r",
                trees += [
                    sql.gettree(treeid=treeid, indb=db,
                                incursor=cursor)["tree"]
                ]

        print "exporting", len(trees), "trees into", outfile
        outfiles += [outfile]
        conll.trees2conllFile(trees, outfile, columns=10)
    return outfiles
Exemplo n.º 9
0
def bulkcorrectDB(project, treeids=[]):
    """
	bulk correction of a whole project! very slow!
	
	better to do directly in sql, for example:
	#change all functions:
		#update links set function='dep' where function='det';

	"""

    sql = SQL(project)
    db, cursor = sql.open()

    if treeids: a, v = ["rowid"], treeids
    else: a, v = [], []

    allt = sql.getall(cursor, "trees", a, v)

    ti = time()

    for nr, (treeid, sid, uid, annotype, status, comment,
             timestamp) in enumerate(allt):

        print "_____ treeid", treeid, "nr", nr + 1, "/", len(
            allt
        ), "---", float(nr + 1) / (
            time() - ti), "trees per second", float(len(allt) - nr + 1) / (
                float(nr + 1) /
                (time() - ti)), "seconds to go", float(len(allt) - nr + 1) / (
                    float(nr + 1) / (time() - ti)) / 60, "minutes to go"

        dic = sql.gettree(None, None, treeid, indb=db, incursor=cursor)
        if dic:
            sentencetree = dic["tree"]

            #newdic, changed = complexbulkcorrectdic(sentencetree)
            #newdic, changed = simplebulkcorrectdic(sentencetree)
            #newdic, changed = correctfeatures(sentencetree)
            #newdic, changed = correctfeatures(sentencetree,4,{})
            newdic, changed = correctfeatures(
                sentencetree, 4, {
                    'i1': {
                        u'person': u'3',
                        u'number': u'sg',
                        u'cat': u'V',
                        u'lemma': u'\xeatre',
                        u'token': u'est',
                        u'tense': u'present',
                        u'mode': u'indicative',
                        'gov': {
                            'i0': 'root'
                        },
                        u't': u'A'
                    },
                    'i2': {
                        u'cat': u'Cl',
                        u'lemma': u'c',
                        u'token': u'-ce',
                        u't': u'B',
                        'gov': {
                            'i1': u'sub'
                        },
                        'child': None
                    }
                })

            #break
            if changed:
                print "________________________________\n"
                #for i,node in newdic.iteritems():
                #print i, node["t"], node
                #1/0
                tokensChanged = True
                ws, sentence, _ = sql.enterTree(cursor,
                                                newdic,
                                                sid,
                                                uid,
                                                tokensChanged=tokensChanged)
                print sentence
                print "changed"
                db.commit()
    #db.commit()
    db.close()