Python SQL.open 예제들, database.SQL.open Python 예제들

예제 #1

0

파일 보기

def extractConllFiles(project,outfolder):
	"""
	creates the empty files with a word per line as a first step to mate parsing
	"""
	if outfolder[-1]!="/": outfolder=outfolder+"/"
	texts={}
	sql = SQL(project)
	db, cursor = sql.open()
	command = """select distinct texts.textname, sentences.nr, features.nr, features.value 
		from features, trees, texts, sentences, users
		where attr = "t" and trees.rowid=features.treeid --and sentences.textid=13 
		and trees.sentenceid=sentences.rowid and sentences.textid=texts.rowid and users.user="******";"""
	cursor.execute(command)
	a = cursor.fetchall()
	#print a
	for nr, (textname, snr, num, token) in enumerate(a):
		#sql.exportAnnotations(textid, textname, "lastconll")
		texts[textname]= texts.get(textname,[])+[(snr, num,token)]
	newfiles=[]
	for textname in texts:
		print "processing",textname
		f=codecs.open(outfolder+textname, "w", "utf-8")
		for c, (snr, num, tok) in enumerate(sorted(texts[textname])):
			if num == 1 and c > 0:
				f.write('\n')
			f.write("\t".join([str(num), tok]+["_"]*12)+'\n')
		print c+1, "tokens"
		f.close()
		newfiles+=[outfolder+textname]
	return newfiles

예제 #2

0

파일 보기

def exportUniqueSentences(project, mode="lasttree", pattern=False):
    """
	exports one tree per sentences: the first time the sentence is found, the newest tree
	"""
    sql = SQL(project)
    db, cursor = sql.open()
    sentences = {}  # toks -> tree
    outdir = os.path.join("..", "projects", project, "export")
    try:
        os.mkdir(outdir)
    except OSError:
        pass
    outfile = os.path.join(outdir, "allSentences.conll")
    if pattern:
        command = """select trees.rowid,userid,max(timestamp) from trees, sentences, texts where texts.rowid=sentences.textid and sentences.rowid=trees.sentenceid
		and textname like "{pattern}"
		group by sentenceid order by trees.rowid;""".format(pattern=pattern)
    else:
        command = """select trees.rowid,userid,max(timestamp) from trees, sentences, texts where texts.rowid=sentences.textid and sentences.rowid=trees.sentenceid
		group by sentenceid order by trees.rowid;"""
    for i, (
            treeid,
            userid,
            timestamp,
    ) in enumerate(cursor.execute(command).fetchall()):
        tree = sql.gettree(treeid=treeid, indb=db, incursor=cursor)["tree"]
        toks = tuple(tree[i]["t"] for i in tree)
        print "___", i, "\r",
        if toks not in sentences:
            sentences[toks] = tree
    print "writing file with", len(sentences), "sentences..."
    conll.trees2conllFile([sentences[toks] for toks in sorted(sentences)],
                          outfile=outfile,
                          columns=10)
    return outfile

예제 #3

0

파일 보기

def directDatabaseChangeForForgottenCorrection():
    from database import SQL
    sql = SQL("Platinum")
    db, cursor = sql.open()
    cursor.execute('update links set function="comp" where function="aff";')
    db.commit()
    db.close()
    print "changed"

예제 #4

0

파일 보기

파일: trees2train.py 프로젝트: songyuchencyan/StageTAL

def lastTreeForAllSamples(project, onlyHuman=True, combine=False):
    outdir = os.path.join("..", "projects", project, "export")
    try:
        os.mkdir(outdir)
    except OSError:
        pass
    sql = SQL(project)
    db, cursor = sql.open()
    if onlyHuman:
        parserid = 0
        for pid, in cursor.execute(
                "select rowid from users where user='******';"):
            parserid = pid
    else:
        parserid = -1
    sents = sorted(
        cursor.execute(
            "select texts.textname, sentences.rowid, sentences.nr from sentences, texts where texts.rowid=sentences.textid;"
        ).fetchall())
    print "todo:", len(sents), "sentences"
    pbar = tqdm.tqdm(total=len(sents))
    annotators = {}

    if combine:
        trees = []
        getTreesForSents(sents,
                         trees,
                         annotators,
                         parserid,
                         cursor,
                         db,
                         sql,
                         pbar,
                         project=project)
        outfile = os.path.join(outdir,
                               project + ".lastHumanTreeForAllSamples.conllu")
        conll.trees2conllFile(trees, outfile=outfile)
        print "wrote", outfile

    else:
        for tid, textname, nrtokens in list(
                cursor.execute("select rowid, * from texts;")):
            print tid, textname, nrtokens
            sents = list(
                cursor.execute(
                    "select rowid, * from sentences where textid=?;",
                    (tid, )).fetchall())
            trees = []
            getTreesForSents(sents, trees, annotators, parserid, cursor, db,
                             sql, pbar)
            if textname.endswith(".conll_parse"):
                textname = textname[:len(".conll_parse")]
            outfile = os.path.join(outdir, textname + ".lastHumanTrees.conllu")
            conll.trees2conllFile(trees, outfile=outfile)
            print "wrote", outfile
    for a in annotators:
        print a, annotators[a]

예제 #5

0

파일 보기

파일: trees2train.py 프로젝트: nschneid/arborator

def exportConllByAnnotators(project, annotators=["prof", "Sy", "parser"]):
    """
	exports complete project
	for every sentence, trees of annotators in given order.
	if no tree: throw error 
	
	"""
    outfiles = []
    sql = SQL(project)
    db, cursor = sql.open()
    goodTexts = {}
    outdir = os.path.join("..", "projects", project, "export")
    try:
        os.mkdir(outdir)
    except OSError:
        pass

    annotatorIds = tuple(a for (a, ) in [
        list(
            cursor.execute("select rowid from users where user =?;", (
                annotator, )))[0] for annotator in annotators
    ])
    #print annotators, annotatorIds

    for textid, textname, nrtokens in list(
            cursor.execute("select rowid, * from texts;")):  # for each text
        print "doing", textname, "with", nrtokens, "tokens"
        nrutids = {}
        for nr, userid, treeid in list(
                cursor.execute(
                    "select nr,userid,trees.rowid as treeid from trees, sentences where sentenceid=sentences.rowid and userid in {annotatorIds} and  textid = ? order by nr;"
                    .format(annotatorIds=annotatorIds), (textid, ))):
            nrutids[nr] = nrutids.get(nr, {})
            nrutids[nr][userid] = treeid
        trees = []
        for nr in sorted(nrutids):  # for each sentence
            tree = None
            for aid in annotatorIds:  # for each interesting annotator id
                if aid in nrutids[nr]:
                    tree = sql.gettree(treeid=nrutids[nr][aid],
                                       indb=db,
                                       incursor=cursor)["tree"]
                    trees += [tree]
                    #print "atree:",tree
                    break
            if not tree:
                print "problem: no tree for nr", nr, "type", type(nr)
                print "annotatorIds", annotatorIds
                raise Exception('no tree', nr)

        if textname.endswith(".conll"): textname = textname[:-len(".conll")]
        outfile = os.path.join(outdir, textname)
        conll.trees2conllFile(trees, outfile=outfile, columns=10)
        print len(trees), "trees"
        outfiles += [outfile]
    return outfiles

예제 #6

0

파일 보기

def exportConllByAnnotators(project,
                            annotators=["prof", "Sy", "parser"],
                            fileExtension=".conllu"):
    """
	exports complete project
	for every sentence, trees of annotators in given order.
	if no tree: throw error

	"""
    outfiles = []
    sql = SQL(project)
    db, cursor = sql.open()
    goodTexts = {}
    outdir = os.path.join("..", "projects", project, "export")
    try:
        os.mkdir(outdir)
    except OSError:
        pass
    try:
        annotatorIds = tuple(a for (a, ) in [
            list(
                cursor.execute("select rowid from users where user =?;", (
                    annotator, )))[0] for annotator in annotators
        ])
    except:
        print "some required annotator IDs are not in the database"
        return
    print annotators, annotatorIds

    for textid, textname, nrtokens in list(
            cursor.execute("select rowid, * from texts;")):  # for each text
        print "doing", textname, "with", nrtokens, "tokens"
        nrutids = {}
        for nr, userid, treeid in list(
                cursor.execute(
                    "select nr,userid,trees.rowid as treeid from trees, sentences where sentenceid=sentences.rowid and userid in {annotatorIds} and  textid = ? order by nr;"
                    .format(annotatorIds=annotatorIds if len(annotatorIds) > 1
                            else '(' + str(annotatorIds[0]) + ')'),
                    (textid, ))):
            nrutids[nr] = nrutids.get(nr, {})
            nrutids[nr][userid] = treeid
        trees = getSpecificTrees(sql, db, cursor, nrutids, annotatorIds)
        if trees:
            if textname.endswith(".conll"):
                textname = textname[:-len(".conll")]
            if textname.endswith(".conllu"):
                textname = textname[:-len(".conllu")]
            outfile = os.path.join(outdir, textname + fileExtension)
            conll.trees2conllFile(trees, outfile=outfile, columns=10)
            print len(trees), "trees"
            outfiles += [outfile]
        else:
            print "skipped", textname
    return outfiles

예제 #7

0

파일 보기

파일: trees2train.py 프로젝트: nschneid/arborator

def printTree(project, treeid):
    sql = SQL(project)
    db, cursor = sql.open()
    dic = sql.gettree(
        None, None, treeid, indb=db,
        incursor=cursor)  # dic -> get tree == on récupère l'arbre
    #print dic
    if dic and dic["tree"]:
        sentencetree = dic["tree"]
        #sentencetree=corrigerNumerotation(sentencetree)
        for i in sorted(sentencetree):
            print i, sentencetree[i]

예제 #8

0

파일 보기

파일: bulkCorrectDatabase.py 프로젝트: ajaiswal-ht/arborator-server

def bulkcorrectDB(project, treeids=[], commit=True):
    """
	bulk correction of a whole project! very slow!
	
	better to do directly in sql, for example:
	#change all functions:
		#update links set function='dep' where function='det';

	"""

    sql = SQL(project)
    db, cursor = sql.open()

    if treeids: a, v = ["rowid"], treeids
    else: a, v = [], []

    allt = list(sql.getall(cursor, "trees", a, v))
    print "nb trees:", len(allt)
    ti = time()

    for nr, (treeid, sid, uid, annotype, status, comment,
             timestamp) in enumerate(allt):

        dic = sql.gettree(None, None, treeid, indb=db, incursor=cursor)
        if dic:
            tree = dic["tree"]
            newtree, changed = correctLowerProperNouns(tree)

            if changed:
                print "________________________________\n"
                ws, sentence, _ = sql.enterTree(cursor,
                                                newtree,
                                                sid,
                                                uid,
                                                tokensChanged=True)
                print sentence
                print "changed", changed
                if not nr % 100:
                    print "committing..."
                    if commit: db.commit()
            if not nr % 100:
                print "_____ treeid", treeid, "nr", nr + 1, "/", len(
                    allt), "---", int(float(nr + 1) /
                                      (time() - ti)), "trees per second", int(
                                          float(len(allt) - nr + 1) /
                                          (float(nr + 1) /
                                           (time() - ti))), "seconds (", round(
                                               float(len(allt) - nr + 1) /
                                               (float(nr + 1) /
                                                (time() - ti)) / 60,
                                               1), "minutes) to go"
    if commit: db.commit()
    db.close()

예제 #9

0

파일 보기

파일: bulkCorrectDatabase.py 프로젝트: amir-zeldes/arborator

def bulkcorrectDB(project, treeids=[]):
	"""
	bulk correction of a whole project! very slow!
	
	better to do directly in sql, for example:
	#change all functions:
		#update links set function='dep' where function='det';

	"""
	
	sql = SQL(project)
	db,cursor=sql.open()
	
	if treeids:	a,v=["rowid"],treeids
	else:		a,v=[],[]
	
	allt=sql.getall(cursor, "trees",a,v)
	
	ti = time()
	
	for nr, (treeid,sid,uid,annotype,status,comment,timestamp) in enumerate(allt):
		
		print "_____ treeid",treeid,"nr",nr+1,"/",len(allt),"---",float(nr+1)/(time()-ti),"trees per second",float(len(allt)-nr+1)/(float(nr+1)/(time()-ti)),"seconds to go",float(len(allt)-nr+1)/(float(nr+1)/(time()-ti))/60,"minutes to go"
		
		dic=sql.gettree(None,None,treeid, indb=db,incursor=cursor)
		if dic:
			sentencetree=dic["tree"]
			
			#newdic, changed = complexbulkcorrectdic(sentencetree)
			#newdic, changed = simplebulkcorrectdic(sentencetree)
			#newdic, changed = correctfeatures(sentencetree)
			#newdic, changed = correctfeatures(sentencetree,4,{})
			newdic, changed = correctfeatures(sentencetree,4,{	
				'i1':{ u'person': u'3', u'number': u'sg', u'cat': u'V', u'lemma': u'\xeatre', u'token': u'est', u'tense': u'present', u'mode': u'indicative', 'gov': {'i0':'root'}, u't': u'A'},
				'i2':{u'cat': u'Cl', u'lemma': u'c', u'token': u'-ce', u't': u'B', 'gov': {'i1': u'sub'}, 'child':None}
			})
			
	
			
			#break
			if changed:
				print "________________________________\n"
				#for i,node in newdic.iteritems():
					#print i, node["t"], node
				#1/0
				tokensChanged=True
				ws,sentence,_ = sql.enterTree(cursor, newdic, sid, uid,tokensChanged=tokensChanged)
				print sentence
				print "changed"
				db.commit()
	#db.commit()
	db.close()

예제 #10

0

파일 보기

파일: trees2train.py 프로젝트: nschneid/arborator

def getValidatedTrees(project, folder, whoseTrees="validator"):
    sql = SQL(project)
    db, cursor = sql.open()
    sentenceValidationInValidatedText(cursor, sql, db)
    #on récupère les nouveaux arbres
    b = databaseQuery(cursor, table=whoseTrees)
    print len(b), u"trees to extract"
    sids2all = {}
    trees = []
    error_trees = []
    textnames = {}
    for nr, (treeid, textname, user, snr, sid, uid, annotype, status, comment,
             timestamp) in enumerate(b):
        # TODO: remove:
        #if textname.startswith("mandarinParsed"):continue
        sids2all[sid] = sids2all.get(
            sid, []) + [(timestamp, textname, user, snr, treeid)]
        textnames[textname] = None
    #print len(sids2all)
    print u"trees extracted from the samples", ", ".join(sorted(textnames))
    lastpourc = -1
    for c, sid in enumerate(sids2all):
        pourc = int(float(c) / len(sids2all) * 100)
        if pourc != lastpourc:
            sys.stdout.write("{pourc}%\r".format(pourc=pourc))
        sys.stdout.flush()

        snr, treeid2get = sorted(sids2all[sid])[0][-2:]
        #print treeid2get, type(treeid2get)
        #lknlk
        dic = sql.gettree(None, None, treeid2get, indb=db,
                          incursor=cursor)  # dic -> get tree
        #if treeid2get==9669:
        #print 9669,dic

        if dic:
            sentencetree = dic["tree"]
            sentencetree = corrigerNumerotation(sentencetree)
            trees.append(sentencetree)
            #print " ".join(node["t"] for i,node in sentencetree.iteritems())
            if checkTree(sentencetree)[0] == False:
                if checkTree(sentencetree)[1] == "self":
                    error_trees += [
                        "\t".join([
                            textname,
                            str(snr), user,
                            "node " + str(checkTree(sentencetree)[2]) +
                            " points to itself"
                        ])
                    ]
                else:
                    error_trees += [
                        "\t".join([
                            textname,
                            str(snr), user, "no gov at node " +
                            str(checkTree(sentencetree)[2])
                        ])
                    ]
                trees.remove(sentencetree)
                #print "nr arbres",len(trees)
        lastpourc = pourc
    print len(error_trees), "arbre(s) avec erreurs."
    if len(error_trees) > 0:
        print "\t".join(["Texte", "num phrase", "correcteur", "cause"])
        for x in sorted(list(set(error_trees))):
            print x
        f = codecs.open(
            folder + "logs/log_erreurs." +
            datetime.datetime.now().strftime('%Y-%m-%d') + ".tsv", "w",
            "utf-8")
        f.write("\t".join(["Texte", "num phrase", "correcteur", "cause"]) +
                '\n')
        for e in error_trees:
            f.write(e + '\n')
        f.close()
        print "Erreurs dans", f.name
    print len(trees), "arbres restants pour entrainement"
    #Creation d'un fichier log
    db.commit()
    db.close()
    return trees

예제 #11

0

파일 보기

파일: trees2train.py 프로젝트: nschneid/arborator

def fusionForgottenTrees(project="Platinum",
                         fusdir="../projects/OrfeoGold2016/platinum/*",
                         annotators=["admin"]):
    """
	takes trees from project ordered by annotators. if they exist fuse them into the fusdir
	result has the extension "cool.conll"
	,"Sy","Marion"
	"""

    #print lemmacorrection
    sys.path.insert(0, '../tools')
    import difflib
    outfiles = []
    sql = SQL(project)
    db, cursor = sql.open()
    goodTexts = {}
    outdir = os.path.join("..", "projects", project, "exportcool")
    try:
        os.mkdir(outdir)
    except OSError:
        pass
    for annotator in annotators:
        print[
            list(
                cursor.execute("select rowid from users where user =?;",
                               (annotator, )))
        ]
    annotatorIds = tuple(a for (a, ) in [
        list(
            cursor.execute("select rowid from users where user =?;", (
                annotator, )))[0] for annotator in annotators
    ])
    print annotators, annotatorIds

    for textid, textname, nrtokens in list(
            cursor.execute("select rowid, * from texts;")):  # for each text
        print "\n__________________________doing", textname, "with", nrtokens, "tokens"
        nrutids = {}
        for nr, userid, treeid in list(
                cursor.execute(
                    "select nr,userid,trees.rowid as treeid from trees, sentences where sentenceid=sentences.rowid and userid in {annotatorIds} and  textid = ? order by nr;"
                    .format(annotatorIds=annotatorIds), (textid, ))):
            nrutids[nr] = nrutids.get(nr, {})
            nrutids[nr][userid] = treeid
        trees = {}
        for nr in sorted(nrutids):  # for each sentence
            tree = None
            for aid in annotatorIds:  # for each interesting annotator id
                if aid in nrutids[nr]:
                    tree = sql.gettree(treeid=nrutids[nr][aid],
                                       indb=db,
                                       incursor=cursor)["tree"]
                    trees[nr] = tree
                    #print "atree:",tree
                    break
            #if not tree:
            #print "problem: no tree for nr",nr,"type",type(nr)
            #print "annotatorIds",annotatorIds
            #raise Exception('no tree', nr)
        #print trees
        print len(trees), "trees from", project
        print textname, textname.split(".")[0]
        btextname = os.path.basename(textname).split(".")[0]
        if btextname.endswith("-one-word-per-line"):
            btextname = btextname[:-len("-one-word-per-line")]
        #print glob.glob(fusdir),[os.path.basename(fi).split(".")[0] for fi in glob.glob(fusdir)]
        cooltrees = []
        ptrees, ftrees = 0, 0
        for fi in glob.glob(fusdir):
            if btextname == os.path.basename(fi).split(".")[0]:
                print "yes", btextname
                fustrees = conll.conllFile2trees(fi)
                print len(fustrees), "ftrees", fi
                for nr, ftree in enumerate(fustrees):
                    if nr + 1 in trees:
                        #print "added tree",nr+1,"from database"
                        #ptree=platinum(trees[nr+1])
                        ptree = trees[nr + 1]
                        for iii in ptree:
                            ptree[iii]["tag2"] = "_"
                            if ptree[iii]["lemma"] in lemmacorrection:
                                ptree[iii]["lemma"] = lemmacorrection[
                                    ptree[iii]["lemma"]]
                        cooltrees += [ptree]
                        #print nr+1,"tree from",project#,tree
                        ptrees += 1
                        if ftree.sentence() != u" ".join(
                            [ptree[i].get("t", "") for i in sorted(ptree)]):
                            print "\n_________", nr + 1
                            print ftree.sentence()
                            print u" ".join(
                                [ptree[i].get("t", "") for i in sorted(ptree)])
                            #for l in difflib.context_diff(ftree.sentence() ,u" ".join([ptree[i].get("t","") for i in sorted(ptree)])):print l

                        #print "dbtree",platinum(trees[nr+1])
                    else:
                        for iii in ftree:
                            ftree[iii]["tag2"] = "_"
                            if ftree[iii]["lemma"] in lemmacorrection:
                                ftree[iii]["lemma"] = lemmacorrection[
                                    ftree[iii]["lemma"]]
                        #print nr+1,"tree from",fusdir#,tree
                        ftrees += 1
                        cooltrees += [ftree]
                        #print "added tree",nr+1,"from fustrees",fi
                outfile = os.path.join(outdir, textname + ".cool.conll")
                conll.trees2conllFile(cooltrees, outfile=outfile, columns=10)
                print "wrote", outfile
                print ptrees, "ptrees, ", ftrees, "ftrees"
                break
        if len(cooltrees) == 0: print "nothing for", btextname
        outfiles += [outfile]
        #qsdf
    return outfiles

예제 #12

0

파일 보기

파일: trees2train.py 프로젝트: nschneid/arborator

def exportGoodTexts(project,
                    lastHuman=False,
                    onlyValidated=True,
                    pattern=False):
    """
	TODO :
	- ajouter parametre p/selectionner Texte
	ex : "UD_ZH_[number]"
	"""
    outfiles = []
    sql = SQL(project)
    db, cursor = sql.open()
    goodTexts = {}
    if onlyValidated: onlyValidated = "and todos.status=1"
    else: onlyValidated = ""
    # take all texts where a validator has validated
    if pattern:
        command = "select distinct * from texts, todos, users where texts.rowid=todos.textid and users.rowid=todos.userid and texts.textname {pattern};".format(
            pattern=pattern)  # like 'UD_ZH%'
    else:
        command = "select distinct * from texts, todos, users where texts.rowid=todos.textid and todos.type=1 {onlyValidated} and users.rowid=todos.userid;".format(
            onlyValidated=onlyValidated)
    for row in cursor.execute(command):
        textname, nrtokens, userid, textid, validator, status, comment, user, realname = row
        goodTexts[textid] = (textname, userid, user)
        print "i'll take", textname, "validated by", user, "with", nrtokens, "tokens"
    sentenceValidationInValidatedText(cursor, sql, db)
    outdir = os.path.join("..", "projects", project, "export")
    try:
        os.mkdir(outdir)
    except OSError:
        pass

    for textid, (textname, userid, user) in goodTexts.iteritems():
        textname = textname.replace("-one-word-per-line.conll14_Parse", "")

        if lastHuman:
            outfile = os.path.join(outdir, textname + ".lastHuman.conll")
        else:
            outfile = os.path.join(
                outdir, "validated." + textname + "." + user + ".conll")
        print "doing", textname, textid
        trees = []

        if lastHuman:
            snr2all = {}
            for row in cursor.execute(
                    """
			select sentences.nr as snr, trees.rowid as treeid, users.user, trees.timestamp 
			from sentences, trees, users 
			where sentences.textid=? 
			and sentences.rowid=trees.sentenceid 
			and users.rowid = trees.userid; """, (textid, )):
                snr, treeid, user, timestamp = row
                snr2all[snr] = snr2all.get(snr,
                                           []) + [(timestamp, user, treeid)]
            lastpourc = -1
            for c, snr in enumerate(sorted(snr2all)):
                pourc = int(float(c) / len(snr2all) * 100)
                if pourc != lastpourc:
                    print "___{pourc}%___\r".format(pourc=pourc),

                lastusersnotparser = sorted([
                    (timestamp, user, treeid)
                    for (timestamp, user, treeid) in snr2all[snr]
                    if user not in ["parser", "mate"]
                ])
                if len(lastusersnotparser) > 0:
                    time, u, tid = lastusersnotparser[-1]  # last tree by human
                else:
                    time, u, tid = sorted(
                        snr2all[snr])[-1]  # last tree by whoever
                #print "je prends l'arbre de",u
                trees += [
                    sql.gettree(treeid=treeid, indb=db,
                                incursor=cursor)["tree"]
                ]

        else:

            for (
                    treeid,
                    sentencenr,
            ) in cursor.execute(
                    "select trees.rowid, sentences.nr from texts, trees, sentences where texts.rowid=? and trees.userid=? and trees.sentenceid = sentences.rowid and sentences.textid=texts.rowid order by sentences.nr;",
                (
                    textid,
                    userid,
                )).fetchall():
                #print "ooo",sentencenr,"\r",
                print "nr", sentencenr, "_____\r",
                trees += [
                    sql.gettree(treeid=treeid, indb=db,
                                incursor=cursor)["tree"]
                ]

        print "exporting", len(trees), "trees into", outfile
        outfiles += [outfile]
        conll.trees2conllFile(trees, outfile, columns=10)
    return outfiles

예제 #13

0

파일 보기

파일: yuchen_ExportNewestTree.py 프로젝트: songyuchencyan/StageTAL

    new_trees = list()
    for nr, tree in sortable:

        # adding metadatas	应该是重命名sent_id,从0开始
        tree.sentencefeatures["text"] = tree.sentence()
        tree.sentencefeatures["sent_id"] = prefix + "_" + str(nr - 1)

        # removing useless metadata
        del tree.sentencefeatures["nr"]
        new_trees.append(tree)
    conll.trees2conllFile(new_trees, outfile)


if __name__ == "__main__":

    ## Open project database

    sql = SQL("NaijaSUD")  # 输入project名字
    db, cursor = sql.open()

    ## Use 2 functions :
    # - exportLastBestAnnotations in lib/database.py -> writes a file with trees and their rank
    # - reorder in lib/yuchen.py -> reorder trees based on their rank, write a file with the output

    users, c = sql.exportLastBestAnnotations(
        115, "P_ABJ_GWA_06_Ugo-lifestory_PRO"
    )  # 输入textid和text name，可通过链接https://arborator.ilpga.fr/editor.cgi?project=NaijaSUD&textid=74&opensentence=1看到textid
    print(users, c)
    fpath = "E:/TAL/Stage/arborator/projects/NaijaSUD/export/P_ABJ_GWA_06_Ugo.lifestory_PRO.most.recent.trees.with.feats.conllu"  # 输入导出的文件所在路径
    trees = conll.conllFile2trees(fpath)  # 重新排序conll树，重命名sent_id
    reorder(trees, fpath + "_reordered")

예제 #14

0

파일 보기

def bulkcorrectDB(project, treeids=[]):
    """
	bulk correction of a whole project! very slow!
	
	better to do directly in sql, for example:
	#change all functions:
		#update links set function='dep' where function='det';

	"""

    sql = SQL(project)
    db, cursor = sql.open()

    if treeids: a, v = ["rowid"], treeids
    else: a, v = [], []

    allt = sql.getall(cursor, "trees", a, v)

    ti = time()

    for nr, (treeid, sid, uid, annotype, status, comment,
             timestamp) in enumerate(allt):

        print "_____ treeid", treeid, "nr", nr + 1, "/", len(
            allt
        ), "---", float(nr + 1) / (
            time() - ti), "trees per second", float(len(allt) - nr + 1) / (
                float(nr + 1) /
                (time() - ti)), "seconds to go", float(len(allt) - nr + 1) / (
                    float(nr + 1) / (time() - ti)) / 60, "minutes to go"

        dic = sql.gettree(None, None, treeid, indb=db, incursor=cursor)
        if dic:
            sentencetree = dic["tree"]

            #newdic, changed = complexbulkcorrectdic(sentencetree)
            #newdic, changed = simplebulkcorrectdic(sentencetree)
            #newdic, changed = correctfeatures(sentencetree)
            #newdic, changed = correctfeatures(sentencetree,4,{})
            newdic, changed = correctfeatures(
                sentencetree, 4, {
                    'i1': {
                        u'person': u'3',
                        u'number': u'sg',
                        u'cat': u'V',
                        u'lemma': u'\xeatre',
                        u'token': u'est',
                        u'tense': u'present',
                        u'mode': u'indicative',
                        'gov': {
                            'i0': 'root'
                        },
                        u't': u'A'
                    },
                    'i2': {
                        u'cat': u'Cl',
                        u'lemma': u'c',
                        u'token': u'-ce',
                        u't': u'B',
                        'gov': {
                            'i1': u'sub'
                        },
                        'child': None
                    }
                })

            #break
            if changed:
                print "________________________________\n"
                #for i,node in newdic.iteritems():
                #print i, node["t"], node
                #1/0
                tokensChanged = True
                ws, sentence, _ = sql.enterTree(cursor,
                                                newdic,
                                                sid,
                                                uid,
                                                tokensChanged=tokensChanged)
                print sentence
                print "changed"
                db.commit()
    #db.commit()
    db.close()