Exemplo n.º 1
0
 def __init__(self, doc, export_fname, corpus_name):
     ExportCorpusReader.__init__(self, doc, export_fname)
     db = get_corpus(corpus_name)
     self.db = db
     self.sentences = db.corpus.attribute("s", 's')
     self.words = db.words
     try:
         self.deprel = db.corpus.attribute("deprel", "p")
         self.attach = db.corpus.attribute("attach", "p")
     except KeyError:
         self.deprel = None
         self.attach = None
     try:
         self.lemmas = db.corpus.attribute('lemma', 'p')
     except KeyError:
         self.lemmas = None
     self.discourse = db.db.discourse
     tasks = [self.db.get_task(x) for x in task_names]
     self.spans = sorted(
         set([
             tuple(span) for task in tasks if task is not None
             for span in task.spans
         ]))
     print >> sys.stderr, "%d spans found" % (len(self.spans), )
     self.span_idx = 0
Exemplo n.º 2
0
 def __init__(self, corpus_name):
     db = get_corpus(corpus_name)
     self.db = db
     self.sentences = db.corpus.attribute("s", 's')
     self.words = db.words
     try:
         self.lemmas = db.corpus.attribute('lemma', 'p')
     except KeyError:
         self.lemmas = None
     tasks = [self.db.get_task(x) for x in task_names]
     self.spans = sorted(
         set([
             tuple(span) for task in tasks if task is not None
             for span in task.spans
         ]))
     print >> sys.stderr, "%d spans found" % (len(self.spans), )
     self.span_idx = 0
     if want_wsd:
         self.wsd = sorted(self.db.db.annotation.find({
             'level': 'wsd',
             'annotator': 'wsdgold'
         }),
                           key=lambda x: x['span'][0])
         print >> sys.stderr, "%d WSD annotations found" % (len(self.wsd), )
     else:
         self.wsd = []
     self.wsd_idx = 0
     self.sent_start = 0
Exemplo n.º 3
0
 def corpus(self):
     data = self.cookies.get('corpus')
     if not data or data not in allowed_corpora_admin:
         data = default_database
     if 'force_corpus' in self.args:
         data2 = self.args['force_corpus']
         if data2 in allowed_corpora and data != data2:
             data = data2
     return get_corpus(data)
Exemplo n.º 4
0
def archive_user(user):
    from annodb.database import get_corpus
    new_name = user + '*old'
    for corpus_name in allowed_corpora_nologin:
        db = get_corpus(corpus_name)
        coll = db.db.discourse
        for doc in coll.find({'_user': user}):
            disc_id = doc['_docno']
            old_id = doc['_id']
            doc['_user'] = new_name
            doc['_id'] = '%s~%s' % (disc_id, new_name)
            coll.update({'_id': old_id}, doc)
Exemplo n.º 5
0
def index(request):
    def by_id(x):
        return x._id

    corpus_name = request.cookies.get('corpus')
    try:
        corpus_name = request.args['corpus']
    except KeyError:
        pass
    if not corpus_name or corpus_name not in allowed_corpora_admin:
        corpus_name = default_database
    db = get_corpus(corpus_name)
    if not request.user:
        tasks = sorted(db.get_tasks(), key=by_id)
        corpora = allowed_corpora_nologin
        tasks_ready = []
    else:
        user = request.user
        tasks = []
        tasks_ready = []
        for task in sorted(db.get_tasks(request.user), key=by_id):
            if task.get_status(user):
                tasks_ready.append(task)
            else:
                tasks.append(task)
        corpora = allowed_corpora
        if user in ADMINS:
            corpora = allowed_corpora_admin
    response = render_template('index.html',
                               user=request.user,
                               tasks=tasks,
                               tasks_ready=tasks_ready,
                               corpus_name=corpus_name,
                               corpora=corpora)
    expire_date = datetime.datetime.now() + datetime.timedelta(30)
    response.set_cookie('corpus', corpus_name, expires=expire_date)
    return response
Exemplo n.º 6
0
            anno2=self.db2.db.annotation.find_one({'_id':anno['_id']})
            if force or anno2 is None:
                self.db2.db.annotation.save(anno)

oparse=optparse.OptionParser()
oparse.add_option('-f','--force',dest="force", default=False,
                  action='store_true',
                  help="overwrite existing annotations")
oparse.add_option('-l','--list',dest="list", default=False,
                  action='store_true',
                  help="list annotations, don't do anything")

if __name__=='__main__':
    opts,args=oparse.parse_args()
    print opts, args
    db1=get_corpus(args[0])
    db2=get_corpus(args[1])
    if len(args)>=3:
        task_re=args[2]
    else:
        task_re=None
    all_names=[]
    if task_re is None:
        all_names=[x['_id'] for x in db1.db.tasks.find()]
    else:
        all_names=[x['_id'] for x in db1.db.tasks.find({'_id':{'$regex':task_re}})]
    if opts.list:
        print >>sys.stderr, "Affected tasks: (RE=%s, n=%s)"%(task_re,len(all_names))
        for x in sorted(all_names):
            print x
    else:
Exemplo n.º 7
0
            doc2['_id'] = doc['_id']
            print doc2
            db.db.discourse.save(doc2)
        return None
    else:
        old_id = doc['_id']
        print "doc %s does not match text %s" % (old_id, t_id)
        for offset in [-1, 1, -2, 2, -3, 3]:
            similar, same = matches_doc(db, doc, t_id + offset)
            if similar:
                print "is really %s" % (t_id + offset, )
                (part1, annotator) = old_id.split('~')
                print 'id => %s~%s' % (t_id + offset, annotator)
                #db.db.discourse.remove({'_id':old_id})
                doc['_docno'] = t_id + offset
                doc['_id'] = '%s~%s' % (t_id + offset, annotator)
                #db.db.discourse.save(doc)
                break


if __name__ == '__main__':
    db = get_corpus(sys.argv[1])
    db2 = get_corpus(sys.argv[2])
    for doc in list(db.db.discourse.find()):
        try:
            check_doc(db2, doc)
        except KeyError, e:
            print e
            # we could actually delete these?
            pass
Exemplo n.º 8
0
                pass
            else:
                edu_markable = Edu()
                edu_markable.span = (start + ctx_start, end + ctx_start)
                edu_markable.xml_id = 'edu_%s_%d_%d' % (t_id, next_sent,
                                                        sub_edu)
                text_markable.edus['%d.%d' %
                                   (next_sent, sub_edu)] = edu_markable
                edu_markable.edu_idx = len(text_markable.edu_list)
                text_markable.edu_list.append(edu_markable)
                ctx.register_object(edu_markable)
        parse_relations(doc['relations'], text_markable, ctx)


if __name__ == '__main__':
    db = database.get_corpus('TUEBA4')
    text_ids = db.corpus.attribute(
        corpus_d_sattr.get(db.corpus_name, 'text_id'), 's')
    if len(sys.argv) > 1:
        annotator = sys.argv[1]
    else:
        annotator = '*gold*'
    results = db.db.discourse.find({'_user': annotator})
    doc = make_implicit_doc()
    print '<?xml version="1.0" encoding="ISO-8859-15"?>'
    print '<exml-doc>'
    reader = DiscourseReader(doc, db)
    # do the actual conversion
    for r in results:
        try:
            docid = int(r['_docno'])
Exemplo n.º 9
0
def action_list_empty_tasks(dbname='R9PRE1'):
    for task in get_corpus(dbname).get_tasks():
        if not task.annotators:
            print task._id,
    print
Exemplo n.º 10
0
def action_remove_task(dbname='xxx', taskname='task1'):
    get_corpus(dbname).remove_task(taskname)