示例#1
0
def discourse_rels(request):
    db = request.corpus
    words = db.words
    text_ids = db.corpus.attribute(
        corpus_d_sattr.get(db.corpus_name, 'text_id'), 's')
    results = db.db.discourse.find({'_user': request.user})
    docs = {}
    rel_counts = defaultdict(int)
    rel_occurrences = defaultdict(list)
    sum_all = 0
    for r in results:
        try:
            docid = int(r['_docno'])
        except KeyError:
            pass
        else:
            txt0 = text_ids[docid]
            txt = "%s: %s" % (txt0[2], ' '.join(words[txt0[0]:txt0[0] + 5]))
            txt = txt.decode('ISO-8859-15')
            rels = isolate_relations(r['relations'])
            for k in rels:
                sum_all += len(rels[k])
                rel_counts[k] += len(rels[k])
                rel_occurrences[k].append((docid, txt, rels[k]))
    result = []
    for rel in sorted(rel_counts.keys(), key=lambda x: -rel_counts[x]):
        result.append((rel, rel_counts[rel], rel_occurrences[rel]))
    return render_template('discourse_rels.html',
                           corpus_name=db.corpus_name,
                           results=result,
                           sum_all=sum_all)
示例#2
0
 def __init__(self, ctx, db):
     self.ctx = ctx
     self.texts = db.corpus.attribute(
         corpus_d_sattr.get(db.corpus_name, 'text_id'), 's')
     self.sentences = db.corpus.attribute("s", 's')
     self.words = db.words
     self.postags = db.corpus.attribute("pos", 'p')
示例#3
0
def list_discourse(request):
    db = request.corpus
    words = db.words
    text_ids = db.corpus.attribute(
        corpus_d_sattr.get(db.corpus_name, 'text_id'), 's')
    docids = sorted(
        set([
            r['_docno'] for r in db.db.discourse.find(
                {'_user': {
                    '$in': [request.user, '*gold*']
                }}) if '_docno' in r
        ]))
    doc_lst = []
    for docid in docids:
        txt0 = text_ids[docid]
        txt = "%s: %s" % (txt0[2], ' '.join(words[txt0[0]:txt0[0] + 5]))
        if request.user in ADMINS:
            users = [
                doc['_user'] for doc in db.db.discourse.find({'_docno': docid})
            ]
        else:
            users = [
                doc['_user'] for doc in db.db.discourse.find({'_docno': docid})
                if (doc['_user'] in ['*gold*', request.user]
                    or request.user is not None
                    and doc['_user'].startswith(request.user + '*'))
            ]
        doc_lst.append((request.user, docid, txt.decode('ISO-8859-15'), users))
    return render_template('discourse_list.html',
                           corpus_name=db.corpus_name,
                           user=request.user,
                           results=doc_lst)
示例#4
0
def render_discourse(request, disc_no):
    db = request.corpus
    corpus = db.corpus
    t_id = int(disc_no)
    doc = db.get_discourse(t_id, request.user)
    texts = corpus.attribute(corpus_d_sattr.get(db.corpus_name, 'text_id'),
                             's')
    sents = corpus.attribute("s", 's')
    start, end, text_attrs = texts[t_id]
    sent_id = sents.cpos2struc(start)
    response = render_template_nocache(
        'discourse.html',
        corpus_name=json.dumps(request.corpus.corpus_name),
        disc_id=disc_no,
        sent_id=sent_id,
        sentences=json.dumps(doc['sentences']),
        edus=json.dumps(doc['edus']),
        tokens=json.dumps(doc['tokens']),
        indent=json.dumps(doc['indent']),
        relations=json.dumps(doc.get('relations', '')),
        nonedu=json.dumps(doc.get('nonedu', {})),
        uedus=json.dumps(doc.get('uedus', {})),
        topics=json.dumps(doc.get('topics', [])))
    request.set_corpus_cookie(response)
    return response
示例#5
0
 def __init__(self, ctx, db):
     self.ctx = ctx
     self.texts = db.corpus.attribute(
         corpus_d_sattr.get(db.corpus_name, 'text_id'), 's')
     self.sentences = db.corpus.attribute("s", 's')
     self.words = db.words
     self.postags = db.corpus.attribute("pos", 'p')
     self.morph = db.corpus.attribute("morph", 'p')
     self.deprel = db.corpus.attribute("deprel", "p")
     self.attach = db.corpus.attribute("attach", "p")
     self.lemma = db.corpus.attribute("lemma", "p")
     self.db = db
示例#6
0
def render_sentence(request, sent_no):
    db = request.corpus
    tueba_corpus = db.corpus
    sno = int(sent_no) - 1
    words = db.words
    sents = db.sentences
    texts = tueba_corpus.attribute(corpus_sattr.get(db.corpus_name, 'text_id'),
                                   's')
    texts_d = tueba_corpus.attribute(
        corpus_d_sattr.get(db.corpus_name, 'text_id'), 's')
    max_sent = len(sents)
    start, end = sents[sno][:2]
    tokens = []
    for i in xrange(start, end + 1):
        tokens.append(words[i].decode('ISO-8859-1'))
    t_id = texts.cpos2struc(end - 1)
    t_id_d = texts_d.cpos2struc(end - 1)
    unused_start, unused_end, t_attrs = texts[t_id]
    if db.corpus_name in corpus_urls:
        text_url = corpus_urls[db.corpus_name](t_attrs, db.corpus_name)
    else:
        text_url = '#'
    parses = db.get_parses(sno)
    alignments = db.get_alignments(sno)
    trees_out = StringIO()
    parses_html = ''
    response = render_template('sentence.tmpl',
                               sent_id=sno + 1,
                               sent_text=' '.join(tokens),
                               parses_html=parses_html,
                               text_id=t_attrs,
                               text_url=text_url,
                               prev_sent='/pycwb/sentence/%d' % (sno, ),
                               next_sent='/pycwb/sentence/%d' % (sno + 2, ),
                               disc_id=t_id_d,
                               corpus_name=request.corpus.corpus_name,
                               has_gold=False)
    request.set_corpus_cookie(response)
    return response
示例#7
0
                edu_markable = Edu()
                edu_markable.span = (start + ctx_start, end + ctx_start)
                edu_markable.xml_id = 'edu_%s_%d_%d' % (t_id, next_sent,
                                                        sub_edu)
                text_markable.edus['%d.%d' %
                                   (next_sent, sub_edu)] = edu_markable
                edu_markable.edu_idx = len(text_markable.edu_list)
                text_markable.edu_list.append(edu_markable)
                ctx.register_object(edu_markable)
        parse_relations(doc['relations'], text_markable, ctx)


if __name__ == '__main__':
    db = database.get_corpus('TUEBA4')
    text_ids = db.corpus.attribute(
        corpus_d_sattr.get(db.corpus_name, 'text_id'), 's')
    if len(sys.argv) > 1:
        annotator = sys.argv[1]
    else:
        annotator = '*gold*'
    results = db.db.discourse.find({'_user': annotator})
    doc = make_implicit_doc()
    print '<?xml version="1.0" encoding="ISO-8859-15"?>'
    print '<exml-doc>'
    reader = DiscourseReader(doc, db)
    # do the actual conversion
    for r in results:
        try:
            docid = int(r['_docno'])
        except KeyError:
            pass
示例#8
0
def render_sentence(request, sent_no):
    db = request.corpus
    tueba_corpus = db.corpus
    sno = int(sent_no) - 1
    words = db.words
    sents = db.sentences
    texts = tueba_corpus.attribute(corpus_sattr.get(db.corpus_name, 'text_id'),
                                   's')
    texts_d = tueba_corpus.attribute(
        corpus_d_sattr.get(db.corpus_name, 'text_id'), 's')
    max_sent = len(sents)
    start, end = sents[sno][:2]
    tokens = []
    for i in xrange(start, end + 1):
        tokens.append(words[i].decode('ISO-8859-1'))
    t_id = texts.cpos2struc(end - 1)
    coref = db.db.referential.find_one({'_id': t_id})
    discourse = db.db.discourse.find_one({'_id': '%s~*gold*' % (t_id, )})
    t_id_d = texts_d.cpos2struc(end - 1)
    unused_start, unused_end, t_attrs = texts[t_id]
    if db.corpus_name in corpus_urls:
        text_url = corpus_urls[db.corpus_name](t_attrs, db.corpus_name)
    else:
        text_url = '#'
    parses = db.get_parses(sno)
    alignments = db.get_alignments(sno)
    trees_out = StringIO()
    names_parses = sorted([k for k in parses.iterkeys() if k != '_id'])
    names_alignments = sorted([k for k in alignments.iterkeys() if k != '_id'])
    if coref is not None:
        names_coref = sorted([k for k in coref.iterkeys() if k != '_id'])
    else:
        names_coref = []
    annotations = db.find_annotations([start, end], '*gold*')
    if names_parses or names_alignments or annotations:
        print >> trees_out, '<div id="parses-tabs">'
        print >> trees_out, '<ul class="nav nav-tabs">'
        for k in names_parses:
            print >> trees_out, '<li><a href="#parses-%s">%s (parse)</a></li>' % (
                k, k)
        for k in names_alignments:
            print >> trees_out, '<li><a href="#alignments-%s">%s (align)</a></li>' % (
                k, k)
        for k in names_coref:
            print >> trees_out, '<li><a href="#coref-%s">%s (coref)</a></li>' % (
                k, k)
        levels = defaultdict(StringIO)
        for anno in annotations:
            level = anno['level']
            schema.schemas[level].make_display(anno, db, levels[level], None)
        names = sorted(levels.iterkeys())
        for k in names:
            print >> trees_out, '<li><a href="#level-tabs-%s">%s</a></li>' % (
                k, k)
        print >> trees_out, '</ul>'
        for k in names_parses:
            v = parses[k]
            print >> trees_out, '<div id="parses-%s">' % (k, )
            #trees_out.write('<b>%s</b> <a href="javascript:$(\'tree:%s\').toggle()">[show]</a><br/>\n'%(k,k))
            t = export.from_json(v)
            csstree.write_html(t, trees_out, _id='tree-' + k)
            print >> trees_out, '</div>'
        for k in names_alignments:
            v = alignments[k]
            print >> trees_out, '<div id="alignments-%s">' % (k, )
            write_alignment(v, trees_out)
            print >> trees_out, '</div>'
        for k in names_coref:
            v = coref[k]
            print >> trees_out, '<div id="coref-%s">' % (k, )
            write_coref(db, v, trees_out, start, end + 1)
            print >> trees_out, '</div>'
        for k in names:
            print >> trees_out, '<div id="level-tabs-%s">' % (k, )
            trees_out.write(''.join(levels[k].getvalue()))
            print >> trees_out, "</div>"
        print >> trees_out, '</div>'
        parses_html = trees_out.getvalue().decode('ISO-8859-15')
    else:
        parses_html = ''
    response = render_template('sentence.tmpl',
                               sent_id=sno + 1,
                               sent_text=' '.join(tokens),
                               parses_html=parses_html,
                               text_id=t_attrs,
                               text_url=text_url,
                               prev_sent='/pycwb/sentence/%d' % (sno, ),
                               next_sent='/pycwb/sentence/%d' % (sno + 2, ),
                               disc_id=t_id_d,
                               corpus_name=request.corpus.corpus_name,
                               has_gold=(discourse is not None))
    request.set_corpus_cookie(response)
    return response