Exemplo n.º 1
0
def post_mturk_upload():
    data = request.get_json()
    text = data['text']
    doc_type = data['doc_type']

    if 'turker_id' in data:
        turker_id = data['turker_id']

        g.user.turker_id = turker_id
        g.user.save()

    from nltk.tokenize import sent_tokenize
    sents = sent_tokenize(text)

    doc = Doc(title='', text=text, source='mturk', type=doc_type)
    if 'source_url' in data:
        doc.source = data['source_url']
    doc.save()

    res = {
        'doc_id': str(doc.id),
        'sents': list(),
        'seq': doc.seq,
        'title': doc.title,
        'created_at': doc.created_at.isoformat(),
    }
    for index in range(0, len(sents)):
        sent = Sent(index=index, text=sents[index], doc=doc).save()
        res['sents'].append(sent.dump())

    return json.dumps(res)
Exemplo n.º 2
0
def delete_doc(doc_id):
    doc = Doc.objects().get(id=doc_id)
    sents = Sent.objects(doc=doc).order_by('index')
    for sent in sents:
        sent.delete()
    annotations = Sent.objects(doc=doc)
    for annotation in annotations:
        annotation.delete()
    doc.delete()
Exemplo n.º 3
0
def post_annotation():
    data = request.get_json()

    doc = data['doc']
    target_text = data['target_text']
    index = data['index']
    anchor_offset = data['anchor_offset']
    focus_offset = data['focus_offset']
    type = data['type']
    basket = data['basket']

    doc = Doc.objects().get(id=doc)
    sent = Sent.objects().get(doc=doc, index=index)
    user = g.user

    target_sent = Sent.objects().get(doc=doc, index=index)

    # In sentence, filter logic have to be changed
    if type == 'sentence':
        annotations = Annotation.objects.filter(doc=doc,
                                                sent=sent,
                                                index=index,
                                                user=g.user,
                                                type=type)
    else:
        annotations = Annotation.objects.filter(doc=doc,
                                                sent=sent,
                                                index=index,
                                                user=g.user,
                                                type=type,
                                                anchor_offset=anchor_offset)

    if annotations.count() > 0:
        annotation = annotations[0]
    else:
        annotation = Annotation(doc=doc,
                                sent=sent,
                                user=user,
                                index=index,
                                type=type,
                                anchor_offset=anchor_offset)

    annotation.anchor_offset = anchor_offset
    annotation.focus_offset = focus_offset
    annotation.entire_text = target_sent.text
    annotation.target_text = target_text
    annotation.basket = basket
    annotation.ip = request.remote_addr

    annotation.save()

    return json.dumps({
        'annotation': annotation.dump(),
    })
Exemplo n.º 4
0
def duplicate_doc(from_type='v2', to_type='v3'):
    docs = Doc.objects(type=from_type).all()
    for doc in tqdm(docs):
        title = doc.title.replace('TARGET_ONLY', to_type)
        new_doc = Doc(title=title,
                      text=doc.text,
                      source=doc.source,
                      type=to_type)
        new_doc.seq = Doc.objects.count() + 1
        new_doc.save()

        sents = Sent.objects(doc=doc).all()
        for sent in sents:
            Sent(index=sent.index, text=sent.text, doc=new_doc).save()
Exemplo n.º 5
0
def review_index_page(user_id):
    try:
        user = User.objects.get(id=user_id)
    except Exception as e:
        return redirect('/404')

    doc_map = dict()
    annotations = Annotation.objects(user=user).order_by('-created_at')

    for annotation in annotations:
        try:
            # for situation in which the annotated document was deleted
            doc = annotation.doc
        except Exception:
            continue

        if not (doc.id in doc_map):
            sent_total = Sent.objects(doc=doc).count()
            annotation_sent_total = Annotation.objects(doc=doc, user=user, type='sentence').count()
            doc_map[doc.id] = {
                'doc': doc,
                'sent_total': sent_total,
                'annotation_sent_total': annotation_sent_total,
                'progress': annotation_sent_total / sent_total * 100,
                'annotation_total': Annotation.objects(doc=doc, user=user).count(),
                'review_total': AnnotationReview.objects(doc=doc, user=g.user).count(),
            }

    return render_template('review/index.html', doc_map=doc_map, user=user, g=g)
Exemplo n.º 6
0
def index_v2_page(doc_type):
    item_per_page = 50
    page = request.args.get('p', 1)
    page = int(page)

    total = Doc.objects.filter(type=doc_type).count()
    total_page = math.ceil(total / item_per_page)
    paginator = Pagination(Doc.objects(type=doc_type).order_by('seq'), page, 50)
    docs = paginator.items

    docs_data = []
    for doc in docs:
        item = doc.dump()
        item['sent_total'] = Sent.objects(doc=doc).count()
        item['progress'] = Annotation.objects(doc=doc, user=g.user, type='sentence').count()

        docs_data.append(item)

    pagination = {
        'page': page,
        'total_page': total_page,
        'left': max(1, page - 5),
        'right': min(page + 5, total_page),
    }

    return render_template('index.html', type=doc_type, docs=docs_data, g=g, pagination=pagination)
Exemplo n.º 7
0
def insert_doc(title, text, source):
    try:
        doc = Doc.objects.get(title=title)
        print('already exist -> pass')
        return
    except Doc.DoesNotExist:
        pass

    doc = Doc(title=title, text=text, source=source, type='v2')
    total = Doc.objects.count()
    doc.seq = total + 1
    doc.save()

    import re
    regex = re.compile(r'\(Sent\d{1,4}\)')

    # from nltk import sent_tokenize
    for text in text.split('\n'):
        if len(text) == 0:
            continue

        index_str = regex.findall(text)[0]
        text = text.replace(index_str, '').strip()
        index = int(index_str.replace('(Sent', '').replace(')', ''))

        Sent(index=index, text=text, doc=doc).save()
Exemplo n.º 8
0
def generate_encrypted_file(seq_id):
    from itertools import cycle

    def str_xor(s1, s2):
        result = []
        for (c1, c2) in zip(s1, cycle(s2)):
            result.append(str(ord(c1) ^ ord(c2)))
        return ",".join(result)

    try:
        doc = Doc.objects().get(seq=seq_id)
        sents = Sent.objects(doc=doc).order_by('index')
    except Exception:
        return

    data = {
        'doc_id': str(doc.id),
        'title': doc.title,
        'seq': doc.seq,
        'sents': [],
    }

    for sent in sents:
        data['sents'].append(sent.dump())

    data = json.dumps(data)
    data = str_xor(data, config.Config.ENCRYPTION_KEY)
    file_path = os.path.abspath(
        os.path.dirname(__file__) +
        '/../data/encrypted/#{}_{}'.format(seq_id, doc.title))
    with open(file_path, 'w') as f:
        f.write(data)
Exemplo n.º 9
0
def target_migration():
    docs = Doc.objects().all()
    for doc in tqdm(docs):
        doc.text = doc.text.replace('<<TARGET>>', '(TARGET)')
        doc.save()

    sents = Sent.objects()
    for sent in tqdm(sents):
        sent.text = sent.text.replace('<<TARGET>>', '(TARGET)')
        sent.save()
Exemplo n.º 10
0
def get_doc(doc_id):
    doc = Doc.objects.get(id=doc_id)
    sents = Sent.objects(doc=doc).order_by('index')

    sents_data = []
    for sent in sents:
        sents_data.append(sent.dump())

    return json.dumps({
        'sents': sents_data,
    })
Exemplo n.º 11
0
def export_dataset_v3():
    docs = Doc.objects.filter(type='mturk_v3')

    data = []
    for doc in tqdm(docs):
        sents = Sent.objects(doc=doc)
        annotations = Annotation.objects(doc=doc)

        if sents.count() != annotations.count():
            continue

        source = doc.source
        if 'aljazeera' not in source and 'foxnews' not in source and 'theguardian' not in source:
            continue

        if not is_ok(annotations):
            continue

        for annotation in annotations:
            data.append({
                'annotator': annotation.user.username,
                'version': doc.type,
                'turker_id': annotation.user.turker_id,
                'doc_id': str(doc.id),
                'sentence_index': annotation.index,
                'sentence': annotation.entire_text,
                'basket': annotation.basket,
                'source': doc.source,
                'created_at': annotation.created_at,
            })

    dataset_path = os.path.abspath(
        os.path.dirname(__file__) + '/../data/dataset_AMT_v3.json')
    data_json = json.dumps(data, default=json_util.default)
    with open(dataset_path, 'w', encoding='utf-8') as f:
        f.write(data_json)