def post_mturk_upload(): data = request.get_json() text = data['text'] doc_type = data['doc_type'] if 'turker_id' in data: turker_id = data['turker_id'] g.user.turker_id = turker_id g.user.save() from nltk.tokenize import sent_tokenize sents = sent_tokenize(text) doc = Doc(title='', text=text, source='mturk', type=doc_type) if 'source_url' in data: doc.source = data['source_url'] doc.save() res = { 'doc_id': str(doc.id), 'sents': list(), 'seq': doc.seq, 'title': doc.title, 'created_at': doc.created_at.isoformat(), } for index in range(0, len(sents)): sent = Sent(index=index, text=sents[index], doc=doc).save() res['sents'].append(sent.dump()) return json.dumps(res)
def delete_doc(doc_id): doc = Doc.objects().get(id=doc_id) sents = Sent.objects(doc=doc).order_by('index') for sent in sents: sent.delete() annotations = Sent.objects(doc=doc) for annotation in annotations: annotation.delete() doc.delete()
def post_annotation(): data = request.get_json() doc = data['doc'] target_text = data['target_text'] index = data['index'] anchor_offset = data['anchor_offset'] focus_offset = data['focus_offset'] type = data['type'] basket = data['basket'] doc = Doc.objects().get(id=doc) sent = Sent.objects().get(doc=doc, index=index) user = g.user target_sent = Sent.objects().get(doc=doc, index=index) # In sentence, filter logic have to be changed if type == 'sentence': annotations = Annotation.objects.filter(doc=doc, sent=sent, index=index, user=g.user, type=type) else: annotations = Annotation.objects.filter(doc=doc, sent=sent, index=index, user=g.user, type=type, anchor_offset=anchor_offset) if annotations.count() > 0: annotation = annotations[0] else: annotation = Annotation(doc=doc, sent=sent, user=user, index=index, type=type, anchor_offset=anchor_offset) annotation.anchor_offset = anchor_offset annotation.focus_offset = focus_offset annotation.entire_text = target_sent.text annotation.target_text = target_text annotation.basket = basket annotation.ip = request.remote_addr annotation.save() return json.dumps({ 'annotation': annotation.dump(), })
def duplicate_doc(from_type='v2', to_type='v3'): docs = Doc.objects(type=from_type).all() for doc in tqdm(docs): title = doc.title.replace('TARGET_ONLY', to_type) new_doc = Doc(title=title, text=doc.text, source=doc.source, type=to_type) new_doc.seq = Doc.objects.count() + 1 new_doc.save() sents = Sent.objects(doc=doc).all() for sent in sents: Sent(index=sent.index, text=sent.text, doc=new_doc).save()
def review_index_page(user_id): try: user = User.objects.get(id=user_id) except Exception as e: return redirect('/404') doc_map = dict() annotations = Annotation.objects(user=user).order_by('-created_at') for annotation in annotations: try: # for situation in which the annotated document was deleted doc = annotation.doc except Exception: continue if not (doc.id in doc_map): sent_total = Sent.objects(doc=doc).count() annotation_sent_total = Annotation.objects(doc=doc, user=user, type='sentence').count() doc_map[doc.id] = { 'doc': doc, 'sent_total': sent_total, 'annotation_sent_total': annotation_sent_total, 'progress': annotation_sent_total / sent_total * 100, 'annotation_total': Annotation.objects(doc=doc, user=user).count(), 'review_total': AnnotationReview.objects(doc=doc, user=g.user).count(), } return render_template('review/index.html', doc_map=doc_map, user=user, g=g)
def index_v2_page(doc_type): item_per_page = 50 page = request.args.get('p', 1) page = int(page) total = Doc.objects.filter(type=doc_type).count() total_page = math.ceil(total / item_per_page) paginator = Pagination(Doc.objects(type=doc_type).order_by('seq'), page, 50) docs = paginator.items docs_data = [] for doc in docs: item = doc.dump() item['sent_total'] = Sent.objects(doc=doc).count() item['progress'] = Annotation.objects(doc=doc, user=g.user, type='sentence').count() docs_data.append(item) pagination = { 'page': page, 'total_page': total_page, 'left': max(1, page - 5), 'right': min(page + 5, total_page), } return render_template('index.html', type=doc_type, docs=docs_data, g=g, pagination=pagination)
def insert_doc(title, text, source): try: doc = Doc.objects.get(title=title) print('already exist -> pass') return except Doc.DoesNotExist: pass doc = Doc(title=title, text=text, source=source, type='v2') total = Doc.objects.count() doc.seq = total + 1 doc.save() import re regex = re.compile(r'\(Sent\d{1,4}\)') # from nltk import sent_tokenize for text in text.split('\n'): if len(text) == 0: continue index_str = regex.findall(text)[0] text = text.replace(index_str, '').strip() index = int(index_str.replace('(Sent', '').replace(')', '')) Sent(index=index, text=text, doc=doc).save()
def generate_encrypted_file(seq_id): from itertools import cycle def str_xor(s1, s2): result = [] for (c1, c2) in zip(s1, cycle(s2)): result.append(str(ord(c1) ^ ord(c2))) return ",".join(result) try: doc = Doc.objects().get(seq=seq_id) sents = Sent.objects(doc=doc).order_by('index') except Exception: return data = { 'doc_id': str(doc.id), 'title': doc.title, 'seq': doc.seq, 'sents': [], } for sent in sents: data['sents'].append(sent.dump()) data = json.dumps(data) data = str_xor(data, config.Config.ENCRYPTION_KEY) file_path = os.path.abspath( os.path.dirname(__file__) + '/../data/encrypted/#{}_{}'.format(seq_id, doc.title)) with open(file_path, 'w') as f: f.write(data)
def target_migration(): docs = Doc.objects().all() for doc in tqdm(docs): doc.text = doc.text.replace('<<TARGET>>', '(TARGET)') doc.save() sents = Sent.objects() for sent in tqdm(sents): sent.text = sent.text.replace('<<TARGET>>', '(TARGET)') sent.save()
def get_doc(doc_id): doc = Doc.objects.get(id=doc_id) sents = Sent.objects(doc=doc).order_by('index') sents_data = [] for sent in sents: sents_data.append(sent.dump()) return json.dumps({ 'sents': sents_data, })
def export_dataset_v3(): docs = Doc.objects.filter(type='mturk_v3') data = [] for doc in tqdm(docs): sents = Sent.objects(doc=doc) annotations = Annotation.objects(doc=doc) if sents.count() != annotations.count(): continue source = doc.source if 'aljazeera' not in source and 'foxnews' not in source and 'theguardian' not in source: continue if not is_ok(annotations): continue for annotation in annotations: data.append({ 'annotator': annotation.user.username, 'version': doc.type, 'turker_id': annotation.user.turker_id, 'doc_id': str(doc.id), 'sentence_index': annotation.index, 'sentence': annotation.entire_text, 'basket': annotation.basket, 'source': doc.source, 'created_at': annotation.created_at, }) dataset_path = os.path.abspath( os.path.dirname(__file__) + '/../data/dataset_AMT_v3.json') data_json = json.dumps(data, default=json_util.default) with open(dataset_path, 'w', encoding='utf-8') as f: f.write(data_json)