def upload_and_annotate_pdfs(): # uploads a bunch of PDFs to the database, # then sends a Celery task for the # worker to do the RobotReviewer annotation # save PDFs + annotations to database # returns the report run uuid + list of article uuids report_uuid = rand_id() uploaded_files = request.files.getlist("file") c = rr_sql_conn.cursor() blobs = [f.read() for f in uploaded_files] pdf_hashes = [hashlib.md5(blob).hexdigest() for blob in blobs] filenames = [f.filename for f in uploaded_files] pdf_uuids = [rand_id() for fn in filenames] for pdf_uuid, pdf_hash, filename, blob in zip(pdf_uuids, pdf_hashes, filenames, blobs): c.execute( "INSERT INTO doc_queue (report_uuid, pdf_uuid, pdf_hash, pdf_filename, pdf_file, timestamp) VALUES (?, ?, ?, ?, ?, ?)", (report_uuid, pdf_uuid, pdf_hash, filename, sqlite3.Binary(blob), datetime.now())) rr_sql_conn.commit() c.close() # send async request to Celery celery_tasks['pdf_annotate'].apply_async((report_uuid, ), task_id=report_uuid) return json.dumps({"report_uuid": report_uuid})
def upload_and_annotate(): # uploads a bunch of PDFs, do the RobotReviewer annotation # save PDFs + annotations to database # returns the report run uuid + list of article uuids report_uuid = rand_id() pdf_uuids = [] uploaded_files = request.files.getlist("file") c = rr_sql_conn.cursor() blobs = [f.read() for f in uploaded_files] filenames = [f.filename for f in uploaded_files] articles = pdf_reader.convert_batch(blobs) parsed_articles = [] # tokenize full texts here for doc in nlp.pipe((d.get('text', u'') for d in articles), batch_size=1, n_threads=config.SPACY_THREADS, tag=True, parse=True, entity=False): parsed_articles.append(doc) # adjust the tag, parse, and entity values if these are needed later for article, parsed_text in zip(articles, parsed_articles): article._spacy['parsed_text'] = parsed_text for filename, blob, data in zip(filenames, blobs, articles): pdf_hash = hashlib.md5(blob).hexdigest() pdf_uuid = rand_id() pdf_uuids.append(pdf_uuid) data = annotate(data, bot_names=[ "pubmed_bot", "bias_bot", "pico_bot", "rct_bot", "pico_viz_bot" ]) data.gold['pdf_uuid'] = pdf_uuid data.gold['filename'] = filename c.execute( "INSERT INTO article (report_uuid, pdf_uuid, pdf_hash, pdf_file, annotations, timestamp, dont_delete) VALUES(?, ?, ?, ?, ?, ?, ?)", (report_uuid, pdf_uuid, pdf_hash, sqlite3.Binary(blob), data.to_json(), datetime.now(), config.DONT_DELETE)) rr_sql_conn.commit() c.close() return json.dumps({"report_uuid": report_uuid, "pdf_uuids": pdf_uuids})
def queue_documents(body): report_uuid = rand_id() c = rr_sql_conn.cursor() c.execute("INSERT INTO api_queue (report_uuid, uploaded_data, timestamp) VALUES (?, ?, ?)", (report_uuid, json.dumps(body), datetime.now())) rr_sql_conn.commit() c.close() # send async request to Celery celery_tasks['api_annotate'].apply_async((report_uuid, ), task_id=report_uuid) return json.dumps({"report_id": report_uuid})