Пример #1
0
def upload_and_annotate_pdfs():
    # uploads a bunch of PDFs to the database,
    # then sends a Celery task for the
    # worker to do the RobotReviewer annotation
    # save PDFs + annotations to database
    # returns the report run uuid + list of article uuids

    report_uuid = rand_id()

    uploaded_files = request.files.getlist("file")
    c = rr_sql_conn.cursor()

    blobs = [f.read() for f in uploaded_files]
    pdf_hashes = [hashlib.md5(blob).hexdigest() for blob in blobs]
    filenames = [f.filename for f in uploaded_files]
    pdf_uuids = [rand_id() for fn in filenames]

    for pdf_uuid, pdf_hash, filename, blob in zip(pdf_uuids, pdf_hashes,
                                                  filenames, blobs):
        c.execute(
            "INSERT INTO doc_queue (report_uuid, pdf_uuid, pdf_hash, pdf_filename, pdf_file, timestamp) VALUES (?, ?, ?, ?, ?, ?)",
            (report_uuid, pdf_uuid, pdf_hash, filename, sqlite3.Binary(blob),
             datetime.now()))
        rr_sql_conn.commit()
    c.close()

    # send async request to Celery
    celery_tasks['pdf_annotate'].apply_async((report_uuid, ),
                                             task_id=report_uuid)

    return json.dumps({"report_uuid": report_uuid})
Пример #2
0
def upload_and_annotate():
    # uploads a bunch of PDFs, do the RobotReviewer annotation
    # save PDFs + annotations to database
    # returns the report run uuid + list of article uuids

    report_uuid = rand_id()
    pdf_uuids = []

    uploaded_files = request.files.getlist("file")
    c = rr_sql_conn.cursor()

    blobs = [f.read() for f in uploaded_files]
    filenames = [f.filename for f in uploaded_files]

    articles = pdf_reader.convert_batch(blobs)
    parsed_articles = []
    # tokenize full texts here
    for doc in nlp.pipe((d.get('text', u'') for d in articles),
                        batch_size=1,
                        n_threads=config.SPACY_THREADS,
                        tag=True,
                        parse=True,
                        entity=False):
        parsed_articles.append(doc)

    # adjust the tag, parse, and entity values if these are needed later
    for article, parsed_text in zip(articles, parsed_articles):
        article._spacy['parsed_text'] = parsed_text

    for filename, blob, data in zip(filenames, blobs, articles):
        pdf_hash = hashlib.md5(blob).hexdigest()
        pdf_uuid = rand_id()
        pdf_uuids.append(pdf_uuid)
        data = annotate(data,
                        bot_names=[
                            "pubmed_bot", "bias_bot", "pico_bot", "rct_bot",
                            "pico_viz_bot"
                        ])
        data.gold['pdf_uuid'] = pdf_uuid
        data.gold['filename'] = filename

        c.execute(
            "INSERT INTO article (report_uuid, pdf_uuid, pdf_hash, pdf_file, annotations, timestamp, dont_delete) VALUES(?, ?, ?, ?, ?, ?, ?)",
            (report_uuid, pdf_uuid, pdf_hash, sqlite3.Binary(blob),
             data.to_json(), datetime.now(), config.DONT_DELETE))
        rr_sql_conn.commit()
    c.close()

    return json.dumps({"report_uuid": report_uuid, "pdf_uuids": pdf_uuids})
Пример #3
0
def queue_documents(body):
    report_uuid = rand_id()
    c = rr_sql_conn.cursor()
    c.execute("INSERT INTO api_queue (report_uuid, uploaded_data, timestamp) VALUES (?, ?, ?)", (report_uuid, json.dumps(body), datetime.now()))
    rr_sql_conn.commit()
    c.close()
    # send async request to Celery
    celery_tasks['api_annotate'].apply_async((report_uuid, ), task_id=report_uuid)
    return json.dumps({"report_id": report_uuid})