def force_do_multi_preprocessing(docs, user_input,
                                 conn):  #list [{doct dict info}]
    t1 = time.time()
    multiprocess(docs)  # if docs is empty [], this function just passes :)

    # Now update annotated_check
    a_check = annotation_check(user_input, conn)

    for a in a_check:  # {"pmcid": pmcid, "annotated": ['yes']}
        logging.info("updating the annotation checks in the db")
        pmcid = str(a["pmcid"])
        annotated = str(a["annotated"][0])

        update = citations.update(). \
         where(citations.c.pmcid == pmcid). \
         where(citations.c.citesPmid == user_input). \
         values(annotated=annotated)
        conn.execute(update)

    # Now extract information from annotated documents
    biodocs = retrieveBioDocs(user_input, conn)
    biodoc_data = loadBioDoc(
        biodocs
    )  # list of dictionaries[{pmid, lemmas, nes, sent_count, token_count}]
    # No problem getting biodocs or biodoc_data ... problem comes with updating db...
    # update db with sents and tokens
    for b in biodoc_data:
        update_annotations(b, user_input, conn)
    logging.info("Execute everything: done in %0.3fs." % (time.time() - t1))
    return biodoc_data
Exemplo n.º 2
0
def update_annotations(b, user_input, conn):
    pc = str(b["pmcid"])
    s = b["num_sentences"]
    t = b["num_tokens"]
    up = citations.update().\
     where(citations.c.pmcid == pc).\
     where(citations.c.citesPmid == user_input).\
     values(dict(sents = s, tokens = t))
    conn.execute(up)
def run_IR_not_db(user_input, conn):
    logging.info('PMID is NOT in the inputPapers database')
    self_info = getMainInfo(user_input)
    try:
        pmc_ids = getCitationIDs(user_input)
        num_citations = len(pmc_ids)
        logging.info("Writing self_info to inputPapers db")
        #write self_info to "inputPapers" db
        for tup in self_info:
            title = tup[0]
            s = ', '
            author = str(s.join(tup[1]))
            journal = tup[2]
            pubdate = tup[3]
            url = tup[4]
            date = str(arrow.now().format('YYYY-MM-DD'))

            update = inputPapers.insert().\
             values(dict(datestamp=date, pmid=user_input, title=title, author=author, journal=journal, pubdate=pubdate,
                url=url, num_citations=num_citations))
            conn.execute(update)

        #Retrieve the input paper if avaliable and update db
        scrape_and_write_Input(user_input, conn)

        #Now retrieve citations
        logging.info("Get basic info about the citations")
        # Previously unseen pmcids only in allCitationsInfo.
        # Previously seen pmcids are copied to db for new pmid in getCitedInfo
        allCitationsInfo = getCitedInfo(
            pmc_ids, user_input, conn
        )  #output: list of dictionaries [{pmid: 1234, author: human, ...}]
        logging.info("Write basic citation info to citations db")
        for citation in allCitationsInfo:
            logging.info(citation)
            new_or_copy_db(citation, conn)

        #Get content and update citations db
        contentDictList = getContentPMC(pmc_ids, user_input, conn)
        for citation in contentDictList:
            pmcid = str(citation["pmcid"])
            citesPmid = str(citation["citesPmid"])
            abstract = str(citation["all_abstract_check"][0])
            whole_article = str(citation["all_article_check"][0])

            up = citations.update().\
             where(citations.c.pmcid == pmcid).\
             where(citations.c.citesPmid == citesPmid).\
             values(dict(abstract=abstract, whole_article=whole_article))
            conn.execute(up)
        return num_citations
    except Exception as e:
        logging.info("probably no papers!")
        #Return False if no papers!
        num_citations = 0
        return num_citations
def run_IR_in_db(user_input, conn):
    logging.info('PMID is in the database')
    # Check for new papers:
    num_in_db = db_input_citations_count(user_input, conn)  #checks MY db
    pmc_ids = getCitationIDs(user_input)  #checks ENTREZ DB
    num_current = len(pmc_ids)
    #If there are new papers,
    if int(num_current) > int(
            num_in_db
    ):  #TODO change this back to > after i've fixed authors problem
        need_to_annotate = 'yes'
        #print("there are new citations!", (num_current, num_in_db))
        logging.info("num_in_db: " + str(num_in_db))
        logging.info("num_current: " + str(num_current))
        #update number of citations in inputPaper db
        update = inputPapers.update().\
         where(inputPapers.c.pmid == user_input).\
         values(num_citations=num_current)
        conn.execute(update)

        #now get the new citation info
        allCitationsInfo = getCitedInfo(
            pmc_ids, user_input, conn
        )  # output: list of dictionaries [{pmid: 1234, author: human, ...}] #skips duplicates
        logging.info(
            "Write basic citation info to citations db for new papers")
        for citation in allCitationsInfo:
            new_or_copy_db(citation, conn)

        #Get content and update citations db
        logging.info("now get the content for the new stuff")
        contentDictList = getContentPMC(pmc_ids, user_input, conn)
        for citation in contentDictList:
            pmcid = str(citation["pmcid"])
            citesPmid = str(citation["citesPmid"])
            abstract = str(citation["all_abstract_check"][0])
            whole_article = str(citation["all_article_check"][0])

            up = citations.update().\
             where(citations.c.pmcid == pmcid).\
             where(citations.c.citesPmid == citesPmid).\
             values(dict(abstract=abstract, whole_article=whole_article))
            conn.execute(up)

    else:
        logging.info("no new papers, nothing to do here folks")
        need_to_annotate = 'no'
        pass
    return need_to_annotate