def force_do_multi_preprocessing(docs, user_input, conn): #list [{doct dict info}] t1 = time.time() multiprocess(docs) # if docs is empty [], this function just passes :) # Now update annotated_check a_check = annotation_check(user_input, conn) for a in a_check: # {"pmcid": pmcid, "annotated": ['yes']} logging.info("updating the annotation checks in the db") pmcid = str(a["pmcid"]) annotated = str(a["annotated"][0]) update = citations.update(). \ where(citations.c.pmcid == pmcid). \ where(citations.c.citesPmid == user_input). \ values(annotated=annotated) conn.execute(update) # Now extract information from annotated documents biodocs = retrieveBioDocs(user_input, conn) biodoc_data = loadBioDoc( biodocs ) # list of dictionaries[{pmid, lemmas, nes, sent_count, token_count}] # No problem getting biodocs or biodoc_data ... problem comes with updating db... # update db with sents and tokens for b in biodoc_data: update_annotations(b, user_input, conn) logging.info("Execute everything: done in %0.3fs." % (time.time() - t1)) return biodoc_data
def update_annotations(b, user_input, conn): pc = str(b["pmcid"]) s = b["num_sentences"] t = b["num_tokens"] up = citations.update().\ where(citations.c.pmcid == pc).\ where(citations.c.citesPmid == user_input).\ values(dict(sents = s, tokens = t)) conn.execute(up)
def run_IR_not_db(user_input, conn): logging.info('PMID is NOT in the inputPapers database') self_info = getMainInfo(user_input) try: pmc_ids = getCitationIDs(user_input) num_citations = len(pmc_ids) logging.info("Writing self_info to inputPapers db") #write self_info to "inputPapers" db for tup in self_info: title = tup[0] s = ', ' author = str(s.join(tup[1])) journal = tup[2] pubdate = tup[3] url = tup[4] date = str(arrow.now().format('YYYY-MM-DD')) update = inputPapers.insert().\ values(dict(datestamp=date, pmid=user_input, title=title, author=author, journal=journal, pubdate=pubdate, url=url, num_citations=num_citations)) conn.execute(update) #Retrieve the input paper if avaliable and update db scrape_and_write_Input(user_input, conn) #Now retrieve citations logging.info("Get basic info about the citations") # Previously unseen pmcids only in allCitationsInfo. # Previously seen pmcids are copied to db for new pmid in getCitedInfo allCitationsInfo = getCitedInfo( pmc_ids, user_input, conn ) #output: list of dictionaries [{pmid: 1234, author: human, ...}] logging.info("Write basic citation info to citations db") for citation in allCitationsInfo: logging.info(citation) new_or_copy_db(citation, conn) #Get content and update citations db contentDictList = getContentPMC(pmc_ids, user_input, conn) for citation in contentDictList: pmcid = str(citation["pmcid"]) citesPmid = str(citation["citesPmid"]) abstract = str(citation["all_abstract_check"][0]) whole_article = str(citation["all_article_check"][0]) up = citations.update().\ where(citations.c.pmcid == pmcid).\ where(citations.c.citesPmid == citesPmid).\ values(dict(abstract=abstract, whole_article=whole_article)) conn.execute(up) return num_citations except Exception as e: logging.info("probably no papers!") #Return False if no papers! num_citations = 0 return num_citations
def run_IR_in_db(user_input, conn): logging.info('PMID is in the database') # Check for new papers: num_in_db = db_input_citations_count(user_input, conn) #checks MY db pmc_ids = getCitationIDs(user_input) #checks ENTREZ DB num_current = len(pmc_ids) #If there are new papers, if int(num_current) > int( num_in_db ): #TODO change this back to > after i've fixed authors problem need_to_annotate = 'yes' #print("there are new citations!", (num_current, num_in_db)) logging.info("num_in_db: " + str(num_in_db)) logging.info("num_current: " + str(num_current)) #update number of citations in inputPaper db update = inputPapers.update().\ where(inputPapers.c.pmid == user_input).\ values(num_citations=num_current) conn.execute(update) #now get the new citation info allCitationsInfo = getCitedInfo( pmc_ids, user_input, conn ) # output: list of dictionaries [{pmid: 1234, author: human, ...}] #skips duplicates logging.info( "Write basic citation info to citations db for new papers") for citation in allCitationsInfo: new_or_copy_db(citation, conn) #Get content and update citations db logging.info("now get the content for the new stuff") contentDictList = getContentPMC(pmc_ids, user_input, conn) for citation in contentDictList: pmcid = str(citation["pmcid"]) citesPmid = str(citation["citesPmid"]) abstract = str(citation["all_abstract_check"][0]) whole_article = str(citation["all_article_check"][0]) up = citations.update().\ where(citations.c.pmcid == pmcid).\ where(citations.c.citesPmid == citesPmid).\ values(dict(abstract=abstract, whole_article=whole_article)) conn.execute(up) else: logging.info("no new papers, nothing to do here folks") need_to_annotate = 'no' pass return need_to_annotate