def get_TermsFrequency(query_terms): """ Count the frequency of MESH terms being used in all abstracts """ abst_data = eUtils.fetch_abstr(query_terms) # node size: freq of term in all abstracts MESH_frequency = defaultdict(int) # edge weights: freq of bi-terms in all abstracts MESH_bifrequency = defaultdict(int) for abst in abst_data: MESH = abst['MESH'] MESH_terms = sorted([i.strip(' ') for i in str(MESH).split(',')]) # all bi-terms combinations MESH_biterms = itertools.combinations(MESH_terms, 2) for term in MESH_terms: MESH_frequency[term] += 1 for biterm in MESH_biterms: MESH_bifrequency[biterm] += 1 sorted_MESH_frequency = sorted(MESH_frequency.iteritems(), key =operator.itemgetter(1), reverse=True) # filter the biterms at least occurs in 2 or more abstracts filter_MESH_bifrequency = dict((k,v) for k,v in MESH_bifrequency.iteritems()\ if v > 1) sorted_MESH_bifrequency = sorted(filter_MESH_bifrequency.iteritems(), key =operator.itemgetter(1), reverse=True) return (sorted_MESH_frequency, sorted_MESH_bifrequency)
def try_to_update_term(data, term): # Spaces cause eUtils queries to fail. term = term.replace('\n', '').replace(' ', '+').upper() # Minimal check for term inconsistencies. for forbidden in ['/', ' ', 'CRDT', 'CRDAT']: if forbidden in term: raise TermException(forbidden) success = False try: # If we can create the micro-corpus with the new term, # then do the update. Otherwise something went wrong. abstr_sample = eUtils.fetch_abstr( term = term, retmax = config.RETMAX, email = config.ADMAIL ) mu_corpus = {} for abstr in abstr_sample: mu_corpus[abstr['pmid']] = tfidf.preprocess(abstr['text']) data.mu_corpus = zlib.compress(json.dumps(mu_corpus)) except (eUtils.PubMedException, eUtils.NoHitException): # PubMed error or no nit. success = False else: success = True data.term_valid = success data.term = term data.put() return success
def get_TermsFrequency(query_terms): """ Count the frequency of MESH terms being used in all abstracts """ abst_data = eUtils.fetch_abstr(query_terms) # node size: freq of term in all abstracts MESH_frequency = defaultdict(int) # edge weights: freq of bi-terms in all abstracts MESH_bifrequency = defaultdict(int) for abst in abst_data: MESH = abst['MESH'] MESH_terms = sorted([i.strip(' ') for i in str(MESH).split(',')]) # all bi-terms combinations MESH_biterms = itertools.combinations(MESH_terms, 2) for term in MESH_terms: MESH_frequency[term] += 1 for biterm in MESH_biterms: MESH_bifrequency[biterm] += 1 sorted_MESH_frequency = sorted(MESH_frequency.iteritems(), key=operator.itemgetter(1), reverse=True) # filter the biterms at least occurs in 2 or more abstracts filter_MESH_bifrequency = dict((k,v) for k,v in MESH_bifrequency.iteritems()\ if v > 1) sorted_MESH_bifrequency = sorted(filter_MESH_bifrequency.iteritems(), key=operator.itemgetter(1), reverse=True) return (sorted_MESH_frequency, sorted_MESH_bifrequency)
def test_fetch_abstr(self): """This also tests 'SAXmed.eFetchResultHandler'.""" query = u'nature[journal]+AND+2012/12/21[crdt]' target_ids = [ u'23254940', u'23254938', u'23254936', u'23254935', u'23254933', u'23254931', u'23254930', u'23254929' ] ab_list = eUtils.fetch_abstr(query) self.assertEqual(len(ab_list), 8) self.assertEqual([ab['pmid'] for ab in ab_list], target_ids)
prog = 'getPubMedAbs_NLP_summarize.py', description = """ Get the abstract from pubmed query and the important sentences being bolded""", formatter_class=RawTextHelpFormatter ) parser.add_argument( '-q', '--query', metavar = 'q', type = str, nargs = '?', default = '.', help = 'pubmed query' ) args = parser.parse_args() # Load in output from blogs_and_nlp__get_feed.py abst_data = eUtils.fetch_abstr(args.query) #f = open('output/results_summary.html', 'w') sys.stdout.write('<html><head><meta charset="utf-8"></head><body>\n') sys.stdout.write('<h1>There are %d abstracts available </h1>' % (len(abst_data))) for abs in abst_data: sys.stdout.write('<br/><h4><a href="http://www.ncbi.nlm.nih.gov/pubmed/%s">' % abs['pmid']) sys.stdout.write(abs['title'].encode('utf-8') + '</a></h4><br/>') summary = summarize(abs['text']) summary = [i.encode('utf-8') for i in summary] sys.stdout.write(' '.join(summary)) sys.stdout.write('<br/>-----------<br/>') sys.stdout.write('\n</body></html>')
def get_hits_and_send_mail(data): """Routine to fetch user's hits and send them the results.""" # We query PubMed for the entries created yesterday. # There is a bit of variability on the update time, # so one might miss the entries of today if they are # put after the cron time. yesterday = date.today() - timedelta(1) the_day_before = yesterday - timedelta(1) one_year_ago = yesterday - timedelta(365) term = str(data.term) term_yesterday = "(" + term + ")" + yesterday.strftime("+AND+(%Y%%2F%m%%2F%d[crdt])") term_older = ( "(" + term + ")" + one_year_ago.strftime("+AND+(%Y%%2F%m%%2F%d:") + the_day_before.strftime("%Y%%2F%m%%2F%d[crdt])") ) # Fetch the abstracts. abstr_list = [] try: abstr_list = eUtils.fetch_abstr( term=term_yesterday, # Limit on all queries, to keep it light. retmax=config.RETMAX, email=config.ADMAIL, ) except eUtils.NoHitException: return except eUtils.PubMedException as e: logging.warn("%s: %s" % (data.user.email(), str(e))) # Can be empty. No big deal, just return. if not abstr_list: return user_gave_relevance_feedback = utils.decrypt(data, "relevant_docs") and utils.decrypt(data, "irrelevant_docs") if not user_gave_relevance_feedback: # No relevance feedback: set all scores to 0 and move on. for abstr in abstr_list: abstr["score"] = 0.0 else: # User gave feedback: recall their data and compute scores. relevant_docs = utils.decrypt(data, "relevant_docs") irrelevant_docs = utils.decrypt(data, "irrelevant_docs") mu_corpus = utils.decrypt(data, "mu_corpus") # Write the scores in place and sort. Classify.update_score_inplace(abstr_list, relevant_docs, irrelevant_docs, mu_corpus) abstr_list = sorted(abstr_list, key=lambda x: x.get("score", 0.0), reverse=True) # Set a limit on hit number. nhits = len(abstr_list) if nhits < config.MAXHITS + 1: maxhit_exceeded = "" else: # Send the top of the sorted list and notify the user. maxhit_exceeded = "Showing only the top %d hits." % config.MAXHITS abstr_list = abstr_list[: config.MAXHITS] ## Alchemy test. if data.user.email() == "*****@*****.**": for abstr in abstr_list: query = json.loads(alchemy_keyword_query(abstr.get("text"))) abstr["keywords"] = [kw["text"] for kw in query["keywords"]] # Make a security checksum. # 1. Concatenate the PMIDs. pmids = "".join(sorted([a["pmid"] for a in abstr_list])) # 2. Add the random salt, and compute the SHA1 digest. checksum = sha1(pmids + data.salt).hexdigest() template_vals = { "nhits": nhits, "maxhit_exceeded": maxhit_exceeded, "uid": data.user.user_id(), "checksum": checksum, "abstr_list": abstr_list, } # Create the hits email message and send. msg = mail.EmailMessage() msg.initialize( to=data.user.email(), sender="*****@*****.**", subject="Recently on PubMed", body="Message in HTML format.", html=utils.render("mail.html", template_vals), ) msg.send() logging.warn("mail sent to %s" % data.user.email()) return