def read_one_from_web(): pmid = "11250746" d = pp.parse_xml_web(pmid, save_xml=True) xml = d["xml"] d.pop("xml", None) pprint.pprint(d) print() os.makedirs("xml", exist_ok=True) with open("xml/{}.xml".format(pmid), "wb") as f: f.write(xml)
def download_api(ids, outfile, save_dir): # if more than 1000 ids, do batch if len(ids) > 1000: chunks = np.array_split(ids, len(ids) / 1000) else: chunks = [ids] # run in chunks print("Downloading from API in {:} chunks".format(len(chunks))) i = 1 for chunk in chunks: # list to store data articles = [] citations = [] for id in tqdm(chunk): try: # article time.sleep(0.25) articles.append(pp.parse_xml_web(id, save_xml=False)) # citations time.sleep(0.25) d_ = pp.parse_outgoing_citation_web(id, id_type='PMID') if d_: for bpmid in d_['pmid_cited']: citations.append({ 'apmid': d_['doc_id'], 'bpmid': bpmid }) except TypeError: pass # write to SQL df_data = pd.DataFrame(articles) df_data.to_pickle(os.path.join(save_dir, 'df_data_' + str(i) + '.pkl')) df_ref = pd.DataFrame(citations) df_ref.to_pickle(os.path.join(save_dir, 'df_ref_' + str(i) + '.pkl')) # write to log if len(df_data) != 0: for id in df_data['pmid'].tolist(): outfile.write(id + '\n') i += 1 print("All done!")
def make_csv(catfile): headers = [ 'pmid', 'title', 'abstract', 'journal', 'affiliation', 'authors', 'year', 'keywords', 'category' ] csv_db = PD.DataFrame(columns=headers, index=None) labelled = PD.read_csv(catfile) pmids = labelled.loc[:, 'pubmed_id'] pmid_cat = labelled.loc[:, 'paper_category'] for n, c in zip(pmids, pmid_cat): paper = PP.parse_xml_web(n) paper_row = [n,paper['title'],paper['abstract'],paper['journal'],paper['affiliation'],\ paper['authors'],paper['year'],paper['keywords'],c] csv_db.loc[len(csv_db) + 1] = paper_row outfile_name = catfile[:-4] + '_processed.csv' csv_db.to_csv(outfile_name, index=None) return outfile_name
def make_csv_(catfile, outfile_name): headers = [ 'pmid', 'title', 'abstract', 'journal', 'affiliation', 'authors', 'year', 'keywords', 'category' ] csv_db = PD.DataFrame(columns=headers, index=None) labelled = catfile pmids = labelled.loc[:, 'id'] pmid_cat = labelled.loc[:, 'label'] counter = 0 for n, c in zip(pmids, pmid_cat): paper = PP.parse_xml_web(n) paper_row = [n,paper['title'],paper['abstract'],paper['journal'],paper['affiliation'],\ paper['authors'],paper['year'],paper['keywords'],c] csv_db.loc[len(csv_db) + 1] = paper_row counter += 1 if counter % 10 == 0: print(counter, end=' ') csv_db.to_csv(outfile_name, index=None) return outfile_name
'B-Intervention': 7, 'I-Intervention': 8, 'B-Comparator': 9, 'I-Comparator': 10, 'B-Outcome': 11, 'I-Outcome': 12 } idx2tag = {idx: tag for tag, idx in tag2idx.items()} #%% App st.header('PICO extraction for in vivo abstract') # pmid = st.text_input('Input one PMID: ', 23326526) pmid = st.text_input('Input one PMID: ', 27231887) try: xml = pubmed_parser.parse_xml_web(pmid) title = xml['title'] text = xml['abstract'] if text == "": st.text("No abstract available!") else: # st.write(title) # st.write(text) ## Extract pico text text = sent_detect(text, sent_pth) # Extract pico phrases tup, _, _ = pico_extract(text, pth_path, idx2tag) res = tup2dict(tup) st.write("""### Extracted PICO text ### """)
def pmid2text(pmid): outputs = pp.parse_xml_web(pmid, save_xml=False) return outputs["title"], outputs["abstract"]
def get_article_info_dict_from_pubmed(self): """ :param store_file_name: :param pmid_list: :return: a dictionary whose key is pmid, value is information of referred article, value is also a dictionary, including keys below: title : title abstract : abstract journal : journal affiliation : affiliation of first author authors : string of authors, separated by ; year : Publication year keywords : keywords or MESH terms of the article store path: 'articles_info_' + store_file_name """ print('------------------------------------------------------------') print('Get article info:') def clean(s): s.replace("*", " ") s.replace("¶", " ") s.replace("†", " ") s.replace("§", " ") s.replace("-", " ") s.replace("<", " ") s.replace(">", " ") s.replace("#", " ") s = s.rstrip().lstrip() if s == '': return '' if s[0] == ',' or s[0] == '.': s = s[1:] if s == '': return '' # if s[-1] == ')': # s = s[:-1] return s begin_time = time.time() store_file_name = self.store_file_name + 'articles' + '.npy' article_info_dict = dict() if os.path.exists(store_file_name): # TODO(lyj): 为什么这里不用flat[0] article_info_dict = np.load(store_file_name, allow_pickle=True).flat[0] if self.update == False: return article_info_dict delete_list = article_info_dict.keys() - self.pmid_list add_list = self.pmid_list - article_info_dict.keys() print("need to delete %d articles" % len(delete_list)) print("need to add %d articles" % len(add_list)) for pmid in delete_list: del article_info_dict[pmid] i = 0 for pmid in add_list: if i % 20 == 0: np.save(store_file_name, article_info_dict) print('\n%d new articles are saved' % i) try: article_info = pp.parse_xml_web(pmid, save_xml=False) except KeyboardInterrupt: # do the saving here exit(-1) # exit the program with the return code -1 except: print('\nerror in ', pmid) else: temp_list = [] for item in article_info['authors'].split(';'): item = clean(item) if item == '': continue temp_list.append(item) article_info['authors'] = temp_list temp_list = [] for item in article_info['affiliation'].split(';'): item = clean(item) if item == '': continue temp_list.append(item) article_info['affiliation'] = temp_list temp_list = [] for item in article_info['keywords'].split(';'): # item = item.split(':') # item = item[1] if len(item) > 1 else item[0] item = clean(item) if item == '': continue if "," in item: temp_list += item.rstrip().lstrip().split(',') else: temp_list.append(str.lower(item.rstrip().lstrip())) article_info['keywords'] = temp_list article_info_dict[pmid] = article_info i += 1 print(i + 1, end=' ') np.save(store_file_name, article_info_dict) print('\n%d new articles are saved' % i) print('Get article info done, used time:', time.time() - begin_time) print('------------------------------------------------------------') return article_info_dict