def process_file(date_update): """Process downloaded MEDLINE folder to parquet file""" print("Process MEDLINE file to parquet") # remove if folder still exist if glob(os.path.join(save_dir, 'medline_*.parquet')): subprocess.call(['rm', '-rf', 'medline_*.parquet']) date_update_str = date_update.strftime("%Y_%m_%d") path_rdd = sc.parallelize(glob(os.path.join(download_dir, 'medline*.xml.gz')), numSlices=1000) parse_results_rdd = path_rdd.\ flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict) for publication_dict in pp.parse_medline_xml(x)]) medline_df = parse_results_rdd.toDF() medline_df.write.parquet(os.path.join(save_dir, 'medline_raw_%s.parquet' % date_update_str), mode='overwrite') window = Window.partitionBy(['pmid']).orderBy(desc('file_name')) windowed_df = medline_df.select( max('delete').over(window).alias('is_deleted'), rank().over(window).alias('pos'), '*') windowed_df.\ where('is_deleted = False and pos = 1').\ write.parquet(os.path.join(save_dir, 'medline_lastview_%s.parquet' % date_update_str), mode='overwrite') # parse grant database parse_grant_rdd = path_rdd.flatMap(lambda x: pp.parse_medline_grant_id(x))\ .filter(lambda x: x is not None)\ .map(lambda x: Row(**x)) grant_df = parse_grant_rdd.toDF() grant_df.write.parquet(os.path.join(save_dir, 'medline_grant_%s.parquet' % date_update_str), mode='overwrite')
def parse_medline_articles(path='medline', saved_path='parsed_articles', year_start=2000, year_stop=2018): """ Give a ``path`` to folder locating .xml.gz of Medline articles, parse and save the parsed articles to ``saved_path`` Input ===== path: str, path to folder with all .xml.gz files saved_path: str, path to saved articles year_start: int, first year to save parsed article year_stop: int, last year to save parsed article """ paths = glob(os.path.join(path, '*.xml.gz')) # check if the directory is not there if not os.path.isdir(saved_path): os.mkdir(saved_path) for i, path in enumerate(paths): all_parsed_papers = [] parsed_papers = pp.parse_medline_xml(path) for paper in parsed_papers: try: if int(paper['pubdate']) >= year_start and int(paper['pubdate']) <= year_stop: all_parsed_papers.append(paper) except: pass save_json(all_parsed_papers, os.path.join(saved_path, 'parsed_%d.json' % i)) print('done!')
def medline2txt(xml_in, file): analyze_out = pp.parse_medline_xml(xml_in) bcnt = 0 # print 'Medline2Txt', xml_in for paper in analyze_out: title = paper['title'].encode('utf-8').replace('\n', ' ') if isinstance(title, unicode): title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore') title = re.sub(r'\s\s+', ' ', title) abstract = paper['abstract'].encode('utf-8').replace('\n', ' ') if isinstance(abstract, unicode): abstract = unicodedata.normalize('NFKD', abstract).encode('ascii', 'ignore') abstract = re.sub(r'\s\s+', ' ', abstract) text = '%s %s\n' % (title, abstract) # PWTEES FORMAT file.write(text) bcnt = bcnt + 1 if bcnt % 10000 == 0 : print bcnt, 'medline records inserted.'
def article_stream(path, batch_size): lines = [] for line in open(path, "r"): lines.append(line.strip()) pmid_batches = [] for batch in make_batches(lines, batch_size): pmid_batches.append(batch) pmid_lists = [] for batch in pmid_batches: pmid_lists.append(",".join(batch)) for pmid_list in pmid_lists: print("Doing " + pmid_list.replace(",", ", ") + "...") api_url = build_api_url(pmid_list, retmode="xml") res = requests.get(api_url) if res.status_code != 200: raise requests.HTTPError(res.reason) with open("xml/tmp.xml", "w") as f: f.write(res.text) medline_json = pp.parse_medline_xml("xml/tmp.xml") for article in medline_json: yield article
def medline2txt(xml_in, pmids, job_size, output_dir): analyze_out = pp.parse_medline_xml(xml_in) bcnt = 0 print 'Medline2Txt', xml_in for paper in analyze_out: pmid = paper['pmid'] sub_dir = '%s/%d' % (output_dir, int(pmid) % job_size) if paper['pmid'] not in pmids: continue title = paper['title'].encode('utf-8').replace('\n', ' ') title = u2a_convert(pmid, title, 'title') abstract = '' if paper['abstract'] is not None: abstract = paper['abstract'].encode('utf-8').replace('\n', ' ') abstract = u2a_convert(pmid, abstract, 'abstract') else: print 'Cannot find abstract for PMID %s' % pmid f_tmp_in_fn = '%s/%s.txt' % (sub_dir, pmid) f_tmp_in = open(f_tmp_in_fn, 'w') #text = '%s|t|%s\n%s|a|%s\n' % (pmid, title, pmid, abstract) #PubTator Format text = '%s %s' % (title, abstract) # PWTEES FORMAT f_tmp_in.write(text) f_tmp_in.close() bcnt = bcnt + 1 if bcnt % 1000 == 0: print bcnt, 'medline records inserted.'
def medline2txt(xml_in, pmids, job_size): analyze_out = pp.parse_medline_xml(xml_in) bcnt = 0 print 'Medline2Txt', xml_in sub_dir = 'input_w/' if not os.path.exists(sub_dir): os.makedirs(sub_dir) f_tmp_in_fn = '%s/medline.txt' % (sub_dir) f_tmp_in = open(f_tmp_in_fn, 'w') for paper in analyze_out: if paper['abstract'] is None: #print "abisnone: ",paper['pmid'] continue abstract = paper['abstract'].encode('utf-8').replace('\n', ' ') if isinstance(abstract, unicode): abstract = unicodedata.normalize('NFKD', abstract).encode('ascii', 'ignore') #text = '%s|t|%s\n%s|a|%s\n' % (pmid, title, pmid, abstract) #PubTator Format text = '%s\n' % (abstract) # PWTEES FORMAT f_tmp_in.write(text) #bcnt = bcnt + 1 #if bcnt % 1000 == 0 : #print bcnt, 'medline records inserted.' f_tmp_in.close()
def download_from_pmid_list(path, batch_size=1): lines = [] for line in open("pmids/example.txt", "r"): lines.append(line.strip()) pmid_batches = [] for batch in make_batches(lines, batch_size): pmid_batches.append(batch) pmid_lists = [] for batch in pmid_batches: pmid_lists.append(",".join(batch)) print() for pmid_list in pmid_lists: print("———————" + pmid_list + "———————") print() api_url = build_api_url(pmid_list, retmode="xml") res = requests.get(api_url) if res.status_code != 200: raise HTTPError(res.reason) d = xmltodict.parse(res.text) # with open("xml/tmp.xml", "w") as f: # f.write(res.text) # medline_json = pp.parse_medline_xml("xml/tmp.xml") # for article in medline_json: # with open("json/{}.json".format(article["pmid"]), "w") as f: # f.write(json.dumps(article, indent=2)) articles = d["PubmedArticleSet"]["PubmedArticle"] # When using xmltodict to convert a multiple-article request, the # PubmedArticle tag becomes a list instead of a dict. if isinstance(articles, dict): articles = [articles] for article in articles: out = {"PubmedArticleSet": {"PubmedArticle": article}} pmid = article["MedlineCitation"]["PMID"]["#text"] xml = xmltodict.unparse(out, pretty=True, full_document=False) # todo: write to /tmp/ instead? with open("xml/{}.xml".format(pmid), "w") as f: f.write(xml) medline_json = pp.parse_medline_xml("xml/{}.xml".format(pmid)) with open("json/{}.json".format(pmid), "w") as f: f.write(json.dumps(medline_json, indent=2)) shutil.rmtree("xml")
def sents2redis(xml_in, redis_server): analyze_out = pp.parse_medline_xml(xml_in) bcnt = 0 r = redis.StrictRedis(host='%s' % redis_server, port=6379, db=0) pipe = r.pipeline() print 'Medline2Redis', xml_in for paper in analyze_out: pmid = paper['pmid'] title = paper['title'].encode('utf-8').replace('\n', ' ') title = u2a_convert(pmid, title, 'title') abstract = '' if paper['abstract'] is not None: abstract = paper['abstract'].encode('utf-8').replace('\n', ' ') abstract = u2a_convert(pmid, abstract, 'abstract') else: print 'Cannot find abstract for PMID %s' % pmid #affiliation: corresponding author's affiliation #authors: authors, each separated by ; #mesh_terms: list of MeSH terms, each separated by ; #keywords: list of keywords, each separated by ; #pubdate: Publication date. Defaults to year information only. year = paper['pubdate'] author = paper['author'] keywords = paper['keywords'] mesh_terms = paper['mesh_terms'] affiliation = paper['affiliation'] journal = paper['journal'] pipe.set('%s:title' % pmid, '%s' % title) pipe.set('%s:abstract' % pmid, '%s' % abstract) pipe.set('%s:pubtator' % pmid, '%s|t|%s\n%s|a|%s' % (pmid, title, pmid, abstract)) pipe.set('%s:pubdate' % pmid, year) pipe.set('%s:author' % pmid, author) pipe.set('%s:mesh_terms' % pmid, mesh_terms) pipe.set('%s:keywords' % pmid, keywords) pipe.set('%s:affiliation' % pmid, affiliation) pipe.set('%s:journal' % pmid, journal) txt_str = '%s %s' % (title, abstract) sents = get_sentences_by_geniass(pmid, txt_str) scnt = 0 for sent in sents: pipe.set('%s:sentence:%d' % (pmid, scnt), sent) scnt += 1 bcnt = bcnt + 1 if bcnt % 100 == 0: print bcnt, 'cnv medline records inserted.' pipe.execute()
def stuff(t): with open("xml/tmp.xml", "w") as f: f.write(t) medline_json = pp.parse_medline_xml("xml/tmp.xml") for article in medline_json: with open("json/{}.json".format(article["pmid"]), "w") as f: f.write(json.dumps(article, indent=2))
def read_and_index_articles_file(self, infile_): infile = str(infile_) print("Reading %s " % infile) if infile.endswith(".xml.gz"): f = gzip.open(infile, 'rb') elif infile.endswith(".xml"): f = open(infile, 'rb') else: print( "Ignoring '%s': filename does not end with '.xml' or '.xml.gz'" % infile) return articles = pp.parse_medline_xml(f) listattrs = [ 'authors', 'mesh_terms', 'publication_types', 'chemical_list', 'keywords', 'references', 'affiliations' ] ids = set() deletedrecords, deletedpmids = list(), list() for i, ar in enumerate(articles): if ar['delete']: # DeleteCitation entries at the end of the xml archive files # are parsed to an object with field values set to float NaN deletedrecords.append(i) deletedpmids.append(ar['pmid']) continue try: num(ar, 'pmc') except ValueError: ar['pmc'] = 2000 ar['_id'] = num(ar, 'pmid') ids.add(ar['_id']) try: ar['pubdate'] = datetime.datetime(int(ar['pubdate']), 1, 1) except ValueError: print(ar['pubdate']) ar['pubdate'] = datetime.datetime(2000, 1, 1) for listattr in listattrs: if len(ar[listattr]) == 0: del ar[listattr] else: spr = ';' if listattr in ['authors', 'references' ] else '; ' ar[listattr] = ar[listattr].split(spr) for i in reversed(deletedrecords): del articles[i] self.qry.deletepubmedids(deletedpmids) if self.db == "Elasticsearch": if not self.qry.checkpubmedidsindexed(list(ids)): self.es_index(articles) else: print("Records in %s looks have been indexed, skipping" % infile) else: # assume MongoDB self.mdb_index(articles)
def read_xml_to_dict(folder_to_xmls, all_xml_files = None, keys_to_parse = None ): ''' Read xml data in dict and store the desired dict values corresponding to the values specified in the list keys_to_parse Input ----- folder_to_xmls: pathlib.PosixPath object denoting the path to the folder containing all the .xml files to be read all_xml_files: list of str, denoting the file names to be read in the folder_to_xmls. Can be obtained from get_files_in_folder() keys_to_parse: list of str, denoting the keys of the dictionary holding all the xml data from each file. The dictionary is created from parse_medline_xml() part of the pubmed_parser package Output ------ all_values: list of len(keys_to_parse) of lists L all_values[i] contains a list L with all the values corresponding to key=keys_to_parse[i]. len(L) depends on the data contained in the .xml files that will be read. ''' xml_file = [] #keep here the xml file name from which the data are read # Initialize a list with N empty lists with N=len(keys_to_parse) # This is where we store all the valeus from the keys_to_parse keys # for a every dict all_values = [[]] * len(keys_to_parse) for current_xml in all_xml_files: print('\nIterating file...:', current_xml) # Normaly parse_medline_xml() should work (since we get data from pubmed), # but this does not appear to be the case. # Instead parse_medline_xml() gets the desired info from the xml files. # To be further checked. dicts_out = pp.parse_medline_xml(str(folder_to_xmls/current_xml)) for d in dicts_out: for i, key in enumerate(keys_to_parse): try: if all_values[i]: all_values[i].append(d[key]) else: # Store the value of key in a list so it can be further # appended in the loop all_values[i] = [d[key]] xml_file.append(current_xml)# keep xml file name except: print('\nKey ', key, ' not found!') return all_values, xml_file
def medline2redis(xml_in, pmids, redis_server): analyze_out = pp.parse_medline_xml(xml_in) cnt=0 bcnt = 0 r = redis.StrictRedis(host='%s' % redis_server, port=6379, db=0) pipe = r.pipeline() print 'Medline2Redis', xml_in for paper in analyze_out: cnt = cnt + 1 if cnt % 1000 == 0: print cnt, "medline records processed" pmid = paper['pmid'] if paper['pmid'] not in pmids.keys(): continue title = paper['title'].encode('utf-8').replace('\n', ' ') title = u2a_convert(pmid, title, "title") abstract = paper['abstract'].encode('utf-8').replace('\n', ' ') abstract = u2a_convert(pmid, abstract, "abstract") #affiliation: corresponding author's affiliation #authors: authors, each separated by ; #mesh_terms: list of MeSH terms, each separated by ; #keywords: list of keywords, each separated by ; #pubdate: Publication date. Defaults to year information only. year = paper['pubdate'] author = paper['author'] keywords = paper['keywords'] mesh_terms = paper['mesh_terms'] affiliation = paper['affiliation'] journal = paper['journal'] pipe.set('%s:title' % pmid, '%s' % title) pipe.set('%s:abstract' % pmid, '%s' % abstract) pipe.set('%s:pubtator' % pmid, '%s|t|%s\n%s|a|%s' % (pmid, title, pmid, abstract)) pipe.set('%s:pubdate' % pmid, year) pipe.set('%s:author' % pmid, author) pipe.set('%s:mesh_terms' % pmid, mesh_terms) pipe.set('%s:keywords' % pmid, keywords) pipe.set('%s:affiliation' % pmid, affiliation) pipe.set('%s:journal' % pmid, journal) bcnt = bcnt + 1 if bcnt % 100 == 0 : print bcnt, 'cnv medline records inserted.' pipe.execute()
def parse_medline_xml(xml_file, output_file): """Import medline XML file into prophet database.""" # For medline import pubmed_parser as pp dicts_out = pp.parse_medline_xml(xml_file, year_info_only=False, nlm_category=True, author_list=True, reference_list=True) with open(output_file, 'w') as fp: json.dump(dicts_out, fp, cls=DateEncoder)
def main(psql=True): input_file_path = "data/pubmed/pubmed20n0340.xml.gz" logging.info(f"Processing articles from {input_file_path}.") article_dicts = pp.parse_medline_xml(input_file_path, year_info_only=False, author_list=False, reference_list=True) logging.info(f"Loaded articles from {input_file_path}.") global_init() for ad in article_dicts: logging.info(f"Processing article {ad['pmid']}.") article = Article() article.id = "PMID:" + ad["pmid"] article.version = "v1" article.source = "PubMed" article.journal = ad["journal"] article.article_type = "postprint" article.title = ad["title"] pubdate = ad["pubdate"] pubdate_dashes = pubdate.count("-") if pubdate_dashes == 2: # format of parsed pubdate is YYYY-MM-DD article.publication_date = date.fromisoformat(pubdate) elif pubdate_dashes == 1: # format of parsed pubdate is YYYY-MM article.publication_date = date.fromisoformat(pubdate + "-01") else: # format of parsed pubdate is YYYY article.publication_date = date.fromisoformat(pubdate + "-01-01") article.update_date = date.today() article.modified_date = datetime.now() article.link = "https://pubmed.ncbi.nlm.nih.gov/" + ad["pmid"] article.pmid = ad["pmid"] article.doi = ad["doi"] article.summary = ad["abstract"] article.full_text = "" article.authors = [ x.strip() for x in ad["authors"].split(";") if x != "" ] article.affiliations = [ad["affiliations"]] article.language = "" article.keywords = [ x.strip() for x in ad["keywords"].split(";") if x != "" ] article.references = [r["pmid"] for r in ad["references"]] article.tags = [ x.strip() for k in ["mesh_terms", "publication_types", "chemical_list"] for x in ad[k].split(";") if x != "" ] if psql: with session_scope() as sess: # TODO move to outer for loop sess.add(article)
def write_articles(articles): for article in articles: out = {"PubmedArticleSet": {"PubmedArticle": article}} pmid = article["MedlineCitation"]["PMID"]["#text"] xml = xmltodict.unparse(out, pretty=True, full_document=False) # todo: write to /tmp/ instead? with open("xml/{}.xml".format(pmid), "w") as f: f.write(xml) medline_json = pp.parse_medline_xml("xml/{}.xml".format(pmid)) with open("json/{}.json".format(pmid), "w") as f: f.write(json.dumps(medline_json, indent=2))
def parse_results_map(key): """Parse MEDLINE XML file""" # Extract name of file from key key_name = key.name.encode('utf-8') data_file = os.path.basename(key_name) # Download file from S3 bucket key.get_contents_to_filename(data_file) # Parse file temp = [ Row(file_name=os.path.basename(data_file), **publication_dict) for publication_dict in pp.parse_medline_xml(data_file) ] # Delete file from local directory subprocess.call(['rm', '-rf', data_file]) return temp
def test_parse_medline_xml(): """ Test parsing MEDLINE XML """ expected_title = "Monitoring of bacteriological contamination and as" expected_abstract = "Two hundred and sixty nine beef, 230 sheep and 165" parsed_medline = pp.parse_medline_xml( os.path.join("data", "pubmed20n0014.xml.gz")) assert isinstance(parsed_medline, list) assert len(parsed_medline) == 30000, "Expect to have 30000 records" assert (len([p for p in parsed_medline if len(p["title"]) > 0 ]) == 30000), "Expect every records to have title" assert parsed_medline[0]["title"][0:50] == expected_title assert parsed_medline[0]["abstract"][0:50] == expected_abstract assert parsed_medline[0]["pmid"] == "399296"
def _download_data(api_url: str): res = requests.get(api_url) if res.status_code != 200: raise requests.HTTPError(res.reason) with open("{}/medline.xml".format(_tmp_dir), "w") as f: f.write(res.text) medline_json_list = pp.parse_medline_xml("{}/medline.xml".format(_tmp_dir)) # Map PMID to article new_data = {} for article in medline_json_list: new_data[article["pmid"]] = article return new_data
def process_open_xml(proc_id, xml_files, output_dir): import pubmed_parser as pp def filter_mesh(string): return " ".join( map(lambda y: y[0], map(lambda x: x.split(";"), string.split(":")[1:]))) print("[Process-{}] Started".format(proc_id)) articles = [] for file_name in xml_files: print(proc_id, file_name) try: articles.extend( pp.parse_medline_xml(file_name, year_info_only=False, nlm_category=False)) except etree.XMLSyntaxError: print("Error on File " + file_name) gc.collect() articles_filter = filter( lambda x: (x["abstract"] is not None and len(x["abstract"]) > 0 and x[ "pubdate"] != ""), articles) articles_mapped = list( map( lambda x: { "id": x["pmid"], "title": x["title"], "abstract": x["abstract"], "keywords": x["keywords"], "pubdate": x["pubdate"], "mesh_terms": filter_mesh(x["mesh_terms"]), "delete": x["delete"] }, articles_filter)) file_name = output_dir + "/pubmed_2019_{0:03}.p".format(proc_id) print("[Process-{}]: Store {}".format(proc_id, file_name)) with open(file_name, "wb") as f: pickle.dump(articles_mapped, f) del articles print("[Process-{}] Ended".format(proc_id))
def parseXMLToDF(self): """ Read XML files and parse them into a dataframe. Returns: Dataframe containing parsed papers """ medline_files_rdd = self.SPARK_SESSION.sparkContext.parallelize( glob(self.xmlPath + "/*.xml"), self.numSlices) parse_results_rdd = medline_files_rdd.\ flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict) for publication_dict in pp.parse_medline_xml(x)]) medline_df = parse_results_rdd.toDF() return medline_df
def merge(self): print('PubMed path:', self.pubmed_path) with open(self.output_filename, mode='w', newline='\n') as ofile: # PubMed for filename in glob.glob(os.path.join(self.pubmed_path, '**/*.xml'), recursive=self.recursive): print('file:', filename) dicts_out = pmp.parse_medline_xml(filename) self.write_dicts(dicts_out, 'abstract', ofile, 'title', 'pubmed_abstract') # PMC for filename in glob.glob(os.path.join(self.pubmed_path, '**/*.nxml'), recursive=self.recursive): print('file:', filename) # OA abstract try: dicts_out = [pmp.parse_pubmed_xml(filename)] self.write_dicts(dicts_out, 'abstract', ofile, 'full_title', 'pmc_oa_abstract') except: pass # OA image caption try: dicts_out = pmp.parse_pubmed_caption(filename) self.write_dicts(dicts_out, 'fig_caption', ofile, 'fig_label', 'pmc_oa_image-caption') except: pass # OA Paragraph try: dicts_out = pmp.parse_pubmed_paragraph(filename, all_paragraph=True) self.write_dicts(dicts_out, 'text', ofile, 'reference_ids', 'pmc_oa_paragraph') except: pass
def sents2txt(xml_in, output_file): analyze_out = pp.parse_medline_xml(xml_in) bcnt = 0 print 'sents2txt ', xml_in sents = [] for paper in analyze_out: pmid = paper['pmid'] if paper['title'] is None: print '%s: title empty!' % pmid continue title = paper['title'].encode('utf-8').replace('\n', ' ') title = u2a_convert(pmid, title, 'title') abstract = '' if paper['abstract'] is not None: abstract = paper['abstract'].encode('utf-8').replace('\n', ' ') abstract = u2a_convert(pmid, abstract, 'abstract') else: print 'Cannot find abstract for PMID %s' % pmid txt_str = '%s %s' % (title, abstract) sub_sents = get_sentences_by_geniass(pmid, txt_str) sents += sub_sents if len(sents) > 1000: f_output = open(output_file, 'a') for sent in sents: f_output.write('%s\n' % sent) f_output.close() sents = [] bcnt = bcnt + 1 if bcnt % 100 == 0: print bcnt, 'medline records processed.' print 'Sentences written to file %s' % output_file
def merge(self): print('PubMed path:', self.pubmed_path) with open(self.output_filename, mode='w', newline='\n') as ofile: for filename in glob.glob(self.pubmed_path + '/*.xml', recursive=self.recursive): print('file:', filename) dicts_out = pmp.parse_medline_xml(filename) for dict_out in dicts_out: if not dict_out['abstract']: continue try: for line in dict_out['abstract'].splitlines(): if len(line) < 30: continue ofile.write(line.strip() + " ") ofile.write("\n\n") except: ofile.write("\n\n") continue
def get_Pubtator_from_medline_xml(xml_in, pmids, txt_out): analyze_out = pp.parse_medline_xml(xml_in) pubtator_out = open(txt_out, 'w') cnt = 0 for paper in analyze_out: cnt = cnt + 1 if cnt % 1000 == 0: print cnt, "medline records processed" if paper['pmid'] not in pmids.keys(): continue pubtator_out.write("%s|t|%s" % (paper['pmid'], paper['title'].encode('utf-8'))) pubtator_out.write('\n') pubtator_out.write("%s|a|%s" % (paper['pmid'], paper['abstract'].encode('utf-8'))) pubtator_out.write('\n') pubtator_out.close()
def parse_single_doc(f): """parse single documents in medline""" # set file path file_name = "pubmed19n{:04d}.xml.gz".format(f) file_name = "../MEDLINE/" + file_name # dicts_out is a list of dictionay dicts_out = pp.parse_medline_xml(file_name, year_info_only=False, nlm_category=False, author_list=False, reference_list=False) # load abstracts that are non-empty texts = [] for dict_ in dicts_out: abs_text = dict_['abstract'] if len(abs_text) > 0: texts.append(abs_text.strip()) return texts
def build_df_and_save_file_from_meline_xml(filename): #print(f"loading {filename}...") output_pickle_filename = filename+'.pickle.xz' try: # try loading the file, and make sure it has at least five rows & an abstract df = pd.read_pickle(output_pickle_filename) len(df.loc[5,'abstract'])>10 df.iloc[5] print(f"ALREADY PROCESSED, SKIPPPING\t{filename}...") return pd.DataFrame() # return None makes Spark crash except: # if we can't load the processed pickle file, generate it from the xml pubmed_dict = pp.parse_medline_xml(filename) # dictionary output print(f"loaded {filename}\tcontains {len(pubmed_dict)} entries.") tmp_df = pd.DataFrame() tmp_df['year'] = [d['pubdate'] for d in pubmed_dict] tmp_df['abstract'] = [d['abstract'] for d in pubmed_dict] tmp_df['abstract'] = tmp_df['abstract'].str.lower() tmp_df['abstract_nchar'] = [len(t) for t in tmp_df['abstract'] ] tmp_df = tmp_df[tmp_df.abstract_nchar > 100] # remove abstracts that are too short tmp_df.reset_index(inplace=True, drop=True) tmp_df.to_pickle(output_pickle_filename,compression='xz') return tmp_df
def medline2txt(xml_in, pmids, job_size): analyze_out = pp.parse_medline_xml(xml_in) cnt=0 bcnt = 0 print 'Medline2Txt', xml_in for paper in analyze_out: cnt = cnt + 1 if cnt % 1000 == 0: print cnt, "medline records processed" pmid = paper['pmid'] sub_dir = 'input/%d' % (int(pmid) % job_size) if paper['pmid'] not in pmids.keys(): continue title = paper['title'].encode('utf-8').replace('\n', ' ') title = u2a_convert(pmid, title, "title") abstract = paper['abstract'].encode('utf-8').replace('\n', ' ') abstract = u2a_convert(pmid, abstract, "abstract") if not os.path.exists(sub_dir): os.makedirs(sub_dir) f_tmp_in_fn = '%s/%s.txt' % (sub_dir,pmid) f_tmp_in = open(f_tmp_in_fn, 'w') #text = '%s|t|%s\n%s|a|%s\n' % (pmid, title, pmid, abstract) #PubTator Format text = '%s %s' % (title, abstract) # PWTEES FORMAT f_tmp_in.write(text) f_tmp_in.close() bcnt = bcnt + 1 if bcnt % 100 == 0 : print bcnt, 'cnv medline records inserted.'
def process_file(date_update): """Process downloaded MEDLINE folder to parquet file""" print("Process MEDLINE file to parquet") # remove if folder still exist if glob(os.path.join(save_dir, 'medline_*.parquet')): subprocess.call(['rm', '-rf', 'medline_*.parquet']) date_update_str = date_update.strftime("%Y_%m_%d") path_rdd = sc.parallelize(glob( os.path.join(download_dir, 'medline*.xml.gz')), numSlices=1000) parse_results_rdd = path_rdd.\ flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict) for publication_dict in pp.parse_medline_xml(x)]) medline_df = parse_results_rdd.toDF() medline_df.write.parquet(os.path.join( save_dir, 'medline_raw_%s.parquet' % date_update_str), compression='gzip') window = Window.partitionBy(['pmid']).orderBy(desc('file_name')) windowed_df = medline_df.select( max('delete').over(window).alias('is_deleted'), rank().over(window).alias('pos'), '*') windowed_df.\ where('is_deleted = False and pos = 1').\ write.parquet(os.path.join(save_dir, 'medline_lastview_%s.parquet' % date_update_str), compression='gzip') # parse grant database parse_grant_rdd = path_rdd.flatMap(lambda x: pp.parse_medline_grant_id(x))\ .filter(lambda x: x is not None)\ .map(lambda x: Row(**x)) grant_df = parse_grant_rdd.toDF() grant_df.write.parquet(os.path.join( save_dir, 'medline_grant_%s.parquet' % date_update_str), compression='gzip')
tempStr[:-1]) #append new string with boundaries index += 1 return synTable #print matches in sample dictionary to console def found(pubmed_dict, synTable): count = 1 #for testing to see how many matches for article in pubmed_dict: #iterate through articles for index in range(len(synTable)): #iterate through each search string find = synTable[index][3] #string with regex word boundaries text = article['abstract'] #abstract to search #find regex string, ignore case sensitivity regex = re.findall(find, text, re.IGNORECASE) if regex: #if found print(article['pmid'] + '\t' + synTable[index][0]) print(regex) print(count) count += 1 synFile = "synonyms.txt" pubmed_dict = pp.parse_medline_xml('pubmedsample18n0001.xml') synTable = createSynTable(synFile) #create synonym table synTable = addRegexString(synTable) #add new regex string to table found(pubmed_dict, synTable)
import pubmed_parser as pp import pandas as pd import pymysql pubmed_data = pp.parse_medline_xml("medsample1.xml") db = pymysql.connect(host="127.0.0.1", user="******", passwd="dehradun123", db="pubmed") curr = db.cursor() print(pubmed_data[5].keys()) print(pubmed_data[5]) try: curr.execute("""create table pubmed_article ( pmid varchar(100) primary key, pmc varchar(100), issn_linking varchar(100), pubdate varchar(100), nlm_id varchar(50), title text, deleted varchar(50), abstract text, affiliation varchar(1000), journal varchar(1000), medline_ta varchar(100), country varchar(500), other_id varchar(200) ); """) except Exception as e: print("article", e)
def parse_abstracts(x): arr = [] for publication_dict in pp.parse_medline_xml(x): if publication_dict['abstract'] != "": arr.append(Row(abstract=publication_dict['abstract'])) return arr
import pubmed_parser as pp import pandas as pd import pymysql pubmed_data=pp.parse_medline_xml("medsample1.xml") db=pymysql.connect(host="127.0.0.1",user="******",passwd="dehradun123",db="pubmed") curr=db.cursor() print(pubmed_data[5].keys()) print(pubmed_data[5]) try: curr.execute("""create table pubmed_article ( pmid varchar(100) primary key, pmc varchar(100), issn_linking varchar(100), pubdate varchar(100), nlm_id varchar(50), title text, deleted varchar(50), abstract text, affiliation varchar(1000), journal varchar(1000), medline_ta varchar(100), country varchar(500), other_id varchar(200) ); """) except Exception as e: print("article",e) try: curr.execute("""create table authors (