def parse_medline_grant_id(path): """Parse grant id from Medline XML file Parameters ---------- path: str The path to the XML with the information Return ------ grant_id_list: list A list of dictionaries contains the grants in a given path. Each dictionary has the keys of 'pmid', 'grant_id', 'grant_acronym', 'country', and 'agency' >>> pubmed_parser.parse_medline_grant_id('data/pubmed20n0014.xml.gz') [{ 'pmid': '399300', 'grant_id': 'HL17731', 'grant_acronym': 'HL', 'country': 'United States', 'agency': 'NHLBI NIH HHS' }, ... ] """ tree = read_xml(path) medline_citations = tree.findall("//MedlineCitationSet/MedlineCitation") if len(medline_citations) == 0: medline_citations = tree.findall("//PubmedArticle") grant_id_list = list(map(parse_grant_id, medline_citations)) grant_id_list = list(chain(*grant_id_list)) # flatten list return grant_id_list
def get_medline_tree(path, to_string=False, encoding='utf-8'): """Initial parsing of the xml file tree. Finds all the articles. Parameters ---------- path: str The path to_string: bool If True, return a list of string elements encoding: str How to encode the elements if `to_string=True` Return ------ medline_citations: list A list of lxml.etree._Element, each being a pubmed article """ tree = read_xml(path) medline_citations = tree.findall("//MedlineCitationSet/MedlineCitation") if len(medline_citations) == 0: medline_citations = tree.findall("//PubmedArticle") if to_string: return [lxml.etree.tostring(elem, encoding=encoding) for elem in medline_citations] return medline_citations
def parse_medline_xml(path, year_info_only=True, nlm_category=False): """Parse XML file from Medline XML format available at ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/ Parameters ---------- path: str The path year_info_only: bool if True, this tool will only attempt to extract year information from PubDate. if False, an attempt will be made to harvest all available PubDate information. If only year and month information is available, this will yield a date of the form 'YYYY-MM'. If year, month and day information is available, a date of the form 'YYYY-MM-DD' will be returned. NOTE: the resolution of PubDate information in the Medline(R) database varies between articles. Defaults to True. nlm_category: bool, default False if True, this will parse structured abstract where each section if original Label if False, this will parse structured abstract where each section will be assigned to NLM category of each sections Returns ------- article_list: list Dictionary containing information about articles in NLM format (see `parse_article_info`). Articles that have been deleted will be added with no information other than the field `delete` being `True` """ tree = read_xml(path) medline_citations = tree.findall('//MedlineCitationSet/MedlineCitation') if len(medline_citations) == 0: medline_citations = tree.findall('//MedlineCitation') article_list = list( map(lambda m: parse_article_info(m, year_info_only, nlm_category), medline_citations)) delete_citations = tree.findall('//DeleteCitation/PMID') dict_delete = \ [ {'title': None, 'abstract': None, 'journal': None, 'author': None, 'affiliation': None, 'pubdate': None, 'pmid': p.text, 'other_id': None, 'pmc': None, 'mesh_terms': None, 'keywords': None, 'delete': True, 'medline_ta': None, 'nlm_unique_id': None, 'issn_linking': None, 'country': None } for p in delete_citations ] article_list.extend(dict_delete) return article_list
def parse_medline_xml(path, year_info_only=True): """Parse XML file from Medline XML format available at ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/ Parameters ---------- path: str The path year_info_only: bool if True, this tool will only attempt to extract year information from PubDate. if False, an attempt will be made to harvest all available PubDate information. If only year and month information is available, this will yield a date of the form 'YYYY-MM'. If year, month and day information is available, a date of the form 'YYYY-MM-DD' will be returned. NOTE: the resolution of PubDate information in the Medline(R) database varies between articles. Defaults to True. Returns ------- article_list: list Dictionary containing information about articles in NLM format (see `parse_article_info`). Articles that have been deleted will be added with no information other than the field `delete` being `True` """ tree = read_xml(path) medline_citations = tree.findall('//MedlineCitationSet/MedlineCitation') if len(medline_citations) == 0: medline_citations = tree.findall('//MedlineCitation') article_list = list(map(lambda m: parse_article_info(m, year_info_only), medline_citations)) delete_citations = tree.findall('//DeleteCitation/PMID') dict_delete = \ [ {'title': None, 'abstract': None, 'journal': None, 'author': None, 'affiliation': None, 'pubdate': None, 'pmid': p.text, 'other_id': None, 'pmc': None, 'mesh_terms': None, 'keywords': None, 'delete': True } for p in delete_citations ] article_list.extend(dict_delete) return article_list
def parse_medline_grant_id(path): """Parse grant id from Medline XML file Parameters ---------- path: str The path to the XML with the information Returns ------- grant_id_list: list List of dictionaries for all files in `path`. Each dictionary will have the information returned by `parse_grant_id` """ tree = read_xml(path) medline_citations = tree.xpath('//MedlineCitationSet/MedlineCitation') grant_id_list = list(map(parse_grant_id, medline_citations)) grant_id_list = list(chain(*grant_id_list)) # flatten list return grant_id_list
def parse_medline_grant_id(path): """Parse grant id from Medline XML file Parameters ---------- path: str The path to the XML with the information Return ------ grant_id_list: list A list of dictionaries contains the grants in a given path. Each dictionary has the keys of 'pmid', 'grant_id', 'grant_acronym', 'country', and 'agency' """ tree = read_xml(path) medline_citations = tree.findall("//MedlineCitationSet/MedlineCitation") if len(medline_citations) == 0: medline_citations = tree.findall("//PubmedArticle") grant_id_list = list(map(parse_grant_id, medline_citations)) grant_id_list = list(chain(*grant_id_list)) # flatten list return grant_id_list
def parse_medline_grant_id(path): """Parse grant id from Medline XML file Parameters ---------- path: str The path to the XML with the information Returns ------- grant_id_list: list List of dictionaries for all files in `path`. Each dictionary will have the information returned by `parse_grant_id` """ tree = read_xml(path) medline_citations = tree.findall('//MedlineCitationSet/MedlineCitation') if len(medline_citations) == 0: medline_citations = tree.findall('//MedlineCitation') grant_id_list = list(map(parse_grant_id, medline_citations)) grant_id_list = list(chain(*grant_id_list)) # flatten list return grant_id_list
def parse_medline_xml( path, year_info_only=True, nlm_category=False, author_list=False, reference_list=False, ): """Parse XML file from Medline XML format available at ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/ Parameters ---------- path: str The path year_info_only: bool if True, this tool will only attempt to extract year information from PubDate. if False, an attempt will be made to harvest all available PubDate information. If only year and month information is available, this will yield a date of the form 'YYYY-MM'. If year, month and day information is available, a date of the form 'YYYY-MM-DD' will be returned. NOTE: the resolution of PubDate information in the Medline(R) database varies between articles. default: True nlm_category: bool if True, this will parse structured abstract where each section if original Label if False, this will parse structured abstract where each section will be assigned to NLM category of each sections default: False author_list: bool if True, return parsed author output as a list of authors if False, return parsed author output as a string of authors concatenated with ``;`` default: False reference_list: bool if True, parse reference list as an output if False, return string of PMIDs concatenated with ; default: False Return ------ article_list: list A list of dictionary containing information about articles in NLM format (see `parse_article_info`). Articles that have been deleted will be added with no information other than the field `delete` being `True` """ tree = read_xml(path) medline_citations = tree.findall("//MedlineCitationSet/MedlineCitation") if len(medline_citations) == 0: medline_citations = tree.findall("//PubmedArticle") article_list = list( map( lambda m: parse_article_info(m, year_info_only, nlm_category, author_list, reference_list), medline_citations, )) delete_citations = tree.findall("//DeleteCitation/PMID") dict_delete = [{ "title": np.nan, "abstract": np.nan, "journal": np.nan, "authors": np.nan, "affiliations": np.nan, "pubdate": np.nan, "pmid": p.text.strip(), "doi": np.nan, "other_id": np.nan, "pmc": np.nan, "mesh_terms": np.nan, "keywords": np.nan, "publication_types": np.nan, "chemical_list": np.nan, "delete": True, "medline_ta": np.nan, "nlm_unique_id": np.nan, "issn_linking": np.nan, "country": np.nan, "references": np.nan, } for p in delete_citations] article_list.extend(dict_delete) return article_list