def adate_extractor(article, year_info_only=False): """Extract PubDate information from an Article in the Medline dataset. Parameters ---------- journal: Element The 'Journal' field in the Medline dataset year_info_only: bool if True, this tool will only attempt to extract year information from PubDate. if False, an attempt will be made to harvest all available PubDate information. If only year and month information is available, this will yield a date of the form 'YYYY-MM'. If year, month and day information is available, a date of the form 'YYYY-MM-DD' will be returned. Returns ------- PubDate: str PubDate extracted from an article. Note: If year_info_only is False and a month could not be extracted this falls back to year automatically. """ day = None month = None issue_date = article.find("ArticleDate") if not issue_date: return None if issue_date.find("Year") is not None: year = issue_date.find("Year").text if not year_info_only: if issue_date.find("Month") is not None: month = month_or_day_formater(issue_date.find("Month").text) if issue_date.find("Day") is not None: day = month_or_day_formater(issue_date.find("Day").text) elif issue_date.find("MedlineDate") is not None: year_text = issue_date.find("MedlineDate").text year = re.findall(r"\d{4}", year_text) if len(year) >= 1: year = year[0] else: year = "" else: year = "" if year_info_only or month is None: return year else: return "-".join(str(x) for x in filter(None, [year, month, day]))
def date_extractor(journal, year_info_only): """Extract PubDate information from an Article in the Medline dataset. Parameters ---------- journal: Element The 'Journal' field in the Medline dataset year_info_only: bool if True, this tool will only attempt to extract year information from PubDate. if False, an attempt will be made to harvest all available PubDate information. If only year and month information is available, this will yield a date of the form 'YYYY-MM'. If year, month and day information is available, a date of the form 'YYYY-MM-DD' will be returned. Returns ------- PubDate: str PubDate extracted from an article. Note: If year_info_only is False and a month could not be extracted this falls back to year automatically. """ day = None month = None issue = journal.xpath('JournalIssue')[0] issue_date = issue.find('PubDate') if issue_date.find('Year') is not None: year = issue_date.find('Year').text if not year_info_only: if issue_date.find('Month') is not None: month = month_or_day_formater(issue_date.find('Month').text) if issue_date.find('Day') is not None: day = month_or_day_formater(issue_date.find('Day').text) elif issue_date.find('MedlineDate') is not None: year_text = issue_date.find('MedlineDate').text year = year_text.split(' ')[0] else: year = "" if year_info_only or month is None: return year else: return "-".join(str(x) for x in filter(None, [year, month, day]))
def date_extractor(journal, year_info_only): """Extract PubDate information from an Article in the Medline dataset. Parameters ---------- journal: Element The 'Journal' field in the Medline dataset year_info_only: bool if True, this tool will only attempt to extract year information from PubDate. if False, an attempt will be made to harvest all available PubDate information. If only year and month information is available, this will yield a date of the form 'YYYY-MM'. If year, month and day information is available, a date of the form 'YYYY-MM-DD' will be returned. Returns ------- PubDate: str PubDate extracted from an article. Note: If year_info_only is False and a month could not be extracted this falls back to year automatically. """ day = None month = None issue = journal.xpath('JournalIssue')[0] issue_date = issue.find('PubDate') if issue_date.find('Year') is not None: year = issue_date.find('Year').text if not year_info_only: if issue_date.find('Month') is not None: month = month_or_day_formater(issue_date.find('Month').text) if issue_date.find('Day') is not None: day = month_or_day_formater(issue_date.find('Day').text) elif issue_date.find('MedlineDate') is not None: year_text = issue_date.find('MedlineDate').text year = year_text.split(' ')[0] else: year = "" if year_info_only or month is None: return year else: return "-".join(str(x) for x in filter(None, [year, month, day]))
def parse_daterevised(medline): """Parse chemical list from article Parameters ---------- medline: Element The lxml node pointing to a medline document Returns ------- DateRevised: str PubDate extracted from an article. """ year = "" month = "" day = "" DateRevised = medline.find("DateRevised") if DateRevised is not None: year = DateRevised.findtext("Year") month = month_or_day_formater(DateRevised.find("Month").text) day = month_or_day_formater(DateRevised.find("Day").text) return "-".join(str(x) for x in filter(None, [year, month, day])) else: return ""