def scrape(abstract_url): tree, urls, page_text = get_tree(abstract_url) article = sn.scrape(abstract_url) article['abstract'] = None if article['journal'] == 'The EMBO Journal': try: article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content() except: pass elif article['journal'] == 'EMBO reports': try: article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content() except: pass elif article['journal'] == 'Oncogene': try: article['abstract'] = tree.xpath("//p[@class='abs lead']")[0].text_content() except: pass return article
def scrape(abstract_url): tree, urls, page_text = get_tree(abstract_url) article = make_blank_article() article['scraper'] = 'rup' article['source_urls'] = [uri for _, uri in urls] article['title'] = get_meta('DC.Title', tree) article['journal'] = get_meta('citation_journal_title', tree) article['publisher'] = get_meta('DC.Publisher', tree) article['author_names'] = get_meta_list('DC.Contributor', tree) article['abstract'] = strip_space( tree.xpath("//div[@class='section abstract']/p")[0].text_content() ) article['citation']['journal'] = get_meta('citation_journal_abbrev', tree) article['citation']['volume'] = get_meta('citation_volume', tree) article['citation']['page'] = get_meta('citation_firstpage', tree) article['ids'] = {'doi':get_meta('DC.Identifier', tree),} pub_date = get_meta('DC.Date', tree) if pub_date: split = pub_date.split('-') article['date_published'] = make_datestamp(split[2], split[1], split[0]) article['citation']['year'] = split[0] return article
def scrape(abstract_url): tree, urls, page_text = get_tree(abstract_url) article = make_blank_article() article['scraper'] = 'rsc' article['source_urls'] = [uri for _, uri in urls] article['ids'] = {'doi': get_meta('DC.Identifier', tree), } article['title'] = get_meta('DC.title', tree) article['publisher'] = get_meta('DC.publisher', tree) article['author_names'] = get_meta_list('DC.Creator', tree) try: article['journal'] = tree.xpath("//img[@id='imgLoader']/@title")[0] except: pass try: article['abstract'] = tree.xpath("//p[@xmlns='http://www.rsc.org/schema/rscart38']")[0].text_content() except: pass article['citation']['journal'] = get_meta('citation_journal_title', tree) article['citation']['volume'] = get_meta('citation_volume', tree) article['citation']['page'] = get_meta('citation_firstpage', tree) pub_date = get_meta('citation_publication_date', tree) if pub_date: split = pub_date.split('-') article['date_published'] = make_datestamp(split[2], split[1], split[0]) article['citation']['year'] = split[0] return article
def scrape(abstract_url): tree, urls, page_text = utils.get_tree(abstract_url) article = make_blank_article() article['scraper'] = 'AA' article['source_urls'] = [uri for _, uri in urls] try: article['title'] = tree.xpath("//div[@id='head']/h2")[0].text_content() except: pass article['author_names'] = tree.xpath("//div[@id='head']/p")[0].text_content() article['journal'] = tree.xpath("//meta[@name='keywords']/@content")[0].split(',')[0] info = tree.xpath("//div[@id='head']//p[@class='history']")[0].text_content() pubdate = info.split(' ')[4:] pubdate[1] = months[pubdate[1]] day, month, year = int(pubdate[0]), int(pubdate[1]), int(pubdate[2]) pubdateuni = time.mktime(datetime.date(year, month, day).timetuple()) article['date_published'] = pubdateuni info = tree.xpath("//div[@id='head']")[0].text_content() ab1 = info.split("Abstract")[1] ab2 = ab1.split("Key words")[0] article['abstract'] = ab2 rec1 = info.split("Accepted: ")[1] rec2 = rec1.split("\nAbstract")[0] day, month, year = rec2.split(' ') article['date_published'] = make_datestamp(day, months[month], year) article['citation']['year'] = year issueinfo = info.split(article['title'])[0] jour, vol, num, yea = issueinfo.split(' ')[0], issueinfo.split(' ')[1],\ issueinfo.split(' ')[2], issueinfo.split(' ')[3] article['citation']['journal'] = jour article['citation']['volume'] = vol.split(',')[0] doi = article['source_urls'][0].split('doi=')[1] doi2 = doi.split('&')[0] article['ids'] = doi2 return article
def scrape(abstract_url): tree, urls, page_text = get_tree(abstract_url) article = scrape_tree(tree, urls, page_text) for field in NECESSARY_FIELDS: if field not in article or not article[field]: print field raise FailedToScrape return article
def scrape(abstract_url): tree, urls, page_text = utils.get_tree(abstract_url) article = meta_scrape(abstract_url) ab1 = tree.xpath("//div[@id='load']")[0].text_content() ab2 = ab1.split("Summary")[1] article['abstract'] = ab2 article['scraper'] = 'cell' return article
def scrape(abstract_url): tree, urls, page_text = get_tree(abstract_url) article = make_blank_article() article['scraper'] = 'MIT' article['source_urls'] = [uri for _, uri in urls] article['title'] = get_meta('dc.Title', tree) article['publisher'] = get_meta('dc.Publisher', tree).strip() article['author_names'] = get_meta_list('dc.Creator', tree) # Two identifier schemes used --- do we want both? article['ids'] = dict(zip(['publisher-id','doi'], get_meta_list('dc.Identifier', tree))) article['journal'] = tree.xpath("//h1[@class='journalTitle']/a/img/@alt")[0] # dc.Description is present, but contains an abbreviated abstract # --- this gets the full abstract article['abstract'] = tree.xpath("//div[@class='abstractSection']/p/text()")[0] article['citation']['journal'] = article['journal'] # Citation details (volume, number, pages) given as text # immediately following the h1 tag. # example: December 2012, Vol. 24, No. 2, Pages 1-35 citation_text = tree.xpath("//h1[@class='journalTitle']/following-sibling::text()")[0] pub_year, volume, number, page_first, page_last = re.findall('\d+', citation_text) article['citation'] = {} article['citation']['volume'] = volume article['citation']['number'] = number article['citation']['page_first'] = page_first article['citation']['page_last'] = page_last date = get_meta('dc.Date', tree).split('-') if date: article['date_published'] = make_datestamp(date[2], date[1], date[0]) article['citation']['year'] = date[0] return article