def scrape(abstract_url): tree, urls, page_text = get_tree(abstract_url) article = make_blank_article() article['scraper'] = 'rup' article['source_urls'] = [uri for _, uri in urls] article['title'] = get_meta('DC.Title', tree) article['journal'] = get_meta('citation_journal_title', tree) article['publisher'] = get_meta('DC.Publisher', tree) article['author_names'] = get_meta_list('DC.Contributor', tree) article['abstract'] = strip_space( tree.xpath("//div[@class='section abstract']/p")[0].text_content() ) article['citation']['journal'] = get_meta('citation_journal_abbrev', tree) article['citation']['volume'] = get_meta('citation_volume', tree) article['citation']['page'] = get_meta('citation_firstpage', tree) article['ids'] = {'doi':get_meta('DC.Identifier', tree),} pub_date = get_meta('DC.Date', tree) if pub_date: split = pub_date.split('-') article['date_published'] = make_datestamp(split[2], split[1], split[0]) article['citation']['year'] = split[0] return article
def scrape(abstract_url): tree, urls, page_text = get_tree(abstract_url) article = make_blank_article() article['scraper'] = 'rsc' article['source_urls'] = [uri for _, uri in urls] article['ids'] = {'doi': get_meta('DC.Identifier', tree), } article['title'] = get_meta('DC.title', tree) article['publisher'] = get_meta('DC.publisher', tree) article['author_names'] = get_meta_list('DC.Creator', tree) try: article['journal'] = tree.xpath("//img[@id='imgLoader']/@title")[0] except: pass try: article['abstract'] = tree.xpath("//p[@xmlns='http://www.rsc.org/schema/rscart38']")[0].text_content() except: pass article['citation']['journal'] = get_meta('citation_journal_title', tree) article['citation']['volume'] = get_meta('citation_volume', tree) article['citation']['page'] = get_meta('citation_firstpage', tree) pub_date = get_meta('citation_publication_date', tree) if pub_date: split = pub_date.split('-') article['date_published'] = make_datestamp(split[2], split[1], split[0]) article['citation']['year'] = split[0] return article
def scrape(abstract_url): tree, urls, page_text = get_tree(abstract_url) article = make_blank_article() article['scraper'] = 'MIT' article['source_urls'] = [uri for _, uri in urls] article['title'] = get_meta('dc.Title', tree) article['publisher'] = get_meta('dc.Publisher', tree).strip() article['author_names'] = get_meta_list('dc.Creator', tree) # Two identifier schemes used --- do we want both? article['ids'] = dict(zip(['publisher-id','doi'], get_meta_list('dc.Identifier', tree))) article['journal'] = tree.xpath("//h1[@class='journalTitle']/a/img/@alt")[0] # dc.Description is present, but contains an abbreviated abstract # --- this gets the full abstract article['abstract'] = tree.xpath("//div[@class='abstractSection']/p/text()")[0] article['citation']['journal'] = article['journal'] # Citation details (volume, number, pages) given as text # immediately following the h1 tag. # example: December 2012, Vol. 24, No. 2, Pages 1-35 citation_text = tree.xpath("//h1[@class='journalTitle']/following-sibling::text()")[0] pub_year, volume, number, page_first, page_last = re.findall('\d+', citation_text) article['citation'] = {} article['citation']['volume'] = volume article['citation']['number'] = number article['citation']['page_first'] = page_first article['citation']['page_last'] = page_last date = get_meta('dc.Date', tree).split('-') if date: article['date_published'] = make_datestamp(date[2], date[1], date[0]) article['citation']['year'] = date[0] return article
def scrape(abstract_url): req = urllib2.Request(abstract_url, headers=utils.headers) urls, response = utils.get_response_chain(req) if response.info().get('Content-Encoding') == 'gzip': buf = StringIO( response.read()) f = gzip.GzipFile(fileobj=buf) data = f.read() else: data = response.read() page_text = data.decode('utf-8') tree = lxml.html.fromstring(page_text) article = make_blank_article() article['scraper'] = 'npg' article['source_urls'] = [uri for _, uri in urls] article['title'] = get_meta('DC.title',tree) if article['title'] == None: article['title'] = get_meta('dc.title',tree) article['publisher'] = get_meta('DC.publisher',tree) if article['publisher'] == None: article['publisher'] = get_meta('dc.publisher',tree) article['author_names'] = get_meta_list('DC.creator',tree) if article['author_names'] == None: article['author_names'] = get_meta_list('dc.creator',tree) article['abstract'] = get_meta('description', tree) if not article['abstract']: try: article['abstract'] = tree.xpath("//div[@class='content']/p")[0].text_content() except: pass if not article['abstract']: try: article['abstract'] = tree.xpath("//div[@id='abs']/p")[0].text_content() except: pass article['citation']['journal'] = get_meta('citation_journal_title', tree) article['citation']['volume'] = get_meta('prism.volume', tree) article['citation']['page'] = get_meta('prism.startingPage', tree) article['journal'] = get_meta('prism.publicationName', tree) year = get_meta('citation_date', tree) if year: article['citation']['year'] = year[0:4] article['ids'] = dict(zip(['doi'], [tree.xpath("//meta[@name='citation_doi']/@content")[0][4:]])) pub_date = get_meta('DC.date', tree) if pub_date == None: pub_date = get_meta('dc.date', tree) if pub_date: split = pub_date.split('-') article['date_published'] = make_datestamp(split[2], split[1], split[0]) #Specific abstract scrapers for subsidiary journals if article['journal'] == 'The EMBO Journal': try: article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content() except: pass elif article['journal'] == 'EMBO reports': try: article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content() except: pass elif article['journal'] == 'Oncogene': try: article['abstract'] = tree.xpath("//p[@class='abs lead']")[0].text_content() except: pass return article
def scrape_tree(tree, urls, page_text): article = {} article['source_urls'] = [uri for _, uri in urls] article['scraper'] = 'default' article['title'] = get_meta(title, tree) article['author_names'] = get_meta_list(authors, tree) if article['author_names'] is None: article['author_names'] = get_meta(authors_list, tree).split(';') article['abstract'] = get_meta(abstract, tree) article_doi = get_meta(doi, tree) if article_doi: article_doi = article_doi.replace('http://dx.doi.org/','') article['ids'] = {'doi': article_doi} article['journal'] = get_meta(journal, tree) article['publisher'] = get_meta(publisher, tree) article['citation'] = {} article['citation']['journal'] = article['journal'] article['citation']['volume'] = get_meta(volume, tree) article['citation']['page_first'] = get_meta(firstpage, tree) article['citation']['page_last'] = get_meta(lastpage, tree) article['keywords'] = get_meta(keywords, tree) date = get_meta(publication_date, tree) if date: date = parse_date(date) article['date_published'] = time.mktime(date.timetuple()) article['citation']['year'] = date.year return article