def scrape(url): if "?fmt=txt" not in url: url = url + "?fmt=txt" urls, response = get_response_chain(url) abstxt = response.read() paper = build_paper(abstxt) paper["source_urls"] = [uri for _, uri in urls] + [url.replace("?fmt=txt", "")] return paper
def scrape(abstract_url): req = urllib2.Request(abstract_url, headers=utils.headers) urls, response = utils.get_response_chain(req) if response.info().get('Content-Encoding') == 'gzip': buf = StringIO( response.read()) f = gzip.GzipFile(fileobj=buf) data = f.read() else: data = response.read() page_text = data.decode('utf-8') tree = lxml.html.fromstring(page_text) article = make_blank_article() article['scraper'] = 'npg' article['source_urls'] = [uri for _, uri in urls] article['title'] = get_meta('DC.title',tree) if article['title'] == None: article['title'] = get_meta('dc.title',tree) article['publisher'] = get_meta('DC.publisher',tree) if article['publisher'] == None: article['publisher'] = get_meta('dc.publisher',tree) article['author_names'] = get_meta_list('DC.creator',tree) if article['author_names'] == None: article['author_names'] = get_meta_list('dc.creator',tree) article['abstract'] = get_meta('description', tree) if not article['abstract']: try: article['abstract'] = tree.xpath("//div[@class='content']/p")[0].text_content() except: pass if not article['abstract']: try: article['abstract'] = tree.xpath("//div[@id='abs']/p")[0].text_content() except: pass article['citation']['journal'] = get_meta('citation_journal_title', tree) article['citation']['volume'] = get_meta('prism.volume', tree) article['citation']['page'] = get_meta('prism.startingPage', tree) article['journal'] = get_meta('prism.publicationName', tree) year = get_meta('citation_date', tree) if year: article['citation']['year'] = year[0:4] article['ids'] = dict(zip(['doi'], [tree.xpath("//meta[@name='citation_doi']/@content")[0][4:]])) pub_date = get_meta('DC.date', tree) if pub_date == None: pub_date = get_meta('dc.date', tree) if pub_date: split = pub_date.split('-') article['date_published'] = make_datestamp(split[2], split[1], split[0]) #Specific abstract scrapers for subsidiary journals if article['journal'] == 'The EMBO Journal': try: article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content() except: pass elif article['journal'] == 'EMBO reports': try: article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content() except: pass elif article['journal'] == 'Oncogene': try: article['abstract'] = tree.xpath("//p[@class='abs lead']")[0].text_content() except: pass return article
def scrape(abstract_url): req = urllib2.Request(abstract_url, headers=headers) urls, page = utils.get_response_chain(req) # Parse the HTML into a tree we can query page_text = page.read().decode('utf-8') tree = lxml.html.fromstring(page_text, base_url=abstract_url) # Make XPATH queries for the first H1 and second H2 for the article title and how to cite it title = tree.xpath('//h1')[0].text_content().strip() cite_as = tree.xpath('//h2')[1].text.strip() # Scrub the citation. cite_as = re.sub('\s{1,}', ' ', cite_as) cite_as = re.sub(' \[.*?\]', '', cite_as) # Make our article object article = {} article['scraper'] = 'pr' article['title'] = title article['cite'] = cite_as try: article['citation'] = parse_citation(cite_as) except: pass # Grab all links inside the the <div> with the id='aps-authors' and take their text as the author list. article['author_names'] = [author.text.strip() for author in tree.xpath("//div[@id='aps-authors']//a")] try: article['author_names'].remove('Hide All Authors/Affiliations') except: pass # Find the div with class 'aps-abstractbox' and grab the text of the first <p> within it as the abstract try: article['abstract'] = tree.xpath("//div[@class='aps-abstractbox']/p")[0].text_content() except: pass months = {'January':1, 'February':2, 'March':3, 'April':4, 'May':5, 'June':6, 'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12} # Received 21 December 2011; revised 18 February 2012; published 9 April 2012 date_received = re.findall('Received\s+([0-9]+)\s+([A-Za-z]+)\s+([0-9]+)', page_text) date_revised = re.findall('revised\s+([0-9]+)\s+([A-Za-z]+)\s+([0-9]+)', page_text) date_published = re.findall('published\s+([0-9]+)\s+([A-Za-z]+)\s+([0-9]+)', page_text) def make_datestamp(date_tuple): year = int(date_tuple[2]) month = months[date_tuple[1]] day = int(date_tuple[0]) return time.mktime(datetime.date(year, month, day).timetuple()) if date_received: article['date_received'] = make_datestamp(date_received[0]) if date_revised: article['date_revised'] = make_datestamp(date_revised[0]) if date_published: article['date_published'] = make_datestamp(date_published[0]) # Find the <div> with the id 'aps-article-info' and take the respective columns cell contents as the type of id and id. article['ids'] = dict(zip([e.text.strip().lower().replace(':','') for e in tree.xpath("//div[@id='aps-article-info']//div[@class='table-cell bold']")],\ [e.text.strip() for e in tree.xpath("//div[@id='aps-article-info']//div[@class='table-cell']")])) if 'subject areas' in article['ids']: article['pr_subject_areas'] = article['ids']['subject areas'] del article['ids']['subject areas'] article['journal'] = recognise_journal(page.geturl()) article['source_urls'] = [uri for _, uri in urls] # PACS will be recognised as an id, even though its actually a list of categories. # Split them out into their own custom field and delete from ids. if 'pacs' in article['ids']: article['categories'] = {'PACS':[c.strip() for c in article['ids']['pacs'].split(',')]} del article['ids']['pacs'] return article