Пример #1
0
def scrape(abstract_url):
  tree, urls, page_text = get_tree(abstract_url) 

  article = make_blank_article()
  article['scraper'] = 'rup'
  article['source_urls'] = [uri for _, uri in urls]

  article['title'] = get_meta('DC.Title', tree)
  article['journal'] = get_meta('citation_journal_title', tree)
  article['publisher'] = get_meta('DC.Publisher', tree)
  article['author_names'] = get_meta_list('DC.Contributor', tree)

  article['abstract'] = strip_space( tree.xpath("//div[@class='section abstract']/p")[0].text_content() )

  article['citation']['journal'] = get_meta('citation_journal_abbrev', tree)
  article['citation']['volume'] = get_meta('citation_volume', tree)
  article['citation']['page'] = get_meta('citation_firstpage', tree)

  article['ids'] = {'doi':get_meta('DC.Identifier', tree),}
  
  pub_date = get_meta('DC.Date', tree)
  if pub_date:
    split = pub_date.split('-')
    article['date_published'] = make_datestamp(split[2], split[1], split[0])
    article['citation']['year'] = split[0]
  
  return article
Пример #2
0
def scrape(abstract_url):
  tree, urls, page_text = get_tree(abstract_url) 

  article = make_blank_article()
  article['scraper'] = 'rsc'
  article['source_urls'] = [uri for _, uri in urls]
  article['ids'] = {'doi': get_meta('DC.Identifier', tree), }  

  article['title'] = get_meta('DC.title', tree)
  article['publisher'] = get_meta('DC.publisher', tree)
  article['author_names'] = get_meta_list('DC.Creator', tree)

  try:
      article['journal'] = tree.xpath("//img[@id='imgLoader']/@title")[0]
  except:
      pass

  try:
      article['abstract'] = tree.xpath("//p[@xmlns='http://www.rsc.org/schema/rscart38']")[0].text_content()
  except:
      pass
  
  article['citation']['journal'] = get_meta('citation_journal_title', tree)
  article['citation']['volume'] = get_meta('citation_volume', tree)
  article['citation']['page'] = get_meta('citation_firstpage', tree)

  pub_date = get_meta('citation_publication_date', tree)
  if pub_date:
    split = pub_date.split('-')
    article['date_published'] = make_datestamp(split[2], split[1], split[0])
    article['citation']['year'] = split[0]

  return article
Пример #3
0
def scrape(abstract_url):
  tree, urls, page_text = get_tree(abstract_url)

  article = make_blank_article()
  article['scraper'] = 'MIT'
  article['source_urls'] = [uri for _, uri in urls]

  article['title'] = get_meta('dc.Title', tree)
  article['publisher'] = get_meta('dc.Publisher', tree).strip()
  article['author_names'] = get_meta_list('dc.Creator', tree)

  # Two identifier schemes used --- do we want both?
  article['ids'] = dict(zip(['publisher-id','doi'], get_meta_list('dc.Identifier', tree)))

  article['journal'] = tree.xpath("//h1[@class='journalTitle']/a/img/@alt")[0]

  # dc.Description is present, but contains an abbreviated abstract
  # --- this gets the full abstract
  article['abstract'] = tree.xpath("//div[@class='abstractSection']/p/text()")[0]

  article['citation']['journal'] = article['journal']

  # Citation details (volume, number, pages) given as text
  # immediately following the h1 tag.
  # example: December 2012, Vol. 24, No. 2, Pages 1-35

  citation_text = tree.xpath("//h1[@class='journalTitle']/following-sibling::text()")[0]

  pub_year, volume, number, page_first, page_last = re.findall('\d+', citation_text)

  article['citation'] = {}
  article['citation']['volume'] = volume
  article['citation']['number'] = number
  article['citation']['page_first'] = page_first
  article['citation']['page_last'] = page_last

  date = get_meta('dc.Date', tree).split('-')
  if date:
    article['date_published'] = make_datestamp(date[2], date[1], date[0])
    article['citation']['year'] = date[0]

  return article
Пример #4
0
def scrape(abstract_url):
  req = urllib2.Request(abstract_url, headers=utils.headers)
  urls, response = utils.get_response_chain(req)

  if response.info().get('Content-Encoding') == 'gzip':
    buf = StringIO( response.read())
    f = gzip.GzipFile(fileobj=buf)
    data = f.read()
  else:
    data = response.read()

  page_text = data.decode('utf-8')
  tree = lxml.html.fromstring(page_text)

  article = make_blank_article()
  article['scraper'] = 'npg'
  article['source_urls'] = [uri for _, uri in urls]

  article['title'] = get_meta('DC.title',tree)
  if article['title'] == None:
      article['title'] = get_meta('dc.title',tree)

  article['publisher'] = get_meta('DC.publisher',tree)
  if article['publisher'] == None:
      article['publisher'] = get_meta('dc.publisher',tree)

  article['author_names'] = get_meta_list('DC.creator',tree)
  if article['author_names'] == None:
      article['author_names'] = get_meta_list('dc.creator',tree)
  
  article['abstract'] = get_meta('description', tree)
  if not article['abstract']: 
      try:
          article['abstract'] = tree.xpath("//div[@class='content']/p")[0].text_content()
      except:
          pass
  if not article['abstract']: 
      try:
          article['abstract'] = tree.xpath("//div[@id='abs']/p")[0].text_content()
      except:
          pass

  article['citation']['journal'] = get_meta('citation_journal_title', tree)
  article['citation']['volume'] = get_meta('prism.volume', tree)
  article['citation']['page'] = get_meta('prism.startingPage', tree)

  article['journal'] = get_meta('prism.publicationName', tree)

  year = get_meta('citation_date', tree)
  if year:
    article['citation']['year'] = year[0:4]

  article['ids'] = dict(zip(['doi'], [tree.xpath("//meta[@name='citation_doi']/@content")[0][4:]]))

  pub_date = get_meta('DC.date', tree)
  if pub_date == None:
      pub_date = get_meta('dc.date', tree)

  if pub_date:
      split = pub_date.split('-')
      article['date_published'] = make_datestamp(split[2], split[1], split[0])

  #Specific abstract scrapers for subsidiary journals
  if article['journal'] == 'The EMBO Journal':
      try:
          article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content()
      except:
          pass
  
  elif article['journal'] == 'EMBO reports':
      try:
          article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content()
      except:
          pass

  elif article['journal'] == 'Oncogene':
      try:
          article['abstract'] = tree.xpath("//p[@class='abs lead']")[0].text_content()
      except:
          pass

  return article
Пример #5
0
def scrape_tree(tree, urls, page_text):

    article = {}
    article['source_urls'] = [uri for _, uri in urls]
    article['scraper'] = 'default'

    article['title'] = get_meta(title, tree)
    article['author_names'] = get_meta_list(authors, tree)

    if article['author_names'] is None:
      article['author_names'] = get_meta(authors_list, tree).split(';')

    article['abstract'] = get_meta(abstract, tree)
    
    article_doi = get_meta(doi, tree)
    if article_doi:
        article_doi = article_doi.replace('http://dx.doi.org/','')
    article['ids'] = {'doi': article_doi}

    article['journal'] = get_meta(journal, tree)
    article['publisher'] = get_meta(publisher, tree)

    article['citation'] = {}
    article['citation']['journal'] = article['journal']
    article['citation']['volume'] = get_meta(volume, tree)
    article['citation']['page_first'] = get_meta(firstpage, tree)
    article['citation']['page_last'] = get_meta(lastpage, tree)

    article['keywords'] = get_meta(keywords, tree)
    date = get_meta(publication_date, tree)
    if date:
        date = parse_date(date)
        article['date_published'] = time.mktime(date.timetuple())
        article['citation']['year'] = date.year

    return article