Пример #1
0
def scrape(abstract_url):
  tree, urls, page_text = get_tree(abstract_url) 

  article = make_blank_article()
  article['scraper'] = 'iop'
  article['source_urls'] = [uri for _, uri in urls]

  article['publisher'] = get_meta('citation_publisher', tree)

  article['title'] = get_meta('dc.title', tree)
  if article['title'] == None:
    article['title'] = get_meta('dc.Title', tree)


  article['author_names'] = get_meta_list('dc.creator', tree)
  if article['author_names'] == None:
    article['author_names'] = get_meta_list('dc.contributor', tree)

  article['abstract'] = get_meta('dc.description', tree)
  
  article['journal'] = get_meta('citation_journal_title', tree)

  article['citation']['journal'] = get_meta('citation_journal_abbrev', tree)
  article['citation']['volume'] = get_meta('citation_volume', tree)
  article['citation']['page'] = get_meta('citation_firstpage', tree)

  article['ids'] = dict(zip(['doi'], [get_meta('citation_doi', tree)]))
 
  pub_date = get_meta('citation_publication_date', tree)
  if pub_date:
    split = pub_date.split('-')
    article['date_published'] = make_datestamp(split[2], split[1], split[0])
    article['citation']['year'] = split[0]

  return article 
Пример #2
0
def scrape(abstract_url):
  tree, urls, page_text = get_tree(abstract_url) 

  article = make_blank_article()
  article['scraper'] = 'rup'
  article['source_urls'] = [uri for _, uri in urls]

  article['title'] = get_meta('DC.Title', tree)
  article['journal'] = get_meta('citation_journal_title', tree)
  article['publisher'] = get_meta('DC.Publisher', tree)
  article['author_names'] = get_meta_list('DC.Contributor', tree)

  article['abstract'] = strip_space( tree.xpath("//div[@class='section abstract']/p")[0].text_content() )

  article['citation']['journal'] = get_meta('citation_journal_abbrev', tree)
  article['citation']['volume'] = get_meta('citation_volume', tree)
  article['citation']['page'] = get_meta('citation_firstpage', tree)

  article['ids'] = {'doi':get_meta('DC.Identifier', tree),}
  
  pub_date = get_meta('DC.Date', tree)
  if pub_date:
    split = pub_date.split('-')
    article['date_published'] = make_datestamp(split[2], split[1], split[0])
    article['citation']['year'] = split[0]
  
  return article
Пример #3
0
def scrape(abstract_url):
  tree, urls, page_text = get_tree(abstract_url) 

  article = make_blank_article()
  article['scraper'] = 'rsc'
  article['source_urls'] = [uri for _, uri in urls]
  article['ids'] = {'doi': get_meta('DC.Identifier', tree), }  

  article['title'] = get_meta('DC.title', tree)
  article['publisher'] = get_meta('DC.publisher', tree)
  article['author_names'] = get_meta_list('DC.Creator', tree)

  try:
      article['journal'] = tree.xpath("//img[@id='imgLoader']/@title")[0]
  except:
      pass

  try:
      article['abstract'] = tree.xpath("//p[@xmlns='http://www.rsc.org/schema/rscart38']")[0].text_content()
  except:
      pass
  
  article['citation']['journal'] = get_meta('citation_journal_title', tree)
  article['citation']['volume'] = get_meta('citation_volume', tree)
  article['citation']['page'] = get_meta('citation_firstpage', tree)

  pub_date = get_meta('citation_publication_date', tree)
  if pub_date:
    split = pub_date.split('-')
    article['date_published'] = make_datestamp(split[2], split[1], split[0])
    article['citation']['year'] = split[0]

  return article
Пример #4
0
def scrape(abstract_url):
  abstract_url = fix_wiley_url(abstract_url)
  tree, urls, page_text = get_tree(abstract_url) 

  article = make_blank_article()
  article['scraper'] = 'wiley'
  article['source_urls'] = [uri for _, uri in urls]

  try:
    article['journal'] = get_meta('citation_journal_title', tree)
  except:
    pass
  try:
    article['title'] = get_meta('citation_title', tree)
  except:
    pass
  try:
    article['ids'] = dict(zip(['doi'], [get_meta('citation_doi', tree)]))
  except:
    pass
  try:
    article['author_names'] = get_meta_list('citation_author', tree)
  except:
    pass
  try:
    article['abstract'] = tree.xpath("//div[@id='abstract']/div/p")[0].text_content()
  except:
    article['abstract'] = tree.xpath("//div[@id='graphicalAbstract']/div/p")[0].text_content()
    
  x = get_meta('citation_publication_date', tree)
  if x is None:
    x = get_meta('citation_online_date', tree)
  

  year, month, day = x.split('/')
  new_date = make_datestamp(day, month, year)
  article['date_published'] = new_date

  
  article['citation']['journal'] = article['journal']
  article['citation']['volume'] = get_meta('citation_volume', tree)
  try:
    article['citation']['year'] = year
  except:
    pass
  
  first_page = get_meta('citation_firstpage', tree)
  if first_page == None:
    first_page = '0'
  
  last_page = get_meta('citation_lastpage', tree)
  if last_page == None:
    last_page = '0'
    
  if first_page != '0' and last_page != '0':
    article['citation']['page'] = first_page + '-' + last_page
    
  return article
Пример #5
0
def scrape(abstract_url):
  tree, urls, page_text = get_tree(abstract_url)

  article = make_blank_article()
  article['scraper'] = 'MIT'
  article['source_urls'] = [uri for _, uri in urls]

  article['title'] = get_meta('dc.Title', tree)
  article['publisher'] = get_meta('dc.Publisher', tree).strip()
  article['author_names'] = get_meta_list('dc.Creator', tree)

  # Two identifier schemes used --- do we want both?
  article['ids'] = dict(zip(['publisher-id','doi'], get_meta_list('dc.Identifier', tree)))

  article['journal'] = tree.xpath("//h1[@class='journalTitle']/a/img/@alt")[0]

  # dc.Description is present, but contains an abbreviated abstract
  # --- this gets the full abstract
  article['abstract'] = tree.xpath("//div[@class='abstractSection']/p/text()")[0]

  article['citation']['journal'] = article['journal']

  # Citation details (volume, number, pages) given as text
  # immediately following the h1 tag.
  # example: December 2012, Vol. 24, No. 2, Pages 1-35

  citation_text = tree.xpath("//h1[@class='journalTitle']/following-sibling::text()")[0]

  pub_year, volume, number, page_first, page_last = re.findall('\d+', citation_text)

  article['citation'] = {}
  article['citation']['volume'] = volume
  article['citation']['number'] = number
  article['citation']['page_first'] = page_first
  article['citation']['page_last'] = page_last

  date = get_meta('dc.Date', tree).split('-')
  if date:
    article['date_published'] = make_datestamp(date[2], date[1], date[0])
    article['citation']['year'] = date[0]

  return article
Пример #6
0
def scrape(abstract_url):
    tree, urls, page_text = utils.get_tree(abstract_url)

    article = make_blank_article()
    article["scraper"] = "acs"
    article["source_urls"] = [uri for _, uri in urls]

    article["title"] = utils.get_meta("dc.Title", tree)
    article["publisher"] = utils.get_meta("dc.Publisher", tree)
    article["author_names"] = utils.get_meta_list("dc.Creator", tree)

    article["ids"] = dict(zip(["doi"], [utils.get_meta("dc.Identifier", tree)]))

    try:
        article["journal"] = tree.xpath("//div[@id='journalTop']/div/a/img/@alt")[0]
    except:
        pass

    try:
        article["abstract"] = tree.xpath("//div[@id='abstractBox']/p")[0].text_content()
    except:
        pass

    try:
        article["citation"]["journal"] = tree.xpath("//div[@id='citation']/cite")[0].text
    except:
        pass

    try:
        article["citation"]["volume"] = tree.xpath("//span[@class='citation_volume']")[0].text
    except:
        pass

    page_cite = tree.xpath("//div[@id='citation']")
    if page_cite:
        page = re.findall("pp\s([0-9]+)", page_cite[0].text_content())
        if page:
            article["citation"]["page"] = page[0]

    date = utils.get_meta("dc.Date", tree).split()
    if date:
        article["date_published"] = utils.make_datestamp(date[1][:-1], months[date[0]], date[2])
        article["citation"]["year"] = date[2]

    return article
Пример #7
0
 def scrape_article(self, page_text=None):
     """Scrape a html page which is an issue of the journal"""
     if page_text is None:
         return None
     tree = html.fromstring(page_text)
     article = self.make_blank_article()
     article["scraper"] = "jhu"
     article["source_urls"] = ""
     article["publisher"] = ""
     article["title"] = get_meta("article-title", tree)
     article["author_names"] = get_meta_list("contrib", tree)
     article["abstract"] = get_meta("abstract", tree)
     article["journal"] = ""
     article["date_published"] = ""
     article["citation"]["journal"] = ""
     article["citation"]["volume"] = ""
     article["citation"]["page"] = ""
     article["citation"]["year"] = ""
     article["ids"] = ""
     return article
Пример #8
0
def scrape_tree(tree, urls, page_text):

    article = {}
    article['source_urls'] = [uri for _, uri in urls]
    article['scraper'] = 'default'

    article['title'] = get_meta(title, tree)
    article['author_names'] = get_meta_list(authors, tree)

    if article['author_names'] is None:
      article['author_names'] = get_meta(authors_list, tree).split(';')

    article['abstract'] = get_meta(abstract, tree)
    
    article_doi = get_meta(doi, tree)
    if article_doi:
        article_doi = article_doi.replace('http://dx.doi.org/','')
    article['ids'] = {'doi': article_doi}

    article['journal'] = get_meta(journal, tree)
    article['publisher'] = get_meta(publisher, tree)

    article['citation'] = {}
    article['citation']['journal'] = article['journal']
    article['citation']['volume'] = get_meta(volume, tree)
    article['citation']['page_first'] = get_meta(firstpage, tree)
    article['citation']['page_last'] = get_meta(lastpage, tree)

    article['keywords'] = get_meta(keywords, tree)
    date = get_meta(publication_date, tree)
    if date:
        date = parse_date(date)
        article['date_published'] = time.mktime(date.timetuple())
        article['citation']['year'] = date.year

    return article
Пример #9
0
def scrape(abstract_url):
  req = urllib2.Request(abstract_url, headers=utils.headers)
  urls, response = utils.get_response_chain(req)

  if response.info().get('Content-Encoding') == 'gzip':
    buf = StringIO( response.read())
    f = gzip.GzipFile(fileobj=buf)
    data = f.read()
  else:
    data = response.read()

  page_text = data.decode('utf-8')
  tree = lxml.html.fromstring(page_text)

  article = make_blank_article()
  article['scraper'] = 'npg'
  article['source_urls'] = [uri for _, uri in urls]

  article['title'] = get_meta('DC.title',tree)
  if article['title'] == None:
      article['title'] = get_meta('dc.title',tree)

  article['publisher'] = get_meta('DC.publisher',tree)
  if article['publisher'] == None:
      article['publisher'] = get_meta('dc.publisher',tree)

  article['author_names'] = get_meta_list('DC.creator',tree)
  if article['author_names'] == None:
      article['author_names'] = get_meta_list('dc.creator',tree)
  
  article['abstract'] = get_meta('description', tree)
  if not article['abstract']: 
      try:
          article['abstract'] = tree.xpath("//div[@class='content']/p")[0].text_content()
      except:
          pass
  if not article['abstract']: 
      try:
          article['abstract'] = tree.xpath("//div[@id='abs']/p")[0].text_content()
      except:
          pass

  article['citation']['journal'] = get_meta('citation_journal_title', tree)
  article['citation']['volume'] = get_meta('prism.volume', tree)
  article['citation']['page'] = get_meta('prism.startingPage', tree)

  article['journal'] = get_meta('prism.publicationName', tree)

  year = get_meta('citation_date', tree)
  if year:
    article['citation']['year'] = year[0:4]

  article['ids'] = dict(zip(['doi'], [tree.xpath("//meta[@name='citation_doi']/@content")[0][4:]]))

  pub_date = get_meta('DC.date', tree)
  if pub_date == None:
      pub_date = get_meta('dc.date', tree)

  if pub_date:
      split = pub_date.split('-')
      article['date_published'] = make_datestamp(split[2], split[1], split[0])

  #Specific abstract scrapers for subsidiary journals
  if article['journal'] == 'The EMBO Journal':
      try:
          article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content()
      except:
          pass
  
  elif article['journal'] == 'EMBO reports':
      try:
          article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content()
      except:
          pass

  elif article['journal'] == 'Oncogene':
      try:
          article['abstract'] = tree.xpath("//p[@class='abs lead']")[0].text_content()
      except:
          pass

  return article
Пример #10
0
def get_meta(names, tree):
    attributes = get_meta_list(names, tree)
    if attributes:
        return attributes[0]