Python get_tree示例，akorn.scrapers.utils.get_tree Python示例

示例#1

0

显示文件

文件： scrape_nsub.py 项目： oakling/akorn.scrapers

def scrape(abstract_url):
    tree, urls, page_text = get_tree(abstract_url)

    article = sn.scrape(abstract_url)

    article['abstract'] = None

    if article['journal'] == 'The EMBO Journal':
        try:
            article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content()
        except:
            pass

        
    elif article['journal'] == 'EMBO reports':
        try:
            article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content()
        except:
            pass

    elif article['journal'] == 'Oncogene':
        try:
            article['abstract'] = tree.xpath("//p[@class='abs lead']")[0].text_content()
        except:
            pass
        
        
    return article

示例#2

0

显示文件

文件： scrape_rup.py 项目： oakling/akorn.scrapers

def scrape(abstract_url):
  tree, urls, page_text = get_tree(abstract_url) 

  article = make_blank_article()
  article['scraper'] = 'rup'
  article['source_urls'] = [uri for _, uri in urls]

  article['title'] = get_meta('DC.Title', tree)
  article['journal'] = get_meta('citation_journal_title', tree)
  article['publisher'] = get_meta('DC.Publisher', tree)
  article['author_names'] = get_meta_list('DC.Contributor', tree)

  article['abstract'] = strip_space( tree.xpath("//div[@class='section abstract']/p")[0].text_content() )

  article['citation']['journal'] = get_meta('citation_journal_abbrev', tree)
  article['citation']['volume'] = get_meta('citation_volume', tree)
  article['citation']['page'] = get_meta('citation_firstpage', tree)

  article['ids'] = {'doi':get_meta('DC.Identifier', tree),}
  
  pub_date = get_meta('DC.Date', tree)
  if pub_date:
    split = pub_date.split('-')
    article['date_published'] = make_datestamp(split[2], split[1], split[0])
    article['citation']['year'] = split[0]
  
  return article

示例#3

0

显示文件

文件： scrape_rsc.py 项目： oakling/akorn.scrapers

def scrape(abstract_url):
  tree, urls, page_text = get_tree(abstract_url) 

  article = make_blank_article()
  article['scraper'] = 'rsc'
  article['source_urls'] = [uri for _, uri in urls]
  article['ids'] = {'doi': get_meta('DC.Identifier', tree), }  

  article['title'] = get_meta('DC.title', tree)
  article['publisher'] = get_meta('DC.publisher', tree)
  article['author_names'] = get_meta_list('DC.Creator', tree)

  try:
      article['journal'] = tree.xpath("//img[@id='imgLoader']/@title")[0]
  except:
      pass

  try:
      article['abstract'] = tree.xpath("//p[@xmlns='http://www.rsc.org/schema/rscart38']")[0].text_content()
  except:
      pass
  
  article['citation']['journal'] = get_meta('citation_journal_title', tree)
  article['citation']['volume'] = get_meta('citation_volume', tree)
  article['citation']['page'] = get_meta('citation_firstpage', tree)

  pub_date = get_meta('citation_publication_date', tree)
  if pub_date:
    split = pub_date.split('-')
    article['date_published'] = make_datestamp(split[2], split[1], split[0])
    article['citation']['year'] = split[0]

  return article

示例#4

0

显示文件

文件： scrape_AA.py 项目： oakling/akorn.scrapers

def scrape(abstract_url):
    tree, urls, page_text = utils.get_tree(abstract_url)

    article = make_blank_article()
    article['scraper'] = 'AA'
    article['source_urls'] = [uri for _, uri in urls]
    
    try:
        article['title'] = tree.xpath("//div[@id='head']/h2")[0].text_content()
    except:
        pass

    

    article['author_names'] = tree.xpath("//div[@id='head']/p")[0].text_content()
    
    article['journal'] = tree.xpath("//meta[@name='keywords']/@content")[0].split(',')[0]
    
    info = tree.xpath("//div[@id='head']//p[@class='history']")[0].text_content()
    pubdate = info.split(' ')[4:]
    pubdate[1] = months[pubdate[1]]
    day, month, year = int(pubdate[0]), int(pubdate[1]), int(pubdate[2])
    pubdateuni = time.mktime(datetime.date(year, month, day).timetuple())
    article['date_published'] = pubdateuni



    info = tree.xpath("//div[@id='head']")[0].text_content()

    ab1 = info.split("Abstract")[1]
    ab2 = ab1.split("Key words")[0]
    article['abstract'] = ab2

    rec1 = info.split("Accepted: ")[1]
    rec2 = rec1.split("\nAbstract")[0]

    day, month, year = rec2.split(' ')
    article['date_published'] = make_datestamp(day, months[month], year)

    article['citation']['year'] = year

    issueinfo = info.split(article['title'])[0]
    jour, vol, num, yea = issueinfo.split(' ')[0], issueinfo.split(' ')[1],\
                          issueinfo.split(' ')[2], issueinfo.split(' ')[3]

    article['citation']['journal'] = jour
    article['citation']['volume'] = vol.split(',')[0]

    doi = article['source_urls'][0].split('doi=')[1]
    doi2 = doi.split('&')[0]
    article['ids'] = doi2


    return article

示例#5

0

显示文件

文件： scrape_meta_tags.py 项目： oakling/akorn.scrapers

def scrape(abstract_url):
  tree, urls, page_text = get_tree(abstract_url) 

  article = scrape_tree(tree, urls, page_text)

  for field in NECESSARY_FIELDS:
        if field not in article or not article[field]:
            print field
            raise FailedToScrape

  return article

示例#6

0

显示文件

文件： scrape_cell.py 项目： oakling/akorn.scrapers

def scrape(abstract_url):
    tree, urls, page_text = utils.get_tree(abstract_url) 

    article = meta_scrape(abstract_url)

    ab1 = tree.xpath("//div[@id='load']")[0].text_content()
    ab2 = ab1.split("Summary")[1]
    article['abstract'] =  ab2

    article['scraper'] = 'cell'
    
    return article

示例#7

0

显示文件

文件： scrape_mit.py 项目： oakling/akorn.scrapers

def scrape(abstract_url):
  tree, urls, page_text = get_tree(abstract_url)

  article = make_blank_article()
  article['scraper'] = 'MIT'
  article['source_urls'] = [uri for _, uri in urls]

  article['title'] = get_meta('dc.Title', tree)
  article['publisher'] = get_meta('dc.Publisher', tree).strip()
  article['author_names'] = get_meta_list('dc.Creator', tree)

  # Two identifier schemes used --- do we want both?
  article['ids'] = dict(zip(['publisher-id','doi'], get_meta_list('dc.Identifier', tree)))

  article['journal'] = tree.xpath("//h1[@class='journalTitle']/a/img/@alt")[0]

  # dc.Description is present, but contains an abbreviated abstract
  # --- this gets the full abstract
  article['abstract'] = tree.xpath("//div[@class='abstractSection']/p/text()")[0]

  article['citation']['journal'] = article['journal']

  # Citation details (volume, number, pages) given as text
  # immediately following the h1 tag.
  # example: December 2012, Vol. 24, No. 2, Pages 1-35

  citation_text = tree.xpath("//h1[@class='journalTitle']/following-sibling::text()")[0]

  pub_year, volume, number, page_first, page_last = re.findall('\d+', citation_text)

  article['citation'] = {}
  article['citation']['volume'] = volume
  article['citation']['number'] = number
  article['citation']['page_first'] = page_first
  article['citation']['page_last'] = page_last

  date = get_meta('dc.Date', tree).split('-')
  if date:
    article['date_published'] = make_datestamp(date[2], date[1], date[0])
    article['citation']['year'] = date[0]

  return article