Пример #1
0
def scrape_doi(doi, doc_id=None):
    records_doi = db_store.view('index/ids', key='doi:' + doi, include_docs='true').rows

    url = resolve_doi(doi)
    records_source = db_store.view('index/sources', key=url, include_docs='true').rows
    
    if doc_id is not None or not (records_doi and records_source):
        # source url isn't in db
        if doc_id:
          article = db_store[doc_id]
          rev_id = article.rev
        else:
          article = {}
           
        try:
          scraped_article = resolve_and_scrape(url)

          # If we haven't excepted at this point, clear the current article and save it
          article.clear()
          article.update(scraped_article)

          # Add the id and revision back in since we just cleared the doc. Awkward.
          if doc_id:
            article['_id'] = doc_id
            article['_rev'] = rev_id
        except Exception, e:
          # Make a doc to remember to rescrape later
          article['error'] = str(e)
          article['rescrape'] = True
          article['source_urls'] = [url]

        if article:
          doc_id, _ = db_store.save(article)
Пример #2
0
def scrape_journal(url, doc_id=None, base_article={}):
    """Find the paper in the database and then add or merge as
    necessary."""

    # TODO: Make sure that if doc_id is not None, it does actually
    # refer to a document in the database.

    # Scrape if we have a doc_id or it hasn't already been scraped
    # always scrape if we're given a doc_id
    if doc_id is not None or check_source(url):
        # source url isn't in db
        if doc_id:
          article = db_store[doc_id]
          rev_id = article.rev
        else:
          article = {}
           
        scraped_article = resolve_and_scrape(url)

        # clear the current article and save it
        article.clear()
        article.update(base_article)
        article.update(scraped_article)

        # Add the id and revision back in since we just cleared the
        # doc. Awkward.
        if doc_id:
          article['_id'] = doc_id
          article['_rev'] = rev_id

        # If we haven't explicitly asked for the article to be scraped
        # by providing a doc_id, then check that it hasn't been
        # inadvertantly scraped already before we go
        if doc_id is not None or check_source(article['source_urls'][-1]):
            doc_id, _ = db_store.save(article)
    else:
        # we've already scraped this url. there should only be one
        # such doc.
        rows = db_store.view('index/sources', key=url, include_docs='true').rows
        article = rows[0].doc
        doc_id = article.id

    resolve_merges()

    return doc_id