def add_feed_items(scraper_module, feedhandler, feed_url): """Add feed items to database..""" # should be smarter here, e.g. use If-Modified-Since feed = feedparser.parse(feed_url, agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50') should_scrape = scraper_factory.should_scrape[scraper_module] for item in feed['items']: base_article = {} if type(feedhandler) is list: for feedhandler_ in feedhandler: if feedhandler_ in item: item_url = item[feedhandler_] break else: raise Exception("Feed tag(s) not valid.") else: item_url = item[feedhandler] check_exists = db_store.view('index/sources', key=item_url, include_docs='false') try: check_exists.rows except: check_exists = db_store.view('index/sources', key=item_url, include_docs='false') if not check_exists.rows: if should_scrape: scrape_journal.delay(item_url, base_article=base_article) else: scrape_rss.delay(scraper_module, item)
def scrape_doi(doi, doc_id=None): records_doi = db_store.view('index/ids', key='doi:' + doi, include_docs='true').rows url = resolve_doi(doi) records_source = db_store.view('index/sources', key=url, include_docs='true').rows if doc_id is not None or not (records_doi and records_source): # source url isn't in db if doc_id: article = db_store[doc_id] rev_id = article.rev else: article = {} try: scraped_article = resolve_and_scrape(url) # If we haven't excepted at this point, clear the current article and save it article.clear() article.update(scraped_article) # Add the id and revision back in since we just cleared the doc. Awkward. if doc_id: article['_id'] = doc_id article['_rev'] = rev_id except Exception, e: # Make a doc to remember to rescrape later article['error'] = str(e) article['rescrape'] = True article['source_urls'] = [url] if article: doc_id, _ = db_store.save(article)
def check_source(url): rows = db_store.view('index/sources', key=url).rows if len(rows) == 0: return True else: return False
def scrape_rss(scraper_module, item): s = scrapers.module_names[scraper_module] d = s.scrape_rss(item) if 'journal' in d: d['journal_id'] = resolve_journal(d['journal']) if check_source(d['source_urls'][0]): doc_id, _ = db_store.save(d) else: print "Already got this one" rows = db_store.view('index/sources', key=d['source_urls'][0], include_docs='true').rows article = rows[0].doc doc_id = article.id return doc_id
def scrape_journal(url, doc_id=None, base_article={}): """Find the paper in the database and then add or merge as necessary.""" # TODO: Make sure that if doc_id is not None, it does actually # refer to a document in the database. # Scrape if we have a doc_id or it hasn't already been scraped # always scrape if we're given a doc_id if doc_id is not None or check_source(url): # source url isn't in db if doc_id: article = db_store[doc_id] rev_id = article.rev else: article = {} scraped_article = resolve_and_scrape(url) # clear the current article and save it article.clear() article.update(base_article) article.update(scraped_article) # Add the id and revision back in since we just cleared the # doc. Awkward. if doc_id: article['_id'] = doc_id article['_rev'] = rev_id # If we haven't explicitly asked for the article to be scraped # by providing a doc_id, then check that it hasn't been # inadvertantly scraped already before we go if doc_id is not None or check_source(article['source_urls'][-1]): doc_id, _ = db_store.save(article) else: # we've already scraped this url. there should only be one # such doc. rows = db_store.view('index/sources', key=url, include_docs='true').rows article = rows[0].doc doc_id = article.id resolve_merges() return doc_id
from akorn.celery.couch import db_journals, db_store for journal_id in db_journals: rows = db_store.view("index/journal_id", key=journal_id).rows if len(rows) == 0: journal = db_journals[journal_id] try: print "No articles for {}".format(journal["name"]) except: print journal
def rescrape_articles(): records = db_store.view('rescrape/rescrape', include_docs='true').rows for record in records: scrape_journal.delay(record.doc['source_url'], record.doc.id)
from akorn.celery.couch import db_store, db_journals print "Hello" journal_id_map = {} def make_journal(journal_name): doc = {'name': journal_name, 'aliases': [journal_name],} doc_id, doc_rev = db_journals.save(doc) journal_id_map[journal_name] = doc_id return doc_id for row in db_store.view('missing/journal_id', include_docs=True).rows: doc = row.doc if 'journal' in doc: print doc['journal'] if doc['journal'] in journal_id_map: doc['journal_id'] = journal_id_map[doc['journal']] print "Re-using" else: doc['journal_id'] = make_journal(doc['journal']) print "Making new" db_store.save(doc)