def scrape_boards(query, board_collection, user_collection, user_limit=500): """ Find all users with a board matching the query, and scrape all of their boards, inserting them into board_collection. If user is already present in user_collection, it means that we have scraped their boards, and so do not need to scrape again -- but do need to update their record with this query. """ log.info(u'scrape_boards: on {}'.format(query)) driver = browser_open() url = 'http://www.pinterest.com/search/boards/?q=' + query usernames = get_usernames_from_query_results_page(driver, url, user_limit) # For each user, scrape their boards page, or update their record. for username in usernames: # If we've already scraped this user's boards, then we don't # need to do that again, but we note that we have seen this # user for this new query. username_count = user_collection.find( {'username': username}).limit(1).count() if username_count > 0: # TODO: confirm that this appends to list of queries # because it looks like it just overwrites it if query in user_collection.find_one( {'username': username})['query']: log.info('Already ran query {} for user {}'.format( query, username)) else: log.info("Already ran a different query for user {}".format( username)) user_collection.update( {'username': username}, {'$push': {'query': query}}) # Otherwise, we scrape the user's boards. else: boards = scrape_user_boards( driver, username, 'query: {} '.format(query)) if len(boards) > 0: user_collection.insert({ 'username': username, 'num_boards': len(boards), 'query': [query] }) for board in boards: try: board_collection.insert(board) except pymongo.errors.DuplicateKeyError: continue log.info('Inserted {} from {} with query: {}'.format( len(boards), username, query))
def get_pinss(db, queries): pin_collection = db['pins'] driver = browser_open() for query in queries: board_iterator = get_boards(driver, query) for board in board_iterator: doc = { 'board_name': board['board_name'], 'username': board['username'] } if vislab.util.zero_results(pin_collection, doc): log.info("Scraping: {}".format(board['board_url'])) t = time.time() scrape_pins(driver, board, pin_collection) log.info("...took {:.2f} s".format(time.time() - t)) else: log.info("Not scraping {}".format(doc)) log.info('Done scraping pins for {}'.format(query))