Exemplo n.º 1
0
def scrape_boards(query, board_collection, user_collection, user_limit=500):
    """
    Find all users with a board matching the query, and scrape all of
    their boards, inserting them into board_collection.

    If user is already present in user_collection, it means that we
    have scraped their boards, and so do not need to scrape again -- but
    do need to update their record with this query.
    """
    log.info(u'scrape_boards: on {}'.format(query))

    driver = browser_open()
    url = 'http://www.pinterest.com/search/boards/?q=' + query
    usernames = get_usernames_from_query_results_page(driver, url, user_limit)

    # For each user, scrape their boards page, or update their record.
    for username in usernames:
        # If we've already scraped this user's boards, then we don't
        # need to do that again, but we note that we have seen this
        # user for this new query.
        username_count = user_collection.find(
            {'username': username}).limit(1).count()
        if username_count > 0:
            # TODO: confirm that this appends to list of queries
            # because it looks like it just overwrites it
            if query in user_collection.find_one(
                    {'username': username})['query']:
                log.info('Already ran query {} for user {}'.format(
                    query, username))
            else:
                log.info("Already ran a different query for user {}".format(
                    username))
                user_collection.update(
                    {'username': username}, {'$push': {'query': query}})

        # Otherwise, we scrape the user's boards.
        else:
            boards = scrape_user_boards(
                driver, username, 'query: {} '.format(query))
            if len(boards) > 0:
                user_collection.insert({
                    'username': username,
                    'num_boards': len(boards),
                    'query': [query]
                })
                for board in boards:
                    try:
                        board_collection.insert(board)
                    except pymongo.errors.DuplicateKeyError:
                        continue
                log.info('Inserted {} from {} with query: {}'.format(
                    len(boards), username, query))
Exemplo n.º 2
0
def get_pinss(db, queries):
    pin_collection = db['pins']
    driver = browser_open()
    for query in queries:
        board_iterator = get_boards(driver, query)
        for board in board_iterator:
            doc = {
                'board_name': board['board_name'],
                'username': board['username']
            }
            if vislab.util.zero_results(pin_collection, doc):
                log.info("Scraping: {}".format(board['board_url']))
                t = time.time()
                scrape_pins(driver, board, pin_collection)
                log.info("...took {:.2f} s".format(time.time() - t))
            else:
                log.info("Not scraping {}".format(doc))
        log.info('Done scraping pins for {}'.format(query))