示例#1
0
""" Core scraper for bitcointalk.org. """
import bitcointalk
import logging
import memoizer
import os
import sys
import traceback

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s:%(message)s',
                    datefmt='%m/%d/%Y %I:%M:%S %p')

# Make sure we don't rescrape information already in the DB
memoizer.remember()

#add topic id to scrape
f = open('data.txt')
data = f.readlines()
f.close()

for topicId in data:
    logging.info(">Starting scrape of topic ID {0}...".format(topicId))
    try:
        topic = memoizer.scrapeTopic(topicId)
    except Exception as e:
        print '-' * 60
        print "Could not request URL for topic {0}:".format(topicId)
        print traceback.format_exc()
        print '-' * 60
        logging.info(">Could not request URL for topic {0}:".format(topicId))
        continue
import bitcointalk
import logging
import memoizer
import os
import sys
import traceback

boardId = 74

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)s:%(message)s',
    datefmt='%m/%d/%Y %I:%M:%S %p')

# Make sure we don't rescrape information already in the DB
memoizer.remember()

logging.info("Beginning scrape of board ID...".format(boardId))
board = memoizer.scrapeBoard(boardId)
logging.info("Found {0} topic pages in board...".format(
    board['num_pages']))
for boardPageNum in range(1, board['num_pages'] + 1):
    logging.info(">Scraping page {0}...".format(boardPageNum))
    topicIds = memoizer.scrapeTopicIds(boardId, boardPageNum)
    for topicId in topicIds:
        logging.info(">>Starting scrape of topic ID {0}...".format(topicId))
        try:
            topic = memoizer.scrapeTopic(topicId)
        except Exception as e:
            print '-'*60
            print "Could not request URL for topic {0}:".format(topicId)