示例#1
0
def mine(db, mined_from=None, entry_count=200):
    subreddit_parser = SubredditParser(mined_from)
    entry_parser = EntryParser()

    ids = set([])
    last_id = ""

    step_size = entry_count / 10
    count = 0
    accepted = 0

    retries = 0
    while accepted < entry_count:
        entries = None
        while entries is None:
            try:
                entries = subreddit_parser.parse_entries(step_size, last_id)
            except Exception, error:
                retries += 1
                if retries < 3:
                    logging.error("Timeout: %s %s %s" % (mined_from, count, error))
                    sleep(randint(10, 20))
                elif retries < 8:
                    logging.error("Timeout: %s %s %s" % (mined_from, count, error))
                    sleep(randint(10, 20))
                    continue
                else:
                    thread.exit()

        unchanged = False
        skipped = False
        for i, entry in enumerate(entries):
            if entry["reddit_id"] in ids:
                unchanged = True
                logging.info("Unchanged: entries %d-%d in %s" % (i, count + len(entries) - 1, mined_from))
                break

            last_id = entry["reddit_id"]
            ids.add(last_id)

            saved_entry = db["entries"].find_one(reddit_id=entry["reddit_id"])

            if saved_entry is None:
                entry["article"] = None
                while entry["article"] is None:
                    try:
                        entry["article"] = entry_parser.get_content(entry["link"])
                        db["entries"].insert(entry)
                        accepted += 1
                    except Exception, error:
                        retries += 1
                        if retries < 3:
                            logging.error("Error: %s %s %s" % (mined_from, count, error))
                            sleep(randint(10, 20))
                        elif retries < 8:
                            logging.error("Error: %s %s %s" % (mined_from, count, error))
                            sleep(randint(10, 20))
                            continue
                        else:
                            thread.exit()
            else:
                skipped = True
                logging.info("Skipped: %d-%d in %s" % (i, count + len(entries) - 1, mined_from))
                break

            sleep(randint(1, 3))
示例#2
0
from random import shuffle

from helpers.db import setup_db
from helpers.EntryParser import EntryParser
from os import listdir


parsed = []
for f in listdir('./json'):
    with open('./json/%s' % f) as data:
        parsed.extend(json.loads(data.read()))
shuffle(parsed)


db = setup_db()
entry_parser = EntryParser()


logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p',
                        filename='miner.log',level=logging.DEBUG)
for entry in parsed:
    saved_entry = db['entries'].find_one(reddit_id=entry['reddit_id'])
    
    if saved_entry is None:
        entry['article'] = None
        try:
            entry['article'] = entry_parser.get_content(entry['link'])
            db['entries'].insert(entry)
        except Exception, error:
            logging.error('Error: %s %s %s' % (entry['mined_from'], entry['link'], error))
            continue