def run_scrape_function_with_retries(scrape_function, date):
    num_attempts = 0

    while True:
        num_attempts += 1

        status = scrape_function(date)

        if status == 200:
            log.info('successful')
            break
        elif status == 404:
            log.info('file not found')
            break
        elif status == 'leaderboard updated':
            log.warning('the leaderboard was updated after this script was started, so re-run this script')
            break
        else:
            if num_attempts < 3:
                log.info('Status was %s, retrying', status)
            else:
                log.error('reached 3 attempts, aborting')
                break

    return status
Exemplo n.º 2
0
def main():
    utils.ensure_exists(output_directory)

    date_of_last_cached_leaderboard = get_date_of_last_cached_leaderboard()
    log.info('date of the last cached leaderboard is %s', date_of_last_cached_leaderboard)

    date_of_last_goko_leaderboard = datetime.date.today()

    one_day_delta = datetime.timedelta(1)
    date = date_of_last_cached_leaderboard + one_day_delta

    while date <= datetime.date.today():
        log.info('Processing %s', date)

        if date == datetime.date.today():
            log.info('scraping from goko')
            status = run_scrape_function_with_retries(scrape_leaderboard_from_goko, date)
        else:
            log.info('scraping from councilroom')
            status = run_scrape_function_with_retries(scrape_leaderboard_from_councilroom, date)

            if status != 200 and date <= datetime.date(2013,01,01):
                log.info('scraping from bggdl')
                status = run_scrape_function_with_retries(scrape_leaderboard_from_bggdl, date)

        if status == 200:
            pass
        elif status == 404:
            log.warning('file not found, so we will assume that it does not exist, and go to the next day')
        else:
            log.warning('Unexpected status of %d, please try again later', status)
            break

        date += one_day_delta
Exemplo n.º 3
0
def run_scrape_function_with_retries(scrape_function, date):
    num_attempts = 0

    while True:
        num_attempts += 1

        status = scrape_function(date)

        if status == 200:
            log.info('successful')
            break
        elif status == 404:
            log.info('file not found')
            break
        elif status == 'leaderboard updated':
            log.warning('the leaderboard was updated after this script was started, so re-run this script')
            break
        else:
            if num_attempts < 3:
                log.info('Status was %s, retrying', status)
            else:
                log.error('reached 3 attempts, aborting')
                break

    return status
Exemplo n.º 4
0
def main(parsed_args):
    db = utils.get_mongo_database()
    goal_db = db.goals
    gstats_db = db.goal_stats
    all_goals = goals.goal_check_funcs.keys()
    total_pcount = collections.defaultdict(int)
    goal_scanner = incremental_scanner.IncrementalScanner('goals', db)
    stat_scanner = incremental_scanner.IncrementalScanner('goal_stats', db)

    if not parsed_args.incremental:
        log.warning('resetting scanner and db')
        stat_scanner.reset()
        gstats_db.remove()

    log.info("Starting run: %s", stat_scanner.status_msg())

    # TODO: The following logic doesn't work now that goal calculation doesn't happen with a scanner.
    # if goal_scanner.get_max_game_id() == stat_scanner.get_max_game_id():
    #     log.info("Stats already set! Skip")
    #     exit(0)

    log.info('all_goals %s', all_goals)
    for goal_name in all_goals:
        log.info("Working on %s", goal_name)
        found_goals_cursor = goal_db.find({'goals.goal_name': goal_name},
                                          {'goals.player': 1, '_id': 0})
        total = found_goals_cursor.count()
        log.info("Found %d instances of %s", total, goal_name)

        pcount = collections.defaultdict(int)
        for goal in found_goals_cursor:
            player = goal['goals'][0]['player']
            pcount[player] += 1
            total_pcount[player] += 1

        psorted = sorted(pcount.iteritems(), key=operator.itemgetter(1), 
                         reverse=True)
        top = []
        leaders = 0
        i = 0
        while leaders < 3 and i < len(psorted):
            (player, count) = psorted[i]
            players = []
            if player not in AIs.names:
                players = [player]
            i += 1
            while i < len(psorted) and psorted[i][1] == count:
                if psorted[i][0] not in AIs.names:
                    players.append(psorted[i][0])
                i += 1
            leaders += len(players)
            if len(players) > 0:
                top.append((players, count))
			
        mongo_val = {'_id': goal_name, 'count': total, 'top': top}
        gstats_db.save(mongo_val)

    stat_scanner.set_max_game_id(goal_scanner.get_max_game_id())
    stat_scanner.save()
    log.info("Ending run: %s", stat_scanner.status_msg())
Exemplo n.º 5
0
def main(parsed_args):
    """ Scan and update buy data"""
    start = time.time()
    db = utils.get_mongo_database()
    games = db.games
    output_db = db

    overall_stats = DeckBuyStats()

    scanner = incremental_scanner.IncrementalScanner(BUYS_COL_NAME, output_db)
    buy_collection = output_db[BUYS_COL_NAME]

    if not parsed_args.incremental:
        log.warning('resetting scanner and db')
        scanner.reset()
        buy_collection.drop()

    start_size = scanner.get_num_games()
    log.info("Starting run: %s", scanner.status_msg())
    do_scan(scanner, games, overall_stats, parsed_args.max_games)
    log.info("Ending run: %s", scanner.status_msg())
    end_size = scanner.get_num_games()

    if parsed_args.incremental:
        existing_overall_data = DeckBuyStats()
        utils.read_object_from_db(existing_overall_data, buy_collection, '')
        overall_stats.merge(existing_overall_data)
        def deck_freq(data_set):
            return data_set[dominioncards.Estate].available.frequency()
        log.info('existing %s decks', deck_freq(existing_overall_data))
        log.info('after merge %s decks', deck_freq(overall_stats))

    utils.write_object_to_db(overall_stats, buy_collection, '')

    scanner.save()
Exemplo n.º 6
0
def main(parsed_args):
    db = utils.get_mongo_database()
    goal_db = db.goals
    gstats_db = db.goal_stats
    all_goals = goals.goal_check_funcs.keys()
    total_pcount = collections.defaultdict(int)
    goal_scanner = incremental_scanner.IncrementalScanner('goals', db)
    stat_scanner = incremental_scanner.IncrementalScanner('goal_stats', db)

    if not parsed_args.incremental:
        log.warning('resetting scanner and db')
        stat_scanner.reset()
        gstats_db.remove()

    log.info("Starting run: %s", stat_scanner.status_msg())

    # TODO: The following logic doesn't work now that goal calculation doesn't happen with a scanner.
    # if goal_scanner.get_max_game_id() == stat_scanner.get_max_game_id():
    #     log.info("Stats already set! Skip")
    #     exit(0)

    log.info('all_goals %s', all_goals)
    for goal_name in all_goals:
        log.info("Working on %s", goal_name)
        found_goals_cursor = goal_db.find({'goals.goal_name': goal_name}, {
            'goals.player': 1,
            '_id': 0
        })
        total = found_goals_cursor.count()
        log.info("Found %d instances of %s", total, goal_name)

        pcount = collections.defaultdict(int)
        for goal in found_goals_cursor:
            player = goal['goals'][0]['player']
            pcount[player] += 1
            total_pcount[player] += 1

        psorted = sorted(pcount.iteritems(),
                         key=operator.itemgetter(1),
                         reverse=True)
        top = []
        leaders = 0
        i = 0
        while leaders < 3 and i < len(psorted):
            (player, count) = psorted[i]
            players = [player]
            i += 1
            while i < len(psorted) and psorted[i][1] == count:
                players.append(psorted[i][0])
                i += 1
            leaders += len(players)
            top.append((players, count))

        mongo_val = {'_id': goal_name, 'count': total, 'top': top}
        gstats_db.save(mongo_val)

    stat_scanner.set_max_game_id(goal_scanner.get_max_game_id())
    stat_scanner.save()
    log.info("Ending run: %s", stat_scanner.status_msg())
def save_file(date, data, is_gzipped):
    if is_gzipped:
        f = gzip.GzipFile(fileobj=StringIO.StringIO(data))
        try:
            decompressed_data = f.read()
            data = decompressed_data
        except IOError, ioe:
            log.warning('Received data was not in gzip format')
        f.close()
Exemplo n.º 8
0
def save_file(date, data, is_gzipped):
    if is_gzipped:
        f = gzip.GzipFile(fileobj=StringIO.StringIO(data))
        try:
            decompressed_data = f.read()
            data = decompressed_data
        except IOError, ioe:
            log.warning('Received data was not in gzip format')
        f.close()
Exemplo n.º 9
0
def main(args):
    """ Update analysis statistics.  By default, do so incrementally, unless
    --noincremental argument is given."""

    commit_after = 25000

    database = utils.get_mongo_database()
    games = database.games

    output_collection_name = 'analysis'
    output_collection = database[output_collection_name]
    game_analysis = GamesAnalysis()

    scanner = incremental_scanner.IncrementalScanner(output_collection_name,
                                                     database)

    if args.incremental:
        utils.read_object_from_db(game_analysis, output_collection, '')
    else:
        log.warning('resetting scanner and db')
        scanner.reset()

    output_file_name = 'static/output/all_games_card_stats.js'

    if not os.path.exists('static/output'):
        os.makedirs('static/output')

    log.info("Starting run: %s", scanner.status_msg())

    for idx, raw_game in enumerate(
            utils.progress_meter(scanner.scan(games, {}))):
        try:
            game_analysis.analyze_game(Game(raw_game))

            if args.max_games >= 0 and idx >= args.max_games:
                log.info("Reached max_games of %d", args.max_games)
                break

            if idx % commit_after == 0 and idx > 0:
                start = time.time()
                game_analysis.max_game_id = scanner.get_max_game_id()
                game_analysis.num_games = scanner.get_num_games()
                utils.write_object_to_db(game_analysis, output_collection, '')
                scanner.save()
                log.info("Committed calculations to the DB in %5.2fs",
                         time.time() - start)

        except int, exception:
            log.exception('Exception occurred for %s in raw game %s',
                          Game(raw_game).isotropic_url(), raw_game)
            raise
Exemplo n.º 10
0
def save_file(date, data, is_gzipped):
    if is_gzipped:
        f = gzip.GzipFile(fileobj=StringIO.StringIO(data))
        try:
            decompressed_data = f.read()
            data = decompressed_data
        except IOError as ioe:
            log.warning('Received data was not in gzip format')
        f.close()

    data = bz2.compress(data)

    f = open(output_directory + str(date) + '.html.bz2', 'w')
    f.write(data)
    f.close()
Exemplo n.º 11
0
def main(args):
    """ Update analysis statistics.  By default, do so incrementally, unless
    --noincremental argument is given."""

    commit_after = 25000

    database = utils.get_mongo_database()
    games = database.games

    output_collection_name = 'analysis'
    output_collection = database[output_collection_name]
    game_analysis = GamesAnalysis()

    scanner = incremental_scanner.IncrementalScanner(output_collection_name,
                                                     database)
 
    if args.incremental:
        utils.read_object_from_db(game_analysis, output_collection, '')
    else:
        log.warning('resetting scanner and db')
        scanner.reset()

    output_file_name = 'static/output/all_games_card_stats.js'

    if not os.path.exists('static/output'):
        os.makedirs('static/output')

    log.info("Starting run: %s", scanner.status_msg())

    for idx, raw_game in enumerate(utils.progress_meter(scanner.scan(games, {}))):
        try:
            game_analysis.analyze_game(Game(raw_game))

            if args.max_games >= 0 and idx >= args.max_games:
                log.info("Reached max_games of %d", args.max_games)
                break

            if idx % commit_after == 0 and idx > 0:
                start = time.time()
                game_analysis.max_game_id = scanner.get_max_game_id()
                game_analysis.num_games = scanner.get_num_games()
                utils.write_object_to_db(game_analysis, output_collection, '')
                scanner.save()
                log.info("Committed calculations to the DB in %5.2fs", time.time() - start)

        except int, exception:
            log.exception('Exception occurred for %s in raw game %s', Game(raw_game).isotropic_url(), raw_game)
            raise 
Exemplo n.º 12
0
def watch_and_log(signature, log_interval=15, timeout=600):
    """Invoke the celery task via the passed signature, wait for it an
    all its children to complete, and log progress along the way.

    log_interval: number of seconds between checking and logging the
    status

    timeout: number of seconds after which to return, when there have
    been no subtask status updates"""
    task_name = signature.task
    log.info("Calling background task %s", task_name)

    async_result = signature.apply_async()

    all_done = False
    last_status_summary = None
    last_status_update = time.time()
    while not all_done:
        # Wait for the log_interval, then check the status
        time.sleep(log_interval)

        c = collections.Counter()
        try:
            # Setting intermediate to False should cause the
            # IncompleteStream exception to be thrown if the task and
            # its children aren't all complete.
            for parent, child in async_result.iterdeps(intermediate=False):
                c[child.state] += 1
            all_done = True
        except celery.exceptions.IncompleteStream:
            status_summary = summarize_task_status(c)
            log.info("Waiting for %s: %s", task_name, status_summary)

            # Check on timeout condition
            if (last_status_summary is not None
                and status_summary == last_status_summary
                and (time.time() - last_status_update) > timeout):
                break
            else:
                last_status_summary = status_summary
                last_status_update = time.time()

    if all_done:
        log.info("Done with background task %s: %s", task_name, summarize_task_status(c))
    else:
        log.warning("Returning due to timeout during background task %s: %s", task_name, summarize_task_status(c))
    return async_result
Exemplo n.º 13
0
def watch_and_log(signature, log_interval=15, timeout=600):
    """Invoke the celery task via the passed signature, wait for it an
    all its children to complete, and log progress along the way.

    log_interval: number of seconds between checking and logging the
    status

    timeout: number of seconds after which to return, when there have
    been no subtask status updates"""
    task_name = signature.task
    log.info("Calling background task %s", task_name)

    async_result = signature.apply_async()

    all_done = False
    last_status_summary = None
    last_status_update = time.time()
    while not all_done:
        # Wait for the log_interval, then check the status
        time.sleep(log_interval)

        c = collections.Counter()
        try:
            # Setting intermediate to False should cause the
            # IncompleteStream exception to be thrown if the task and
            # its children aren't all complete.
            for parent, child in async_result.iterdeps(intermediate=False):
                c[child.state] += 1
            all_done = True
        except celery.exceptions.IncompleteStream:
            status_summary = summarize_task_status(c)
            log.info("Waiting for %s: %s", task_name, status_summary)

            # Check on timeout condition
            if (last_status_summary is not None
                and status_summary == last_status_summary
                and (time.time() - last_status_update) > timeout):
                break
            else:
                last_status_summary = status_summary
                last_status_update = time.time()

    if all_done:
        log.info("Done with background task %s: %s", task_name, summarize_task_status(c))
    else:
        log.warning("Returning due to timeout during background task %s: %s", task_name, summarize_task_status(c))
    return async_result
Exemplo n.º 14
0
def main(args):
    commit_after = 25000
    database = utils.get_mongo_database()
    games = database.games
    collection = database.optimal_card_ratios
    db_tracker = None

    scanner = incremental_scanner.IncrementalScanner('optimal_card_ratios',
                                                     database)

    if not args.incremental:
        log.warning('resetting scanner and db')
        scanner.reset()

    log.info("Starting run: %s", scanner.status_msg())

    for ind, game in enumerate(utils.progress_meter(scanner.scan(games, {}))):
        if not db_tracker:
            log.debug("Initializing db tracker manager")
            db_tracker = DBCardRatioTrackerManager(collection,
                                                   args.incremental)
            log.debug("DB tracker manager initialized")

        result = process_game(Game(game))
        for final_ratio_dict, progressive_ratio_dict, win_points in result:
            db_tracker.integrate_results('final', final_ratio_dict, win_points)
            db_tracker.integrate_results('progressive', progressive_ratio_dict,
                                         win_points)

        if args.max_games >= 0 and ind >= args.max_games:
            log.info("Reached max_games of %d", args.max_games)
            break

        if ind % commit_after == 0 and ind > 0:
            start = time.time()
            db_tracker.save()
            scanner.save()
            log.info("Committed calculations to the DB in %5.2fs",
                     time.time() - start)

    log.info("Ending run: %s", scanner.status_msg())

    if db_tracker:
        db_tracker.save()
    scanner.save()
Exemplo n.º 15
0
def main(args):
    commit_after = 25000
    database = utils.get_mongo_database()
    games = database.games
    collection = database.optimal_card_ratios
    db_tracker = None

    scanner = incremental_scanner.IncrementalScanner('optimal_card_ratios', database)

    if not args.incremental:
        log.warning('resetting scanner and db')
        scanner.reset()

    log.info("Starting run: %s", scanner.status_msg())

    for ind, game in enumerate(
        utils.progress_meter(scanner.scan(games, {}))):
        if not db_tracker:
            log.debug("Initializing db tracker manager")
            db_tracker = DBCardRatioTrackerManager(collection, args.incremental)
            log.debug("DB tracker manager initialized")

        result = process_game(Game(game))
        for final_ratio_dict, progressive_ratio_dict, win_points in result:
            db_tracker.integrate_results('final', final_ratio_dict, win_points)
            db_tracker.integrate_results('progressive', progressive_ratio_dict, win_points)

        if args.max_games >= 0 and ind >= args.max_games:
            log.info("Reached max_games of %d", args.max_games)
            break

        if ind % commit_after == 0 and ind > 0:
            start = time.time()
            db_tracker.save()
            scanner.save()
            log.info("Committed calculations to the DB in %5.2fs", time.time() - start)

    log.info("Ending run: %s", scanner.status_msg())

    if db_tracker:
        db_tracker.save()
    scanner.save()
Exemplo n.º 16
0
def main(args):
    db = utils.get_mongo_database()
    scanner = incremental_scanner.IncrementalScanner('analyze2', db)

    if not args.incremental:
        log.warning('resetting scanner and db')
        scanner.reset()
        for collection_name, _ in event_detectors:
            db[collection_name].drop()

    log.info("Starting run: %s", scanner.status_msg())
    games_stream = analysis_util.games_stream(scanner, db.games)
    accumulator = EventAccumulator()
    accumulate_card_stats(games_stream, accumulator, args.max_games)

    log.info('saving to database')
    log.debug('saving accumulated stats')
    accumulator.update_db(db)
    log.info('saving the game scanner state')
    scanner.save()
    log.info("Ending run: %s", scanner.status_msg())
Exemplo n.º 17
0
def main(args):
    db = utils.get_mongo_database()
    scanner = incremental_scanner.IncrementalScanner('analyze2', db)

    if not args.incremental:
        log.warning('resetting scanner and db')
        scanner.reset()
        for collection_name, _ in event_detectors:
            db[collection_name].drop()

    log.info("Starting run: %s", scanner.status_msg())
    games_stream = analysis_util.games_stream(scanner, db.games)
    accumulator = EventAccumulator()
    accumulate_card_stats(games_stream, accumulator, args.max_games)

    log.info('saving to database')
    log.debug('saving accumulated stats')
    accumulator.update_db(db)
    log.info('saving the game scanner state')
    scanner.save()
    log.info("Ending run: %s", scanner.status_msg())
Exemplo n.º 18
0
def run_trueskill_openings(args, db, log, commit_after=25000):
    games = db.games


    collection = db.trueskill_openings
    player_collection = db.trueskill_players
    # player_collection.remove()
    # collection.remove()
    setup_openings_collection(collection)
    # setup_openings_collection(player_collection)

    opening_skill_table = DbBackedSkillTable(collection)
    # player_skill_table = DbBackedSkillTable(player_collection)

    scanner = incremental_scanner.IncrementalScanner('trueskill', db)
    log.info("Starting run: %s", scanner.status_msg())
    if not args.incremental:
        log.warning('resetting scanner and db')
        scanner.reset()
        collection.drop()

    for ind, game in enumerate(
        utils.progress_meter(scanner.scan(db.games, {}))):
        if ( len(game[DECKS]) >= 2 and len(game[DECKS][1][TURNS]) >= 5 and (RATING_SYSTEM not in game or (RATING_SYSTEM in game and 'adventure' not in game[RATING_SYSTEM] and 'unknown' not in game[RATING_SYSTEM]))):
            update_skills_for_game(game, opening_skill_table)
                                   
        if ind == args.max_games:
            break

        if ind % commit_after == 0 and ind > 0:
            start = time.time()
            #player_skill_table.save()
            opening_skill_table.save()
            scanner.save()
            log.info("Committed calculations to the DB in %5.2fs", time.time() - start)

    #player_skill_table.save()
    opening_skill_table.save()
    scanner.save()
    log.info("Ending run: %s", scanner.status_msg())
Exemplo n.º 19
0
def main(parsed_args):
    """ Scan and update buy data"""
    start = time.time()
    db = utils.get_mongo_database()
    games = db.games
    output_db = db

    overall_stats = DeckBuyStats()

    scanner = incremental_scanner.IncrementalScanner(BUYS_COL_NAME, output_db)
    buy_collection = output_db[BUYS_COL_NAME]

    if not parsed_args.incremental:
        log.warning('resetting scanner and db')
        scanner.reset()
        buy_collection.drop()

    start_size = scanner.get_num_games()
    log.info("Starting run: %s", scanner.status_msg())
    do_scan(scanner, games, overall_stats, parsed_args.max_games)
    log.info("Ending run: %s", scanner.status_msg())
    end_size = scanner.get_num_games()

    if parsed_args.incremental:
        existing_overall_data = DeckBuyStats()
        utils.read_object_from_db(existing_overall_data, buy_collection, '')
        overall_stats.merge(existing_overall_data)

        def deck_freq(data_set):
            return data_set[dominioncards.Estate].available.frequency()

        log.info('existing %s decks', deck_freq(existing_overall_data))
        log.info('after merge %s decks', deck_freq(overall_stats))

    utils.write_object_to_db(overall_stats, buy_collection, '')

    scanner.save()
def main():
    utils.ensure_exists(output_directory)

    date_of_last_cached_leaderboard = get_date_of_last_cached_leaderboard()
    log.info('date of the last cached leaderboard is %s', date_of_last_cached_leaderboard)

    date_of_current_isotropic_leaderboard = get_date_of_current_isotropic_leaderboard()
    if date_of_current_isotropic_leaderboard is None:
        log.warning('could not determine the date of the current isotropic leaderboard, so please try again later')
        return
    log.info('date of the current isotropic leaderboard is %s', date_of_current_isotropic_leaderboard)

    one_day_delta = datetime.timedelta(1)
    date = date_of_last_cached_leaderboard + one_day_delta

    while date <= date_of_current_isotropic_leaderboard:
        log.info('Processing %s', date)

        if date == date_of_current_isotropic_leaderboard:
            log.info('scraping from isotropic')
            status = run_scrape_function_with_retries(scrape_leaderboard_from_isotropic, date)
        else:
            log.info('scraping from councilroom')
            status = run_scrape_function_with_retries(scrape_leaderboard_from_councilroom, date)

            if status != 200:
                log.info('scraping from bggdl')
                status = run_scrape_function_with_retries(scrape_leaderboard_from_bggdl, date)

        if status == 200:
            pass
        elif status == 404:
            log.warning('file not found, so we will assume that it does not exist, and go to the next day')
        else:
            log.warning('Unexpected status of %d, please try again later', status)
            break

        date += one_day_delta
Exemplo n.º 21
0
def main():
    filename_pattern = re.compile(r'^(?P<date>\d\d\d\d-\d\d-\d\d)\.html\.bz2$')
    iso_leaderboard_pattern = re.compile(r'<td>(?P<skill_mean>-?\d+\.\d+) &plusmn; ' + \
                                     r'(?P<skill_error>-?\d+\.\d+)</td><td class=c2>' + \
                                     r'(?P<rank>\d+)</td><td class=c>' + \
                                     r'(?P<eligible_games_played>\d+)</td><td>' + \
                                     r'(?P<nickname>[^<]*) <')
    goko_leaderboard_pattern = re.compile(r'\s+<td class="leaders-table-item table-item-rank">(?P<rank>\d+)</td>\s*\n' + r'\s*<td class="leaders-table-item table-item-name"><img [^>]*>(?P<nickname>.*)</td>\s*\n' + r'\s*<td class="leaders-table-item table-item-points">(?P<skill_mean>\d+)</td>')

    database = utils.get_mongo_database()
    history_collection = database.leaderboard_history
    scanner_collection = database.scanner

    db_val = scanner_collection.find_one({'_id': 'leaderboard_history'})
    last_date = db_val['last_date'] if db_val else '0000-00-00'

    directory = 'static/leaderboard/'

    filenames = os.listdir(directory)
    filenames.sort()

    bad_leaderboard_dates = utils.get_bad_leaderboard_dates()

    for filename in filenames:
        match = filename_pattern.search(filename)
        if not match:
            continue

        date = match.group('date')

        if date in bad_leaderboard_dates:
            # don't load data from when the leaderboard was messed up
            log.warning("Skipping %s because the leaderboard was messed up", date)
            continue

        if date <= last_date:
            log.warning("Date %s is less than last date %s", date, last_date)
            continue

        log.info('Processing %s', date)

        file_obj = bz2.BZ2File(directory + filename)
        content = file_obj.read().decode('utf-8')
        file_obj.close()

        nickname_to_entry = {}
        num_matches = 0
        last_rank = -1

        pos = 0
        while True:
            match = iso_leaderboard_pattern.search(content, pos)
            if not match:
                break

            num_matches += 1
            skill_mean = float(match.group('skill_mean'))
            skill_error = float(match.group('skill_error'))
            rank = int(match.group('rank'))
            eligible_games_played = int(match.group('eligible_games_played'))
            nickname = match.group('nickname')

            normed_nickname = name_merger.norm_name(nickname)

            if normed_nickname not in nickname_to_entry:
                nickname_to_entry[normed_nickname] = [date, skill_mean, skill_error, rank, eligible_games_played]
            else:
                log.info('normed nickname %s already exists for %s', normed_nickname, date)

            last_rank = rank
            pos = match.end()

        pos = 0
        while True:
            match = goko_leaderboard_pattern.search(content, pos)
            if not match:
                break

            num_matches += 1
            skill_mean = float(match.group('skill_mean'))
            skill_error = 0
            rank = int(match.group('rank'))
            eligible_games_played = 0
            nickname = match.group('nickname')

            normed_nickname = nickname

            if normed_nickname not in nickname_to_entry:
                nickname_to_entry[normed_nickname] = [date, skill_mean, skill_error, rank, eligible_games_played]
            else:
                log.info('normed nickname %s already exists for %s', normed_nickname, date)

            last_rank = rank
            pos = match.end()
        log.info('%d entries matched', num_matches)

        if num_matches == 0:
            log.error('No entries found, so the regex is probably not doing its job anymore.')
            break

        if num_matches != last_rank:
            log.error('ERROR: # entries does not match last rank, so the regex is probably not doing its job anymore.')
            break

        for nickname, data in nickname_to_entry.iteritems():
            history_collection.update({'_id': nickname}, {'$push': {'history': data}}, upsert=True)

        log.info('%d player histories updated', len(nickname_to_entry))

        last_date = date

    scanner_collection.update({'_id': 'leaderboard_history'}, {'$set': {'last_date': last_date}}, upsert=True)
Exemplo n.º 22
0
def main():
    filename_pattern = re.compile(r'^(?P<date>\d\d\d\d-\d\d-\d\d)\.html\.bz2$')
    leaderboard_pattern = re.compile(r'<td>(?P<skill_mean>-?\d+\.\d+) &plusmn; ' + \
                                     r'(?P<skill_error>-?\d+\.\d+)</td><td class=c2>' + \
                                     r'(?P<rank>\d+)</td><td class=c>' + \
                                     r'(?P<eligible_games_played>\d+)</td><td>' + \
                                     r'(?P<nickname>[^<]*) <')

    database = utils.get_mongo_database()
    history_collection = database.leaderboard_history
    scanner_collection = database.scanner

    db_val = scanner_collection.find_one({'_id': 'leaderboard_history'})
    last_date = db_val['last_date'] if db_val else '0000-00-00'

    directory = 'static/leaderboard/'

    filenames = os.listdir(directory)
    filenames.sort()

    bad_leaderboard_dates = utils.get_bad_leaderboard_dates()

    for filename in filenames:
        match = filename_pattern.search(filename)
        if not match:
            continue

        date = match.group('date')

        if date in bad_leaderboard_dates:
            # don't load data from when the leaderboard was messed up
            log.warning("Skipping %s because the leaderboard was messed up",
                        date)
            continue

        if date <= last_date:
            log.warning("Date %s is less than last date %s", date, last_date)
            continue

        log.info('Processing %s', date)

        file_obj = bz2.BZ2File(directory + filename)
        content = file_obj.read().decode('utf-8')
        file_obj.close()

        nickname_to_entry = {}
        num_matches = 0
        last_rank = -1

        pos = 0
        while True:
            match = leaderboard_pattern.search(content, pos)
            if not match:
                break

            num_matches += 1
            skill_mean = float(match.group('skill_mean'))
            skill_error = float(match.group('skill_error'))
            rank = int(match.group('rank'))
            eligible_games_played = int(match.group('eligible_games_played'))
            nickname = match.group('nickname')

            normed_nickname = name_merger.norm_name(nickname)

            if normed_nickname not in nickname_to_entry:
                nickname_to_entry[normed_nickname] = [
                    date, skill_mean, skill_error, rank, eligible_games_played
                ]
            else:
                log.info('normed nickname %s already exists for %s',
                         normed_nickname, date)

            last_rank = rank
            pos = match.end()

        log.info('%d entries matched', num_matches)

        if num_matches == 0:
            log.error(
                'No entries found, so the regex is probably not doing its job anymore.'
            )
            break

        if num_matches != last_rank:
            log.error(
                'ERROR: # entries does not match last rank, so the regex is probably not doing its job anymore.'
            )
            break

        for nickname, data in nickname_to_entry.iteritems():
            history_collection.update({'_id': nickname},
                                      {'$push': {
                                          'history': data
                                      }},
                                      upsert=True)

        log.info('%d player histories updated', len(nickname_to_entry))

        last_date = date

    scanner_collection.update({'_id': 'leaderboard_history'},
                              {'$set': {
                                  'last_date': last_date
                              }},
                              upsert=True)