def run_scrape_function_with_retries(scrape_function, date): num_attempts = 0 while True: num_attempts += 1 status = scrape_function(date) if status == 200: log.info('successful') break elif status == 404: log.info('file not found') break elif status == 'leaderboard updated': log.warning('the leaderboard was updated after this script was started, so re-run this script') break else: if num_attempts < 3: log.info('Status was %s, retrying', status) else: log.error('reached 3 attempts, aborting') break return status
def main(): utils.ensure_exists(output_directory) date_of_last_cached_leaderboard = get_date_of_last_cached_leaderboard() log.info('date of the last cached leaderboard is %s', date_of_last_cached_leaderboard) date_of_last_goko_leaderboard = datetime.date.today() one_day_delta = datetime.timedelta(1) date = date_of_last_cached_leaderboard + one_day_delta while date <= datetime.date.today(): log.info('Processing %s', date) if date == datetime.date.today(): log.info('scraping from goko') status = run_scrape_function_with_retries(scrape_leaderboard_from_goko, date) else: log.info('scraping from councilroom') status = run_scrape_function_with_retries(scrape_leaderboard_from_councilroom, date) if status != 200 and date <= datetime.date(2013,01,01): log.info('scraping from bggdl') status = run_scrape_function_with_retries(scrape_leaderboard_from_bggdl, date) if status == 200: pass elif status == 404: log.warning('file not found, so we will assume that it does not exist, and go to the next day') else: log.warning('Unexpected status of %d, please try again later', status) break date += one_day_delta
def main(parsed_args): db = utils.get_mongo_database() goal_db = db.goals gstats_db = db.goal_stats all_goals = goals.goal_check_funcs.keys() total_pcount = collections.defaultdict(int) goal_scanner = incremental_scanner.IncrementalScanner('goals', db) stat_scanner = incremental_scanner.IncrementalScanner('goal_stats', db) if not parsed_args.incremental: log.warning('resetting scanner and db') stat_scanner.reset() gstats_db.remove() log.info("Starting run: %s", stat_scanner.status_msg()) # TODO: The following logic doesn't work now that goal calculation doesn't happen with a scanner. # if goal_scanner.get_max_game_id() == stat_scanner.get_max_game_id(): # log.info("Stats already set! Skip") # exit(0) log.info('all_goals %s', all_goals) for goal_name in all_goals: log.info("Working on %s", goal_name) found_goals_cursor = goal_db.find({'goals.goal_name': goal_name}, {'goals.player': 1, '_id': 0}) total = found_goals_cursor.count() log.info("Found %d instances of %s", total, goal_name) pcount = collections.defaultdict(int) for goal in found_goals_cursor: player = goal['goals'][0]['player'] pcount[player] += 1 total_pcount[player] += 1 psorted = sorted(pcount.iteritems(), key=operator.itemgetter(1), reverse=True) top = [] leaders = 0 i = 0 while leaders < 3 and i < len(psorted): (player, count) = psorted[i] players = [] if player not in AIs.names: players = [player] i += 1 while i < len(psorted) and psorted[i][1] == count: if psorted[i][0] not in AIs.names: players.append(psorted[i][0]) i += 1 leaders += len(players) if len(players) > 0: top.append((players, count)) mongo_val = {'_id': goal_name, 'count': total, 'top': top} gstats_db.save(mongo_val) stat_scanner.set_max_game_id(goal_scanner.get_max_game_id()) stat_scanner.save() log.info("Ending run: %s", stat_scanner.status_msg())
def main(parsed_args): """ Scan and update buy data""" start = time.time() db = utils.get_mongo_database() games = db.games output_db = db overall_stats = DeckBuyStats() scanner = incremental_scanner.IncrementalScanner(BUYS_COL_NAME, output_db) buy_collection = output_db[BUYS_COL_NAME] if not parsed_args.incremental: log.warning('resetting scanner and db') scanner.reset() buy_collection.drop() start_size = scanner.get_num_games() log.info("Starting run: %s", scanner.status_msg()) do_scan(scanner, games, overall_stats, parsed_args.max_games) log.info("Ending run: %s", scanner.status_msg()) end_size = scanner.get_num_games() if parsed_args.incremental: existing_overall_data = DeckBuyStats() utils.read_object_from_db(existing_overall_data, buy_collection, '') overall_stats.merge(existing_overall_data) def deck_freq(data_set): return data_set[dominioncards.Estate].available.frequency() log.info('existing %s decks', deck_freq(existing_overall_data)) log.info('after merge %s decks', deck_freq(overall_stats)) utils.write_object_to_db(overall_stats, buy_collection, '') scanner.save()
def main(parsed_args): db = utils.get_mongo_database() goal_db = db.goals gstats_db = db.goal_stats all_goals = goals.goal_check_funcs.keys() total_pcount = collections.defaultdict(int) goal_scanner = incremental_scanner.IncrementalScanner('goals', db) stat_scanner = incremental_scanner.IncrementalScanner('goal_stats', db) if not parsed_args.incremental: log.warning('resetting scanner and db') stat_scanner.reset() gstats_db.remove() log.info("Starting run: %s", stat_scanner.status_msg()) # TODO: The following logic doesn't work now that goal calculation doesn't happen with a scanner. # if goal_scanner.get_max_game_id() == stat_scanner.get_max_game_id(): # log.info("Stats already set! Skip") # exit(0) log.info('all_goals %s', all_goals) for goal_name in all_goals: log.info("Working on %s", goal_name) found_goals_cursor = goal_db.find({'goals.goal_name': goal_name}, { 'goals.player': 1, '_id': 0 }) total = found_goals_cursor.count() log.info("Found %d instances of %s", total, goal_name) pcount = collections.defaultdict(int) for goal in found_goals_cursor: player = goal['goals'][0]['player'] pcount[player] += 1 total_pcount[player] += 1 psorted = sorted(pcount.iteritems(), key=operator.itemgetter(1), reverse=True) top = [] leaders = 0 i = 0 while leaders < 3 and i < len(psorted): (player, count) = psorted[i] players = [player] i += 1 while i < len(psorted) and psorted[i][1] == count: players.append(psorted[i][0]) i += 1 leaders += len(players) top.append((players, count)) mongo_val = {'_id': goal_name, 'count': total, 'top': top} gstats_db.save(mongo_val) stat_scanner.set_max_game_id(goal_scanner.get_max_game_id()) stat_scanner.save() log.info("Ending run: %s", stat_scanner.status_msg())
def save_file(date, data, is_gzipped): if is_gzipped: f = gzip.GzipFile(fileobj=StringIO.StringIO(data)) try: decompressed_data = f.read() data = decompressed_data except IOError, ioe: log.warning('Received data was not in gzip format') f.close()
def main(args): """ Update analysis statistics. By default, do so incrementally, unless --noincremental argument is given.""" commit_after = 25000 database = utils.get_mongo_database() games = database.games output_collection_name = 'analysis' output_collection = database[output_collection_name] game_analysis = GamesAnalysis() scanner = incremental_scanner.IncrementalScanner(output_collection_name, database) if args.incremental: utils.read_object_from_db(game_analysis, output_collection, '') else: log.warning('resetting scanner and db') scanner.reset() output_file_name = 'static/output/all_games_card_stats.js' if not os.path.exists('static/output'): os.makedirs('static/output') log.info("Starting run: %s", scanner.status_msg()) for idx, raw_game in enumerate( utils.progress_meter(scanner.scan(games, {}))): try: game_analysis.analyze_game(Game(raw_game)) if args.max_games >= 0 and idx >= args.max_games: log.info("Reached max_games of %d", args.max_games) break if idx % commit_after == 0 and idx > 0: start = time.time() game_analysis.max_game_id = scanner.get_max_game_id() game_analysis.num_games = scanner.get_num_games() utils.write_object_to_db(game_analysis, output_collection, '') scanner.save() log.info("Committed calculations to the DB in %5.2fs", time.time() - start) except int, exception: log.exception('Exception occurred for %s in raw game %s', Game(raw_game).isotropic_url(), raw_game) raise
def save_file(date, data, is_gzipped): if is_gzipped: f = gzip.GzipFile(fileobj=StringIO.StringIO(data)) try: decompressed_data = f.read() data = decompressed_data except IOError as ioe: log.warning('Received data was not in gzip format') f.close() data = bz2.compress(data) f = open(output_directory + str(date) + '.html.bz2', 'w') f.write(data) f.close()
def main(args): """ Update analysis statistics. By default, do so incrementally, unless --noincremental argument is given.""" commit_after = 25000 database = utils.get_mongo_database() games = database.games output_collection_name = 'analysis' output_collection = database[output_collection_name] game_analysis = GamesAnalysis() scanner = incremental_scanner.IncrementalScanner(output_collection_name, database) if args.incremental: utils.read_object_from_db(game_analysis, output_collection, '') else: log.warning('resetting scanner and db') scanner.reset() output_file_name = 'static/output/all_games_card_stats.js' if not os.path.exists('static/output'): os.makedirs('static/output') log.info("Starting run: %s", scanner.status_msg()) for idx, raw_game in enumerate(utils.progress_meter(scanner.scan(games, {}))): try: game_analysis.analyze_game(Game(raw_game)) if args.max_games >= 0 and idx >= args.max_games: log.info("Reached max_games of %d", args.max_games) break if idx % commit_after == 0 and idx > 0: start = time.time() game_analysis.max_game_id = scanner.get_max_game_id() game_analysis.num_games = scanner.get_num_games() utils.write_object_to_db(game_analysis, output_collection, '') scanner.save() log.info("Committed calculations to the DB in %5.2fs", time.time() - start) except int, exception: log.exception('Exception occurred for %s in raw game %s', Game(raw_game).isotropic_url(), raw_game) raise
def watch_and_log(signature, log_interval=15, timeout=600): """Invoke the celery task via the passed signature, wait for it an all its children to complete, and log progress along the way. log_interval: number of seconds between checking and logging the status timeout: number of seconds after which to return, when there have been no subtask status updates""" task_name = signature.task log.info("Calling background task %s", task_name) async_result = signature.apply_async() all_done = False last_status_summary = None last_status_update = time.time() while not all_done: # Wait for the log_interval, then check the status time.sleep(log_interval) c = collections.Counter() try: # Setting intermediate to False should cause the # IncompleteStream exception to be thrown if the task and # its children aren't all complete. for parent, child in async_result.iterdeps(intermediate=False): c[child.state] += 1 all_done = True except celery.exceptions.IncompleteStream: status_summary = summarize_task_status(c) log.info("Waiting for %s: %s", task_name, status_summary) # Check on timeout condition if (last_status_summary is not None and status_summary == last_status_summary and (time.time() - last_status_update) > timeout): break else: last_status_summary = status_summary last_status_update = time.time() if all_done: log.info("Done with background task %s: %s", task_name, summarize_task_status(c)) else: log.warning("Returning due to timeout during background task %s: %s", task_name, summarize_task_status(c)) return async_result
def main(args): commit_after = 25000 database = utils.get_mongo_database() games = database.games collection = database.optimal_card_ratios db_tracker = None scanner = incremental_scanner.IncrementalScanner('optimal_card_ratios', database) if not args.incremental: log.warning('resetting scanner and db') scanner.reset() log.info("Starting run: %s", scanner.status_msg()) for ind, game in enumerate(utils.progress_meter(scanner.scan(games, {}))): if not db_tracker: log.debug("Initializing db tracker manager") db_tracker = DBCardRatioTrackerManager(collection, args.incremental) log.debug("DB tracker manager initialized") result = process_game(Game(game)) for final_ratio_dict, progressive_ratio_dict, win_points in result: db_tracker.integrate_results('final', final_ratio_dict, win_points) db_tracker.integrate_results('progressive', progressive_ratio_dict, win_points) if args.max_games >= 0 and ind >= args.max_games: log.info("Reached max_games of %d", args.max_games) break if ind % commit_after == 0 and ind > 0: start = time.time() db_tracker.save() scanner.save() log.info("Committed calculations to the DB in %5.2fs", time.time() - start) log.info("Ending run: %s", scanner.status_msg()) if db_tracker: db_tracker.save() scanner.save()
def main(args): commit_after = 25000 database = utils.get_mongo_database() games = database.games collection = database.optimal_card_ratios db_tracker = None scanner = incremental_scanner.IncrementalScanner('optimal_card_ratios', database) if not args.incremental: log.warning('resetting scanner and db') scanner.reset() log.info("Starting run: %s", scanner.status_msg()) for ind, game in enumerate( utils.progress_meter(scanner.scan(games, {}))): if not db_tracker: log.debug("Initializing db tracker manager") db_tracker = DBCardRatioTrackerManager(collection, args.incremental) log.debug("DB tracker manager initialized") result = process_game(Game(game)) for final_ratio_dict, progressive_ratio_dict, win_points in result: db_tracker.integrate_results('final', final_ratio_dict, win_points) db_tracker.integrate_results('progressive', progressive_ratio_dict, win_points) if args.max_games >= 0 and ind >= args.max_games: log.info("Reached max_games of %d", args.max_games) break if ind % commit_after == 0 and ind > 0: start = time.time() db_tracker.save() scanner.save() log.info("Committed calculations to the DB in %5.2fs", time.time() - start) log.info("Ending run: %s", scanner.status_msg()) if db_tracker: db_tracker.save() scanner.save()
def main(args): db = utils.get_mongo_database() scanner = incremental_scanner.IncrementalScanner('analyze2', db) if not args.incremental: log.warning('resetting scanner and db') scanner.reset() for collection_name, _ in event_detectors: db[collection_name].drop() log.info("Starting run: %s", scanner.status_msg()) games_stream = analysis_util.games_stream(scanner, db.games) accumulator = EventAccumulator() accumulate_card_stats(games_stream, accumulator, args.max_games) log.info('saving to database') log.debug('saving accumulated stats') accumulator.update_db(db) log.info('saving the game scanner state') scanner.save() log.info("Ending run: %s", scanner.status_msg())
def run_trueskill_openings(args, db, log, commit_after=25000): games = db.games collection = db.trueskill_openings player_collection = db.trueskill_players # player_collection.remove() # collection.remove() setup_openings_collection(collection) # setup_openings_collection(player_collection) opening_skill_table = DbBackedSkillTable(collection) # player_skill_table = DbBackedSkillTable(player_collection) scanner = incremental_scanner.IncrementalScanner('trueskill', db) log.info("Starting run: %s", scanner.status_msg()) if not args.incremental: log.warning('resetting scanner and db') scanner.reset() collection.drop() for ind, game in enumerate( utils.progress_meter(scanner.scan(db.games, {}))): if ( len(game[DECKS]) >= 2 and len(game[DECKS][1][TURNS]) >= 5 and (RATING_SYSTEM not in game or (RATING_SYSTEM in game and 'adventure' not in game[RATING_SYSTEM] and 'unknown' not in game[RATING_SYSTEM]))): update_skills_for_game(game, opening_skill_table) if ind == args.max_games: break if ind % commit_after == 0 and ind > 0: start = time.time() #player_skill_table.save() opening_skill_table.save() scanner.save() log.info("Committed calculations to the DB in %5.2fs", time.time() - start) #player_skill_table.save() opening_skill_table.save() scanner.save() log.info("Ending run: %s", scanner.status_msg())
def main(): utils.ensure_exists(output_directory) date_of_last_cached_leaderboard = get_date_of_last_cached_leaderboard() log.info('date of the last cached leaderboard is %s', date_of_last_cached_leaderboard) date_of_current_isotropic_leaderboard = get_date_of_current_isotropic_leaderboard() if date_of_current_isotropic_leaderboard is None: log.warning('could not determine the date of the current isotropic leaderboard, so please try again later') return log.info('date of the current isotropic leaderboard is %s', date_of_current_isotropic_leaderboard) one_day_delta = datetime.timedelta(1) date = date_of_last_cached_leaderboard + one_day_delta while date <= date_of_current_isotropic_leaderboard: log.info('Processing %s', date) if date == date_of_current_isotropic_leaderboard: log.info('scraping from isotropic') status = run_scrape_function_with_retries(scrape_leaderboard_from_isotropic, date) else: log.info('scraping from councilroom') status = run_scrape_function_with_retries(scrape_leaderboard_from_councilroom, date) if status != 200: log.info('scraping from bggdl') status = run_scrape_function_with_retries(scrape_leaderboard_from_bggdl, date) if status == 200: pass elif status == 404: log.warning('file not found, so we will assume that it does not exist, and go to the next day') else: log.warning('Unexpected status of %d, please try again later', status) break date += one_day_delta
def main(): filename_pattern = re.compile(r'^(?P<date>\d\d\d\d-\d\d-\d\d)\.html\.bz2$') iso_leaderboard_pattern = re.compile(r'<td>(?P<skill_mean>-?\d+\.\d+) ± ' + \ r'(?P<skill_error>-?\d+\.\d+)</td><td class=c2>' + \ r'(?P<rank>\d+)</td><td class=c>' + \ r'(?P<eligible_games_played>\d+)</td><td>' + \ r'(?P<nickname>[^<]*) <') goko_leaderboard_pattern = re.compile(r'\s+<td class="leaders-table-item table-item-rank">(?P<rank>\d+)</td>\s*\n' + r'\s*<td class="leaders-table-item table-item-name"><img [^>]*>(?P<nickname>.*)</td>\s*\n' + r'\s*<td class="leaders-table-item table-item-points">(?P<skill_mean>\d+)</td>') database = utils.get_mongo_database() history_collection = database.leaderboard_history scanner_collection = database.scanner db_val = scanner_collection.find_one({'_id': 'leaderboard_history'}) last_date = db_val['last_date'] if db_val else '0000-00-00' directory = 'static/leaderboard/' filenames = os.listdir(directory) filenames.sort() bad_leaderboard_dates = utils.get_bad_leaderboard_dates() for filename in filenames: match = filename_pattern.search(filename) if not match: continue date = match.group('date') if date in bad_leaderboard_dates: # don't load data from when the leaderboard was messed up log.warning("Skipping %s because the leaderboard was messed up", date) continue if date <= last_date: log.warning("Date %s is less than last date %s", date, last_date) continue log.info('Processing %s', date) file_obj = bz2.BZ2File(directory + filename) content = file_obj.read().decode('utf-8') file_obj.close() nickname_to_entry = {} num_matches = 0 last_rank = -1 pos = 0 while True: match = iso_leaderboard_pattern.search(content, pos) if not match: break num_matches += 1 skill_mean = float(match.group('skill_mean')) skill_error = float(match.group('skill_error')) rank = int(match.group('rank')) eligible_games_played = int(match.group('eligible_games_played')) nickname = match.group('nickname') normed_nickname = name_merger.norm_name(nickname) if normed_nickname not in nickname_to_entry: nickname_to_entry[normed_nickname] = [date, skill_mean, skill_error, rank, eligible_games_played] else: log.info('normed nickname %s already exists for %s', normed_nickname, date) last_rank = rank pos = match.end() pos = 0 while True: match = goko_leaderboard_pattern.search(content, pos) if not match: break num_matches += 1 skill_mean = float(match.group('skill_mean')) skill_error = 0 rank = int(match.group('rank')) eligible_games_played = 0 nickname = match.group('nickname') normed_nickname = nickname if normed_nickname not in nickname_to_entry: nickname_to_entry[normed_nickname] = [date, skill_mean, skill_error, rank, eligible_games_played] else: log.info('normed nickname %s already exists for %s', normed_nickname, date) last_rank = rank pos = match.end() log.info('%d entries matched', num_matches) if num_matches == 0: log.error('No entries found, so the regex is probably not doing its job anymore.') break if num_matches != last_rank: log.error('ERROR: # entries does not match last rank, so the regex is probably not doing its job anymore.') break for nickname, data in nickname_to_entry.iteritems(): history_collection.update({'_id': nickname}, {'$push': {'history': data}}, upsert=True) log.info('%d player histories updated', len(nickname_to_entry)) last_date = date scanner_collection.update({'_id': 'leaderboard_history'}, {'$set': {'last_date': last_date}}, upsert=True)
def main(): filename_pattern = re.compile(r'^(?P<date>\d\d\d\d-\d\d-\d\d)\.html\.bz2$') leaderboard_pattern = re.compile(r'<td>(?P<skill_mean>-?\d+\.\d+) ± ' + \ r'(?P<skill_error>-?\d+\.\d+)</td><td class=c2>' + \ r'(?P<rank>\d+)</td><td class=c>' + \ r'(?P<eligible_games_played>\d+)</td><td>' + \ r'(?P<nickname>[^<]*) <') database = utils.get_mongo_database() history_collection = database.leaderboard_history scanner_collection = database.scanner db_val = scanner_collection.find_one({'_id': 'leaderboard_history'}) last_date = db_val['last_date'] if db_val else '0000-00-00' directory = 'static/leaderboard/' filenames = os.listdir(directory) filenames.sort() bad_leaderboard_dates = utils.get_bad_leaderboard_dates() for filename in filenames: match = filename_pattern.search(filename) if not match: continue date = match.group('date') if date in bad_leaderboard_dates: # don't load data from when the leaderboard was messed up log.warning("Skipping %s because the leaderboard was messed up", date) continue if date <= last_date: log.warning("Date %s is less than last date %s", date, last_date) continue log.info('Processing %s', date) file_obj = bz2.BZ2File(directory + filename) content = file_obj.read().decode('utf-8') file_obj.close() nickname_to_entry = {} num_matches = 0 last_rank = -1 pos = 0 while True: match = leaderboard_pattern.search(content, pos) if not match: break num_matches += 1 skill_mean = float(match.group('skill_mean')) skill_error = float(match.group('skill_error')) rank = int(match.group('rank')) eligible_games_played = int(match.group('eligible_games_played')) nickname = match.group('nickname') normed_nickname = name_merger.norm_name(nickname) if normed_nickname not in nickname_to_entry: nickname_to_entry[normed_nickname] = [ date, skill_mean, skill_error, rank, eligible_games_played ] else: log.info('normed nickname %s already exists for %s', normed_nickname, date) last_rank = rank pos = match.end() log.info('%d entries matched', num_matches) if num_matches == 0: log.error( 'No entries found, so the regex is probably not doing its job anymore.' ) break if num_matches != last_rank: log.error( 'ERROR: # entries does not match last rank, so the regex is probably not doing its job anymore.' ) break for nickname, data in nickname_to_entry.iteritems(): history_collection.update({'_id': nickname}, {'$push': { 'history': data }}, upsert=True) log.info('%d player histories updated', len(nickname_to_entry)) last_date = date scanner_collection.update({'_id': 'leaderboard_history'}, {'$set': { 'last_date': last_date }}, upsert=True)