def scrape_games(): parser = utils.incremental_date_range_cmd_line_parser() utils.ensure_exists('static/scrape_data') os.chdir('static/scrape_data') args = parser.parse_args() last_month = '' for cur_date in utils.daterange(datetime.date(2010, 10, 15), datetime.date.today()): str_date = time.strftime("%Y%m%d", cur_date.timetuple()) if not utils.includes_day(args, str_date): if DEBUG: print 'skipping', str_date, 'because not in cmd line arg daterange' continue mon = time.strftime("%b%y", cur_date.timetuple()) if mon != last_month: print print mon, cur_date.day*" ", sys.stdout.flush() last_month = mon ret = scrape_date(str_date, cur_date, passive=args.passive) if ret==DOWNLOADED: print 'o', elif ret==REPACKAGED: print 'O', elif ret==ERROR: print '!', elif ret==MISSING: print '_', else: print '.', sys.stdout.flush() print os.chdir('../..')
def main(args, log): BEEN_PARSED_KEY = 'day_analyzed' if args.incremental: log.info("Performing incremental parsing from %s to %s", args.startdate, args.enddate) else: log.info("Performing non-incremental (re)parsing from %s to %s", args.startdate, args.enddate) connection = pymongo.MongoClient() db = connection.test # RT: changed. raw_games = db.raw_games raw_games.ensure_index('game_date') utils.ensure_exists('parsed_out') day_status_col = db.day_status days = day_status_col.find({'raw_games_loaded': True}) for day in days: year_month_day = day['_id'] if not utils.includes_day(args, year_month_day): log.debug( "Raw games for %s available in the database but not in date range, skipping", year_month_day) continue if BEEN_PARSED_KEY not in day: day[BEEN_PARSED_KEY] = False day_status_col.save(day) if day[BEEN_PARSED_KEY] and args.incremental: log.debug( "Raw games for %s have been parsed, and we're running incrementally, skipping", year_month_day) continue try: log.info("Parsing %s", year_month_day) convert_to_json(log, raw_games, year_month_day) continue day[BEEN_PARSED_KEY] = True day_status_col.save(day) except ParseTurnHeaderError as e: log.error("ParseTurnHeaderError occurred while parsing %s: %s", year_month_day, e) return except Exception as e: log.error("Exception occurred while parsing %s: %s", year_month_day, e) return
def main(args, log): BEEN_PARSED_KEY = 'day_analyzed' if args.incremental: log.info("Performing incremental parsing from %s to %s", args.startdate, args.enddate) else: log.info("Performing non-incremental (re)parsing from %s to %s", args.startdate, args.enddate) connection = pymongo.Connection() db = connection.test raw_games = db.raw_games raw_games.ensure_index('game_date') utils.ensure_exists('parsed_out') day_status_col = db.day_status days = day_status_col.find({'raw_games_loaded': True}) for day in days: year_month_day = day['_id'] if not utils.includes_day(args, year_month_day): log.debug("Raw games for %s available in the database but not in date range, skipping", year_month_day) continue if BEEN_PARSED_KEY not in day: day[BEEN_PARSED_KEY] = False day_status_col.save(day) if day[BEEN_PARSED_KEY] and args.incremental: log.debug("Raw games for %s have been parsed, and we're running incrementally, skipping", year_month_day) continue try: log.info("Parsing %s", year_month_day) convert_to_json(log, raw_games, year_month_day) continue day[BEEN_PARSED_KEY] = True day_status_col.save(day) except ParseTurnHeaderError, e: log.error("ParseTurnHeaderError occurred while parsing %s: %s", year_month_day, e) return except Exception, e: log.error("Exception occurred while parsing %s: %s", year_month_day, e) return
def main(args, log): if args.incremental: log.info("Performing incremental parsing from %s to %s", args.startdate, args.enddate) else: log.info("Performing non-incremental (re)parsing from %s to %s", args.startdate, args.enddate) games_table = pymongo.Connection().test.games games_table.ensure_index(PLAYERS) games_table.ensure_index(SUPPLY) data_files_to_load = os.listdir("parsed_out") data_files_to_load.sort() for fn in data_files_to_load: yyyymmdd = fn[:8] if not utils.includes_day(args, yyyymmdd): log.debug("Parsed games for %s available in the filesystem but not in date range, skipping", yyyymmdd) continue process_file(fn, args.incremental, games_table, log)
def main(): args = parser.parse_args() connection = utils.get_mongo_connection() games_table = connection.test.games games_table.ensure_index('players') games_table.ensure_index('supply') data_files_to_load = os.listdir('parsed_out') data_files_to_load.sort() find_id = re.compile('game-.*.html') done = set() for fn in data_files_to_load: yyyymmdd = fn[:8] print yyyymmdd if not utils.includes_day(args, yyyymmdd): print 'skipping', fn, 'because not in range' continue if args.incremental: if yyyymmdd in done: print 'skipping', fn, 'because done' continue contents = open('parsed_out/' + fn, 'r').read(100) if contents.strip() == '[]': print "empty contents (make parser not dump empty files?)", \ fn continue first_game_id_match = find_id.search(contents) assert first_game_id_match is not None, ( 'could not get id from %s in file %s' % (contents, fn)) first_game_id = first_game_id_match.group(0) query = {'_id': first_game_id} if games_table.find(query).count(): done.add(yyyymmdd) print 'skipping', yyyymmdd, 'and marking as done' continue else: print first_game_id, str(query), 'not in db, importing' cmd = ('mongoimport -h localhost parsed_out/%s -c ' 'games --jsonArray' % fn) print cmd os.system(cmd)
def main(): # print AnnotateGame(codecs.open(fn, 'r', encoding='utf-8').read()).encode( # 'utf-8') # return args = utils.incremental_date_range_cmd_line_parser().parse_args() print args days = os.listdir("static/scrape_data") days.sort() for year_month_day in days: if not utils.includes_day(args, year_month_day): continue if args.incremental and os.path.exists("parsed_out/%s-0.json" % year_month_day): print "skipping", year_month_day, "because already done" continue try: print "trying", year_month_day convert_to_json(year_month_day) except ParseTurnHeaderError, e: print e return
def main(args, log): if args.incremental: log.info("Performing incremental parsing from %s to %s", args.startdate, args.enddate) else: log.info("Performing non-incremental (re)parsing from %s to %s", args.startdate, args.enddate) games_table = pymongo.Connection().test.games games_table.ensure_index(PLAYERS) games_table.ensure_index(SUPPLY) data_files_to_load = os.listdir('parsed_out') data_files_to_load.sort() for fn in data_files_to_load: yyyymmdd = fn[:8] if not utils.includes_day(args, yyyymmdd): log.debug( "Parsed games for %s available in the filesystem but not in date range, skipping", yyyymmdd) continue process_file(fn, args.incremental, games_table, log)
def main(): #print AnnotateGame(codecs.open(fn, 'r', encoding='utf-8').read()).encode( # 'utf-8') #return args = utils.incremental_date_range_cmd_line_parser().parse_args() print args days = os.listdir('static/scrape_data') days.sort() for year_month_day in days: if not utils.includes_day(args, year_month_day): continue if args.incremental and os.path.exists( 'parsed_out/%s-0.json' % year_month_day): print 'skipping', year_month_day, 'because already done' continue try: print 'trying', year_month_day convert_to_json(year_month_day) except ParseTurnHeaderError, e: print e return
def scrape_games(): parser = utils.incremental_date_range_cmd_line_parser() utils.ensure_exists('static/scrape_data') os.chdir('static/scrape_data') args = parser.parse_args() last_month = '' yesterday = datetime.date.today() - datetime.timedelta(days=1) #Goko updates logs in real time; wait a day so the list is finalized. for cur_date in utils.daterange(default_startdate, yesterday, reverse=True): str_date = time.strftime("%Y%m%d", cur_date.timetuple()) if not utils.includes_day(args, str_date): if DEBUG: print 'skipping', str_date, 'because not in cmd line arg daterange' continue mon = time.strftime("%b%y", cur_date.timetuple()) if mon != last_month: print print mon, cur_date.day*" ", sys.stdout.flush() last_month = mon ret = scrape_date(str_date, cur_date, passive=args.passive) if ret==DOWNLOADED: print 'o', elif ret==REPACKAGED: print 'O', elif ret==ERROR: print '!', elif ret==MISSING: print '_', else: print '.', sys.stdout.flush() print os.chdir('../..')
def http_error_default(self, *args, **kwargs): urllib.URLopener.http_error_default(self, *args, **kwargs) if __name__ == '__main__': parser = utils.incremental_date_range_cmd_line_parser() args = parser.parse_args() utils.ensure_exists('static/scrape_data') os.chdir('static/scrape_data') for cur_date in utils.daterange(datetime.date(2010, 10, 15), datetime.date.today()): str_date = time.strftime("%Y%m%d", cur_date.timetuple()) if not utils.includes_day(args, str_date): print 'skipping', str_date, 'because not in cmd line arg daterange' continue directory = str_date print str_date games_short_name = str_date + '.all.tar.bz2' saved_games_bundle = directory + '/' + games_short_name if utils.at_least_as_big_as(saved_games_bundle, SMALL_FILE_SIZE): print 'skipping because exists', str_date, saved_games_bundle, \ 'and not small (size=', os.stat(saved_games_bundle).st_size, ')' continue if not os.path.exists(directory): os.mkdir(directory) RemoveSmallFileIfExists(saved_games_bundle) url = IsotropicGamesCollectionUrl(cur_date)
host = 'http://councilroom.com/static/scrape_data/' return host + FormatDate(COUNCILROOM_FORMAT, cur_date) def RemoveSmallFileIfExists(fn): if (os.path.exists(fn) and os.stat(fn).st_size <= SMALL_FILE_SIZE): print 'removing small existing file', fn os.unlink(fn) args = parser.parse_args() for cur_date in utils.daterange(datetime.date(2010, 10, 15), datetime.date.today()): str_date = time.strftime("%Y%m%d", cur_date.timetuple()) if not utils.includes_day(args, str_date): print 'skipping', str_date, 'because not in cmd line arg daterange' continue directory = str_date print str_date games_short_name = str_date + '.all.tar.bz2' saved_games_bundle = directory + '/' + games_short_name if utils.at_least_as_big_as(saved_games_bundle, SMALL_FILE_SIZE): print 'skipping because exists', str_date, saved_games_bundle, \ 'and not small (size=', os.stat(saved_games_bundle).st_size, ')' else: if not os.path.exists(directory): os.mkdir(directory) RemoveSmallFileIfExists(saved_games_bundle) urls_by_priority = [