def scrape_games(): parser = utils.incremental_date_range_cmd_line_parser() utils.ensure_exists('static/scrape_data') os.chdir('static/scrape_data') args = parser.parse_args() last_month = '' for cur_date in utils.daterange(datetime.date(2010, 10, 15), datetime.date.today()): str_date = time.strftime("%Y%m%d", cur_date.timetuple()) if not utils.includes_day(args, str_date): if DEBUG: print 'skipping', str_date, 'because not in cmd line arg daterange' continue mon = time.strftime("%b%y", cur_date.timetuple()) if mon != last_month: print print mon, cur_date.day*" ", sys.stdout.flush() last_month = mon ret = scrape_date(str_date, cur_date, passive=args.passive) if ret==DOWNLOADED: print 'o', elif ret==REPACKAGED: print 'O', elif ret==ERROR: print '!', elif ret==MISSING: print '_', else: print '.', sys.stdout.flush() print os.chdir('../..')
def main(): # print AnnotateGame(codecs.open(fn, 'r', encoding='utf-8').read()).encode( # 'utf-8') # return args = utils.incremental_date_range_cmd_line_parser().parse_args() print args days = os.listdir("static/scrape_data") days.sort() for year_month_day in days: if not utils.includes_day(args, year_month_day): continue if args.incremental and os.path.exists("parsed_out/%s-0.json" % year_month_day): print "skipping", year_month_day, "because already done" continue try: print "trying", year_month_day convert_to_json(year_month_day) except ParseTurnHeaderError, e: print e return
def main(): #print AnnotateGame(codecs.open(fn, 'r', encoding='utf-8').read()).encode( # 'utf-8') #return args = utils.incremental_date_range_cmd_line_parser().parse_args() print args days = os.listdir('static/scrape_data') days.sort() for year_month_day in days: if not utils.includes_day(args, year_month_day): continue if args.incremental and os.path.exists( 'parsed_out/%s-0.json' % year_month_day): print 'skipping', year_month_day, 'because already done' continue try: print 'trying', year_month_day convert_to_json(year_month_day) except ParseTurnHeaderError, e: print e return
def scrape_games(): parser = utils.incremental_date_range_cmd_line_parser() utils.ensure_exists('static/scrape_data') os.chdir('static/scrape_data') args = parser.parse_args() last_month = '' yesterday = datetime.date.today() - datetime.timedelta(days=1) #Goko updates logs in real time; wait a day so the list is finalized. for cur_date in utils.daterange(default_startdate, yesterday, reverse=True): str_date = time.strftime("%Y%m%d", cur_date.timetuple()) if not utils.includes_day(args, str_date): if DEBUG: print 'skipping', str_date, 'because not in cmd line arg daterange' continue mon = time.strftime("%b%y", cur_date.timetuple()) if mon != last_month: print print mon, cur_date.day*" ", sys.stdout.flush() last_month = mon ret = scrape_date(str_date, cur_date, passive=args.passive) if ret==DOWNLOADED: print 'o', elif ret==REPACKAGED: print 'O', elif ret==ERROR: print '!', elif ret==MISSING: print '_', else: print '.', sys.stdout.flush() print os.chdir('../..')
#!/usr/bin/python import logging import logging.handlers import os import os.path import pymongo import re import sys import utils from keys import * parser = utils.incremental_date_range_cmd_line_parser() find_id = re.compile("game-.*.html") def process_file(filename, incremental, games_table, log): yyyymmdd = filename[:8] if incremental: contents = open("parsed_out/" + filename, "r").read() if contents.strip() == "[]": log.warning("empty contents in %s (make parser not dump empty files?)", filename) return assert find_id.search(contents), "could not get id from %s in file %s" % (contents[:100], filename) found_all = True for match in find_id.finditer(contents):
#!/usr/bin/python import os import pymongo import re import sys import argparse import utils parser = utils.incremental_date_range_cmd_line_parser() def main(): args = parser.parse_args() games_table = pymongo.Connection().test.games games_table.ensure_index('players') games_table.ensure_index('supply') data_files_to_load = os.listdir('parsed_out') data_files_to_load.sort() find_id = re.compile('game-.*.html') done = set() for fn in data_files_to_load: yyyymmdd = fn[:8] print yyyymmdd if not utils.includes_day(args, yyyymmdd): print 'skipping', fn, 'because not in range' continue if args.incremental: if yyyymmdd in done: print 'skipping', fn, 'because done'
#!/usr/bin/python import logging import logging.handlers import os import os.path import pymongo import re import sys import utils from keys import * parser = utils.incremental_date_range_cmd_line_parser() find_id = re.compile('game-.*.html') def process_file(filename, incremental, games_table, log): yyyymmdd = filename[:8] if incremental: contents = open('parsed_out/' + filename, 'r').read() if contents.strip() == '[]': log.warning( "empty contents in %s (make parser not dump empty files?)", filename) return assert find_id.search(contents), ( 'could not get id from %s in file %s' % (contents[:100], filename))