示例#1
0
def scrape_games():
    parser = utils.incremental_date_range_cmd_line_parser()
    utils.ensure_exists('static/scrape_data')
    os.chdir('static/scrape_data')

    args = parser.parse_args()
    last_month = ''

    for cur_date in utils.daterange(datetime.date(2010, 10, 15), 
                                    datetime.date.today()):
        str_date = time.strftime("%Y%m%d", cur_date.timetuple())
        if not utils.includes_day(args, str_date):
            if DEBUG:
                print 'skipping', str_date, 'because not in cmd line arg daterange'
            continue
        mon = time.strftime("%b%y", cur_date.timetuple())
        if mon != last_month:
            print
            print mon, cur_date.day*"  ",
            sys.stdout.flush()
            last_month = mon
        ret = scrape_date(str_date, cur_date, passive=args.passive)
        if ret==DOWNLOADED:
            print 'o',
        elif ret==REPACKAGED:
            print 'O',
        elif ret==ERROR:
            print '!',
        elif ret==MISSING:
            print '_',
        else:
            print '.',
        sys.stdout.flush()
    print
    os.chdir('../..')
示例#2
0
def main(args, log):
    BEEN_PARSED_KEY = 'day_analyzed'

    if args.incremental:
        log.info("Performing incremental parsing from %s to %s",
                 args.startdate, args.enddate)
    else:
        log.info("Performing non-incremental (re)parsing from %s to %s",
                 args.startdate, args.enddate)

    connection = pymongo.MongoClient()
    db = connection.test  # RT: changed.
    raw_games = db.raw_games
    raw_games.ensure_index('game_date')

    utils.ensure_exists('parsed_out')

    day_status_col = db.day_status
    days = day_status_col.find({'raw_games_loaded': True})

    for day in days:
        year_month_day = day['_id']

        if not utils.includes_day(args, year_month_day):
            log.debug(
                "Raw games for %s available in the database but not in date range, skipping",
                year_month_day)
            continue

        if BEEN_PARSED_KEY not in day:
            day[BEEN_PARSED_KEY] = False
            day_status_col.save(day)

        if day[BEEN_PARSED_KEY] and args.incremental:
            log.debug(
                "Raw games for %s have been parsed, and we're running incrementally, skipping",
                year_month_day)
            continue

        try:
            log.info("Parsing %s", year_month_day)
            convert_to_json(log, raw_games, year_month_day)
            continue
            day[BEEN_PARSED_KEY] = True
            day_status_col.save(day)
        except ParseTurnHeaderError as e:
            log.error("ParseTurnHeaderError occurred while parsing %s: %s",
                      year_month_day, e)
            return
        except Exception as e:
            log.error("Exception occurred while parsing %s: %s",
                      year_month_day, e)
            return
示例#3
0
def main(args, log):
    BEEN_PARSED_KEY = 'day_analyzed'

    if args.incremental:
        log.info("Performing incremental parsing from %s to %s", args.startdate, args.enddate)
    else:
        log.info("Performing non-incremental (re)parsing from %s to %s", args.startdate, args.enddate)

    connection = pymongo.Connection()
    db = connection.test
    raw_games = db.raw_games
    raw_games.ensure_index('game_date')

    utils.ensure_exists('parsed_out')

    day_status_col = db.day_status
    days = day_status_col.find({'raw_games_loaded': True})

    for day in days:
        year_month_day = day['_id']

        if not utils.includes_day(args, year_month_day):
            log.debug("Raw games for %s available in the database but not in date range, skipping", year_month_day)
            continue

        if BEEN_PARSED_KEY not in day:
            day[BEEN_PARSED_KEY] = False
            day_status_col.save(day)

        if day[BEEN_PARSED_KEY] and args.incremental:
            log.debug("Raw games for %s have been parsed, and we're running incrementally, skipping", year_month_day)
            continue

        try:
            log.info("Parsing %s", year_month_day)
            convert_to_json(log, raw_games, year_month_day)
            continue
            day[BEEN_PARSED_KEY] = True
            day_status_col.save(day)
        except ParseTurnHeaderError, e:
            log.error("ParseTurnHeaderError occurred while parsing %s: %s", year_month_day, e)
            return
        except Exception, e:
            log.error("Exception occurred while parsing %s: %s", year_month_day, e)
            return
def main(args, log):

    if args.incremental:
        log.info("Performing incremental parsing from %s to %s", args.startdate, args.enddate)
    else:
        log.info("Performing non-incremental (re)parsing from %s to %s", args.startdate, args.enddate)

    games_table = pymongo.Connection().test.games
    games_table.ensure_index(PLAYERS)
    games_table.ensure_index(SUPPLY)
    data_files_to_load = os.listdir("parsed_out")
    data_files_to_load.sort()

    for fn in data_files_to_load:
        yyyymmdd = fn[:8]
        if not utils.includes_day(args, yyyymmdd):
            log.debug("Parsed games for %s available in the filesystem but not in date range, skipping", yyyymmdd)
            continue
        process_file(fn, args.incremental, games_table, log)
示例#5
0
def main():
    args = parser.parse_args()
    connection = utils.get_mongo_connection()
    games_table = connection.test.games
    games_table.ensure_index('players')
    games_table.ensure_index('supply')
    data_files_to_load = os.listdir('parsed_out')
    data_files_to_load.sort()
    find_id = re.compile('game-.*.html')
    done = set()
    for fn in data_files_to_load:
        yyyymmdd = fn[:8]
        print yyyymmdd
        if not utils.includes_day(args, yyyymmdd):
            print 'skipping', fn, 'because not in range'
            continue

        if args.incremental:
            if yyyymmdd in done:
                print 'skipping', fn, 'because done'
                continue
            contents = open('parsed_out/' + fn, 'r').read(100)
            if contents.strip() == '[]':
                print "empty contents (make parser not dump empty files?)", \
                      fn
                continue
            first_game_id_match = find_id.search(contents)
            assert first_game_id_match is not None, (
                'could not get id from %s in file %s' % (contents, fn))
            first_game_id = first_game_id_match.group(0)
            query = {'_id': first_game_id}
            if games_table.find(query).count():
                done.add(yyyymmdd)
                print 'skipping', yyyymmdd, 'and marking as done'
                continue
            else:
                print first_game_id, str(query), 'not in db, importing'

        cmd = ('mongoimport -h localhost parsed_out/%s -c '
               'games --jsonArray' % fn)
        print cmd
        os.system(cmd)
示例#6
0
def main():
    args = parser.parse_args()
    connection = utils.get_mongo_connection()
    games_table = connection.test.games
    games_table.ensure_index('players')
    games_table.ensure_index('supply')
    data_files_to_load = os.listdir('parsed_out')
    data_files_to_load.sort()
    find_id = re.compile('game-.*.html')
    done = set()
    for fn in data_files_to_load:
        yyyymmdd = fn[:8]
        print yyyymmdd
        if not utils.includes_day(args, yyyymmdd):
            print 'skipping', fn, 'because not in range'
            continue

        if args.incremental:
            if yyyymmdd in done:
                print 'skipping', fn, 'because done'
                continue
            contents = open('parsed_out/' + fn, 'r').read(100)
            if contents.strip() == '[]':
                print "empty contents (make parser not dump empty files?)", \
                      fn
                continue
            first_game_id_match = find_id.search(contents)
            assert first_game_id_match is not None, (
                'could not get id from %s in file %s' % (contents, fn))
            first_game_id = first_game_id_match.group(0)
            query = {'_id': first_game_id}
            if games_table.find(query).count():
                done.add(yyyymmdd)
                print 'skipping', yyyymmdd, 'and marking as done'
                continue
            else:
                print first_game_id, str(query), 'not in db, importing'
        
        cmd = ('mongoimport -h localhost parsed_out/%s -c '
               'games --jsonArray' % fn)
        print cmd
        os.system(cmd)
示例#7
0
def main():
    # print AnnotateGame(codecs.open(fn, 'r', encoding='utf-8').read()).encode(
    #    'utf-8')
    # return
    args = utils.incremental_date_range_cmd_line_parser().parse_args()
    print args
    days = os.listdir("static/scrape_data")
    days.sort()
    for year_month_day in days:
        if not utils.includes_day(args, year_month_day):
            continue

        if args.incremental and os.path.exists("parsed_out/%s-0.json" % year_month_day):
            print "skipping", year_month_day, "because already done"
            continue

        try:
            print "trying", year_month_day
            convert_to_json(year_month_day)
        except ParseTurnHeaderError, e:
            print e
            return
示例#8
0
def main(args, log):

    if args.incremental:
        log.info("Performing incremental parsing from %s to %s",
                 args.startdate, args.enddate)
    else:
        log.info("Performing non-incremental (re)parsing from %s to %s",
                 args.startdate, args.enddate)

    games_table = pymongo.Connection().test.games
    games_table.ensure_index(PLAYERS)
    games_table.ensure_index(SUPPLY)
    data_files_to_load = os.listdir('parsed_out')
    data_files_to_load.sort()

    for fn in data_files_to_load:
        yyyymmdd = fn[:8]
        if not utils.includes_day(args, yyyymmdd):
            log.debug(
                "Parsed games for %s available in the filesystem but not in date range, skipping",
                yyyymmdd)
            continue
        process_file(fn, args.incremental, games_table, log)
示例#9
0
def main():
    #print AnnotateGame(codecs.open(fn, 'r', encoding='utf-8').read()).encode(
    #    'utf-8')
    #return
    args = utils.incremental_date_range_cmd_line_parser().parse_args()
    print args
    days = os.listdir('static/scrape_data')
    days.sort()
    for year_month_day in days:
        if not utils.includes_day(args, year_month_day):
            continue

        if args.incremental and os.path.exists(
                'parsed_out/%s-0.json' % year_month_day):
            print 'skipping', year_month_day, 'because already done'
            continue

        try:
            print 'trying', year_month_day
            convert_to_json(year_month_day)
        except ParseTurnHeaderError, e:
            print e
            return
示例#10
0
def scrape_games():
    parser = utils.incremental_date_range_cmd_line_parser()
    utils.ensure_exists('static/scrape_data')
    os.chdir('static/scrape_data')

    args = parser.parse_args()
    last_month = ''
    
    yesterday = datetime.date.today() - datetime.timedelta(days=1)
    #Goko updates logs in real time; wait a day so the list is finalized.

    for cur_date in utils.daterange(default_startdate, yesterday, reverse=True):
        str_date = time.strftime("%Y%m%d", cur_date.timetuple())
        if not utils.includes_day(args, str_date):
            if DEBUG:
                print 'skipping', str_date, 'because not in cmd line arg daterange'
            continue
        mon = time.strftime("%b%y", cur_date.timetuple())
        if mon != last_month:
            print
            print mon, cur_date.day*"  ",
            sys.stdout.flush()
            last_month = mon
        ret = scrape_date(str_date, cur_date, passive=args.passive)
        if ret==DOWNLOADED:
            print 'o',
        elif ret==REPACKAGED:
            print 'O',
        elif ret==ERROR:
            print '!',
        elif ret==MISSING:
            print '_',
        else:
            print '.',
        sys.stdout.flush()
    print
    os.chdir('../..')
示例#11
0
    def http_error_default(self, *args, **kwargs):
        urllib.URLopener.http_error_default(self, *args, **kwargs)


if __name__ == '__main__':
    parser = utils.incremental_date_range_cmd_line_parser()
    args = parser.parse_args()

    utils.ensure_exists('static/scrape_data')
    os.chdir('static/scrape_data')

    for cur_date in utils.daterange(datetime.date(2010, 10, 15),
                                    datetime.date.today()):
        str_date = time.strftime("%Y%m%d", cur_date.timetuple())
        if not utils.includes_day(args, str_date):
            print 'skipping', str_date, 'because not in cmd line arg daterange'
            continue
        directory = str_date
        print str_date
        games_short_name = str_date + '.all.tar.bz2'
        saved_games_bundle = directory + '/' + games_short_name
        if utils.at_least_as_big_as(saved_games_bundle, SMALL_FILE_SIZE):
            print 'skipping because exists', str_date, saved_games_bundle, \
                'and not small (size=', os.stat(saved_games_bundle).st_size, ')'
            continue
        if not os.path.exists(directory):
            os.mkdir(directory)
        RemoveSmallFileIfExists(saved_games_bundle)

        url = IsotropicGamesCollectionUrl(cur_date)
示例#12
0
    host = 'http://councilroom.com/static/scrape_data/'
    return host + FormatDate(COUNCILROOM_FORMAT, cur_date)


def RemoveSmallFileIfExists(fn):
    if (os.path.exists(fn) and os.stat(fn).st_size <= SMALL_FILE_SIZE):
        print 'removing small existing file', fn
        os.unlink(fn)


args = parser.parse_args()

for cur_date in utils.daterange(datetime.date(2010, 10, 15),
                                datetime.date.today()):
    str_date = time.strftime("%Y%m%d", cur_date.timetuple())
    if not utils.includes_day(args, str_date):
        print 'skipping', str_date, 'because not in cmd line arg daterange'
        continue
    directory = str_date
    print str_date
    games_short_name = str_date + '.all.tar.bz2'
    saved_games_bundle = directory + '/' + games_short_name
    if utils.at_least_as_big_as(saved_games_bundle, SMALL_FILE_SIZE):
        print 'skipping because exists', str_date, saved_games_bundle, \
            'and not small (size=', os.stat(saved_games_bundle).st_size, ')'
    else:
        if not os.path.exists(directory):
            os.mkdir(directory)
        RemoveSmallFileIfExists(saved_games_bundle)

        urls_by_priority = [