def mmap_csv(target_path, df, outputDir, args): haibrid_chess_utils.printWithDate(f"Loading: {target_path}") name = os.path.basename(target_path).split('.')[0] df_blunder = df[df['is_blunder_mean']] haibrid_chess_utils.printWithDate(f"Found {len(df_blunder)} blunders") df_blunder = df_blunder.sample(frac=1).reset_index(drop=True) df_non_blunder = df[df['is_blunder_mean'].eq(False)] haibrid_chess_utils.printWithDate( f"Found {len(df_non_blunder)} non blunders") df_non_blunder = df_non_blunder.sample(frac=1).reset_index(drop=True) del df haibrid_chess_utils.printWithDate( f"Reduced to {len(df_non_blunder)} non blunders") haibrid_chess_utils.printWithDate(f"Starting mmaping") os.makedirs(outputDir, exist_ok=True) mmaps = {} mmaps['blunder'] = make_df_mmaps(df_blunder, name, os.path.join(outputDir, 'blunder')) del df_blunder mmaps['nonblunder'] = make_df_mmaps(df_non_blunder, name, os.path.join(outputDir, 'nonblunder')) return mmaps
def main(): parser = argparse.ArgumentParser( description='Create new cvs with select columns', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help='input CSV') parser.add_argument('outputDir', help='output CSV') args = parser.parse_args() haibrid_chess_utils.printWithDate( f"Starting CSV conversion of {args.input} writing to {args.outputDir}") haibrid_chess_utils.printWithDate( f"Collecting {', '.join(target_columns)}") name = os.path.basename(args.input).split('.')[0] outputName = os.path.join(args.outputDir, f"{name}_trimmed.csv.bz2") haibrid_chess_utils.printWithDate(f"Created output name {outputName}") os.makedirs(args.outputDir, exist_ok=True) haibrid_chess_utils.printWithDate(f"Starting read") with bz2.open(args.input, 'rt') as f: df = pandas.read_csv(f, usecols=target_columns) haibrid_chess_utils.printWithDate(f"Starting write") with bz2.open(outputName, 'wt') as f: df.to_csv(f, index=False)
def writerWorker(outputFile, inputQueue, num_readers, name): i = -1 num_kill_remaining = num_readers tstart = time.time() haibrid_chess_utils.printWithDate("Writer created") with bz2.open(outputFile, 'wb') as f: haibrid_chess_utils.printWithDate(f"Created: {outputFile}") f.write((','.join(haibrid_chess_utils.full_csv_header) + '\n').encode('utf8')) tLast = time.time() while True: try: dat = inputQueue.get() except queue.Empty: #Should never happen break try: f.write(dat) except TypeError: if dat == 'kill': num_kill_remaining -= 1 if num_kill_remaining <= 0: break else: raise else: i += 1 if i % 1000 == 0 and time.time() - tLast > logging_delay: tLast = time.time() haibrid_chess_utils.printWithDate(f"{name} Written {i} games in {humanize.naturaldelta(time.time() - tstart)}, doing {(i + 1) /(time.time() - tstart):.0f} games a second", flush = True) haibrid_chess_utils.printWithDate("Received shutdown signal to writer") haibrid_chess_utils.printWithDate(f"Done a total of {i} games in {humanize.naturaldelta(time.time() - tstart)}")
def cleanup(pgnReaders, gameReaders, writers): pgnReaders.get() haibrid_chess_utils.printWithDate(f"Done reading") time.sleep(10) for r in gameReaders: r.get() haibrid_chess_utils.printWithDate(f"Done processing") writers.get()
def gamesConverter(inputQueue, outputQueue): #haibrid_chess_utils.printWithDate("Converter created") while True: try: #print('qsize', inputQueue.qsize()) dat = inputQueue.get() except queue.Empty: break if dat == 'kill': outputQueue.put('kill', True, 1000) break else: try: s = haibrid_chess_utils.gameToCSVlines(dat) except haibrid_chess_utils.NoStockfishEvals: pass except: haibrid_chess_utils.printWithDate('error:') haibrid_chess_utils.printWithDate(dat) haibrid_chess_utils.printWithDate(traceback.format_exc()) raise else: if len(s) > 0: lines = '\n'.join(s) + '\n' outputQueue.put(lines.encode('utf8'), True, 1000) haibrid_chess_utils.printWithDate("Received shutdown signal to Converter", flush = True)
def main(): parser = argparse.ArgumentParser( description='Make mmapped version of csv', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('inputs', nargs='+', help='input csv') parser.add_argument('outputDir', help='output dir of mmapped files') parser.add_argument('--nrows', type=int, help='number of rows to read in, FOR TESTING', default=None) args = parser.parse_args() haibrid_chess_utils.printWithDate( f"Starting mmap of {', '.join(args.inputs)} writing to {args.outputDir} with {', '.join(mmap_columns)}" ) mmaps = {} for path in args.inputs: mmaps[path] = mmap_csv( path, load_csv(path, args.nrows), args.outputDir, args, ) haibrid_chess_utils.printWithDate("All mmapped") try: while True: haibrid_chess_utils.printWithDate("Still alive", end='\r') time.sleep(10 * 60) except KeyboardInterrupt: print() haibrid_chess_utils.printWithDate("Exiting")
def cleanup(pgnReaders, gameReaders, writers): while len(pgnReaders) > 0: for i in list(pgnReaders.keys()): if pgnReaders[i].ready(): haibrid_chess_utils.printWithDate(f"{i} Done reading") pgnReaders[i].get() del pgnReaders[i] time.sleep(10) for i in list(gameReaders.keys()): for w in gameReaders[i]: w.get() haibrid_chess_utils.printWithDate(f"{i} Done processing") for i in list(writers.keys()): writers[i].get()
def main(): parser = argparse.ArgumentParser(description='process PGN file with stockfish annotaions into a csv file', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('inputs', nargs = '+', help='input PGNs') parser.add_argument('outputDir', help='output CSVs dir') parser.add_argument('--pool', type=int, help='number of simultaneous jobs running per fil', default = 20) #parser.add_argument('--readers', type=int, help='number of simultaneous reader running per inputfile', default = 24) parser.add_argument('--queueSize', type=int, help='Max number of games to cache', default = 1000) args = parser.parse_args() haibrid_chess_utils.printWithDate(f"Starting CSV conversion of {', '.join(args.inputs)} writing to {args.outputDir}") os.makedirs(args.outputDir, exist_ok=True) names = {} for n in args.inputs: name = os.path.basename(n).split('.')[0] outputName = os.path.join(args.outputDir, f"{name}.csv.bz2") names[n] = (name, outputName) haibrid_chess_utils.printWithDate(f"Loading file: {name}") haibrid_chess_utils.printWithDate(f"Starting main loop") tstart = time.time() pgnReaders = {} gameReaders = {} writers = {} queues = {} with multiprocessing.Manager() as manager: with multiprocessing.Pool(args.pool * len(names)) as workers_pool, multiprocessing.Pool(len(names) * 2 + 4) as io_pool: for p, (i, o) in names.items(): pgnReader, gameReader, writer, unproccessedQueue, resultsQueue = processPGN(p, i, o, args.queueSize, args.pool, manager, workers_pool, io_pool) pgnReaders[i] = pgnReader gameReaders[i] = gameReader writers[i] = writer queues[i] = (unproccessedQueue, resultsQueue) haibrid_chess_utils.printWithDate(f"Done loading Queues in {humanize.naturaldelta(time.time() - tstart)}, waiting for reading to finish") cleanup(pgnReaders, gameReaders, writers) haibrid_chess_utils.printWithDate(f"Done everything in {humanize.naturaldelta(time.time() - tstart)}, exiting")
def readerWorker(inputPath, unproccessedQueue, resultsQueue, name, num_readers): tstart = time.time() gamesFile = haibrid_chess_utils.LightGamesFile(inputPath, just_games = True) try: tLast = time.time() for i, (_, gs) in enumerate(gamesFile): unproccessedQueue.put(gs, True, 1000) if i % 1000 == 0 and time.time() - tLast > logging_delay: tLast = time.time() haibrid_chess_utils.printWithDate(f"{name} Loaded {i} games, input queue depth: {unproccessedQueue.qsize()}, ouput queue depth: {resultsQueue.qsize()}", flush = True) except EOFError: pass haibrid_chess_utils.printWithDate(f"{name} Done loading Queue in {humanize.naturaldelta(time.time() - tstart)}, sending kills") for i in range(num_readers): #haibrid_chess_utils.printWithDate(f"Putting kill number {i} in queue") unproccessedQueue.put('kill', True, 100)
def runGroupBys(df_working, outputDir, fname): for opList in operations: mStart = time.time() opgroups = [groupbyColumns[n] for n in opList] opName = 'on-' + '_'.join(opList) df_c = df_working.groupby(opgroups).count().reset_index() df_m = df_working.groupby(opgroups).mean().reset_index() df_m['count'] = df_c['clock'] df_m.to_csv(os.path.join(outputDir, f"{fname}_{opName}.csv")) haibrid_chess_utils.printWithDate( f"{fname} groupby: {opName} done in {humanize.naturaldelta(time.time() - mStart)}", colour='pink', flush=True)
def main(): parser = argparse.ArgumentParser( description='Group by board', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('inputs', nargs='+', help='input CSVs') parser.add_argument('outputDir', help='output CSVs dir') parser.add_argument('--nrows', type=int, help='number of rows to read in, FOR TESTING', default=None) args = parser.parse_args() os.makedirs(args.outputDir, exist_ok=True) haibrid_chess_utils.printWithDate( f"Starting groupby on {','.join(args.inputs)} writing to {args.outputDir}" ) board_dicts = [] #for p in args.inputs: # board_dicts.append(get_boards_dict(p, args.outputDir)) with multiprocessing.Manager() as manager, multiprocessing.Pool( len(args.inputs) + 1) as pool: sem = manager.Semaphore() q_m = manager.Queue() q_c = manager.Queue() q_b = manager.Queue() q_n = manager.Queue() q_t = manager.Queue() queues = { 'semaphore': sem, 'maps': q_m, 'counts': q_c, 'blunders': q_b, 'nonblunders': q_n, 'target_columns': q_t, } board_dicts_ret = pool.starmap_async( get_boards_dict, ((p, args.outputDir, queues, args.nrows) for p in args.inputs)) final_fname = os.path.join(args.outputDir, 'all_months.csv.bz2') haibrid_chess_utils.printWithDate(f"Done all months, merging") merge_dicts(queues, board_dicts_ret, final_fname)
def cleanup(pgnReader, gameParsers, writer): while not pgnReader.ready(): if writer.ready() and not pgnReader.ready(): haibrid_chess_utils.printWithDate(writer.get()) for w in gameParsers: if w.ready() and not pgnReader.ready(): haibrid_chess_utils.printWithDate(w.get()) time.sleep(1) pgnReader.get() haibrid_chess_utils.printWithDate(f"Done reading") for w in gameParsers: w.get() haibrid_chess_utils.printWithDate(f"Done parsing") writer.get() haibrid_chess_utils.printWithDate(f"Done writing")
def merge_dicts(queues, rets, final_fname): dicts_lst = [] counts_dict_com = {} maps_dict_com = {} blunder_moves_com = {} nonblunder_moves_com = {} target_columns_dicts_com = {c: {} for c in target_columns} count = 0 while not rets.ready() or not queues['maps'].empty(): if queues['maps'].empty(): time.sleep(1) else: queues['semaphore'].acquire() maps_dict = queues['maps'].get() counts_dict = queues['counts'].get() blunder_dict = queues['blunders'].get() nonblunder_dict = queues['nonblunders'].get() columns_dict = queues['target_columns'].get() queues['semaphore'].release() count += 1 haibrid_chess_utils.printWithDate( f"Merging {count}, {len(maps_dict)} total boards") for h_board, c in counts_dict.items(): try: counts_dict_com[h_board] += c except KeyError: maps_dict_com[h_board] = maps_dict[h_board] counts_dict_com[h_board] = c blunder_moves_com[h_board] = blunder_dict[h_board] nonblunder_moves_com[h_board] = nonblunder_dict[h_board] for c in target_columns: target_columns_dicts_com[c][h_board] = columns_dict[c][ h_board] else: blunder_moves_com[h_board] += blunder_dict[h_board] nonblunder_moves_com[h_board] += nonblunder_dict[h_board] for c in target_columns: target_columns_dicts_com[c][h_board] += columns_dict[ c][h_board] haibrid_chess_utils.printWithDate( f"Done merging, writing to {final_fname}") write_dicts(final_fname, maps_dict_com, counts_dict_com, blunder_moves_com, nonblunder_moves_com, target_columns_dicts_com)
def main(): parser = argparse.ArgumentParser( description= 'process PGN file with stockfish annotaions into a csv file', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help='input PGNs') parser.add_argument('outputDir', help='output CSVs dir') parser.add_argument('--pool', type=int, help='number of simultaneous jobs running per file', default=30) parser.add_argument('--allow_non_sf', help='Allow games with no stockfish info', default=False, action="store_true") #parser.add_argument('--debug', help='DEBUG MODE', default = False, action="store_true") #parser.add_argument('--readers', type=int, help='number of simultaneous reader running per inputfile', default = 24) parser.add_argument('--queueSize', type=int, help='Max number of games to cache', default=1000) args = parser.parse_args() haibrid_chess_utils.printWithDate( f"Starting CSV conversion of {args.input} writing to {args.outputDir}") os.makedirs(args.outputDir, exist_ok=True) name = os.path.basename(args.input).split('.')[0] outputName = os.path.join(args.outputDir, f"{name}.csv.bz2") #names[n] = (name, outputName) haibrid_chess_utils.printWithDate(f"Loading file: {name}") haibrid_chess_utils.printWithDate(f"Starting main loop") tstart = time.time() with multiprocessing.Manager() as manager: with multiprocessing.Pool( args.pool) as workers_pool, multiprocessing.Pool(3) as io_pool: pgnReader, gameReader, writer, unproccessedQueue, resultsQueue = processPGN( args.input, name, outputName, args.queueSize, args.pool, args.allow_non_sf, manager, workers_pool, io_pool) haibrid_chess_utils.printWithDate( f"Done loading Queues in {humanize.naturaldelta(time.time() - tstart)}, waiting for reading to finish" ) cleanup(pgnReader, gameReader, writer)
def pgnParser(inputQueue, outputQueue, num_splits, min_elo, max_elo, allow_negative_loss, allow_low_time): while True: try: dats = inputQueue.get() except queue.Empty: break if dats == 'kill': outputQueue.put('kill', True) break else: for dat in dats: try: b_vals, nb_vals = parseLines(dat, num_splits, min_elo, max_elo, allow_negative_loss, allow_low_time) except KeyboardInterrupt: raise except: haibrid_chess_utils.printWithDate('error:') haibrid_chess_utils.printWithDate(dat) raise if len(b_vals) > 0 or len(nb_vals) > 0: outputQueue.put((b_vals, nb_vals), True)
def processTarget(target, outputDir, nrows, queue_out): tstart = time.time() os.makedirs(outputDir, exist_ok=True) tname = os.path.basename(target).split('.')[0] haibrid_chess_utils.printWithDate(f"Starting on {tname}", colour='green', flush=True) with bz2.open(target, 'rt') as f: df = pandas.read_csv(f, index_col=None, nrows=nrows, usecols=[ 'cp_rel', 'active_elo', 'opponent_elo', 'clock', 'opp_clock', 'move_ply', 'type', 'low_time', 'active_won' ]) haibrid_chess_utils.printWithDate( f"{tname} loaded: {len(df)} lines in {humanize.naturaldelta(time.time() - tstart)}", colour='green', flush=True) #df['mul_factor'] = df['white_active'].apply(lambda x : 1 if x else -1) #df['cp_rel'] = df['cp'] * df['mul_factor'] df['binned_cp'] = df['cp_rel'].apply(lambda x: x * 100 // 10 / 10) df['binned_active_elo'] = df['active_elo'].apply(lambda x: (x // 100) * 100) df['binned_opponent_elo'] = df['opponent_elo'].apply(lambda x: (x // 100) * 100) df['elo_delta'] = (df['active_elo'] - df['opponent_elo']).apply(lambda x: (x // 100) * 100) df['binned_clock'] = df['clock'].apply(binClocks) df['opp_binned_clock'] = df['opp_clock'].apply(binClocks) df['game_phase'] = df['move_ply'].apply(binPly) #df['low_time'] = df['clock'].apply(isLowTime) for df_sub in np.split(df, range(10**6, len(df), 10**6), axis=0): queue_out.put(df_sub.copy()) queue_out.put("done") runGroupBys(df, outputDir, tname) haibrid_chess_utils.printWithDate( f"{tname} done everything in {humanize.naturaldelta(time.time() - tstart)}", colour='green', flush=True)
def main(): parser = argparse.ArgumentParser( description='Make train testr split of grouped boards', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help='input CSV') parser.add_argument('outputDir', help='output CSVs dir') parser.add_argument('--nrows', type=int, help='number of rows to read in, FOR TESTING', default=None) parser.add_argument('--blunder_ratio', type=float, help='ratio to declare a positive class', default=.1) parser.add_argument('--min_count', type=int, help='ratio to declare a positive class', default=10) args = parser.parse_args() os.makedirs(os.path.join(args.outputDir, 'test'), exist_ok=True) os.makedirs(os.path.join(args.outputDir, 'train'), exist_ok=True) haibrid_chess_utils.printWithDate(f"Loading: {args.input}") df = pandas.read_csv(args.input) haibrid_chess_utils.printWithDate(f"Filtering: {args.input}") df = df[df['count'] >= args.min_count].copy() df['is_blunder_mean'] = df['is_blunder_wr_mean'] > .1 df['board_extended'] = df['board'].apply(lambda x: x + ' KQkq - 0 1') df['white_active'] = df['board'].apply(lambda x: x.endswith('w')) df['has_nonblunder_move'] = df['top_nonblunder'].isna() == False df['has_blunder_move'] = df['top_blunder'].isna() == False df['is_test'] = [ np.random.random() < args.blunder_ratio for i in range(len(df)) ] haibrid_chess_utils.printWithDate(f"Wrting to: {args.outputDir}") df[df['is_test']].to_csv('/datadrive/group_csv/test/grouped_fens.csv.bz2', compression='bz2') df[df['is_test'] == False].to_csv(os.path.join(args.outputDir, 'train', 'grouped_fens.csv.bz2'), compression='bz2') df[df['is_test'] == True].to_csv(os.path.join(args.outputDir, 'test', 'grouped_fens.csv.bz2'), compression='bz2')
def main(): parser = argparse.ArgumentParser(description='Create two new csvs with select columns split by is_blunder_wr', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('targets', nargs = '+', help='input CSVs') parser.add_argument('outputDir', help='output CSV') parser.add_argument('--min_elo', type=int, help='min active elo', default = 1000) parser.add_argument('--max_elo', type=int, help='min active elo', default = 9999999999) parser.add_argument('--allow_negative_loss', type=bool, help='allow winrate losses below 0', default = False) parser.add_argument('--allow_low_time', type=bool, help='Include low time moves', default = False) parser.add_argument('--min_ply', type=int, help='min move ply to consider', default = 6) parser.add_argument('--pool', type=64, help='min move ply to consider', default = 6) #parser.add_argument('--shuffleSize', type=int, help='Shuffle buffer size', default = 1000) parser.add_argument('--nrows', type=int, help='number of rows to read in', default = None) parser.add_argument('--nb_to_b_ratio', type=float, help='ratio fof blunders to non blunders in dataset', default = 1.5) args = parser.parse_args() haibrid_chess_utils.printWithDate(f"Starting CSVs split of {len(args.targets)} targets writing to {args.outputBlunder}") haibrid_chess_utils.printWithDate(f"Collecting {', '.join(target_columns)}") name = os.path.basename(args.input).split('.')[0] outputBlunder = os.path.join(args.outputDir, f"{name}_blunder.csv.bz2") outputNonBlunder = os.path.join(args.outputDir, f"{name}_nonblunder.csv.bz2") haibrid_chess_utils.printWithDate(f"Created outputs named {outputBlunder} and {outputNonBlunder}") os.makedirs(args.outputDir, exist_ok = True) haibrid_chess_utils.printWithDate(f"Starting read") with multiprocessing.Pool(args.pool) as pool
def readerWorker(inputPath, unproccessedQueue, resultsQueue, stopLoadingQueue, num_readers, testing): tstart = time.time() gamesFile = haibrid_chess_utils.LightGamesFile(inputPath, just_games = True) haibrid_chess_utils.printWithDate(f"Reader created", flush = True) try: tLast = time.time() games_bundle = [] for i, (_, gs) in enumerate(gamesFile): games_bundle.append(gs) if len(games_bundle) >= game_per_put: unproccessedQueue.put(games_bundle, True) games_bundle = [] if i % 100 == 0 and time.time() - tLast > logging_delay: tLast = time.time() haibrid_chess_utils.printWithDate(f"Loaded {i} games, input queue depth: {unproccessedQueue.qsize()}, ouput queue depth: {resultsQueue.qsize()}", flush = True) if testing and i > 100000: break try: stopLoading = stopLoadingQueue.get_nowait() except queue.Empty: pass else: if stopLoading == 'kill': haibrid_chess_utils.printWithDate(f"Killed by max_bs, ending after {i} games loaded") break else: raise RuntimeError(f"{stopLoading} in wrong queue") except EOFError: pass if len(games_bundle) > 0: unproccessedQueue.put(games_bundle, True) haibrid_chess_utils.printWithDate(f"Done loading Queue in {humanize.naturaldelta(time.time() - tstart)}, sending kills") for i in range(num_readers): #haibrid_chess_utils.printWithDate(f"Putting kill number {i} in queue") unproccessedQueue.put('kill', True, 100)
def processPGN(gamesPath, inputName, outputName, queueSize, poolSize, manager, workers_pool, io_pool): unproccessedQueue = manager.Queue(queueSize) resultsQueue = manager.Queue(queueSize) readers = [] for _ in range(poolSize - 1): reader = workers_pool.apply_async(gamesConverter, (unproccessedQueue, resultsQueue)) readers.append(reader) haibrid_chess_utils.printWithDate(f"{inputName} Started {len(readers)} readers", flush = True) pgnReader = io_pool.apply_async(readerWorker, (gamesPath, unproccessedQueue, resultsQueue, inputName, len(readers))) haibrid_chess_utils.printWithDate(f"{inputName} loader created") writer = io_pool.apply_async(writerWorker, (outputName, resultsQueue, len(readers), inputName)) haibrid_chess_utils.printWithDate(f"{inputName} Started writer for: {inputName}", flush = True) return pgnReader, readers, writer, unproccessedQueue, resultsQueue
def main(): parser = argparse.ArgumentParser(description='Convert PGN file to zip of the binaries', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help='input PGNs') parser.add_argument('output', help='output zip') parser.add_argument('--pool', type=int, help='number of simultaneous jobs procesing the file, reading and writing are two more', default = 48) parser.add_argument('--num_boards', type=int, help='Max number of boards of each class to take from each game', default = 5) parser.add_argument('--batch_size', type=int, help='Number of boards per batch', default = 100) parser.add_argument('--max_bs', type=int, help='Max number of blunder files to create', default = 9999999999) parser.add_argument('--queueSize', type=int, help='Max number of games to cache', default = 1000) parser.add_argument('--min_elo', type=int, help='min active elo', default = 1000) parser.add_argument('--max_elo', type=int, help='min active elo', default = 9999999999) parser.add_argument('--allow_negative_loss', type=bool, help='allow winrate losses below 0', default = False) parser.add_argument('--allow_low_time', type=bool, help='Include low time moves', default = False) parser.add_argument('--testing', help='Make run on only first 1000', default = False, action='store_true') args = parser.parse_args() if args.testing: haibrid_chess_utils.printWithDate(f"TESTING RUN", colour = 'red') haibrid_chess_utils.printWithDate(f"Starting PGN conversion of {args.input} writing to {args.output}") tstart = time.time() with multiprocessing.Manager() as manager: with multiprocessing.Pool(args.pool) as worker_pool, multiprocessing.Pool(3) as io_pool: pgnReader, gameParsers, writer, unproccessedQueue, resultsQueue, stopQueue = setupProcessors(args.input, args.output, manager, worker_pool, io_pool, args.pool, args.num_boards, args.batch_size, args.queueSize, args.min_elo, args.max_elo, args.allow_negative_loss, args.allow_low_time, args.max_bs, args.testing) haibrid_chess_utils.printWithDate(f"Done setting up Queues in {humanize.naturaldelta(time.time() - tstart)}, waiting for reading to finish") cleanup(pgnReader, gameParsers, writer) haibrid_chess_utils.printWithDate(f"Done everything in {humanize.naturaldelta(time.time() - tstart)}, exiting")
def cleanup(pgnReaders, gameReaders, writers): #time.sleep(10) while len(gameReaders) > 0: for i in range(len(gameReaders)): #haibrid_chess_utils.printWithDate(f"Checking {i} of {len(gameReaders)}", flush = True) try: gameReaders[i].get(1) except multiprocessing.TimeoutError: pass else: del gameReaders[i] break haibrid_chess_utils.printWithDate(f"Done processing") pgnReaders.get() haibrid_chess_utils.printWithDate(f"Done reading") writers.get() haibrid_chess_utils.printWithDate(f"Done cleanup")
def setupProcessors(gamesPath, ouput_path, manager, worker_pool, io_pool, poolSize, num_boards, batch_size, queueSize, min_elo, max_elo, allow_negative_loss, allow_low_time, max_bs, testing): unproccessedQueue = manager.Queue(queueSize) resultsQueue = manager.Queue(queueSize) stopLoadingQueue = manager.Queue() parsers = [] for _ in range(poolSize): parser = worker_pool.apply_async(pgnParser, (unproccessedQueue, resultsQueue, num_boards, min_elo, max_elo, allow_negative_loss, allow_low_time)) parsers.append(parser) haibrid_chess_utils.printWithDate(f"Started {len(parsers)} parsers") pgnReader = io_pool.apply_async(readerWorker, (gamesPath, unproccessedQueue, resultsQueue, stopLoadingQueue, len(parsers), testing)) haibrid_chess_utils.printWithDate(f"loader created") writer = io_pool.apply_async(writerWorker, (ouput_path, resultsQueue, stopLoadingQueue, len(parsers), num_boards, batch_size, max_bs)) haibrid_chess_utils.printWithDate(f"Started writer") return pgnReader, parsers, writer, unproccessedQueue, resultsQueue, stopLoadingQueue
def make_df_mmaps(df, name, output_dir): os.makedirs(output_dir, exist_ok=True) outputName = os.path.join(output_dir, name) mmaps = {} haibrid_chess_utils.printWithDate(f"Making y_vals mmaps for: {name} done:", end=' ') for y_name in mmap_columns: make_var_mmap(y_name, outputName, mmaps, df) print(y_name, end=' ', flush=True) haibrid_chess_utils.printWithDate(f"Making move array mmaps for: {name}") make_move_mmap(outputName, mmaps, 'top_blunder', df) make_move_mmap(outputName, mmaps, 'top_nonblunder', df) haibrid_chess_utils.printWithDate(f"Making boards array mmaps for: {name}") make_board_mmap(outputName, mmaps, df) return mmaps
def main(): parser = argparse.ArgumentParser( description='Run groupby on the csv files', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('inputs', nargs='+', help='input CSVs') parser.add_argument('outputDir', help='output dir name') parser.add_argument('--pool', type=int, help='number of simultaneous jobs running', default=32) parser.add_argument('--nrows', type=int, help='For debugging, limit the number lines read', default=None) args = parser.parse_args() haibrid_chess_utils.printWithDate( f"Starting CSV groupby of {', '.join(args.inputs)} writing to {args.outputDir}" ) os.makedirs(args.outputDir, exist_ok=True) tstart = time.time() with multiprocessing.Manager() as manager: dfs_queue = manager.Queue() with multiprocessing.Pool(args.pool) as pool: mResult = pool.starmap_async( processTarget, [(tname, os.path.join(args.outputDir, os.path.basename(tname).split('.')[0]), args.nrows, dfs_queue) for tname in args.inputs]) dfs = [] dones = [] while len(dones) < len(args.inputs): try: r = dfs_queue.get(True, 10) if isinstance(r, str): dones.append(r) else: dfs.append(r) except queue.Empty: pass tname = "lichess_db_standard_rated_all" haibrid_chess_utils.printWithDate(f"All {len(dfs)} dfs collected") df = pandas.concat(dfs, ignore_index=True) haibrid_chess_utils.printWithDate( f"Full dataset loaded: {len(df)} lines") runGroupBys(df, args.outputDir, tname) mJoined = mResult.get() haibrid_chess_utils.printWithDate( f"Done all single months in {humanize.naturaldelta(time.time() - tstart)}" ) haibrid_chess_utils.printWithDate( f"{tname} done everything in {humanize.naturaldelta(time.time() - tstart)}, exiting" )
def load_csv(target_path, nrows): haibrid_chess_utils.printWithDate(f"Loading: {target_path}", flush=True) return pandas.read_csv(target_path, usecols=target_columns, nrows=nrows)
def get_boards_dict(fPath, outputDir, queues, nrows): name = os.path.basename(fPath).split('.')[0] haibrid_chess_utils.printWithDate(f"Starting: {name}") maps_dict = {} counts_dict = {} blunder_moves = {} nonblunder_moves = {} target_columns_dicts = {c: {} for c in target_columns} with bz2.open(fPath, 'rt') as f: reader = csv.DictReader(f) tstart = time.time() for i, line in enumerate(reader): #import pdb; pdb.set_trace() if line['low_time'] == 'True': continue if nrows is not None and i >= nrows: break #if i % 10000 == 0: # haibrid_chess_utils.printWithDate(f"{name} {i} rows dones, {i /(time.time() - tstart):.2f} rows per second", end ='\r') #if i > 100000: # break #import pdb; pdb.set_trace() board_s = ' '.join(line['board'].split(' ')[:2]) #, colour, castling = haibrid_chess_utils.preproc_fen(line['board']) board_hash = hash(board_s) try: counts_dict[board_hash] += 1 except KeyError: counts_dict[board_hash] = 1 maps_dict[board_hash] = board_s if line['is_blunder_wr'] == 'True': blunder_moves[board_hash] = [line['move']] nonblunder_moves[board_hash] = [] else: nonblunder_moves[board_hash] = [line['move']] blunder_moves[board_hash] = [] for c in target_columns: target_columns_dicts[c][board_hash] = [ val_to_float(line[c]) ] else: if line['is_blunder_wr'] == 'True': blunder_moves[board_hash].append(line['move']) else: nonblunder_moves[board_hash].append(line['move']) for c in target_columns: target_columns_dicts[c][board_hash].append( val_to_float(line[c])) haibrid_chess_utils.printWithDate(f"{name} done, now writing") #name = os.path.basename(fPath).split('.')[0] outputName = os.path.join(outputDir, f"{name}.csv.bz2") for k in list(maps_dict.keys()): if counts_dict[k] < 2: del maps_dict[k] del counts_dict[k] del blunder_moves[k] del nonblunder_moves[k] for c in target_columns: del target_columns_dicts[c][k] write_dicts(outputName, maps_dict, counts_dict, blunder_moves, nonblunder_moves, target_columns_dicts) sub_dicts = { i: { 'maps': {}, 'counts': {}, 'blunders': {}, 'nonblunders': {}, 'target_columns': {c: {} for c in target_columns}, } for i in range(8) } for k in list(maps_dict.keys()): if counts_dict[k] < 2: pass else: s_d = sub_dicts[k % 8] s_d['maps'][k] = maps_dict[k] s_d['counts'][k] = counts_dict[k] s_d['blunders'][k] = blunder_moves[k] s_d['nonblunders'][k] = nonblunder_moves[k] for c in target_columns: s_d['target_columns'][c][k] = target_columns_dicts[c][k] for k in range(8): queues['semaphore'].acquire() queues['maps'].put(sub_dicts[k]['maps']) queues['counts'].put(sub_dicts[k]['counts']) queues['blunders'].put(sub_dicts[k]['blunders']) queues['nonblunders'].put(sub_dicts[k]['nonblunders']) queues['target_columns'].put(sub_dicts[k]['target_columns']) queues['semaphore'].release()
def writerWorker(outputFile, inputQueue, stopLoadingQueue, num_readers, num_splits, batch_size, max_bs): i = -1 num_kill_remaining = num_readers tstart = time.time() haibrid_chess_utils.printWithDate("Writer created") last_b_batch = 0 last_nb_batch = 0 num_bs = 0 b_bins = {} nb_bins = {} if not os.path.isdir(os.path.dirname(outputFile)) and len(os.path.dirname(outputFile)) > 0: os.makedirs(os.path.dirname(outputFile), exist_ok=True) with zipfile.ZipFile(outputFile, 'w') as myzip: haibrid_chess_utils.printWithDate(f"Created: {outputFile}") tLast = time.time() while True: unload_b_bins = False unload_nb_bins = False while len(b_bins) < num_splits: b_bins[last_b_batch] = [] last_b_batch += 1 while len(nb_bins) < num_splits: nb_bins[last_nb_batch] = [] last_nb_batch += 1 try: dat_vals = inputQueue.get() except queue.Empty: #Should never happen break try: b_vals, nb_vals = dat_vals print("-------") #print(type(nb_vals[0])) #print(type(b_vals[0])) print(b_vals[0][0][0]) print(b_vals[0][0].shape) print(b_vals[0][2]) for _, a in b_bins.items(): try: a.append(b_vals.pop()) if len(a) >= batch_size: unload_b_bins = True except IndexError: break for _, a in nb_bins.items(): try: a.append(nb_vals.pop()) if len(a) >= batch_size: unload_nb_bins = True except IndexError: break except: if dat_vals == 'kill': num_kill_remaining -= 1 if num_kill_remaining <= 0: break else: raise else: if unload_nb_bins: unload_bin_dict(nb_bins, batch_size, myzip) if unload_b_bins: num_bs += unload_bin_dict(b_bins, batch_size, myzip, blunder = True) i += 1 if num_bs >= max_bs: haibrid_chess_utils.printWithDate(f"Max bs hit stopping after {i} games") stopLoadingQueue.put('kill') break if i % 100 == 0 and time.time() - tLast > logging_delay: tLast = time.time() haibrid_chess_utils.printWithDate(f"Written {i} games {last_b_batch} b batches {last_nb_batch} nb batches in {humanize.naturaldelta(time.time() - tstart)}, doing {(i + 1) /(time.time() - tstart):.0f} games a second", flush = True)
def main(): parser = argparse.ArgumentParser( description= 'Create two new csvs with select columns split by is_blunder_wr', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help='input CSV') parser.add_argument('outputDir', help='output CSV') parser.add_argument('--min_elo', type=int, help='min active elo', default=1000) parser.add_argument('--max_elo', type=int, help='min active elo', default=9999999999) parser.add_argument('--allow_negative_loss', type=bool, help='allow winrate losses below 0', default=False) parser.add_argument('--allow_low_time', type=bool, help='Include low time moves', default=False) parser.add_argument('--min_ply', type=int, help='min move ply to consider', default=6) #parser.add_argument('--shuffleSize', type=int, help='Shuffle buffer size', default = 1000) parser.add_argument('--nrows', type=int, help='number of rows to read in', default=None) parser.add_argument('--nb_to_b_ratio', type=float, help='ratio fof blunders to non blunders in dataset', default=1.5) args = parser.parse_args() haibrid_chess_utils.printWithDate( f"Starting CSV split of {args.input} writing to {args.outputDir}") haibrid_chess_utils.printWithDate( f"Collecting {', '.join(target_columns)}") name = os.path.basename(args.input).split('.')[0] outputBlunder = os.path.join(args.outputDir, f"{name}_blunder.csv.bz2") outputNonBlunder = os.path.join(args.outputDir, f"{name}_nonblunder.csv.bz2") haibrid_chess_utils.printWithDate( f"Created outputs named {outputBlunder} and {outputNonBlunder}") os.makedirs(args.outputDir, exist_ok=True) haibrid_chess_utils.printWithDate(f"Starting read") with bz2.open(args.input, 'rt') as f: df = pandas.read_csv(f, usecols=target_columns, nrows=args.nrows) haibrid_chess_utils.printWithDate( f"Filtering data starting at {len(df)} rows") df = df[df['move_ply'] >= args.min_ply] if not args.allow_low_time: df = df[df['low_time'].eq(False)] if not args.allow_negative_loss: df = df[df['winrate_loss'] > 0] df = df[df['active_elo'] > args.min_elo] df = df[df['active_elo'] < args.max_elo] df = df.dropna() haibrid_chess_utils.printWithDate(f"Filtering down data to {len(df)} rows") df_blunder = df[df['is_blunder_wr']] haibrid_chess_utils.printWithDate(f"Found {len(df_blunder)} blunders") df_blunder = df_blunder.sample(frac=1).reset_index(drop=True) df_non_blunder = df[df['is_blunder_wr'].eq(False)] haibrid_chess_utils.printWithDate( f"Found {len(df_non_blunder)} non blunders") df_non_blunder = df_non_blunder.sample(frac=1).reset_index( drop=True).iloc[:int(len(df_blunder) * args.nb_to_b_ratio)] haibrid_chess_utils.printWithDate( f"Reduced to {len(df_non_blunder)} non blunders") haibrid_chess_utils.printWithDate(f"Starting writing") with bz2.open(outputNonBlunder, 'wt') as fnb: df_non_blunder.to_csv(fnb, index=False) with bz2.open(outputBlunder, 'wt') as fb: df_blunder.to_csv(fb, index=False)
os.makedirs(args.outputDir, exist_ok = True) haibrid_chess_utils.printWithDate(f"Starting read") with multiprocessing.Pool(args.pool) as pool dfs = with bz2.open(args.input, 'rt') as f: df = pandas.read_csv(f, usecols = target_columns, nrows = args.nrows) haibrid_chess_utils.printWithDate(f"Filtering data starting at {len(df)} rows") df = df[df['move_ply'] >= args.min_ply] if not args.allow_low_time: df = df[df['low_time'].eq(False)] if not args.allow_negative_loss: df = df[df['winrate_loss'] > 0] df = df[df['active_elo'] > args.min_elo] df = df[df['active_elo'] < args.max_elo] df = df.dropna() haibrid_chess_utils.printWithDate(f"Filtering down data to {len(df)} rows")