Exemplo n.º 1
0
def mmap_csv(target_path, df, outputDir, args):
    haibrid_chess_utils.printWithDate(f"Loading: {target_path}")
    name = os.path.basename(target_path).split('.')[0]

    df_blunder = df[df['is_blunder_mean']]
    haibrid_chess_utils.printWithDate(f"Found {len(df_blunder)} blunders")

    df_blunder = df_blunder.sample(frac=1).reset_index(drop=True)

    df_non_blunder = df[df['is_blunder_mean'].eq(False)]
    haibrid_chess_utils.printWithDate(
        f"Found {len(df_non_blunder)} non blunders")

    df_non_blunder = df_non_blunder.sample(frac=1).reset_index(drop=True)

    del df

    haibrid_chess_utils.printWithDate(
        f"Reduced to {len(df_non_blunder)} non blunders")

    haibrid_chess_utils.printWithDate(f"Starting mmaping")

    os.makedirs(outputDir, exist_ok=True)

    mmaps = {}

    mmaps['blunder'] = make_df_mmaps(df_blunder, name,
                                     os.path.join(outputDir, 'blunder'))

    del df_blunder
    mmaps['nonblunder'] = make_df_mmaps(df_non_blunder, name,
                                        os.path.join(outputDir, 'nonblunder'))
    return mmaps
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(
        description='Create new cvs with select columns',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('input', help='input CSV')
    parser.add_argument('outputDir', help='output CSV')

    args = parser.parse_args()

    haibrid_chess_utils.printWithDate(
        f"Starting CSV conversion of {args.input} writing to {args.outputDir}")
    haibrid_chess_utils.printWithDate(
        f"Collecting {', '.join(target_columns)}")

    name = os.path.basename(args.input).split('.')[0]
    outputName = os.path.join(args.outputDir, f"{name}_trimmed.csv.bz2")

    haibrid_chess_utils.printWithDate(f"Created output name {outputName}")

    os.makedirs(args.outputDir, exist_ok=True)

    haibrid_chess_utils.printWithDate(f"Starting read")
    with bz2.open(args.input, 'rt') as f:
        df = pandas.read_csv(f, usecols=target_columns)

    haibrid_chess_utils.printWithDate(f"Starting write")
    with bz2.open(outputName, 'wt') as f:
        df.to_csv(f, index=False)
Exemplo n.º 3
0
def writerWorker(outputFile, inputQueue, num_readers, name):
    i = -1
    num_kill_remaining = num_readers
    tstart = time.time()
    haibrid_chess_utils.printWithDate("Writer created")
    with bz2.open(outputFile, 'wb') as f:
        haibrid_chess_utils.printWithDate(f"Created: {outputFile}")
        f.write((','.join(haibrid_chess_utils.full_csv_header) + '\n').encode('utf8'))
        tLast = time.time()
        while True:
            try:
                dat = inputQueue.get()
            except queue.Empty:
                #Should never happen
                break
            try:
                f.write(dat)
            except TypeError:
                if dat == 'kill':
                    num_kill_remaining -= 1
                    if num_kill_remaining <= 0:
                        break
                else:
                    raise
            else:
                i += 1
                if i % 1000 == 0 and  time.time() - tLast > logging_delay:
                    tLast = time.time()
                    haibrid_chess_utils.printWithDate(f"{name} Written {i} games in {humanize.naturaldelta(time.time() - tstart)}, doing {(i + 1) /(time.time() - tstart):.0f} games a second", flush = True)
    haibrid_chess_utils.printWithDate("Received shutdown signal to writer")
    haibrid_chess_utils.printWithDate(f"Done a total of {i} games in {humanize.naturaldelta(time.time() - tstart)}")
Exemplo n.º 4
0
def cleanup(pgnReaders, gameReaders, writers):
    pgnReaders.get()
    haibrid_chess_utils.printWithDate(f"Done reading")
    time.sleep(10)
    for r in gameReaders:
        r.get()
    haibrid_chess_utils.printWithDate(f"Done processing")

    writers.get()
Exemplo n.º 5
0
def gamesConverter(inputQueue, outputQueue):
    #haibrid_chess_utils.printWithDate("Converter created")
    while True:
        try:
            #print('qsize', inputQueue.qsize())
            dat = inputQueue.get()
        except queue.Empty:
            break
        if dat == 'kill':
            outputQueue.put('kill', True, 1000)
            break
        else:
            try:
                s = haibrid_chess_utils.gameToCSVlines(dat)
            except haibrid_chess_utils.NoStockfishEvals:
                pass
            except:
                haibrid_chess_utils.printWithDate('error:')
                haibrid_chess_utils.printWithDate(dat)
                haibrid_chess_utils.printWithDate(traceback.format_exc())
                raise
            else:
                if len(s) > 0:
                    lines = '\n'.join(s) + '\n'
                    outputQueue.put(lines.encode('utf8'), True, 1000)
    haibrid_chess_utils.printWithDate("Received shutdown signal to Converter", flush = True)
Exemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser(
        description='Make mmapped version of csv',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('inputs', nargs='+', help='input csv')
    parser.add_argument('outputDir', help='output dir of mmapped files')
    parser.add_argument('--nrows',
                        type=int,
                        help='number of rows to read in, FOR TESTING',
                        default=None)

    args = parser.parse_args()

    haibrid_chess_utils.printWithDate(
        f"Starting mmap of {', '.join(args.inputs)} writing to {args.outputDir} with {', '.join(mmap_columns)}"
    )

    mmaps = {}
    for path in args.inputs:
        mmaps[path] = mmap_csv(
            path,
            load_csv(path, args.nrows),
            args.outputDir,
            args,
        )
    haibrid_chess_utils.printWithDate("All mmapped")
    try:
        while True:
            haibrid_chess_utils.printWithDate("Still alive", end='\r')
            time.sleep(10 * 60)
    except KeyboardInterrupt:
        print()
        haibrid_chess_utils.printWithDate("Exiting")
Exemplo n.º 7
0
def cleanup(pgnReaders, gameReaders, writers):
    while len(pgnReaders) > 0:
        for i in list(pgnReaders.keys()):
            if pgnReaders[i].ready():
                haibrid_chess_utils.printWithDate(f"{i} Done reading")
                pgnReaders[i].get()
                del pgnReaders[i]
        time.sleep(10)

    for i in list(gameReaders.keys()):
        for w in gameReaders[i]:
            w.get()
        haibrid_chess_utils.printWithDate(f"{i} Done processing")

    for i in list(writers.keys()):
        writers[i].get()
Exemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser(description='process PGN file with stockfish annotaions into a csv file', formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('inputs', nargs = '+', help='input PGNs')
    parser.add_argument('outputDir', help='output CSVs dir')

    parser.add_argument('--pool', type=int, help='number of simultaneous jobs running per fil', default = 20)
    #parser.add_argument('--readers', type=int, help='number of simultaneous reader running per inputfile', default = 24)
    parser.add_argument('--queueSize', type=int, help='Max number of games to cache', default = 1000)

    args = parser.parse_args()

    haibrid_chess_utils.printWithDate(f"Starting CSV conversion of {', '.join(args.inputs)} writing to {args.outputDir}")

    os.makedirs(args.outputDir, exist_ok=True)

    names = {}
    for n in args.inputs:
        name = os.path.basename(n).split('.')[0]
        outputName = os.path.join(args.outputDir, f"{name}.csv.bz2")
        names[n] = (name, outputName)


        haibrid_chess_utils.printWithDate(f"Loading file: {name}")
    haibrid_chess_utils.printWithDate(f"Starting main loop")

    tstart = time.time()

    pgnReaders = {}
    gameReaders = {}
    writers = {}
    queues = {}

    with multiprocessing.Manager() as manager:
        with multiprocessing.Pool(args.pool * len(names)) as workers_pool, multiprocessing.Pool(len(names) * 2 + 4) as io_pool:
            for p, (i, o) in names.items():
                pgnReader, gameReader, writer, unproccessedQueue, resultsQueue = processPGN(p, i, o, args.queueSize, args.pool, manager, workers_pool, io_pool)
                pgnReaders[i] = pgnReader
                gameReaders[i] = gameReader
                writers[i] = writer
                queues[i] = (unproccessedQueue, resultsQueue)

            haibrid_chess_utils.printWithDate(f"Done loading Queues in {humanize.naturaldelta(time.time() - tstart)}, waiting for reading to finish")

            cleanup(pgnReaders, gameReaders, writers)

    haibrid_chess_utils.printWithDate(f"Done everything in {humanize.naturaldelta(time.time() - tstart)}, exiting")
Exemplo n.º 9
0
def readerWorker(inputPath, unproccessedQueue, resultsQueue, name, num_readers):
    tstart = time.time()
    gamesFile = haibrid_chess_utils.LightGamesFile(inputPath, just_games = True)
    try:
        tLast = time.time()
        for i, (_, gs) in enumerate(gamesFile):
            unproccessedQueue.put(gs, True, 1000)
            if i % 1000 == 0 and  time.time() - tLast > logging_delay:
                tLast = time.time()
                haibrid_chess_utils.printWithDate(f"{name} Loaded {i} games, input queue depth: {unproccessedQueue.qsize()}, ouput queue depth: {resultsQueue.qsize()}", flush = True)
    except EOFError:
        pass

    haibrid_chess_utils.printWithDate(f"{name} Done loading Queue in {humanize.naturaldelta(time.time() - tstart)}, sending kills")
    for i in range(num_readers):
        #haibrid_chess_utils.printWithDate(f"Putting kill number {i} in queue")
        unproccessedQueue.put('kill', True, 100)
Exemplo n.º 10
0
def runGroupBys(df_working, outputDir, fname):
    for opList in operations:
        mStart = time.time()
        opgroups = [groupbyColumns[n] for n in opList]

        opName = 'on-' + '_'.join(opList)

        df_c = df_working.groupby(opgroups).count().reset_index()
        df_m = df_working.groupby(opgroups).mean().reset_index()
        df_m['count'] = df_c['clock']

        df_m.to_csv(os.path.join(outputDir, f"{fname}_{opName}.csv"))

        haibrid_chess_utils.printWithDate(
            f"{fname} groupby: {opName} done in {humanize.naturaldelta(time.time() - mStart)}",
            colour='pink',
            flush=True)
Exemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser(
        description='Group by board',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('inputs', nargs='+', help='input CSVs')
    parser.add_argument('outputDir', help='output CSVs dir')
    parser.add_argument('--nrows',
                        type=int,
                        help='number of rows to read in, FOR TESTING',
                        default=None)

    args = parser.parse_args()
    os.makedirs(args.outputDir, exist_ok=True)

    haibrid_chess_utils.printWithDate(
        f"Starting groupby on {','.join(args.inputs)} writing to {args.outputDir}"
    )
    board_dicts = []
    #for p in args.inputs:
    #    board_dicts.append(get_boards_dict(p, args.outputDir))
    with multiprocessing.Manager() as manager, multiprocessing.Pool(
            len(args.inputs) + 1) as pool:
        sem = manager.Semaphore()
        q_m = manager.Queue()
        q_c = manager.Queue()
        q_b = manager.Queue()
        q_n = manager.Queue()
        q_t = manager.Queue()
        queues = {
            'semaphore': sem,
            'maps': q_m,
            'counts': q_c,
            'blunders': q_b,
            'nonblunders': q_n,
            'target_columns': q_t,
        }
        board_dicts_ret = pool.starmap_async(
            get_boards_dict,
            ((p, args.outputDir, queues, args.nrows) for p in args.inputs))
        final_fname = os.path.join(args.outputDir, 'all_months.csv.bz2')

        haibrid_chess_utils.printWithDate(f"Done all months, merging")

        merge_dicts(queues, board_dicts_ret, final_fname)
Exemplo n.º 12
0
def cleanup(pgnReader, gameParsers, writer):
    while not pgnReader.ready():
        if writer.ready() and not pgnReader.ready():
            haibrid_chess_utils.printWithDate(writer.get())
        for w in gameParsers:
            if w.ready() and  not pgnReader.ready():
                haibrid_chess_utils.printWithDate(w.get())
        time.sleep(1)

    pgnReader.get()
    haibrid_chess_utils.printWithDate(f"Done reading")

    for w in gameParsers:
        w.get()
    haibrid_chess_utils.printWithDate(f"Done parsing")

    writer.get()
    haibrid_chess_utils.printWithDate(f"Done writing")
Exemplo n.º 13
0
def merge_dicts(queues, rets, final_fname):
    dicts_lst = []
    counts_dict_com = {}
    maps_dict_com = {}
    blunder_moves_com = {}
    nonblunder_moves_com = {}
    target_columns_dicts_com = {c: {} for c in target_columns}
    count = 0
    while not rets.ready() or not queues['maps'].empty():
        if queues['maps'].empty():
            time.sleep(1)
        else:
            queues['semaphore'].acquire()
            maps_dict = queues['maps'].get()
            counts_dict = queues['counts'].get()
            blunder_dict = queues['blunders'].get()
            nonblunder_dict = queues['nonblunders'].get()
            columns_dict = queues['target_columns'].get()
            queues['semaphore'].release()
            count += 1
            haibrid_chess_utils.printWithDate(
                f"Merging {count}, {len(maps_dict)} total boards")
            for h_board, c in counts_dict.items():
                try:
                    counts_dict_com[h_board] += c
                except KeyError:
                    maps_dict_com[h_board] = maps_dict[h_board]
                    counts_dict_com[h_board] = c
                    blunder_moves_com[h_board] = blunder_dict[h_board]
                    nonblunder_moves_com[h_board] = nonblunder_dict[h_board]
                    for c in target_columns:
                        target_columns_dicts_com[c][h_board] = columns_dict[c][
                            h_board]
                else:
                    blunder_moves_com[h_board] += blunder_dict[h_board]
                    nonblunder_moves_com[h_board] += nonblunder_dict[h_board]
                    for c in target_columns:
                        target_columns_dicts_com[c][h_board] += columns_dict[
                            c][h_board]
    haibrid_chess_utils.printWithDate(
        f"Done merging, writing to {final_fname}")
    write_dicts(final_fname, maps_dict_com, counts_dict_com, blunder_moves_com,
                nonblunder_moves_com, target_columns_dicts_com)
Exemplo n.º 14
0
def main():

    parser = argparse.ArgumentParser(
        description=
        'process PGN file with stockfish annotaions into a csv file',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('input', help='input PGNs')
    parser.add_argument('outputDir', help='output CSVs dir')

    parser.add_argument('--pool',
                        type=int,
                        help='number of simultaneous jobs running per file',
                        default=30)
    parser.add_argument('--allow_non_sf',
                        help='Allow games with no stockfish info',
                        default=False,
                        action="store_true")
    #parser.add_argument('--debug', help='DEBUG MODE', default = False, action="store_true")
    #parser.add_argument('--readers', type=int, help='number of simultaneous reader running per inputfile', default = 24)
    parser.add_argument('--queueSize',
                        type=int,
                        help='Max number of games to cache',
                        default=1000)

    args = parser.parse_args()

    haibrid_chess_utils.printWithDate(
        f"Starting CSV conversion of {args.input} writing to {args.outputDir}")

    os.makedirs(args.outputDir, exist_ok=True)

    name = os.path.basename(args.input).split('.')[0]
    outputName = os.path.join(args.outputDir, f"{name}.csv.bz2")
    #names[n] = (name, outputName)

    haibrid_chess_utils.printWithDate(f"Loading file: {name}")
    haibrid_chess_utils.printWithDate(f"Starting main loop")

    tstart = time.time()
    with multiprocessing.Manager() as manager:
        with multiprocessing.Pool(
                args.pool) as workers_pool, multiprocessing.Pool(3) as io_pool:
            pgnReader, gameReader, writer, unproccessedQueue, resultsQueue = processPGN(
                args.input, name, outputName, args.queueSize, args.pool,
                args.allow_non_sf, manager, workers_pool, io_pool)

            haibrid_chess_utils.printWithDate(
                f"Done loading Queues in {humanize.naturaldelta(time.time() - tstart)}, waiting for reading to finish"
            )

            cleanup(pgnReader, gameReader, writer)
Exemplo n.º 15
0
def pgnParser(inputQueue, outputQueue, num_splits, min_elo, max_elo, allow_negative_loss, allow_low_time):
    while True:
        try:
            dats = inputQueue.get()
        except queue.Empty:
            break
        if dats == 'kill':
            outputQueue.put('kill', True)
            break
        else:
            for dat in dats:
                try:
                    b_vals, nb_vals = parseLines(dat, num_splits, min_elo, max_elo, allow_negative_loss, allow_low_time)
                except KeyboardInterrupt:
                    raise
                except:
                    haibrid_chess_utils.printWithDate('error:')
                    haibrid_chess_utils.printWithDate(dat)
                    raise
                if len(b_vals) > 0 or len(nb_vals) > 0:
                    outputQueue.put((b_vals, nb_vals), True)
Exemplo n.º 16
0
def processTarget(target, outputDir, nrows, queue_out):
    tstart = time.time()
    os.makedirs(outputDir, exist_ok=True)
    tname = os.path.basename(target).split('.')[0]
    haibrid_chess_utils.printWithDate(f"Starting on {tname}",
                                      colour='green',
                                      flush=True)

    with bz2.open(target, 'rt') as f:
        df = pandas.read_csv(f,
                             index_col=None,
                             nrows=nrows,
                             usecols=[
                                 'cp_rel', 'active_elo', 'opponent_elo',
                                 'clock', 'opp_clock', 'move_ply', 'type',
                                 'low_time', 'active_won'
                             ])

    haibrid_chess_utils.printWithDate(
        f"{tname} loaded: {len(df)} lines in {humanize.naturaldelta(time.time() - tstart)}",
        colour='green',
        flush=True)

    #df['mul_factor'] = df['white_active'].apply(lambda x : 1 if x else -1)
    #df['cp_rel'] = df['cp'] * df['mul_factor']

    df['binned_cp'] = df['cp_rel'].apply(lambda x: x * 100 // 10 / 10)
    df['binned_active_elo'] = df['active_elo'].apply(lambda x:
                                                     (x // 100) * 100)
    df['binned_opponent_elo'] = df['opponent_elo'].apply(lambda x:
                                                         (x // 100) * 100)

    df['elo_delta'] = (df['active_elo'] -
                       df['opponent_elo']).apply(lambda x: (x // 100) * 100)

    df['binned_clock'] = df['clock'].apply(binClocks)
    df['opp_binned_clock'] = df['opp_clock'].apply(binClocks)
    df['game_phase'] = df['move_ply'].apply(binPly)
    #df['low_time'] = df['clock'].apply(isLowTime)

    for df_sub in np.split(df, range(10**6, len(df), 10**6), axis=0):
        queue_out.put(df_sub.copy())

    queue_out.put("done")

    runGroupBys(df, outputDir, tname)

    haibrid_chess_utils.printWithDate(
        f"{tname} done everything in {humanize.naturaldelta(time.time() - tstart)}",
        colour='green',
        flush=True)
Exemplo n.º 17
0
def main():
    parser = argparse.ArgumentParser(
        description='Make train testr split of grouped boards',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('input', help='input CSV')
    parser.add_argument('outputDir', help='output CSVs dir')
    parser.add_argument('--nrows',
                        type=int,
                        help='number of rows to read in, FOR TESTING',
                        default=None)
    parser.add_argument('--blunder_ratio',
                        type=float,
                        help='ratio to declare a positive class',
                        default=.1)
    parser.add_argument('--min_count',
                        type=int,
                        help='ratio to declare a positive class',
                        default=10)

    args = parser.parse_args()
    os.makedirs(os.path.join(args.outputDir, 'test'), exist_ok=True)
    os.makedirs(os.path.join(args.outputDir, 'train'), exist_ok=True)

    haibrid_chess_utils.printWithDate(f"Loading: {args.input}")

    df = pandas.read_csv(args.input)

    haibrid_chess_utils.printWithDate(f"Filtering: {args.input}")
    df = df[df['count'] >= args.min_count].copy()

    df['is_blunder_mean'] = df['is_blunder_wr_mean'] > .1
    df['board_extended'] = df['board'].apply(lambda x: x + ' KQkq - 0 1')
    df['white_active'] = df['board'].apply(lambda x: x.endswith('w'))
    df['has_nonblunder_move'] = df['top_nonblunder'].isna() == False
    df['has_blunder_move'] = df['top_blunder'].isna() == False
    df['is_test'] = [
        np.random.random() < args.blunder_ratio for i in range(len(df))
    ]

    haibrid_chess_utils.printWithDate(f"Wrting to: {args.outputDir}")
    df[df['is_test']].to_csv('/datadrive/group_csv/test/grouped_fens.csv.bz2',
                             compression='bz2')

    df[df['is_test'] == False].to_csv(os.path.join(args.outputDir, 'train',
                                                   'grouped_fens.csv.bz2'),
                                      compression='bz2')
    df[df['is_test'] == True].to_csv(os.path.join(args.outputDir, 'test',
                                                  'grouped_fens.csv.bz2'),
                                     compression='bz2')
Exemplo n.º 18
0
def main():
    parser = argparse.ArgumentParser(description='Create two new csvs with select columns split by is_blunder_wr', formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('targets', nargs = '+', help='input CSVs')
    parser.add_argument('outputDir', help='output CSV')

    parser.add_argument('--min_elo', type=int, help='min active elo', default = 1000)
    parser.add_argument('--max_elo', type=int, help='min active elo', default = 9999999999)
    parser.add_argument('--allow_negative_loss', type=bool, help='allow winrate losses below 0', default = False)
    parser.add_argument('--allow_low_time', type=bool, help='Include low time moves', default = False)

    parser.add_argument('--min_ply', type=int, help='min move ply to consider', default = 6)
    parser.add_argument('--pool', type=64, help='min move ply to consider', default = 6)

    #parser.add_argument('--shuffleSize', type=int, help='Shuffle buffer size', default = 1000)
    parser.add_argument('--nrows', type=int, help='number of rows to read in', default = None)

    parser.add_argument('--nb_to_b_ratio', type=float, help='ratio fof blunders to non blunders in dataset', default = 1.5)

    args = parser.parse_args()

    haibrid_chess_utils.printWithDate(f"Starting CSVs split of {len(args.targets)} targets writing to {args.outputBlunder}")
    haibrid_chess_utils.printWithDate(f"Collecting {', '.join(target_columns)}")

    name = os.path.basename(args.input).split('.')[0]
    outputBlunder = os.path.join(args.outputDir, f"{name}_blunder.csv.bz2")
    outputNonBlunder = os.path.join(args.outputDir, f"{name}_nonblunder.csv.bz2")

    haibrid_chess_utils.printWithDate(f"Created outputs named {outputBlunder} and {outputNonBlunder}")



    os.makedirs(args.outputDir, exist_ok = True)

    haibrid_chess_utils.printWithDate(f"Starting read")

    with multiprocessing.Pool(args.pool) as pool
Exemplo n.º 19
0
def readerWorker(inputPath, unproccessedQueue, resultsQueue, stopLoadingQueue, num_readers, testing):
    tstart = time.time()
    gamesFile = haibrid_chess_utils.LightGamesFile(inputPath, just_games = True)
    haibrid_chess_utils.printWithDate(f"Reader created", flush = True)
    try:
        tLast = time.time()
        games_bundle = []
        for i, (_, gs) in enumerate(gamesFile):
            games_bundle.append(gs)
            if len(games_bundle) >= game_per_put:
                unproccessedQueue.put(games_bundle, True)
                games_bundle = []
            if i % 100 == 0 and  time.time() - tLast > logging_delay:
                tLast = time.time()
                haibrid_chess_utils.printWithDate(f"Loaded {i} games, input queue depth: {unproccessedQueue.qsize()}, ouput queue depth: {resultsQueue.qsize()}", flush = True)
                if testing and i > 100000:
                    break
                try:
                    stopLoading = stopLoadingQueue.get_nowait()
                except queue.Empty:
                    pass
                else:
                    if stopLoading == 'kill':
                        haibrid_chess_utils.printWithDate(f"Killed by max_bs, ending after {i} games loaded")
                        break
                    else:
                        raise RuntimeError(f"{stopLoading} in wrong queue")
    except EOFError:
        pass
    if len(games_bundle) > 0:
        unproccessedQueue.put(games_bundle, True)

    haibrid_chess_utils.printWithDate(f"Done loading Queue in {humanize.naturaldelta(time.time() - tstart)}, sending kills")
    for i in range(num_readers):
        #haibrid_chess_utils.printWithDate(f"Putting kill number {i} in queue")
        unproccessedQueue.put('kill', True, 100)
Exemplo n.º 20
0
def processPGN(gamesPath, inputName, outputName, queueSize, poolSize, manager, workers_pool, io_pool):
    unproccessedQueue = manager.Queue(queueSize)
    resultsQueue = manager.Queue(queueSize)

    readers = []
    for _ in range(poolSize - 1):
        reader = workers_pool.apply_async(gamesConverter, (unproccessedQueue, resultsQueue))
        readers.append(reader)
    haibrid_chess_utils.printWithDate(f"{inputName} Started {len(readers)} readers", flush = True)

    pgnReader = io_pool.apply_async(readerWorker, (gamesPath, unproccessedQueue, resultsQueue, inputName, len(readers)))
    haibrid_chess_utils.printWithDate(f"{inputName} loader created")

    writer = io_pool.apply_async(writerWorker, (outputName, resultsQueue, len(readers), inputName))
    haibrid_chess_utils.printWithDate(f"{inputName} Started writer for: {inputName}", flush = True)

    return pgnReader, readers, writer, unproccessedQueue, resultsQueue
Exemplo n.º 21
0
def main():
    parser = argparse.ArgumentParser(description='Convert PGN file to zip of the binaries', formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('input', help='input PGNs')
    parser.add_argument('output', help='output zip')

    parser.add_argument('--pool', type=int, help='number of simultaneous jobs procesing the file, reading and writing are two more', default = 48)
    parser.add_argument('--num_boards', type=int, help='Max number of boards of each class to take from each game', default = 5)
    parser.add_argument('--batch_size', type=int, help='Number of boards per batch', default = 100)
    parser.add_argument('--max_bs', type=int, help='Max number of blunder files to create', default = 9999999999)
    parser.add_argument('--queueSize', type=int, help='Max number of games to cache', default = 1000)

    parser.add_argument('--min_elo', type=int, help='min active elo', default = 1000)
    parser.add_argument('--max_elo', type=int, help='min active elo', default = 9999999999)
    parser.add_argument('--allow_negative_loss', type=bool, help='allow winrate losses below 0', default = False)
    parser.add_argument('--allow_low_time', type=bool, help='Include low time moves', default = False)

    parser.add_argument('--testing', help='Make run on only first 1000', default = False, action='store_true')

    args = parser.parse_args()

    if args.testing:
        haibrid_chess_utils.printWithDate(f"TESTING RUN", colour = 'red')

    haibrid_chess_utils.printWithDate(f"Starting PGN conversion of {args.input} writing to {args.output}")

    tstart = time.time()
    with multiprocessing.Manager() as manager:
        with multiprocessing.Pool(args.pool) as worker_pool, multiprocessing.Pool(3) as io_pool:
            pgnReader, gameParsers, writer, unproccessedQueue, resultsQueue, stopQueue = setupProcessors(args.input, args.output, manager, worker_pool, io_pool, args.pool, args.num_boards, args.batch_size, args.queueSize, args.min_elo, args.max_elo, args.allow_negative_loss, args.allow_low_time, args.max_bs, args.testing)

            haibrid_chess_utils.printWithDate(f"Done setting up Queues in {humanize.naturaldelta(time.time() - tstart)}, waiting for reading to finish")

            cleanup(pgnReader, gameParsers, writer)

    haibrid_chess_utils.printWithDate(f"Done everything in {humanize.naturaldelta(time.time() - tstart)}, exiting")
Exemplo n.º 22
0
def cleanup(pgnReaders, gameReaders, writers):

    #time.sleep(10)
    while len(gameReaders) > 0:
        for i in range(len(gameReaders)):
            #haibrid_chess_utils.printWithDate(f"Checking {i} of {len(gameReaders)}", flush = True)
            try:
                gameReaders[i].get(1)
            except multiprocessing.TimeoutError:
                pass
            else:
                del gameReaders[i]
                break
    haibrid_chess_utils.printWithDate(f"Done processing")
    pgnReaders.get()
    haibrid_chess_utils.printWithDate(f"Done reading")
    writers.get()
    haibrid_chess_utils.printWithDate(f"Done cleanup")
Exemplo n.º 23
0
def setupProcessors(gamesPath, ouput_path, manager, worker_pool, io_pool, poolSize, num_boards, batch_size, queueSize, min_elo, max_elo, allow_negative_loss, allow_low_time, max_bs, testing):
    unproccessedQueue = manager.Queue(queueSize)
    resultsQueue = manager.Queue(queueSize)
    stopLoadingQueue = manager.Queue()

    parsers = []
    for _ in range(poolSize):
        parser = worker_pool.apply_async(pgnParser, (unproccessedQueue, resultsQueue, num_boards, min_elo, max_elo, allow_negative_loss, allow_low_time))
        parsers.append(parser)
    haibrid_chess_utils.printWithDate(f"Started {len(parsers)} parsers")

    pgnReader = io_pool.apply_async(readerWorker, (gamesPath, unproccessedQueue, resultsQueue, stopLoadingQueue, len(parsers), testing))
    haibrid_chess_utils.printWithDate(f"loader created")

    writer = io_pool.apply_async(writerWorker, (ouput_path, resultsQueue, stopLoadingQueue, len(parsers), num_boards, batch_size, max_bs))
    haibrid_chess_utils.printWithDate(f"Started writer")

    return pgnReader, parsers, writer, unproccessedQueue, resultsQueue, stopLoadingQueue
Exemplo n.º 24
0
def make_df_mmaps(df, name, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    outputName = os.path.join(output_dir, name)

    mmaps = {}
    haibrid_chess_utils.printWithDate(f"Making y_vals mmaps for: {name} done:",
                                      end=' ')
    for y_name in mmap_columns:
        make_var_mmap(y_name, outputName, mmaps, df)
        print(y_name, end=' ', flush=True)

    haibrid_chess_utils.printWithDate(f"Making move array mmaps for: {name}")

    make_move_mmap(outputName, mmaps, 'top_blunder', df)
    make_move_mmap(outputName, mmaps, 'top_nonblunder', df)

    haibrid_chess_utils.printWithDate(f"Making boards array mmaps for: {name}")

    make_board_mmap(outputName, mmaps, df)

    return mmaps
Exemplo n.º 25
0
def main():
    parser = argparse.ArgumentParser(
        description='Run groupby on the csv files',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('inputs', nargs='+', help='input CSVs')
    parser.add_argument('outputDir', help='output dir name')

    parser.add_argument('--pool',
                        type=int,
                        help='number of simultaneous jobs running',
                        default=32)

    parser.add_argument('--nrows',
                        type=int,
                        help='For debugging, limit the number lines read',
                        default=None)

    args = parser.parse_args()

    haibrid_chess_utils.printWithDate(
        f"Starting CSV groupby of {', '.join(args.inputs)} writing to {args.outputDir}"
    )

    os.makedirs(args.outputDir, exist_ok=True)

    tstart = time.time()

    with multiprocessing.Manager() as manager:
        dfs_queue = manager.Queue()

        with multiprocessing.Pool(args.pool) as pool:
            mResult = pool.starmap_async(
                processTarget,
                [(tname,
                  os.path.join(args.outputDir,
                               os.path.basename(tname).split('.')[0]),
                  args.nrows, dfs_queue) for tname in args.inputs])

            dfs = []
            dones = []
            while len(dones) < len(args.inputs):
                try:
                    r = dfs_queue.get(True, 10)
                    if isinstance(r, str):
                        dones.append(r)
                    else:
                        dfs.append(r)
                except queue.Empty:
                    pass

            tname = "lichess_db_standard_rated_all"

            haibrid_chess_utils.printWithDate(f"All {len(dfs)} dfs collected")
            df = pandas.concat(dfs, ignore_index=True)

            haibrid_chess_utils.printWithDate(
                f"Full dataset loaded: {len(df)} lines")

            runGroupBys(df, args.outputDir, tname)

            mJoined = mResult.get()
            haibrid_chess_utils.printWithDate(
                f"Done all single months in {humanize.naturaldelta(time.time() - tstart)}"
            )

    haibrid_chess_utils.printWithDate(
        f"{tname} done everything in {humanize.naturaldelta(time.time() - tstart)}, exiting"
    )
Exemplo n.º 26
0
def load_csv(target_path, nrows):
    haibrid_chess_utils.printWithDate(f"Loading: {target_path}", flush=True)
    return pandas.read_csv(target_path, usecols=target_columns, nrows=nrows)
Exemplo n.º 27
0
def get_boards_dict(fPath, outputDir, queues, nrows):
    name = os.path.basename(fPath).split('.')[0]
    haibrid_chess_utils.printWithDate(f"Starting: {name}")
    maps_dict = {}
    counts_dict = {}
    blunder_moves = {}
    nonblunder_moves = {}
    target_columns_dicts = {c: {} for c in target_columns}
    with bz2.open(fPath, 'rt') as f:
        reader = csv.DictReader(f)
        tstart = time.time()
        for i, line in enumerate(reader):
            #import pdb; pdb.set_trace()
            if line['low_time'] == 'True':
                continue
            if nrows is not None and i >= nrows:
                break
            #if i % 10000 == 0:
            #    haibrid_chess_utils.printWithDate(f"{name} {i} rows dones, {i /(time.time() - tstart):.2f} rows per second", end ='\r')
            #if i > 100000:
            #    break
            #import pdb; pdb.set_trace()
            board_s = ' '.join(line['board'].split(' ')[:2])

            #, colour, castling = haibrid_chess_utils.preproc_fen(line['board'])
            board_hash = hash(board_s)
            try:
                counts_dict[board_hash] += 1
            except KeyError:
                counts_dict[board_hash] = 1
                maps_dict[board_hash] = board_s
                if line['is_blunder_wr'] == 'True':
                    blunder_moves[board_hash] = [line['move']]
                    nonblunder_moves[board_hash] = []
                else:
                    nonblunder_moves[board_hash] = [line['move']]
                    blunder_moves[board_hash] = []
                for c in target_columns:
                    target_columns_dicts[c][board_hash] = [
                        val_to_float(line[c])
                    ]
            else:
                if line['is_blunder_wr'] == 'True':
                    blunder_moves[board_hash].append(line['move'])
                else:
                    nonblunder_moves[board_hash].append(line['move'])
                for c in target_columns:
                    target_columns_dicts[c][board_hash].append(
                        val_to_float(line[c]))
    haibrid_chess_utils.printWithDate(f"{name} done, now writing")
    #name = os.path.basename(fPath).split('.')[0]
    outputName = os.path.join(outputDir, f"{name}.csv.bz2")
    for k in list(maps_dict.keys()):
        if counts_dict[k] < 2:
            del maps_dict[k]
            del counts_dict[k]
            del blunder_moves[k]
            del nonblunder_moves[k]
            for c in target_columns:
                del target_columns_dicts[c][k]

    write_dicts(outputName, maps_dict, counts_dict, blunder_moves,
                nonblunder_moves, target_columns_dicts)
    sub_dicts = {
        i: {
            'maps': {},
            'counts': {},
            'blunders': {},
            'nonblunders': {},
            'target_columns': {c: {}
                               for c in target_columns},
        }
        for i in range(8)
    }

    for k in list(maps_dict.keys()):
        if counts_dict[k] < 2:
            pass
        else:
            s_d = sub_dicts[k % 8]
            s_d['maps'][k] = maps_dict[k]
            s_d['counts'][k] = counts_dict[k]
            s_d['blunders'][k] = blunder_moves[k]
            s_d['nonblunders'][k] = nonblunder_moves[k]
            for c in target_columns:
                s_d['target_columns'][c][k] = target_columns_dicts[c][k]

    for k in range(8):
        queues['semaphore'].acquire()
        queues['maps'].put(sub_dicts[k]['maps'])
        queues['counts'].put(sub_dicts[k]['counts'])
        queues['blunders'].put(sub_dicts[k]['blunders'])
        queues['nonblunders'].put(sub_dicts[k]['nonblunders'])
        queues['target_columns'].put(sub_dicts[k]['target_columns'])
        queues['semaphore'].release()
Exemplo n.º 28
0
def writerWorker(outputFile, inputQueue, stopLoadingQueue, num_readers, num_splits, batch_size, max_bs):
    i = -1
    num_kill_remaining = num_readers
    tstart = time.time()
    haibrid_chess_utils.printWithDate("Writer created")

    last_b_batch = 0
    last_nb_batch = 0
    num_bs = 0
    b_bins = {}
    nb_bins = {}

    if not os.path.isdir(os.path.dirname(outputFile)) and len(os.path.dirname(outputFile)) > 0:
        os.makedirs(os.path.dirname(outputFile), exist_ok=True)

    with zipfile.ZipFile(outputFile, 'w') as myzip:
        haibrid_chess_utils.printWithDate(f"Created: {outputFile}")
        tLast = time.time()
        while True:
            unload_b_bins = False
            unload_nb_bins = False
            while len(b_bins) < num_splits:
                b_bins[last_b_batch] = []
                last_b_batch += 1
            while len(nb_bins) < num_splits:
                nb_bins[last_nb_batch] = []
                last_nb_batch += 1
            try:
                dat_vals = inputQueue.get()
            except queue.Empty:
                #Should never happen
                break
            try:
                b_vals, nb_vals = dat_vals
                print("-------")
                #print(type(nb_vals[0]))
                #print(type(b_vals[0]))
                print(b_vals[0][0][0])
                print(b_vals[0][0].shape)
                print(b_vals[0][2])
                for _, a in b_bins.items():
                    try:
                        a.append(b_vals.pop())
                        if len(a) >= batch_size:
                            unload_b_bins = True
                    except IndexError:
                        break
                for _, a in nb_bins.items():
                    try:
                        a.append(nb_vals.pop())
                        if len(a) >= batch_size:
                            unload_nb_bins = True
                    except IndexError:
                        break
            except:
                if dat_vals == 'kill':
                    num_kill_remaining -= 1
                    if num_kill_remaining <= 0:
                        break
                else:
                    raise
            else:
                if unload_nb_bins:
                    unload_bin_dict(nb_bins, batch_size, myzip)
                if unload_b_bins:
                    num_bs += unload_bin_dict(b_bins, batch_size, myzip, blunder = True)
                i += 1
                if num_bs >= max_bs:
                    haibrid_chess_utils.printWithDate(f"Max bs hit stopping after {i} games")
                    stopLoadingQueue.put('kill')
                    break
                if i % 100 == 0 and  time.time() - tLast > logging_delay:
                    tLast = time.time()
                    haibrid_chess_utils.printWithDate(f"Written {i} games {last_b_batch} b batches {last_nb_batch} nb batches in {humanize.naturaldelta(time.time() - tstart)}, doing {(i + 1) /(time.time() - tstart):.0f} games a second", flush = True)
Exemplo n.º 29
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Create two new csvs with select columns split by is_blunder_wr',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('input', help='input CSV')
    parser.add_argument('outputDir', help='output CSV')

    parser.add_argument('--min_elo',
                        type=int,
                        help='min active elo',
                        default=1000)
    parser.add_argument('--max_elo',
                        type=int,
                        help='min active elo',
                        default=9999999999)
    parser.add_argument('--allow_negative_loss',
                        type=bool,
                        help='allow winrate losses below 0',
                        default=False)
    parser.add_argument('--allow_low_time',
                        type=bool,
                        help='Include low time moves',
                        default=False)

    parser.add_argument('--min_ply',
                        type=int,
                        help='min move ply to consider',
                        default=6)

    #parser.add_argument('--shuffleSize', type=int, help='Shuffle buffer size', default = 1000)
    parser.add_argument('--nrows',
                        type=int,
                        help='number of rows to read in',
                        default=None)

    parser.add_argument('--nb_to_b_ratio',
                        type=float,
                        help='ratio fof blunders to non blunders in dataset',
                        default=1.5)

    args = parser.parse_args()

    haibrid_chess_utils.printWithDate(
        f"Starting CSV split of {args.input} writing to {args.outputDir}")
    haibrid_chess_utils.printWithDate(
        f"Collecting {', '.join(target_columns)}")

    name = os.path.basename(args.input).split('.')[0]
    outputBlunder = os.path.join(args.outputDir, f"{name}_blunder.csv.bz2")
    outputNonBlunder = os.path.join(args.outputDir,
                                    f"{name}_nonblunder.csv.bz2")

    haibrid_chess_utils.printWithDate(
        f"Created outputs named {outputBlunder} and {outputNonBlunder}")

    os.makedirs(args.outputDir, exist_ok=True)

    haibrid_chess_utils.printWithDate(f"Starting read")
    with bz2.open(args.input, 'rt') as f:
        df = pandas.read_csv(f, usecols=target_columns, nrows=args.nrows)

    haibrid_chess_utils.printWithDate(
        f"Filtering data starting at {len(df)} rows")

    df = df[df['move_ply'] >= args.min_ply]

    if not args.allow_low_time:
        df = df[df['low_time'].eq(False)]

    if not args.allow_negative_loss:
        df = df[df['winrate_loss'] > 0]

    df = df[df['active_elo'] > args.min_elo]
    df = df[df['active_elo'] < args.max_elo]

    df = df.dropna()

    haibrid_chess_utils.printWithDate(f"Filtering down data to {len(df)} rows")

    df_blunder = df[df['is_blunder_wr']]
    haibrid_chess_utils.printWithDate(f"Found {len(df_blunder)} blunders")

    df_blunder = df_blunder.sample(frac=1).reset_index(drop=True)

    df_non_blunder = df[df['is_blunder_wr'].eq(False)]
    haibrid_chess_utils.printWithDate(
        f"Found {len(df_non_blunder)} non blunders")

    df_non_blunder = df_non_blunder.sample(frac=1).reset_index(
        drop=True).iloc[:int(len(df_blunder) * args.nb_to_b_ratio)]

    haibrid_chess_utils.printWithDate(
        f"Reduced to {len(df_non_blunder)} non blunders")

    haibrid_chess_utils.printWithDate(f"Starting writing")

    with bz2.open(outputNonBlunder, 'wt') as fnb:
        df_non_blunder.to_csv(fnb, index=False)
    with bz2.open(outputBlunder, 'wt') as fb:
        df_blunder.to_csv(fb, index=False)
Exemplo n.º 30
0


    os.makedirs(args.outputDir, exist_ok = True)

    haibrid_chess_utils.printWithDate(f"Starting read")

    with multiprocessing.Pool(args.pool) as pool
    dfs =


    with bz2.open(args.input, 'rt') as f:
        df = pandas.read_csv(f, usecols = target_columns, nrows = args.nrows)


    haibrid_chess_utils.printWithDate(f"Filtering data starting at {len(df)} rows")

    df = df[df['move_ply'] >= args.min_ply]

    if not args.allow_low_time:
        df = df[df['low_time'].eq(False)]

    if not args.allow_negative_loss:
        df = df[df['winrate_loss'] > 0]

    df = df[df['active_elo'] > args.min_elo]
    df = df[df['active_elo'] < args.max_elo]

    df = df.dropna()

    haibrid_chess_utils.printWithDate(f"Filtering down data to {len(df)} rows")