Exemplo n.º 1
0
def main():
    parser = OptionParser()

    parser.add_option("-p", "--protocol", type="string", default="betaori_closed_hand")

    parser.add_option("-i", "--input", type="string", help="The input directory name")

    parser.add_option("-e", "--epochs", type="int", default=16)

    parser.add_option("--load", type="int", help="What epoch to load", default=0)

    parser.add_option(
        "--print", action="store_true", help="Do we need to print predictions or not", default=False,
    )

    parser.add_option("--visualize", action="store_true", default=False)

    opts, _ = parser.parse_args()

    load_epoch = opts.load
    epochs = opts.epochs
    protocol_string = opts.protocol
    visualize = opts.visualize
    input_directory_name = opts.input
    print_predictions = opts.print

    root_dir = os.path.dirname(os.path.realpath(__file__))
    data_dir = os.path.join(root_dir, "..", "processed_data", input_directory_name)
    if not os.path.exists(data_dir):
        print("Directory with data is not exists. Run prepare_data.py")
        return

    if not os.listdir(data_dir):
        print("Directory with data is empty. Run prepare_data.py")
        return

    protocols = {
        "betaori_closed_hand": BetaoriClosedHandModel,
        "betaori_open_hand": BetaoriOpenHandModel,
        "hand_cost_open": OpenHandCostModel,
        "hand_cost_closed": ClosedHandCostModel,
    }

    protocol = protocols.get(protocol_string)

    if not protocol:
        parser.error("Possible values for protocol are: {}.".format(", ".join(protocols.keys())))

    set_up_logging("training_{}".format(protocol_string))

    model = protocol(input_directory_name, data_dir, print_predictions, epochs, visualize, load_epoch)

    model.run()
Exemplo n.º 2
0
def main():
    parser = OptionParser()

    parser.add_option("-p", "--protocol", type="string")
    parser.add_option("-e", "--epochs", type="int", default=16)
    parser.add_option("--load",
                      type="int",
                      help="What epoch to load",
                      default=0)
    parser.add_option(
        "--print",
        action="store_true",
        help="Do we need to print predictions or not",
        default=False,
    )
    parser.add_option("--visualize", action="store_true", default=False)

    opts, _ = parser.parse_args()

    load_epoch = opts.load
    epochs = opts.epochs
    protocol_string = opts.protocol
    visualize = opts.visualize
    print_predictions = opts.print

    data_dir = pathlib.Path(
        __file__).parent / ".." / "processed_data" / protocol_string
    if not os.path.exists(data_dir):
        print("Directory with data doesn't exist. Run prepare_data.py")
        return

    if not os.listdir(data_dir):
        print("Directory with data is empty. Run prepare_data.py")
        return

    protocols = {
        "agari_riichi_cost": AgariRiichiCostModel,
    }
    model_class = protocols.get(protocol_string)

    if not model_class:
        parser.error(
            f"Possible values for protocol are: {', '.join(protocols.keys())}."
        )

    set_up_logging("training_{}".format(protocol_string))

    model = model_class(protocol_string, data_dir, print_predictions, epochs,
                        visualize, load_epoch)
    model.run()
Exemplo n.º 3
0
def main():
    parser = OptionParser()

    parser.add_option("-p",
                      "--protocol",
                      type="string",
                      default="betaori_closed_hand")

    parser.add_option("-o",
                      "--output",
                      type="string",
                      help="The output directory name")

    parser.add_option("-d",
                      "--train-path",
                      type="string",
                      help="Path to .csv with train data.")

    parser.add_option("-t",
                      "--test-path",
                      type="string",
                      help="Path to .csv with test data.")

    parser.add_option("--chunk", type="int", help="chunk size", default=100000)

    parser.add_option("--test-chunk",
                      type="int",
                      help="test file chunk size",
                      default=50000)

    parser.add_option("--percentage",
                      type="int",
                      help="test data percentage",
                      default=20)

    opts, _ = parser.parse_args()

    data_path = opts.train_path
    test_path = opts.test_path
    chunk_size = opts.chunk
    test_file_chunk_size = opts.test_chunk
    test_data_percentage = opts.percentage
    output_directory_name = opts.output

    if not data_path:
        parser.error("Path to .csv with train data is not given.")

    if not test_path:
        parser.error("Path to .csv with test data is not given.")

    protocol_string = opts.protocol
    protocols = {
        "betaori_closed_hand": BetaoriClosedHandProtocol,
        "betaori_open_hand": BetaoriOpenHandProtocol,
        "hand_cost_open": OpenHandCostProtocol,
        "hand_cost_closed": ClosedHandCostProtocol,
    }

    protocol = protocols.get(protocol_string)

    if not protocol:
        parser.error("Possible values for protocol are: {}".format(", ".join(
            protocols.keys())))

    set_up_logging("prepare_data")

    logger.info("{} protocol will be used.".format(protocol_string))
    logger.info("Chunk size: {}. Test data percentage: {}".format(
        chunk_size, test_data_percentage))

    root_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                            "processed_data")
    if not os.path.exists(root_dir):
        os.mkdir(root_dir)

    data_dir = os.path.join(root_dir, output_directory_name)
    if os.path.exists(data_dir):
        logger.info("Data directory already exists. It was deleted.")
        shutil.rmtree(data_dir)

    os.mkdir(data_dir)

    total_count = line_count(data_path)
    test_count = int((total_count / 100.0) * test_data_percentage)

    logger.info("Train data size: {}".format(total_count))
    logger.info("Test data size: {}".format(test_count))

    # our test data had to be in separate file
    header = CSVExporter.header()
    # test_data = pd.read_csv(test_path, names=header, nrows=test_count)
    # test_data = test_data.replace([None, np.nan, 'None', 'NaN', 'nan'], '')
    #
    # protocol.parse_new_data(test_data.iterrows())

    for i, chunk in enumerate(
            pd.read_csv(test_path,
                        chunksize=test_file_chunk_size,
                        names=header,
                        nrows=test_count)):
        file_name = "test_chunk_{:03}.hkl".format(i)
        logger.info("Processing {}...".format(file_name))

        protocol = protocols.get(protocol_string)
        protocol = protocol()

        chunk = chunk.replace([None, np.nan, "None", "NaN", "nan"], "")
        protocol.parse_new_data(chunk.iterrows())

        test_path = os.path.join(data_dir, file_name)
        hickle.dump(protocol, test_path, mode="w")

        gc.collect()

    logger.info("")
    logger.info("Processing train data...")

    for i, chunk in enumerate(
            pd.read_csv(data_path, chunksize=chunk_size, names=header)):
        file_name = "chunk_{:03}.h5".format(i)
        logger.info("Processing {}...".format(file_name))

        protocol = protocols.get(protocol_string)
        protocol = protocol()

        chunk = chunk.replace([None, np.nan, "None", "NaN", "nan"], "")
        protocol.parse_new_data(chunk.iterrows())

        with h5py.File(os.path.join(data_dir, file_name), "w") as f:
            f.create_dataset("input_data",
                             data=protocol.input_data,
                             dtype="float32")
            f.create_dataset("output_data",
                             data=protocol.output_data,
                             dtype="float32")

        logger.info("Data size = {}".format(len(protocol.input_data)))

        gc.collect()
Exemplo n.º 4
0
def main():
    """
    Walk through tenhou logs and extract information needed for training.
    :return:
    """
    parser = OptionParser()

    parser.add_option("-p",
                      "--protocol",
                      type="string",
                      help="The output protocol")

    parser.add_option("-o", "--output", type="string", help="The output file")

    parser.add_option("-d",
                      "--data",
                      type="string",
                      help="Path to .sqlite3 db with logs content")

    parser.add_option("--limit",
                      type="int",
                      help="How many logs to load",
                      default=None)

    parser.add_option("--offset",
                      type="int",
                      help="Point from where to load logs",
                      default=0)

    opts, _ = parser.parse_args()

    db_path = opts.data
    limit = opts.limit
    offset = opts.offset
    output_format = opts.protocol
    output_file = opts.output

    if not db_path:
        parser.error("Path to db is not given.")

    allowed_outputs = {
        "betaori_closed_hand": BetaoriClosedHandParser(),
        "betaori_open_hand": BetaoriOpenHandParser(),
    }

    if not allowed_outputs.get(output_format):
        parser.error("Not correct output format. Available options: {}".format(
            ", ".join(allowed_outputs.keys())))

    parser = allowed_outputs.get(output_format)

    if os.path.exists(output_file):
        os.remove(output_file)
        logger.warning(f"File {output_file} already exists! It was removed")

    set_up_logging("parser")

    logger.info("Data file: {}".format(db_path))
    logger.info("{} protocol will be used".format(output_format))
    logger.info("Loading and decompressing logs content...")
    logs = load_logs(db_path, limit, offset)

    logs_count = 0
    samples_count = 0
    count_of_logs = len(logs)
    logger.info("Starting processing {} logs...".format(count_of_logs))

    bar = tqdm(logs)
    for log_data in bar:
        if logs_count > 0 and logs_count % 1000 == 0:
            logger.info("Processed logs: {}/{}".format(logs_count,
                                                       count_of_logs))
            logger.info(f"Samples: {samples_count}")

        game = parser.get_game_rounds(log_data["log_content"],
                                      log_data["log_id"])
        records = parser.parse_game_rounds(game)
        samples_count += len(records)

        with open(output_file, "a") as f:
            writer = csv.writer(f)
            writer.writerow(CSVExporter.header())
            for record in records:
                writer.writerow(record)

        logs_count += 1
        bar.set_description(f"Samples: {samples_count}")

    logger.info("Shuffle output file")
    # subprocess.run(
    #     "shuf -o {} < {}".format(os.path.abspath(output_file), os.path.abspath(output_file)), shell=True,
    # )

    logger.info("End")
    logger.info("Total samples:  {}".format(samples_count))
Exemplo n.º 5
0
def main():
    parser = OptionParser()

    parser.add_option('-p',
                      '--protocol',
                      type='string',
                      default='betaori_closed_hand')

    parser.add_option('-i',
                      '--input',
                      type='string',
                      help='The input directory name')

    parser.add_option('-e', '--epochs', type='int', default=16)

    parser.add_option('--load',
                      type='int',
                      help='What epoch to load',
                      default=0)

    parser.add_option('--print',
                      action='store_true',
                      help='Do we need to print predictions or not',
                      default=False)

    parser.add_option('--visualize', action='store_true', default=False)

    opts, _ = parser.parse_args()

    load_epoch = opts.load
    epochs = opts.epochs
    protocol_string = opts.protocol
    visualize = opts.visualize
    input_directory_name = opts.input
    print_predictions = opts.print

    root_dir = os.path.dirname(os.path.realpath(__file__))
    data_dir = os.path.join(root_dir, '..', 'processed_data',
                            input_directory_name)
    if not os.path.exists(data_dir):
        print('Directory with data is not exists. Run prepare_data.py')
        return

    if not os.listdir(data_dir):
        print('Directory with data is empty. Run prepare_data.py')
        return

    protocols = {
        'betaori_closed_hand': BetaoriClosedHandModel,
        'betaori_open_hand': BetaoriOpenHandModel,
        'hand_cost_open': OpenHandCostModel,
        'hand_cost_closed': ClosedHandCostModel,
    }

    protocol = protocols.get(protocol_string)

    if not protocol:
        parser.error('Possible values for protocol are: {}.'.format(', '.join(
            protocols.keys())))

    set_up_logging('training_{}'.format(protocol_string))

    model = protocol(input_directory_name, data_dir, print_predictions, epochs,
                     visualize, load_epoch)

    model.run()
Exemplo n.º 6
0
def main():
    parser = OptionParser()

    parser.add_option("-p", "--protocol", type="string")
    parser.add_option("-d",
                      "--train-path",
                      type="string",
                      help="Path to .csv with train data.")
    parser.add_option("-t",
                      "--test-path",
                      type="string",
                      help="Path to .csv with test data.")
    parser.add_option("-c",
                      "--chunk",
                      type="int",
                      help="chunk size",
                      default=100000)

    opts, _ = parser.parse_args()

    data_path = opts.train_path
    test_path = opts.test_path
    chunk_size = opts.chunk

    if not data_path:
        parser.error("Path to .csv with train data is not given.")

    if not test_path:
        parser.error("Path to .csv with test data is not given.")

    protocol_string = opts.protocol
    protocols = {
        "agari_riichi_cost": AgariRiichiCostProtocol,
    }

    protocol_class = protocols.get(protocol_string)

    if not protocol_class:
        parser.error(
            f"Possible values for protocol are: {', '.join(protocols.keys())}")

    set_up_logging("prepare_data")

    logger.info(f"{protocol_class.__name__} protocol will be used.")
    logger.info(f"Chunk size: {chunk_size}")

    processed_folder = pathlib.Path(__file__).parent / ".." / "processed_data"
    if not processed_folder.exists():
        os.mkdir(processed_folder)

    data_dir = processed_folder / protocol_string
    if data_dir.exists():
        logger.info("Data directory already exists. It was deleted.")
        shutil.rmtree(data_dir)

    os.mkdir(data_dir)

    for i, chunk in enumerate(
            pd.read_csv(test_path,
                        chunksize=chunk_size,
                        names=protocol_class.CSV_HEADER)):
        file_name = f"test_chunk_{i:03}.hkl"
        logger.info(f"Processing {file_name}...")

        protocol = protocol_class()

        chunk = chunk.replace([None, np.nan, "None", "NaN", "nan"], "")
        protocol.parse_new_data(chunk.iterrows())

        test_path = os.path.join(data_dir, file_name)
        hickle.dump(
            {
                "input_data": protocol.input_data,
                "output_data": protocol.output_data,
                "verification_data": protocol.verification_data,
            },
            test_path,
            mode="w",
        )

        logger.info(f"Test size = {len(protocol.input_data)}")

        del protocol
        gc.collect()

    logger.info("")
    logger.info("Processing train data...")

    for i, chunk in enumerate(
            pd.read_csv(data_path,
                        chunksize=chunk_size,
                        names=protocol_class.CSV_HEADER)):
        file_name = f"chunk_{i:03}.hkl"
        logger.info(f"Processing {file_name}...")

        protocol = protocols.get(protocol_string)
        protocol = protocol()

        chunk = chunk.replace([None, np.nan, "None", "NaN", "nan"], "")
        protocol.parse_new_data(chunk.iterrows())

        with h5py.File(os.path.join(data_dir, file_name), "w") as f:
            f.create_dataset("input_data",
                             data=protocol.input_data,
                             dtype="float32")
            f.create_dataset("output_data",
                             data=protocol.output_data,
                             dtype="float32")

        logger.info(f"Data size = {len(protocol.input_data)}")

        del protocol
        gc.collect()
Exemplo n.º 7
0
def main():
    parser = OptionParser()

    parser.add_option('-p',
                      '--protocol',
                      type='string',
                      default='betaori_closed_hand')

    parser.add_option('-o',
                      '--output',
                      type='string',
                      help='The output directory name')

    parser.add_option('-d',
                      '--train-path',
                      type='string',
                      help='Path to .csv with train data.')

    parser.add_option('-t',
                      '--test-path',
                      type='string',
                      help='Path to .csv with test data.')

    parser.add_option('--chunk', type='int', help='chunk size', default=100000)

    parser.add_option('--test-chunk',
                      type='int',
                      help='test file chunk size',
                      default=50000)

    parser.add_option('--percentage',
                      type='int',
                      help='test data percentage',
                      default=20)

    opts, _ = parser.parse_args()

    data_path = opts.train_path
    test_path = opts.test_path
    chunk_size = opts.chunk
    test_file_chunk_size = opts.test_chunk
    test_data_percentage = opts.percentage
    output_directory_name = opts.output

    if not data_path:
        parser.error('Path to .csv with train data is not given.')

    if not test_path:
        parser.error('Path to .csv with test data is not given.')

    protocol_string = opts.protocol
    protocols = {
        'betaori_closed_hand': BetaoriClosedHandProtocol,
        'betaori_open_hand': BetaoriOpenHandProtocol,
        'hand_cost_open': OpenHandCostProtocol,
        'hand_cost_closed': ClosedHandCostProtocol,
    }

    protocol = protocols.get(protocol_string)

    if not protocol:
        parser.error('Possible values for protocol are: {}'.format(', '.join(
            protocols.keys())))

    set_up_logging('prepare_data')

    logger.info('{} protocol will be used.'.format(protocol_string))
    logger.info('Chunk size: {}. Test data percentage: {}'.format(
        chunk_size, test_data_percentage))

    root_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
                            'processed_data')
    if not os.path.exists(root_dir):
        os.mkdir(root_dir)

    data_dir = os.path.join(root_dir, output_directory_name)
    if os.path.exists(data_dir):
        logger.info('Data directory already exists. It was deleted.')
        shutil.rmtree(data_dir)

    os.mkdir(data_dir)

    total_count = line_count(data_path)
    test_count = int((total_count / 100.0) * test_data_percentage)

    logger.info('Train data size: {}'.format(total_count))
    logger.info('Test data size: {}'.format(test_count))

    # our test data had to be in separate file
    header = CSVExporter.header()
    # test_data = pd.read_csv(test_path, names=header, nrows=test_count)
    # test_data = test_data.replace([None, np.nan, 'None', 'NaN', 'nan'], '')
    #
    # protocol.parse_new_data(test_data.iterrows())

    for i, chunk in enumerate(
            pd.read_csv(test_path,
                        chunksize=test_file_chunk_size,
                        names=header,
                        nrows=test_count)):
        file_name = 'test_chunk_{:03}.hkl'.format(i)
        logger.info('Processing {}...'.format(file_name))

        protocol = protocols.get(protocol_string)
        protocol = protocol()

        chunk = chunk.replace([None, np.nan, 'None', 'NaN', 'nan'], '')
        protocol.parse_new_data(chunk.iterrows())

        test_path = os.path.join(data_dir, file_name)
        hickle.dump(protocol, test_path, mode='w')

        gc.collect()

    logger.info('')
    logger.info('Processing train data...')

    for i, chunk in enumerate(
            pd.read_csv(data_path, chunksize=chunk_size, names=header)):
        file_name = 'chunk_{:03}.h5'.format(i)
        logger.info('Processing {}...'.format(file_name))

        protocol = protocols.get(protocol_string)
        protocol = protocol()

        chunk = chunk.replace([None, np.nan, 'None', 'NaN', 'nan'], '')
        protocol.parse_new_data(chunk.iterrows())

        with h5py.File(os.path.join(data_dir, file_name), 'w') as f:
            f.create_dataset('input_data',
                             data=protocol.input_data,
                             dtype='float32')
            f.create_dataset('output_data',
                             data=protocol.output_data,
                             dtype='float32')

        logger.info('Data size = {}'.format(len(protocol.input_data)))

        gc.collect()
Exemplo n.º 8
0
def main():
    parser = OptionParser()

    parser.add_option('-p',
                      '--protocol',
                      type='string',
                      help='The output protocol')

    parser.add_option('-o', '--output', type='string', help='The output file')

    parser.add_option('-d',
                      '--data',
                      type='string',
                      help='Path to .sqlite3 db with logs content')

    parser.add_option('-l',
                      '--limit',
                      type='string',
                      help='For debugging',
                      default='unlimited')

    opts, _ = parser.parse_args()

    db_path = opts.data
    limit = opts.limit
    output_format = opts.protocol
    output_file = opts.output

    if not db_path:
        parser.error('Path to db is not given.')

    allowed_outputs = {
        'closed_hand': BetaoriClosedHandParser(),
        'open_hand': BetaoriOpenHandParser(),
    }

    if not allowed_outputs.get(output_format):
        parser.error('Not correct output format. Available options: {}'.format(
            ', '.join(allowed_outputs.keys())))

    parser = allowed_outputs.get(output_format)

    if os.path.exists(output_file):
        logger.warning(
            'File {} already exists! New data will append there.'.format(
                output_file))

    set_up_logging('parser')

    logger.info('Data file: {}'.format(db_path))
    logger.info('{} protocol will be used'.format(output_format))
    logger.info('Loading and decompressing logs content...')
    logs = load_logs(db_path, limit)

    logs_count = 0
    samples_count = 0
    count_of_logs = len(logs)
    logger.info('Starting processing {} logs...'.format(count_of_logs))

    for log_data in logs:
        if logs_count > 0 and logs_count % 1000 == 0:
            logger.info('Processed logs: {}/{}'.format(logs_count,
                                                       count_of_logs))
            logger.info('Samples: {}'.format(samples_count))

        game = parser.get_game_rounds(log_data['log_content'],
                                      log_data['log_id'])
        records = parser.parse_game_rounds(game)
        samples_count += len(records)

        with open(output_file, 'a') as f:
            writer = csv.writer(f)
            for record in records:
                writer.writerow(record)

        logs_count += 1

    logger.info('Shuffle output file')
    subprocess.run('shuf -o {} < {}'.format(os.path.abspath(output_file),
                                            os.path.abspath(output_file)),
                   shell=True)

    logger.info('End')
    logger.info('Total samples:  {}'.format(samples_count))