Exemplo n.º 1
0
def preprocess(*data_sets, processed_dir="processed_data"):
    '''
     preprocess the SGF files. This takes all positions in the SGF files and extracts features for each position,
     as well as recording the correct next move. These positions are then split into chunks,
     with one test chunk and the remainder as training chunks.
     This step may take a while, and must be repeated if you change the feature extraction steps in features.py
    :param data_sets: 下载的sgf文件目录
    :param processed_dir: 处理后的放置目录名
    :return:
    '''
    # os.path.join():合并项目地址与相对地址,组成绝对地址
    # os.getcwd():得到项目当前地址
    # os.path.isdir():检测地址是否存在
    # os.mkdir():新建目录
    processed_dir = os.path.join(os.getcwd(), processed_dir)
    if not os.path.isdir(processed_dir):
        os.mkdir(processed_dir)

    test_chunk, training_chunks = parse_data_sets(*data_sets)
    print("Allocating %s positions as test; remainder as training" %
          len(test_chunk),
          file=sys.stderr)

    print("Writing test chunk")
    test_dataset = DataSet.from_positions_w_context(test_chunk, is_test=True)
    test_filename = os.path.join(processed_dir, "test.chunk.gz")
    test_dataset.write(test_filename)

    training_datasets = map(DataSet.from_positions_w_context, training_chunks)
    for i, train_dataset in enumerate(training_datasets):
        if i % 10 == 0:
            print("Writing training chunk %s" % i)
        train_filename = os.path.join(processed_dir, "train%s.chunk.gz" % i)
        train_dataset.write(train_filename)
    print("%s chunks written" % (i + 1))
Exemplo n.º 2
0
def preprocess(dataset_root,
               processed_dir="processed_data",
               desired_test_size=10**5):
    if not os.path.exists(processed_dir):
        os.makedirs(processed_dir)
    test_chunk, train_chunk = parse_data_sets(dataset_root, desired_test_size)
    print("=====Test # %s, Train # %s.=====" %
          (len(test_chunk), len(train_chunk)))

    print("=====Writing test chunk=====")
    test_dataset = DataSet.from_positions_w_context(test_chunk, is_test=True)
    test_filename = os.path.join(processed_dir, "test.chunk.gz")
    test_dataset.write(test_filename)

    print("=====Writing training chunk=====")
    train_dataset = DataSet.from_positions_w_context(train_chunk, is_test=True)
    train_filename = os.path.join(processed_dir, "train.chunk.gz")
    train_dataset.write(train_filename)
Exemplo n.º 3
0
def preprocess(*datasets,processed_dir="processed_data"):    #带一个星号(*)参数的函数传入的参数存储为一个元组(tuple)→(2,3,4)带两个(*)号则是表示字典(dict)→{a:2, b:3}
    print("正在运行preprocess模块")
    processed_dir=os.path.join(os.getcwd(),processed_dir) #输出路径
    if not os.path.isdir(processed_dir):    
        os.mkdir(processed_dir)
    #TensorFlow将存储块以及相应的块信息抽象为一种叫做Chunk(组块)的双向链表数据结构。
    test_chunk,training_chunks=parse_data_sets(*datasets)    #解析文件
    print("分配 %s 位置作为测试块; 剩余的作训练块" % len(test_chunk), file=sys.stderr)

    test_dataset=DataSet.from_positions_w_context(test_chunk,is_test=True) #DataSet类   文件组进入处理
    test_filename=os.path.join(processed_dir,"test.chunk.gz")
    print("写入测试数据块")
    test_dataset.write(test_filename)

    training_datasets=map(DataSet.from_positions_w_context,training_chunks)
    for i,train_dataset in tqdm.tqdm(enumerate(training_datasets)):
        train_filename=os.path.join(processed_dir,"train%s.chunk.gz"% i)
        print("写入训练数据块")
        train_dataset.write(train_filename)
    print("已经写入训练数据%s"%(i+1))
Exemplo n.º 4
0
def preprocess(*data_sets, processed_dir="processed_data"):
    processed_dir = os.path.join(os.getcwd(), processed_dir)
    if not os.path.isdir(processed_dir):
        os.mkdir(processed_dir)

    test_chunk, training_chunks = parse_data_sets(*data_sets)
    print("Allocating %s positions as test; remainder as training" % len(test_chunk), file=sys.stderr)

    print("Writing test chunk")
    test_dataset = DataSet.from_positions_w_context(test_chunk, is_test=True)
    test_filename = os.path.join(processed_dir, "test.chunk.gz")
    test_dataset.write(test_filename)

    training_datasets = map(DataSet.from_positions_w_context, training_chunks)
    for i, train_dataset in enumerate(training_datasets):
        if i % 10 == 0:
            print("Writing training chunk %s" % i)
        train_filename = os.path.join(processed_dir, "train%s.chunk.gz" % i)
        train_dataset.write(train_filename)
    print("%s chunks written" % (i + 1))
Exemplo n.º 5
0
Arquivo: main.py Projeto: brilee/MuGo
def preprocess(*data_sets, processed_dir="processed_data"):
    processed_dir = os.path.join(os.getcwd(), processed_dir)
    if not os.path.isdir(processed_dir):
        os.mkdir(processed_dir)

    test_chunk, training_chunks = parse_data_sets(*data_sets)
    print("Allocating %s positions as test; remainder as training" % len(test_chunk), file=sys.stderr)

    print("Writing test chunk")
    test_dataset = DataSet.from_positions_w_context(test_chunk, is_test=True)
    test_filename = os.path.join(processed_dir, "test.chunk.gz")
    test_dataset.write(test_filename)

    training_datasets = map(DataSet.from_positions_w_context, training_chunks)
    for i, train_dataset in enumerate(training_datasets):
        if i % 10 == 0:
            print("Writing training chunk %s" % i)
        train_filename = os.path.join(processed_dir, "train%s.chunk.gz" % i)
        train_dataset.write(train_filename)
    print("%s chunks written" % (i+1))
Exemplo n.º 6
0
def preprocess(*data_sets, processed_dir='processed_data'):
    processed_dir = os.path.join(os.getcwd(), processed_dir)
    if not os.path.isdir(processed_dir):
        os.mkdir(processed_dir)

    print(processed_dir)
    test_chunk, training_chunks = parse_data_sets(*data_sets)
    print('Allocating %s positions as test, reminder as training' %
          len(test_chunk),
          file=sys.stderr)

    print('Writing test chunk')
    test_dataset = DataSet.from_positions_w_context(test_chunk, is_test=True)
    test_filename = os.path.join(processed_dir, 'test.chunk.gz')
    test_dataset.write(test_filename)

    print('\nWrite training chunks')
    training_datasets = map(DataSet.from_positions_w_context, training_chunks)
    for i, dataset in tqdm.tqdm(enumerate(training_datasets)):
        train_filename = os.path.join(processed_dir, 'train%s.chunk.gz' % i)
        dataset.write(train_filename)