def preprocess(*data_sets, processed_dir="processed_data"): processed_dir = os.path.join(os.getcwd(), processed_dir) if not os.path.isdir(processed_dir): os.mkdir(processed_dir) test_chunk, training_chunks = parse_data_sets(*data_sets) print("Allocating %s positions as test; remainder as training" % len(test_chunk), file=sys.stderr) print("Writing test chunk") test_dataset = DataSet.from_positions_w_context(test_chunk, is_test=True) test_filename = os.path.join(processed_dir, "test.chunk.gz") test_dataset.write(test_filename) training_datasets = map(DataSet.from_positions_w_context, training_chunks) for i, train_dataset in enumerate(training_datasets): if i % 10 == 0: print("Writing training chunk %s" % i) train_filename = os.path.join(processed_dir, "train%s.chunk.gz" % i) train_dataset.write(train_filename) print("%s chunks written" % (i+1))
def preprocess(*data_sets, processed_dir="..\go_data\pre_data"): processed_dir = os.path.join(os.getcwd(), processed_dir) if not os.path.isdir(processed_dir): os.mkdir(processed_dir) test_chunk, training_chunks = parse_data_sets(*data_sets) print("%s的数据作为test(测试集),剩下的数据作为训练集" % len(test_chunk)) # , file=sys.stderr) print("制作test chunk(测试集)") test_dataset = DataSet.from_positions_w_context(test_chunk, is_test=True) test_filename = os.path.join(processed_dir, "test.chunk.gz") test_dataset.write(test_filename) print("制作train chunk(训练集)") training_datasets = map(DataSet.from_positions_w_context, training_chunks) for i, train_dataset in enumerate(training_datasets): if i % 10 == 0: print("已经制作了%s训练集" % (i + 1)) train_filename = os.path.join(processed_dir, "train%s.chunk.gz" % i) train_dataset.write(train_filename)