def preprocess(*data_sets, processed_dir="processed_data"): processed_dir = os.path.join(os.getcwd(), processed_dir) if not os.path.isdir(processed_dir): os.mkdir(processed_dir) test_chunk, training_chunks = parse_data_sets(*data_sets) print("写 chunk") test_dataset = DataSet.from_positions_w_context(test_chunk, is_test=True) test_filename = os.path.join(processed_dir, "test.chunk.gz") test_dataset.write(test_filename) training_datasets = map(DataSet.from_positions_w_context, training_chunks) for i, train_dataset in enumerate(training_datasets): if i % 10 == 0: print("写chunk %s" % i) train_filename = os.path.join(processed_dir, "train%s.chunk.gz" % i) train_dataset.write(train_filename) print("%s chunks " % (i + 1))