def test_dnn_for_big_data(config_file): config = ConfigParser.ConfigParser() config.read(config_file) hadoop_bin = config.get("hadoop", 'bin') temp_dir = config.get('temp','temp_dir') sample_file_list = config.get("input", 'sample_file_list') frame_name = config.get("input", 'data_frame_name') chunk_size = int(config.get("input", 'chunk_size')) model_file_path = config.get("model", 'model_file_path') predict_file_path = config.get("output", 'predict_file_path') with open(model_file_path, 'r') as model_file: model_data = cPickle.load(model_file) parameter = model_data["parameter"] del model_data["parameter"] neuralnet = create_neuralnet(model_data) neuralnet.set_parameter(parameter) sample_file_paths = [] with open(sample_file_list,'r') as f: for line in f: line = line.strip() if line: sample_file_paths.append(line) predict_file = open(predict_file_path, 'w') for file_path in sample_file_paths: if file_path.startswith("hdfs:"): local_file_path = download_file(hadoop_bin, file_path, temp_dir) else: local_file_path = file_path train_data_set = SupervisedDataSet(local_file_path, frame_name=frame_name) print time.ctime() + ":\tbegin predict with sample : " + remote_file_path for idx, (train_X, train_y_) in enumerate(train_data_set.sample_batches(batch_size=chunk_size)): predict_y = neuralnet.predict(train_X) output_val = numpy.concatenate((train_y_, predict_y), axis=1) predict_file.write("\n".join("\t".join(x) for x in output_val)) predict_file.write("\n") if file_path.startswith("hdfs:"): os.system('rm ' + local_file_path) predict_file.close()
def train_dnn_for_big_data(config_file): config = ConfigParser.ConfigParser() config.read(config_file) hadoop_bin = config.get("hadoop", 'bin') temp_dir = config.get('temp','temp_dir') sample_file_list = config.get("input", 'sample_file_list') frame_name = config.get("input", 'data_frame_name') output_model_prefix = config.get("output", 'output_model_prefix') try: network_arch = json.loads(config.get("network","architecture")) except: print config.get("network","architecture") raise max_epoches = int(config.get("train", 'max_epoches')) chunk_size = int(config.get("train", 'chunk_size')) optim_settings = json.loads(config.get("train", 'optim_settings')) neuralnet = create_neuralnet(network_arch) optimizer = create_optimizer(optim_settings) optimizer.work_for(neuralnet) sample_file_paths = [] with open(sample_file_list,'r') as f: for line in f: line = line.strip() if line: sample_file_paths.append(line) for i in range(max_epoches): print time.ctime() + ":\tbegin epoche :", i shuffle(sample_file_paths) for file_path in sample_file_paths: if file_path.startswith("hdfs:"): local_file_path = download_file(hadoop_bin, file_path, temp_dir) else: local_file_path = file_path train_data_set = SupervisedDataSet(local_file_path, frame_name=frame_name) print time.ctime() + ":\tbegin training with sample : " + file_path try: for idx, (train_X, train_y) in enumerate(train_data_set.sample_batches(batch_size=chunk_size)): print time.ctime() + ":\tbegin new chunk : ", idx, "@epoch : ", i optimizer.update_chunk(train_X, train_y) new_param = optimizer.optimize(neuralnet.get_parameter()) neuralnet.set_parameter(new_param) except Exception as e: print e.message if file_path.startswith("hdfs:"): os.system('rm ' + local_file_path) with open(output_model_prefix + "_" + str(i) + ".dat", 'w') as f: content = network_arch content["parameter"] = neuralnet.get_parameter() cPickle.dump(content, f, protocol=cPickle.HIGHEST_PROTOCOL)