Exemplo n.º 1
0
import datetime
import pickle
import gzip

# 训练集文件路径
dir_name = r''
model_save_path = r"model/"
if not os.path.isdir(model_save_path):
    os.makedirs(model_save_path)  # 创建输出文件目录
model_fn = path.join(model_save_path, 'save_net.ckpt')

start_time = datetime.datetime.now()
print("startTime: ", start_time)

# 提起pickle数据 data包含 特征+标签
data = tool_set.read_and_decode(dir_name + "new.pkl")

isize = 10
img_channel = 3

img_pixel = isize
'''
# CNN 完整程序  训练模型
'''
# Parameters
training_epochs = 200
batch_size = 128
display_step = 10
channels = img_channel
per_process_gpu_memory_fraction = 1
Exemplo n.º 2
0
def writeMNIST(sc, dir_name,img_pixel,channels, output, format, num_partitions): # 按照自己数据格式相应修改
  '''
  :remark  将原有的数据转成需要的格式 存储成HDFS
  :param sc:  SparkContext 不改
  :param dir_name:  输入存放影像路径 如:存有0,1文件夹(2类)
  :param img_pixel:   图像大小(如:mnist 28)
  :param channels:    图像波段数(如:mnist 波段为 1)
  :param output:    转成HDFS输出路径
  :param format:    需要转成的数据格式
  :param num_partitions:  实际图像分类数  mnist 10分类 所有为10
  :return: HDFS
  '''
  """Writes MNIST image/label vectors into parallelized files on HDFS"""
  '''
  # load MNIST gzip into memory
  with open(input_images, 'rb') as f:
    images = numpy.array(mnist.extract_images(f)) # 将原有的图像数据全部提取出来转成numpy array

  with open(input_labels, 'rb') as f: # 将原有的标签数据全部提取出来转成numpy array
    if format == "csv2": # 数据格式
      labels = numpy.array(mnist.extract_labels(f, one_hot=False)) # array
    else:
      labels = numpy.array(mnist.extract_labels(f, one_hot=True)) # array

  '''

  # tool_set.create_pickle_train(dir_name,img_pixel,channels)

  # data=tool_set.read_and_decode(dir_name+"/train_data.pkl",img_pixel,channels)

  '''
  直接读取图像
  data=tool_set.create_pickle_train(dir_name,img_pixel,channels)
  '''
  # 读取pickle
  data = tool_set.read_and_decode(dir_name, img_pixel, channels)


  ## 图像-->numpy array
  # data = create_pickle_train(dir_name,img_pixel,channels) #(image+label)

  # 将数据按行打乱
  index = [i for i in range(len(data))]  # len(data)得到的行数
  np.random.shuffle(index)  # 将索引打乱
  data = data[index]
  del index


  labels_dense=data[:,-1] #取出标签列
  if format == "csv2":  # 数据格式
    labels=labels_dense
  else:
    # 转成one_hot
    labels=tool_set.dense_to_one_hot2(labels_dense,num_partitions)
  del labels_dense

  images_=data[:, 0:img_pixel * img_pixel * channels]
  images=images_.reshape((-1,img_pixel,img_pixel,channels))
  del data

  #标签--->float,图像数据-->int
  #下面这个处理一定要加,否则后续训练由于数据类型与TensorFlowOnSpark自带的数据类型
  # 不一致,而产生各种错误
  labels=labels.astype(np.float16)
  images=images.astype(np.uint8)

  # 如果使用自己的数据转成HDFS 需要修改 上面两个 open 将自己的数据 转成 numpy array

  shape = images.shape # 图像总数 x 28 x 28 x 1(波段数)
  print("images.shape: {0}".format(shape))          # 60000 x 28 x 28  mnist数据 28x28x1 0~9(10类)
  print("labels.shape: {0}".format(labels.shape))   # 60000 x 10

  # create RDDs of vectors
  imageRDD = sc.parallelize(images.reshape(shape[0], shape[1] * shape[2]*shape[3]), num_partitions) # [-1,28*28*1]
  # imageRDD = sc.parallelize(images.reshape(shape[0], shape[1] * shape[2]*nBands), num_partitions)  nBands 图像波段数

  labelRDD = sc.parallelize(labels, num_partitions)

  output_images = output + "/images" # 输出路径
  output_labels = output + "/labels" # 输出路径

  # save RDDs as specific format
  if format == "pickle":
    imageRDD.saveAsPickleFile(output_images) #保存成Pickle
    labelRDD.saveAsPickleFile(output_labels) #
  elif format == "csv":
    imageRDD.map(toCSV).saveAsTextFile(output_images) # 转成csv 再转成 Text
    labelRDD.map(toCSV).saveAsTextFile(output_labels) # 转成csv 再转成 Text
  elif format == "csv2":
    imageRDD.map(toCSV).zip(labelRDD).map(lambda x: str(x[1]) + "|" + x[0]).saveAsTextFile(output) # image + label 放在一个文件转成 text
  else: # format == "tfr":
    tfRDD = imageRDD.zip(labelRDD).map(lambda x: (bytearray(toTFExample(x[0], x[1])), None)) # 转成 .tfrecord
    # requires: --jars tensorflow-hadoop-1.0-SNAPSHOT.jar
    tfRDD.saveAsNewAPIHadoopFile(output, "org.tensorflow.hadoop.io.TFRecordFileOutputFormat",
                                keyClass="org.apache.hadoop.io.BytesWritable",
                                valueClass="org.apache.hadoop.io.NullWritable")
Exemplo n.º 3
0
import pyximport
pyximport.install()

import tool_set
import numpy as np
from datetime import datetime
if __name__ == "__main__":
    # path="/home/wu/Water_extract/data/0_1.tif"
    # img=tool_set.Multiband2Array(path)
    # print(img.shape)
    # pass

    dir_name = "/home/wu/Water_extract/data/data/"

    start = datetime.now()

    tool_set.create_pickle_train(dir_name, 10, 4)
    #
    data0 = tool_set.read_and_decode(dir_name + 'train_data.pkl', 10, 4)
    print(data0.shape)

    data1 = tool_set.read_and_decode(dir_name + 'train_data_1.pkl', 10, 4)
    print(data1.shape)

    # data=np.vstack((data0,data1))
    # print(data.shape)

    print(datetime.now() - start)
Exemplo n.º 4
0
# 训练集文件路径
dir_name = 'F:/water_detect/pkl/'

#生成模型最终会保存在model文件夹下
model_save_path = "model/"

# # 输出文件路径设置
fpa_path = path.join(dir_name, 'train_output.txt')
fpa = open(fpa_path, "a")  #这个文件好像没什么用 by xjxf
# # fpa.close()

start_time = datetime.datetime.now()
print("startTime: ", start_time)

# 提起pickle数据 data包含 特征+标签
data = tool_set.read_and_decode(dir_name + "train_data_64.pkl", 64)
isize = 9
img_channel = 4
img_pixel = isize
'''
# CNN 完整程序  训练模型
'''
# Parameters
training_epochs = 500
batch_size = 920

display_step = 10
channels = img_channel

# Network Parameters
img_size = isize * isize * channels  # data input (img shape: 28*28*3)
Exemplo n.º 5
0
# Define loss and optimizer
cost = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=pred))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# Evaluate model
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

#初始化所有的op
init = tf.global_variables_initializer()

if __name__ == '__main__':

    # 提起pickle数据 data包含 特征+标签
    data = tool_set.read_and_decode(dir_name + "train_data.pkl", img_pixel,
                                    channels)
    # data_1 = tool_set.read_and_decode(dir_name_1 + "train_data.pkl", img_pixel, channels)

    # data=np.vstack((data,data_1)) #2组数据按行合并

    saver = tf.train.Saver()  # 默认是保存所有变量

    with tf.Session() as sess:
        sess.run(init)

        total_batch = int(img_nums / batch_size)

        for epoch in range(training_epochs):

            # 每进行一个周期训练时,先将原数据按行打乱
            index = [i for i in range(len(data))]  # len(data)得到的行数
Exemplo n.º 6
0
import datetime
import pickle
import gzip

# 训练集文件路径
dir_name = r''
model_save_path = r"model/"
if not os.path.isdir(model_save_path):
    os.makedirs(model_save_path)  # 创建输出文件目录
model_fn = path.join(model_save_path, 'save_net.ckpt')  # 存放掩膜影像

start_time = datetime.datetime.now()
print("startTime: ", start_time)

# 提起pickle数据 data包含 特征+标签
data = tool_set.read_and_decode(dir_name + "04_pool_50p.pkl")

isize = 2
img_channel = 4

img_pixel = isize
'''
# CNN 完整程序  训练模型
'''
# Parameters
training_epochs = 50
batch_size = 128
display_step = 10
channels = img_channel

# Network Parameters
Exemplo n.º 7
0
dir_name = ''
dir_summary_name = ''

#生成模型最终会保存在model文件夹下
model_save_path = "model/"

# # 输出文件路径设置
fpa_path = path.join(dir_name, 'train_output.txt')
fpa = open(fpa_path, "a")  #这个文件好像没什么用 by xjxf
# # fpa.close()

start_time = datetime.datetime.now()
print("startTime: ", start_time)

# 提起pickle数据 data包含 特征+标签
data = tool_set.read_and_decode(dir_name + "train_data_400_all.pkl", 3)

isize = 400
img_channel = 3

img_pixel = isize
'''
# CNN 完整程序  训练模型
'''
# Parameters
training_epochs = 2500
batch_size = 1
display_step = 1
channels = img_channel

# Network Parameters