def constructAllSamplesDs(): ds = tfDsParquet.create_parquet_dataset(dataFileNames) ds = tfDsParquetAnnotation.annotate(ds, trainLabelsFileName) return ds
if __name__ == "__main__": tf.random.set_seed(seed + 563) print("Data dir is {0}".format(inputDataDir)) dataFileNames = glob("{0}/train*.parquet".format(inputDataDir)) trainLabelsFileName = "{0}/train.csv".format(inputDataDir) N = len(pd.read_csv(trainLabelsFileName)) #N = 1000 print("There are {0} training samples in total".format(N)) print("Parquet files count is {0}".format(len(dataFileNames))) print("First is {0}".format(dataFileNames[0])) ds = tfDsParquet.create_parquet_dataset(dataFileNames) ds = tfDsParquetAnnotation.annotate(ds, trainLabelsFileName) ds = ds.take(N) ds = ds.cache() print("Caching all DS") for element in tqdm(ds.as_numpy_iterator(), total=N, ascii=True): () def inValidationIndicator(ident): identBytes = ident.numpy() identStr = identBytes.decode('utf-8') res = 0 if identStr in valIds: res = 1 return res
from glob import glob import png sys.path.append(os.path.join(__file__, '..', '..')) from tfDataIngest import tfDataSetParquet as tfDsParquet inputDataDir = sys.argv[1] outputDir = sys.argv[2] # test app if __name__ == "__main__": files = glob(os.path.join(inputDataDir, "train*.parquet")) print("Found {0} parquet files in input dir {1}".format( len(files), inputDataDir)) print("First is {0}".format(files[0])) ds = tfDsParquet.create_parquet_dataset([files[0]]) for element in ds.as_numpy_iterator(): #print("Iterating...") sampleId, pixels = element sampleId = sampleId.decode("utf-8") fileName = os.path.join(outputDir, "{0}.png".format(sampleId)) png.from_array(pixels, mode="L").save(fileName) #print(element) #print("sample name is {0}".format(sampleId)) #print(sampleIds.shape) #print(pixels.shape) # a += 1 # if a > 10: # break print("Done")