def constructAllSamplesDs():
     ds = tfDsParquet.create_parquet_dataset(dataFileNames)
     ds = tfDsParquetAnnotation.annotate(ds, trainLabelsFileName)
     return ds
if __name__ == "__main__":
    tf.random.set_seed(seed + 563)

    print("Data dir is {0}".format(inputDataDir))
    dataFileNames = glob("{0}/train*.parquet".format(inputDataDir))
    trainLabelsFileName = "{0}/train.csv".format(inputDataDir)

    N = len(pd.read_csv(trainLabelsFileName))
    #N = 1000
    print("There are {0} training samples in total".format(N))

    print("Parquet files count is {0}".format(len(dataFileNames)))
    print("First is {0}".format(dataFileNames[0]))

    ds = tfDsParquet.create_parquet_dataset(dataFileNames)
    ds = tfDsParquetAnnotation.annotate(ds, trainLabelsFileName)
    ds = ds.take(N)
    ds = ds.cache()

    print("Caching all DS")
    for element in tqdm(ds.as_numpy_iterator(), total=N, ascii=True):
        ()

    def inValidationIndicator(ident):
        identBytes = ident.numpy()
        identStr = identBytes.decode('utf-8')
        res = 0
        if identStr in valIds:
            res = 1
        return res
示例#3
0
from glob import glob
import png
sys.path.append(os.path.join(__file__, '..', '..'))

from tfDataIngest import tfDataSetParquet as tfDsParquet

inputDataDir = sys.argv[1]
outputDir = sys.argv[2]

# test app
if __name__ == "__main__":
    files = glob(os.path.join(inputDataDir, "train*.parquet"))
    print("Found {0} parquet files in input dir {1}".format(
        len(files), inputDataDir))
    print("First is {0}".format(files[0]))
    ds = tfDsParquet.create_parquet_dataset([files[0]])

    for element in ds.as_numpy_iterator():
        #print("Iterating...")
        sampleId, pixels = element
        sampleId = sampleId.decode("utf-8")
        fileName = os.path.join(outputDir, "{0}.png".format(sampleId))
        png.from_array(pixels, mode="L").save(fileName)
        #print(element)
        #print("sample name is {0}".format(sampleId))
        #print(sampleIds.shape)
        #print(pixels.shape)
        # a += 1
        # if a > 10:
        #     break
    print("Done")