def test_tf_column_filter(self): """InputMode.TENSORFLOW TFEstimator saving temporary TFRecords, filtered by input_mapping columns""" # create a Spark DataFrame of training examples (features, labels) trainDF = self.spark.createDataFrame(self.train_examples, ['col1', 'col2']) # and add some extra columns df = trainDF.withColumn('extra1', trainDF.col1) df = df.withColumn('extra2', trainDF.col2) self.assertEquals(len(df.columns), 4) # train model args = {} estimator = TFEstimator(self.get_function('tf/train'), args, export_fn=self.get_function('tf/export')) \ .setInputMapping( { 'col1': 'x', 'col2': 'y_' }) \ .setInputMode(TFCluster.InputMode.TENSORFLOW) \ .setModelDir(self.model_dir) \ .setExportDir(self.export_dir) \ .setTFRecordDir(self.tfrecord_dir) \ .setClusterSize(self.num_workers) \ .setNumPS(1) \ .setBatchSize(10) model = estimator.fit(df) self.assertTrue(os.path.isdir(self.model_dir)) self.assertTrue(os.path.isdir(self.tfrecord_dir)) df_tmp = dfutil.loadTFRecords(self.sc, self.tfrecord_dir) self.assertEquals(df_tmp.columns, ['col1', 'col2'])
def test_dfutils(self): # create a DataFrame of a single row consisting of standard types (str, int, int_array, float, float_array, binary) row1 = (bytearray(b'text string'), 1, [2, 3, 4, 5], -1.1, [-2.2, -3.3, -4.4, -5.5], bytearray(b'\xff\xfe\xfd\xfc')) rdd = self.sc.parallelize([row1]) df1 = self.spark.createDataFrame(rdd, ['a', 'b', 'c', 'd', 'e', 'f']) print("schema: {}".format(df1.schema)) # save the DataFrame as TFRecords dfutil.saveAsTFRecords(df1, self.tfrecord_dir) self.assertTrue(hdfs_util.isdir(self.tfrecord_dir)) # self.assertTrue(os.path.isdir(self.tfrecord_dir)) # reload the DataFrame from exported TFRecords df2 = dfutil.loadTFRecords(self.sc, self.tfrecord_dir, binary_features=['a', 'f']) row2 = df2.take(1)[0] print("row_saved: {}".format(row1)) print("row_loaded: {}".format(row2)) # confirm loaded values match original/saved values self.assertEqual(row1[0], row2['a']) self.assertEqual(row1[1], row2['b']) self.assertEqual(row1[2], row2['c']) self.assertAlmostEqual(row1[3], row2['d'], 6) for i in range(len(row1[4])): self.assertAlmostEqual(row1[4][i], row2['e'][i], 6) print("type(f): {}".format(type(row2['f']))) for i in range(len(row1[5])): self.assertEqual(row1[5][i], row2['f'][i]) # check origin of each DataFrame self.assertFalse(dfutil.isLoadedDF(df1)) self.assertTrue(dfutil.isLoadedDF(df2)) # references are equivalent df_ref = df2 self.assertTrue(dfutil.isLoadedDF(df_ref)) # mutated DFs are not equal, even if contents are identical df3 = df2.filter(df2.a == 'string_label') self.assertFalse(dfutil.isLoadedDF(df3)) # re-used/re-assigned variables are not equal df2 = df3 self.assertFalse(dfutil.isLoadedDF(df2))
help="path to export saved_model", default="mnist_export") parser.add_argument("--output", help="HDFS path to save predictions", type=str, default="predictions") parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args = parser.parse_args() print("args:", args) if args.format == 'tfr': # load TFRecords as a DataFrame df = dfutil.loadTFRecords(sc, args.images_labels) else: # args.format == 'csv': # create RDD of input data def parse(ln): vec = [int(x) for x in ln.split(',')] return (vec[1:], vec[0]) images_labels = sc.textFile(args.images_labels).map(parse) df = spark.createDataFrame(images_labels, ['image', 'label']) df.show() if args.mode == 'train': estimator = TFEstimator(main_fun, args) \ .setInputMapping({'image': 'image', 'label': 'label'}) \ .setModelDir(args.model_dir) \
type=str) parser.add_argument("--train_data", help="HDFS path to training data", type=str) parser.add_argument("--validation_data", help="HDFS path to validation data", type=str) (args, rem) = parser.parse_known_args() input_mode = TFCluster.InputMode.SPARK if args.input_mode == 'spark' else TFCluster.InputMode.TENSORFLOW print("{0} ===== Start".format(datetime.now().isoformat())) df = dfutil.loadTFRecords(sc, args.train_data, binary_features=['image/encoded']) estimator = TFEstimator(main_fun, sys.argv, export_fn=inception_export.export) \ .setModelDir(args.train_dir) \ .setExportDir(args.export_dir) \ .setTFRecordDir(args.tfrecord_dir) \ .setClusterSize(args.cluster_size) \ .setNumPS(args.num_ps) \ .setInputMode(TFCluster.InputMode.TENSORFLOW) \ .setTensorboard(args.tensorboard) \ print("{0} ===== Train".format(datetime.now().isoformat())) model = estimator.fit(df) print("{0} ===== Inference".format(datetime.now().isoformat())) df = dfutil.loadTFRecords(sc,