def testPandasFeeding(self): if not HAS_PANDAS: return batch_size = 3 iterations = 1000 index = np.arange(100, 132) a = np.arange(32) b = np.arange(32, 64) dataframe = pd.DataFrame({"a": a, "b": b}, index=index) pandas_source = in_memory_source.PandasSource(dataframe, batch_size) pandas_columns = pandas_source() cache = {} with tf.Graph().as_default(): pandas_tensors = [col.build(cache) for col in pandas_columns] with tf.Session() as sess: coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) for i in range(iterations): indices = [ j % dataframe.shape[0] for j in range(batch_size * i, batch_size * (i + 1)) ] expected_df_indices = dataframe.index[indices] expected_rows = dataframe.iloc[indices] actual_value = sess.run(pandas_tensors) np.testing.assert_array_equal(expected_df_indices, actual_value[0]) for col_num, col in enumerate(dataframe.columns): np.testing.assert_array_equal( expected_rows[col].values, actual_value[col_num + 1]) coord.request_stop() coord.join(threads)
def from_pandas(cls, pandas_dataframe, batch_size=None, shuffle=True, queue_capacity=None, min_after_dequeue=None, seed=None): """Create a `tf.learn.DataFrame` from a `pandas.DataFrame`. Args: pandas_dataframe: `pandas.DataFrame` that serves as a data source. batch_size: desired batch size. shuffle: whether records should be shuffled. Defaults to true. queue_capacity: capacity of the queue that will store parsed `Example`s min_after_dequeue: minimum number of elements that can be left by a dequeue operation. Only used if `shuffle` is true. seed: passed to random shuffle operations. Only used if `shuffle` is true. Returns: A `tf.learn.DataFrame` that contains batches drawn from the given `pandas_dataframe`. """ pandas_source = in_memory_source.PandasSource(pandas_dataframe, batch_size, queue_capacity, shuffle, min_after_dequeue, seed) dataframe = cls() dataframe.assign(**(pandas_source()._asdict())) return dataframe
def from_pandas(cls, pandas_dataframe, num_threads=None, enqueue_size=None, batch_size=None, queue_capacity=None, min_after_dequeue=None, shuffle=True, seed=None, data_name="pandas_data"): """Create a `tf.learn.DataFrame` from a `pandas.DataFrame`. Args: pandas_dataframe: `pandas.DataFrame` that serves as a data source. num_threads: the number of threads to use for enqueueing. enqueue_size: the number of rows to enqueue per step. batch_size: desired batch size. queue_capacity: capacity of the queue that will store parsed `Example`s min_after_dequeue: minimum number of elements that can be left by a dequeue operation. Only used if `shuffle` is true. shuffle: whether records should be shuffled. Defaults to true. seed: passed to random shuffle operations. Only used if `shuffle` is true. data_name: a scope name identifying the data. Returns: A `tf.learn.DataFrame` that contains batches drawn from the given `pandas_dataframe`. """ pandas_source = in_memory_source.PandasSource( pandas_dataframe, num_threads=num_threads, enqueue_size=enqueue_size, batch_size=batch_size, queue_capacity=queue_capacity, shuffle=shuffle, min_after_dequeue=min_after_dequeue, seed=seed, data_name=data_name) dataframe = cls() dataframe.assign(**(pandas_source()._asdict())) return dataframe