def recordio_protobuf_to_dmatrix(string_like): # type: (bytes) -> xgb.DMatrix """Convert a RecordIO-Protobuf byte representation to a DMatrix object. Args: string_like (bytes): RecordIO-Protobuf bytes. Returns: (xgb.DMatrix): XGBoost DataMatrix """ buf = bytes(string_like) dataset = [mlio.InMemoryStore(buf)] reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=100) reader = mlio.RecordIOProtobufReader(reader_params) is_dense_tensor = type(reader.peek_example()['values']) is mlio.DenseTensor examples = [] for example in reader: # Ignore labels if present values = as_numpy( example['values']) if is_dense_tensor else to_coo_matrix( example['values']) examples.append(values) data = np.vstack(examples) if is_dense_tensor else scipy_vstack( examples).tocsr() dmatrix = xgb.DMatrix(data) return dmatrix
def to_tf(tensor): if isinstance(tensor, DenseTensor): return tf.convert_to_tensor(as_numpy(tensor)) mtx = to_coo_matrix(tensor).tocsr() non_zero_row_col = mtx.nonzero() indices = np.asmatrix([non_zero_row_col[0], non_zero_row_col[1]]) indices = indices.transpose() return tf.SparseTensor(indices, mtx.data, mtx.shape)
def get_recordio_protobuf_dmatrix(path, is_pipe=False): """Get Data Matrix from recordio-protobuf data. :param path: Path where recordio-protobuf formatted training data resides, either directory, file, or SageMaker pipe :param is_pipe: Boolean to indicate if data is being read in pipe mode :return: xgb.DMatrix or None """ try: if is_pipe: pipes_path = path if isinstance(path, list) else [path] dataset = [ mlio.SageMakerPipe(pipe_path) for pipe_path in pipes_path ] else: dataset = mlio.list_files(path) reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=BATCH_SIZE) reader = mlio.RecordIOProtobufReader(reader_params) if reader.peek_example() is not None: # recordio-protobuf tensor may be dense (use numpy) or sparse (use scipy) is_dense_tensor = type( reader.peek_example()['values']) is mlio.DenseTensor all_features = [] all_labels = [] for example in reader: features = as_numpy( example['values']) if is_dense_tensor else to_coo_matrix( example['values']) all_features.append(features) labels = as_numpy(example['label_values']) all_labels.append(labels) all_features = np.vstack( all_features) if is_dense_tensor else scipy_vstack( all_features).tocsr() all_labels = np.concatenate(all_labels, axis=None) dmatrix = xgb.DMatrix(all_features, label=all_labels) return dmatrix else: return None except Exception as e: raise exc.UserError( "Failed to load recordio-protobuf data with exception:\n{}".format( e))