def _get_parquet_dmatrix_pipe_mode(pipe_path): """Get Data Matrix from parquet data in pipe mode. :param pipe_path: SageMaker pipe path where parquet formatted training data is piped :return: xgb.DMatrix or None """ try: f = mlio.SageMakerPipe(pipe_path) examples = [] with f.open_read() as strm: reader = mlio.ParquetRecordReader(strm) for record in reader: table = pq.read_table(as_arrow_file(record)) array = table.to_pandas() if type(array) is pd.DataFrame: array = array.to_numpy() examples.append(array) if examples: data = np.vstack(examples) del examples dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0]) return dmatrix else: return None except Exception as e: raise exc.UserError("Failed to load parquet data with exception:\n{}".format(e))
def pipe_iterator(self, fifo_id=0): fifo_id = self.increment() print(f"opening pipe iterator {self.path}:{fifo_id}") pipe = mlio.SageMakerPipe(self.path, fifo_id=fifo_id) reader_params = mlio.DataReaderParams(dataset=[pipe], **self.reader_params) reader = mlio.RecordIOProtobufReader(reader_params) return reader
def pipe_iterator(self): with self.lock: fifo_id = self.count.value self.count.value = fifo_id+1 pipe = mlio.SageMakerPipe(self.path, fifo_id=fifo_id) return chunk_iterable( read_examples(pipe), self.size, last='error' )
def get_recordio_protobuf_dmatrix(path, is_pipe=False, subsample_ratio_on_read=None): """Get Data Matrix from recordio-protobuf data. :param path: Path where recordio-protobuf formatted training data resides, either directory, file, or SageMaker pipe :param is_pipe: Boolean to indicate if data is being read in pipe mode :param subsample_ratio_on_read: None or a value in (0, 1) to indicate how much of the dataset should be read into memory. :return: xgb.DMatrix or None """ try: if is_pipe: dataset = [mlio.SageMakerPipe(path)] reader = mlio.RecordIOProtobufReader( dataset=dataset, batch_size=BATCH_SIZE, subsample_ratio=subsample_ratio_on_read) else: dataset = mlio.list_files(path) reader = mlio.RecordIOProtobufReader( dataset=dataset, batch_size=BATCH_SIZE, subsample_ratio=subsample_ratio_on_read) exm = reader.peek_example() if exm is None: return None # Recordio-protobuf tensor may be dense (use numpy) or sparse (use scipy) if isinstance(exm['values'], mlio.DenseTensor): to_matrix = as_numpy vstack = np.vstack else: to_matrix = to_coo_matrix vstack = scipy_vstack all_values = [] all_labels = [] for example in reader: values = to_matrix(example['values']) all_values.append(values) labels = as_numpy(example['label_values']).squeeze() all_labels.append(labels) all_values = vstack(all_values) all_labels = np.concatenate(all_labels) return xgb.DMatrix(all_values, label=all_labels) except Exception as e: raise exc.UserError( "Failed to load recordio-protobuf data with exception:\n{}".format( e))
def get_recordio_protobuf_dmatrix(path, is_pipe=False): """Get Data Matrix from recordio-protobuf data. :param path: Path where recordio-protobuf formatted training data resides, either directory, file, or SageMaker pipe :param is_pipe: Boolean to indicate if data is being read in pipe mode :return: xgb.DMatrix or None """ try: if is_pipe: pipes_path = path if isinstance(path, list) else [path] dataset = [ mlio.SageMakerPipe(pipe_path) for pipe_path in pipes_path ] else: dataset = mlio.list_files(path) reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=BATCH_SIZE) reader = mlio.RecordIOProtobufReader(reader_params) if reader.peek_example() is not None: # recordio-protobuf tensor may be dense (use numpy) or sparse (use scipy) is_dense_tensor = type( reader.peek_example()['values']) is mlio.DenseTensor all_features = [] all_labels = [] for example in reader: features = as_numpy( example['values']) if is_dense_tensor else to_coo_matrix( example['values']) all_features.append(features) labels = as_numpy(example['label_values']) all_labels.append(labels) all_features = np.vstack( all_features) if is_dense_tensor else scipy_vstack( all_features).tocsr() all_labels = np.concatenate(all_labels, axis=None) dmatrix = xgb.DMatrix(all_features, label=all_labels) return dmatrix else: return None except Exception as e: raise exc.UserError( "Failed to load recordio-protobuf data with exception:\n{}".format( e))
def _get_csv_dmatrix_pipe_mode(pipe_path, csv_weights): """Get Data Matrix from CSV data in pipe mode. :param pipe_path: SageMaker pipe path where CSV formatted training data is piped :param csv_weights: 1 if instance weights are in second column of CSV data; else 0 :return: xgb.DMatrix or None """ try: pipes_path = pipe_path if isinstance(pipe_path, list) else [pipe_path] dataset = [mlio.SageMakerPipe(path) for path in pipes_path] reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=BATCH_SIZE) csv_params = mlio.CsvParams(header_row_index=None) reader = mlio.CsvReader(reader_params, csv_params) # Check if data is present in reader if reader.peek_example() is not None: examples = [] for example in reader: # Write each feature (column) of example into a single numpy array tmp = [as_numpy(feature).squeeze() for feature in example] tmp = np.array(tmp) if len(tmp.shape) > 1: # Columns are written as rows, needs to be transposed tmp = tmp.T else: # If tmp is a 1-D array, it needs to be reshaped as a matrix tmp = np.reshape(tmp, (1, tmp.shape[0])) examples.append(tmp) data = np.vstack(examples) del examples if csv_weights == 1: dmatrix = xgb.DMatrix(data[:, 2:], label=data[:, 0], weight=data[:, 1]) else: dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0]) return dmatrix else: return None except Exception as e: raise exc.UserError( "Failed to load csv data with exception:\n{}".format(e))
def _get_csv_dmatrix_pipe_mode(pipe_path, csv_weights, subsample_ratio_on_read): """Get Data Matrix from CSV data in pipe mode. :param pipe_path: SageMaker pipe path where CSV formatted training data is piped :param csv_weights: 1 if instance weights are in second column of CSV data; else 0 :param subsample_ratio_on_read: None or a value in (0, 1) to indicate how much of the dataset should be read into memory. :return: xgb.DMatrix or None """ try: dataset = [mlio.SageMakerPipe(pipe_path, fifo_id=0)] reader = mlio.CsvReader(dataset=dataset, batch_size=BATCH_SIZE, header_row_index=None, subsample_ratio=subsample_ratio_on_read) # Check if data is present in reader if reader.peek_example() is None: return None batches = [] for example in reader: batch = np.column_stack([as_numpy(f) for f in example]) batches.append(batch) data = np.vstack(batches) del batches if csv_weights == 1: dmatrix = xgb.DMatrix(data[:, 2:], label=data[:, 0], weights=data[:, 1]) else: dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0]) return dmatrix except Exception as e: raise exc.UserError( "Failed to load csv data with exception:\n{}".format(e))
def _get_data(source): """Determines the input mode of the source and returns a InMemoryStore, SageMakerPipe, or File object based on the input mode. If source is a python buffer, a mlio.core.InMemoryStore will be returned. If SM_INPUT_DATA_CONFIG environment variable is not defined, source is assumed to be a file or directory and a mlio.core.File object will be returned. If SM_INPUT_DATA_CONFIG environment variable is defined, source can be the name of the channel in SM_INPUT_DATA_CONFIG. If the source is a path, it is assumed that the basename of the path is the name of the channel. The type of mlio.core object to be returned will be based on the "TrainingInputMode" of the channel. Here is an example of SM_INPUT_DATA_CONFIG with two channels ("code" and "train"). SM_INPUT_DATA_CONFIG= { "code": { "ContentType": "application/x-code", "RecordWrapperType": "None", "S3DistributionType": "FullyReplicated", "TrainingInputMode": "File" }, "train": { "ContentType": "text/csv", "RecordWrapperType": "None", "S3DistributionType": "ShardedByS3Key", "TrainingInputMode": "File" } } Parameters ---------- source: str or bytes Name of the SageMaker Channel, File, or directory from which the data is being read or the Python buffer object from which the data is being read. Returns ------- mlio.core.File: A mlio.core.File object is return based on the file or directory described by the `source`. mlio.core.SageMakerPipe: In SageMaker framework containers, the inputdataconfig.json is made available via environment variable 'SM_INPUT_DATA_CONFIG'. When the given source is a to 'Pipe' the value of the environment variable 'SM_INPUT_DATA_CONFIG' is used to read out the 'TrainingInputMode' and confirm that the source is a 'Pipe'. Then a `mlio.SageMakerPipe` object is created using the 'source' and returned. mlio.core.InMemoryStore: Given the `source` is a Python buffer, a mlio.InMemoryStore object is created and returned """ if isinstance(source, bytes): return [mlio.InMemoryStore(source)] if isinstance(source, mlio.core.File): source = source.id config = os.environ.get("SM_INPUT_DATA_CONFIG") if config is None: return mlio.list_files(source, pattern="*") channels = json.loads(config) source_channel_name = os.path.basename(source) try: channel_config = channels[source_channel_name] except KeyError: raise KeyError( "Configuration for channel name {} is not provided in SM_INPUT_DATA_CONFIG." .format(source_channel_name)) try: data_config_input_mode = channel_config["TrainingInputMode"] except KeyError: raise KeyError( "SM_INPUT_DATA_CONFIG is malformed. TrainingInputMode is " "not found for channel name {}".format(source_channel_name)) if data_config_input_mode == "Pipe": return [mlio.SageMakerPipe(source)] return mlio.list_files(source, pattern="*") # 'File' mode