def loadSeriesLocal(self, dataFilePath, inputFormat='npy', minPartitions=None, keyFilePath=None, varName=None): """ Load a Series object from a local file (either npy or MAT format). File should contain a 1d or 2d matrix, where each row of the input matrix is a record. Keys can be provided in a separate file (with variable name 'keys', for MAT files). If not provided, linear indices will be used for keys. Parameters ---------- dataFilePath: str File to import varName : str, optional, default = None Variable name to load (for MAT files only) keyFilePath : str, optional, default = None File containing the keys for each record as another 1d or 2d array minPartitions : Int, optional, default = 1 Number of partitions for RDD """ checkParams(inputFormat, ['mat', 'npy']) from thunder.rdds.fileio.seriesloader import SeriesLoader loader = SeriesLoader(self._sc, minPartitions=minPartitions) if inputFormat.lower() == 'mat': if varName is None: raise Exception( 'Must provide variable name for loading MAT files') data = loader.fromMatLocal(dataFilePath, varName, keyFilePath) else: data = loader.fromNpyLocal(dataFilePath, keyFilePath) return data
def loadSeriesLocal(self, datafile, inputformat='npy', minPartitions=None, keyfile=None, varname=None): """ Load a Series object from a local file (either npy or MAT format). File should contain a 1d or 2d matrix, where each row of the input matrix is a record. Keys can be provided in a separate file (with variable name 'keys', for MAT files). If not provided, linear indices will be used for keys. Parameters ---------- datafile : str File to import varname : str, optional, default = None Variable name to load (for MAT files only) keyfile : str, optional, default = None File containing the keys for each record as another 1d or 2d array minPartitions : Int, optional, default = 1 Number of partitions for RDD """ checkparams(inputformat, ['mat', 'npy']) from thunder.rdds.fileio.seriesloader import SeriesLoader loader = SeriesLoader(self._sc, minPartitions=minPartitions) if inputformat.lower() == 'mat': if varname is None: raise Exception('Must provide variable name for loading MAT files') data = loader.fromMatLocal(datafile, varname, keyfile) else: data = loader.fromNpyLocal(datafile, keyfile) return data
def loadSeries(self, dataPath, nkeys=None, nvalues=None, inputFormat='binary', minPartitions=None, confFilename='conf.json', keyType=None, valueType=None, keyPath=None, varName=None): """ Loads a Series object from data stored as binary, text, npy, or mat. For binary and text, supports single files or multiple files stored on a local file system, a networked file system (mounted and available on all cluster nodes), Amazon S3, or HDFS. For local formats (npy and mat) only local file systems currently supported. Parameters ---------- dataPath: string Path to data files or directory, as either a local filesystem path or a URI. May include a single '*' wildcard in the filename. Examples of valid dataPaths include 'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/". nkeys: int, optional (required if `inputFormat` is 'text'), default = None Number of keys per record (e.g. 3 for (x, y, z) coordinate keys). Must be specified for text data; can be specified here or in a configuration file for binary data. nvalues: int, optional (required if `inputFormat` is 'text') Number of values per record. Must be specified here or in a configuration file for binary data. inputFormat: {'text', 'binary', 'npy', 'mat'}. optional, default = 'binary' inputFormat of data to be read. minPartitions: int, optional, default = SparkContext.minParallelism Minimum number of Spark partitions to use, only for text. confFilename: string, optional, default 'conf.json' Path to JSON file with configuration options including 'nkeys', 'nvalues', 'keyType', and 'valueType'. If a file is not found at the given path, then the base directory in 'dataPath' will be checked. Parameters will override the conf file. keyType: string or numpy dtype, optional, default = None Numerical type of keys, will override conf file. valueType: string or numpy dtype, optional, default = None Numerical type of values, will override conf file. keyPath: string, optional, default = None Path to file with keys when loading from npy or mat. varName : str, optional, default = None Variable name to load (for MAT files only) Returns ------- data: thunder.rdds.Series A Series object, wrapping an RDD, with (n-tuples of ints) : (numpy array) pairs """ checkParams(inputFormat, ['text', 'binary', 'npy', 'mat']) from thunder.rdds.fileio.seriesloader import SeriesLoader loader = SeriesLoader(self._sc, minPartitions=minPartitions) if inputFormat.lower() == 'binary': data = loader.fromBinary(dataPath, confFilename=confFilename, nkeys=nkeys, nvalues=nvalues, keyType=keyType, valueType=valueType) elif inputFormat.lower() == 'text': if nkeys is None: raise Exception('Must provide number of keys per record for loading from text') data = loader.fromText(dataPath, nkeys=nkeys) elif inputFormat.lower() == 'npy': data = loader.fromNpyLocal(dataPath, keyPath) else: if varName is None: raise Exception('Must provide variable name for loading MAT files') data = loader.fromMatLocal(dataPath, varName, keyPath) return data