def _local_read_sparse_mm(array, ex, fn, data_begin): ''' 1. Noted that Matrix Market format doesn't require (row, col) to be sorted. If the file is sorted (by either row or col), each worker will return only a part of the array. If the file is unsorted, each worker may return a very big and sparser sub-array of the original array. In the worst case, the sub-array can be as large as the original array but sparser. 2. We can't know how many lines without reading the whole file. So we simply decide the region this worker should read based on the file size. ''' data_size = os.path.getsize(fn) - data_begin array_size = np.product(array.shape) begin = extent.ravelled_pos(ex.ul, array.shape) begin = math.ceil(((begin * 1.0) / array_size) * data_size) + data_begin end = extent.ravelled_pos([(i - 1) for i in ex.lr], array.shape) end = math.floor(((end * 1.0) / array_size) * data_size) + data_begin ul = [array.shape[0], array.shape[1]] lr = [0, 0] rows = [] cols = [] data = [] with open(fn) as fp: fp.seek(begin) if begin != data_begin: fp.seek(begin - 1) a = fp.read(1) if a != '\n': line = fp.readline() pos = fp.tell() for line in fp: if pos > end + 1: # +1 in case end locates on \n break pos += len(line) (_row, _col), val = _extract_mm_coordinate(line) _row -= 1 _col -= 1 rows.append(_row) cols.append(_col) data.append(float(val)) ul[0] = _row if _row < ul[0] else ul[0] ul[1] = _col if _col < ul[1] else ul[1] lr[0] = _row if _row > lr[0] else lr[0] lr[1] = _col if _col > lr[1] else lr[1] # Adjust rows and cols based on the ul of this submatrix. for i in xrange(len(rows)): rows[i] -= ul[0] cols[i] -= ul[1] new_ex = extent.create(ul, [lr[0] + 1, lr[1] + 1], array.shape) new_array = sp.coo_matrix((data, (rows, cols)), new_ex.shape) return new_ex, sparse.convert_sparse_array(new_array)
def _local_read_sparse_npy(array, ex, fn): ''' 1. Noted that coo_matrix format doesn't require row[] or col[] to be sorted. If one of row[] or col[] is sorted (by either row or col), each worker will return only a part of the array. If the file is unsorted, each worker may return a very big and sparser sub-array of the original array. In the worst case, the sub-array can be as large as the original array but sparser. 2. For numpy format, we can evenly distribute the files we need to read to workers. ''' #data_begin = {} #dtype = {} #dtype_size = {} #shape = {} #fp = {} #read_next = {} attr = { 'data_begin': {}, 'dtype': {}, 'shape': None, 'read_next': {}, 'fn': {} } types = ['row', 'col', 'data'] dtype_name = {'float64': 'd', 'float32': 'f', 'int64': 'q', 'int32': 'i'} for i in types: _fn = '%s_%s.npy' % (fn, i) attr['fn'][i] = _fn _shape, attr['dtype'][i], attr['data_begin'][i] = _parse_npy_header( _fn) if attr['shape'] is not None: assert attr['shape'] == _shape else: attr['shape'] = _shape #shape['row'], dtype['row'], data_begin['row'] = _parse_npy_header(fn + '_row.npy') #shape['col'], dtype['col'], data_begin['col'] = _parse_npy_header(fn + '_col.npy') #shape['data'], dtype['data'], data_begin['data'] = _parse_npy_header(fn + '_data.npy') item_count = np.product(array.shape) begin_item = extent.ravelled_pos(ex.ul, array.shape) begin_item = int( math.ceil(((begin_item * 1.0) / item_count) * attr['shape'][0])) end_item = extent.ravelled_pos([(i - 1) for i in ex.lr], array.shape) end_item = int(math.floor( (end_item * 1.0) / item_count * attr['shape'][0])) + 1 end_item = attr['shape'][0] if end_item > attr['shape'][0] else end_item ul = [array.shape[0], array.shape[1]] lr = [0, 0] rows = [] cols = [] data = [] with FileHelper(row=open(attr['fn']['row'], 'rb'), col=open(attr['fn']['col'], 'rb'), data=open(attr['fn']['data'], 'rb')) as fp: for k in types: _dtype = attr['dtype'][k] _dtype_size = _dtype.itemsize _fp = getattr(fp, k) _fp.seek(attr['data_begin'][k] + begin_item * _dtype_size) attr['read_next'][k] = _bulk_read(_fp, _dtype_size) attr['dtype'][k] = dtype_name[_dtype.name] for i in xrange(begin_item, end_item): _row = struct.unpack(attr['dtype']['row'], attr['read_next']['row'].next())[0] rows.append(_row) _col = struct.unpack(attr['dtype']['col'], attr['read_next']['col'].next())[0] cols.append(_col) _data = struct.unpack(attr['dtype']['data'], attr['read_next']['data'].next())[0] data.append(_data) ul[0] = _row if _row < ul[0] else ul[0] ul[1] = _col if _col < ul[1] else ul[1] lr[0] = _row if _row > lr[0] else lr[0] lr[1] = _col if _col > lr[1] else lr[1] for i in xrange(len(rows)): rows[i] -= ul[0] cols[i] -= ul[1] new_ex = extent.create(ul, [lr[0] + 1, lr[1] + 1], array.shape) new_array = sp.coo_matrix((data, (rows, cols)), new_ex.shape) return new_ex, sparse.convert_sparse_array(new_array)
def _local_read_sparse_npy(array, ex, fn): ''' 1. Noted that coo_matrix format doesn't require row[] or col[] to be sorted. If one of row[] or col[] is sorted (by either row or col), each worker will return only a part of the array. If the file is unsorted, each worker may return a very big and sparser sub-array of the original array. In the worst case, the sub-array can be as large as the original array but sparser. 2. For numpy format, we can evenly distribute the files we need to read to workers. ''' #data_begin = {} #dtype = {} #dtype_size = {} #shape = {} #fp = {} #read_next = {} attr = {'data_begin': {}, 'dtype': {}, 'shape': None, 'read_next': {}, 'fn': {}} types = ['row', 'col', 'data'] dtype_name = {'float64': 'd', 'float32': 'f', 'int64': 'q', 'int32': 'i'} for i in types: _fn = '%s_%s.npy' % (fn, i) attr['fn'][i] = _fn _shape, attr['dtype'][i], attr['data_begin'][i] = _parse_npy_header(_fn) if attr['shape'] is not None: assert attr['shape'] == _shape else: attr['shape'] = _shape #shape['row'], dtype['row'], data_begin['row'] = _parse_npy_header(fn + '_row.npy') #shape['col'], dtype['col'], data_begin['col'] = _parse_npy_header(fn + '_col.npy') #shape['data'], dtype['data'], data_begin['data'] = _parse_npy_header(fn + '_data.npy') item_count = np.product(array.shape) begin_item = extent.ravelled_pos(ex.ul, array.shape) begin_item = int(math.ceil(((begin_item * 1.0) / item_count) * attr['shape'][0])) end_item = extent.ravelled_pos([(i - 1) for i in ex.lr], array.shape) end_item = int(math.floor((end_item * 1.0) / item_count * attr['shape'][0])) + 1 end_item = attr['shape'][0] if end_item > attr['shape'][0] else end_item ul = [array.shape[0], array.shape[1]] lr = [0, 0] rows = [] cols = [] data = [] with FileHelper(row=open(attr['fn']['row'], 'rb'), col=open(attr['fn']['col'], 'rb'), data=open(attr['fn']['data'], 'rb')) as fp: for k in types: _dtype = attr['dtype'][k] _dtype_size = _dtype.itemsize _fp = getattr(fp, k) _fp.seek(attr['data_begin'][k] + begin_item * _dtype_size) attr['read_next'][k] = _bulk_read(_fp, _dtype_size) attr['dtype'][k] = dtype_name[_dtype.name] for i in xrange(begin_item, end_item): _row = struct.unpack(attr['dtype']['row'], attr['read_next']['row'].next())[0] rows.append(_row) _col = struct.unpack(attr['dtype']['col'], attr['read_next']['col'].next())[0] cols.append(_col) _data = struct.unpack(attr['dtype']['data'], attr['read_next']['data'].next())[0] data.append(_data) ul[0] = _row if _row < ul[0] else ul[0] ul[1] = _col if _col < ul[1] else ul[1] lr[0] = _row if _row > lr[0] else lr[0] lr[1] = _col if _col > lr[1] else lr[1] for i in xrange(len(rows)): rows[i] -= ul[0] cols[i] -= ul[1] new_ex = extent.create(ul, [lr[0] + 1, lr[1] + 1], array.shape) new_array = sp.coo_matrix((data, (rows, cols)), new_ex.shape) return new_ex, sparse.convert_sparse_array(new_array)