Exemplo n.º 1
0
def _local_read_sparse_mm(array, ex, fn, data_begin):
  '''
  1. Noted that Matrix Market format doesn't require (row, col) to be sorted.
     If the file is sorted (by either row or col), each worker will return
     only a part of the array. If the file is unsorted, each worker may
     return a very big and sparser sub-array of the original array. In the
     worst case, the sub-array can be as large as the original array but
     sparser.
  2. We can't know how many lines without reading the whole file. So we simply
     decide the region this worker should read based on the file size.
  '''
  data_size = os.path.getsize(fn) - data_begin
  array_size = np.product(array.shape)
  begin = extent.ravelled_pos(ex.ul, array.shape)
  begin = math.ceil(((begin * 1.0) / array_size) * data_size) + data_begin
  end = extent.ravelled_pos([(i - 1) for i in ex.lr], array.shape)
  end = math.floor(((end * 1.0) / array_size) * data_size) + data_begin

  ul = [array.shape[0], array.shape[1]]
  lr = [0, 0]
  rows = []
  cols = []
  data = []
  with open(fn) as fp:
    fp.seek(begin)
    if begin != data_begin:
      fp.seek(begin - 1)
      a = fp.read(1)
      if a != '\n':
        line = fp.readline()

    pos = fp.tell()
    for line in fp:
      if pos > end + 1: # +1 in case end locates on \n
        break
      pos += len(line)
      (_row, _col), val = _extract_mm_coordinate(line)
      _row -= 1
      _col -= 1
      rows.append(_row)
      cols.append(_col)
      data.append(float(val))
      ul[0] = _row if _row < ul[0] else ul[0]
      ul[1] = _col if _col < ul[1] else ul[1]
      lr[0] = _row if _row > lr[0] else lr[0]
      lr[1] = _col if _col > lr[1] else lr[1]

  # Adjust rows and cols based on the ul of this submatrix.
  for i in xrange(len(rows)):
    rows[i] -= ul[0]
    cols[i] -= ul[1]

  new_ex = extent.create(ul, [lr[0] + 1, lr[1] + 1], array.shape)
  new_array = sp.coo_matrix((data, (rows, cols)), new_ex.shape)
  return new_ex, sparse.convert_sparse_array(new_array)
Exemplo n.º 2
0
def _local_read_sparse_mm(array, ex, fn, data_begin):
    '''
  1. Noted that Matrix Market format doesn't require (row, col) to be sorted.
     If the file is sorted (by either row or col), each worker will return
     only a part of the array. If the file is unsorted, each worker may
     return a very big and sparser sub-array of the original array. In the
     worst case, the sub-array can be as large as the original array but
     sparser.
  2. We can't know how many lines without reading the whole file. So we simply
     decide the region this worker should read based on the file size.
  '''
    data_size = os.path.getsize(fn) - data_begin
    array_size = np.product(array.shape)
    begin = extent.ravelled_pos(ex.ul, array.shape)
    begin = math.ceil(((begin * 1.0) / array_size) * data_size) + data_begin
    end = extent.ravelled_pos([(i - 1) for i in ex.lr], array.shape)
    end = math.floor(((end * 1.0) / array_size) * data_size) + data_begin

    ul = [array.shape[0], array.shape[1]]
    lr = [0, 0]
    rows = []
    cols = []
    data = []
    with open(fn) as fp:
        fp.seek(begin)
        if begin != data_begin:
            fp.seek(begin - 1)
            a = fp.read(1)
            if a != '\n':
                line = fp.readline()

        pos = fp.tell()
        for line in fp:
            if pos > end + 1:  # +1 in case end locates on \n
                break
            pos += len(line)
            (_row, _col), val = _extract_mm_coordinate(line)
            _row -= 1
            _col -= 1
            rows.append(_row)
            cols.append(_col)
            data.append(float(val))
            ul[0] = _row if _row < ul[0] else ul[0]
            ul[1] = _col if _col < ul[1] else ul[1]
            lr[0] = _row if _row > lr[0] else lr[0]
            lr[1] = _col if _col > lr[1] else lr[1]

    # Adjust rows and cols based on the ul of this submatrix.
    for i in xrange(len(rows)):
        rows[i] -= ul[0]
        cols[i] -= ul[1]

    new_ex = extent.create(ul, [lr[0] + 1, lr[1] + 1], array.shape)
    new_array = sp.coo_matrix((data, (rows, cols)), new_ex.shape)
    return new_ex, sparse.convert_sparse_array(new_array)
Exemplo n.º 3
0
def _local_read_sparse_npy(array, ex, fn):
  '''
  1. Noted that coo_matrix format doesn't require row[] or col[] to be sorted.
     If one of row[] or col[] is sorted (by either row or col), each worker will
     return only a part of the array. If the file is unsorted, each worker may
     return a very big and sparser sub-array of the original array. In the worst
     case, the sub-array can be as large as the original array but sparser.
  2. For numpy format, we can evenly distribute the files we need to read to
     workers.
  '''
  #data_begin = {}
  #dtype = {}
  #dtype_size = {}
  #shape = {}
  #fp = {}
  #read_next = {}
  attr = {'data_begin':{}, 'dtype':{}, 'shape':None,
          'read_next':{}, 'fn':{}}
  types = ['row', 'col', 'data']
  dtype_name = {'float64':'d', 'float32':'f', 'int64':'q', 'int32':'i'}

  for i in types:
    _fn = '%s_%s.npy' % (fn, i)
    attr['fn'][i] = _fn
    _shape, attr['dtype'][i], attr['data_begin'][i] = _parse_npy_header(_fn)
    if attr['shape'] != None:
      assert attr['shape'] == _shape
    else:
      attr['shape'] = _shape
  #shape['row'], dtype['row'], data_begin['row'] = _parse_npy_header(fn + '_row.npy')
  #shape['col'], dtype['col'], data_begin['col'] = _parse_npy_header(fn + '_col.npy')
  #shape['data'], dtype['data'], data_begin['data'] = _parse_npy_header(fn + '_data.npy')

  item_count = np.product(array.shape)
  begin_item = extent.ravelled_pos(ex.ul, array.shape)
  begin_item = int(math.ceil(((begin_item * 1.0) / item_count) * attr['shape'][0]))
  end_item = extent.ravelled_pos([(i - 1) for i in ex.lr], array.shape)
  end_item = int(math.floor((end_item * 1.0) / item_count * attr['shape'][0])) + 1
  end_item = attr['shape'][0] if end_item > attr['shape'][0] else end_item

  ul = [array.shape[0], array.shape[1]]
  lr = [0, 0]
  rows = []
  cols = []
  data = []
  with FileHelper(row=open(attr['fn']['row'], 'rb'),
                  col=open(attr['fn']['col'], 'rb'),
                  data=open(attr['fn']['data'], 'rb')) as fp:
    for k in types:
      _dtype = attr['dtype'][k]
      _dtype_size = _dtype.itemsize
      _fp = getattr(fp, k)

      _fp.seek(attr['data_begin'][k] + begin_item * _dtype_size)
      attr['read_next'][k] = _bulk_read(_fp, _dtype_size)
      attr['dtype'][k] = dtype_name[_dtype.name]

    for i in xrange(begin_item, end_item):
      _row = struct.unpack(attr['dtype']['row'], attr['read_next']['row'].next())[0]
      rows.append(_row)
      _col = struct.unpack(attr['dtype']['col'], attr['read_next']['col'].next())[0]
      cols.append(_col)
      _data = struct.unpack(attr['dtype']['data'], attr['read_next']['data'].next())[0]
      data.append(_data)

      ul[0] = _row if _row < ul[0] else ul[0]
      ul[1] = _col if _col < ul[1] else ul[1]
      lr[0] = _row if _row > lr[0] else lr[0]
      lr[1] = _col if _col > lr[1] else lr[1]

  for i in xrange(len(rows)):
    rows[i] -= ul[0]
    cols[i] -= ul[1]

  new_ex = extent.create(ul, [lr[0] + 1, lr[1] + 1], array.shape)
  new_array = sp.coo_matrix((data, (rows, cols)), new_ex.shape)
  return new_ex, sparse.convert_sparse_array(new_array)
Exemplo n.º 4
0
def _local_read_sparse_npy(array, ex, fn):
    '''
  1. Noted that coo_matrix format doesn't require row[] or col[] to be sorted.
     If one of row[] or col[] is sorted (by either row or col), each worker will
     return only a part of the array. If the file is unsorted, each worker may
     return a very big and sparser sub-array of the original array. In the worst
     case, the sub-array can be as large as the original array but sparser.
  2. For numpy format, we can evenly distribute the files we need to read to
     workers.
  '''
    #data_begin = {}
    #dtype = {}
    #dtype_size = {}
    #shape = {}
    #fp = {}
    #read_next = {}
    attr = {
        'data_begin': {},
        'dtype': {},
        'shape': None,
        'read_next': {},
        'fn': {}
    }
    types = ['row', 'col', 'data']
    dtype_name = {'float64': 'd', 'float32': 'f', 'int64': 'q', 'int32': 'i'}

    for i in types:
        _fn = '%s_%s.npy' % (fn, i)
        attr['fn'][i] = _fn
        _shape, attr['dtype'][i], attr['data_begin'][i] = _parse_npy_header(
            _fn)
        if attr['shape'] != None:
            assert attr['shape'] == _shape
        else:
            attr['shape'] = _shape
    #shape['row'], dtype['row'], data_begin['row'] = _parse_npy_header(fn + '_row.npy')
    #shape['col'], dtype['col'], data_begin['col'] = _parse_npy_header(fn + '_col.npy')
    #shape['data'], dtype['data'], data_begin['data'] = _parse_npy_header(fn + '_data.npy')

    item_count = np.product(array.shape)
    begin_item = extent.ravelled_pos(ex.ul, array.shape)
    begin_item = int(
        math.ceil(((begin_item * 1.0) / item_count) * attr['shape'][0]))
    end_item = extent.ravelled_pos([(i - 1) for i in ex.lr], array.shape)
    end_item = int(math.floor(
        (end_item * 1.0) / item_count * attr['shape'][0])) + 1
    end_item = attr['shape'][0] if end_item > attr['shape'][0] else end_item

    ul = [array.shape[0], array.shape[1]]
    lr = [0, 0]
    rows = []
    cols = []
    data = []
    with FileHelper(row=open(attr['fn']['row'], 'rb'),
                    col=open(attr['fn']['col'], 'rb'),
                    data=open(attr['fn']['data'], 'rb')) as fp:
        for k in types:
            _dtype = attr['dtype'][k]
            _dtype_size = _dtype.itemsize
            _fp = getattr(fp, k)

            _fp.seek(attr['data_begin'][k] + begin_item * _dtype_size)
            attr['read_next'][k] = _bulk_read(_fp, _dtype_size)
            attr['dtype'][k] = dtype_name[_dtype.name]

        for i in xrange(begin_item, end_item):
            _row = struct.unpack(attr['dtype']['row'],
                                 attr['read_next']['row'].next())[0]
            rows.append(_row)
            _col = struct.unpack(attr['dtype']['col'],
                                 attr['read_next']['col'].next())[0]
            cols.append(_col)
            _data = struct.unpack(attr['dtype']['data'],
                                  attr['read_next']['data'].next())[0]
            data.append(_data)

            ul[0] = _row if _row < ul[0] else ul[0]
            ul[1] = _col if _col < ul[1] else ul[1]
            lr[0] = _row if _row > lr[0] else lr[0]
            lr[1] = _col if _col > lr[1] else lr[1]

    for i in xrange(len(rows)):
        rows[i] -= ul[0]
        cols[i] -= ul[1]

    new_ex = extent.create(ul, [lr[0] + 1, lr[1] + 1], array.shape)
    new_array = sp.coo_matrix((data, (rows, cols)), new_ex.shape)
    return new_ex, sparse.convert_sparse_array(new_array)