def _lsmat_to_matrix(cls, fh, delimiter): # We aren't using np.loadtxt because it uses *way* too much memory # (e.g, a 2GB matrix eats up 10GB, which then isn't freed after parsing # has finished). See: # http://mail.scipy.org/pipermail/numpy-tickets/2012-August/006749.html # Strategy: # - find the header # - initialize an empty ndarray # - for each row of data in the input file: # - populate the corresponding row in the ndarray with floats header = _find_header(fh) if header is None: raise LSMatFormatError( "Could not find a header line containing IDs in the " "dissimilarity matrix file. Please verify that the file is " "not empty.") ids = _parse_header(header, delimiter) num_ids = len(ids) data = np.empty((num_ids, num_ids), dtype=np.float64) row_idx = -1 for row_idx, (row_id, row_data) in enumerate(_parse_data(fh, delimiter)): if row_idx >= num_ids: # We've hit a nonempty line after we already filled the data # matrix. Raise an error because we shouldn't ignore extra data. raise LSMatFormatError( "Encountered extra row(s) without corresponding IDs in " "the header.") num_vals = len(row_data) if num_vals != num_ids: raise LSMatFormatError( "There are %d value(s) in row %d, which is not equal to the " "number of ID(s) in the header (%d)." % (num_vals, row_idx + 1, num_ids)) expected_id = ids[row_idx] if row_id == expected_id: data[row_idx, :] = np.asarray(row_data, dtype=float) else: raise LSMatFormatError( "Encountered mismatched IDs while parsing the " "dissimilarity matrix file. Found '%s' but expected " "'%s'. Please ensure that the IDs match between the " "dissimilarity matrix header (first row) and the row " "labels (first column)." % (row_id, expected_id)) if row_idx != num_ids - 1: raise LSMatFormatError("Expected %d row(s) of data, but found %d." % (num_ids, row_idx + 1)) return cls(data, ids)
def _parse_header(header, delimiter): tokens = header.rstrip().split(delimiter) if tokens[0]: raise LSMatFormatError("Header must start with delimiter %r." % delimiter) return [e.strip() for e in tokens[1:]]