예제 #1
0
def _lsmat_to_matrix(cls, fh, delimiter):
    # We aren't using np.loadtxt because it uses *way* too much memory
    # (e.g, a 2GB matrix eats up 10GB, which then isn't freed after parsing
    # has finished). See:
    # http://mail.scipy.org/pipermail/numpy-tickets/2012-August/006749.html

    # Strategy:
    #   - find the header
    #   - initialize an empty ndarray
    #   - for each row of data in the input file:
    #     - populate the corresponding row in the ndarray with floats

    header = _find_header(fh)
    if header is None:
        raise LSMatFormatError(
            "Could not find a header line containing IDs in the "
            "dissimilarity matrix file. Please verify that the file is "
            "not empty.")

    ids = _parse_header(header, delimiter)
    num_ids = len(ids)
    data = np.empty((num_ids, num_ids), dtype=np.float64)

    row_idx = -1
    for row_idx, (row_id, row_data) in enumerate(_parse_data(fh, delimiter)):
        if row_idx >= num_ids:
            # We've hit a nonempty line after we already filled the data
            # matrix. Raise an error because we shouldn't ignore extra data.
            raise LSMatFormatError(
                "Encountered extra row(s) without corresponding IDs in "
                "the header.")

        num_vals = len(row_data)
        if num_vals != num_ids:
            raise LSMatFormatError(
                "There are %d value(s) in row %d, which is not equal to the "
                "number of ID(s) in the header (%d)." %
                (num_vals, row_idx + 1, num_ids))

        expected_id = ids[row_idx]
        if row_id == expected_id:
            data[row_idx, :] = np.asarray(row_data, dtype=float)
        else:
            raise LSMatFormatError(
                "Encountered mismatched IDs while parsing the "
                "dissimilarity matrix file. Found '%s' but expected "
                "'%s'. Please ensure that the IDs match between the "
                "dissimilarity matrix header (first row) and the row "
                "labels (first column)." % (row_id, expected_id))

    if row_idx != num_ids - 1:
        raise LSMatFormatError("Expected %d row(s) of data, but found %d." %
                               (num_ids, row_idx + 1))

    return cls(data, ids)
예제 #2
0
def _parse_header(header, delimiter):
    tokens = header.rstrip().split(delimiter)

    if tokens[0]:
        raise LSMatFormatError("Header must start with delimiter %r." %
                               delimiter)

    return [e.strip() for e in tokens[1:]]