def get_distribution(distribution_string, state_name, valid_states): """ This high level function returns a state distribution. Each valid state is expected to have exactly one weight. The weights in the returned dictionary are normalized to sum to one. @param distribution_string: a string of lines of text that each look something like 'A : 0.245' @param state_name: something like 'amino acid' @param valid_states: a set of valid states @return: a dictionary mapping each state to a probability """ if not distribution_string: raise HandlingError('no %s distribution was specified' % state_name) state_to_weight = {} for line in smallutil.stripped_lines(StringIO(distribution_string)): state, weight = get_weight_pair(line, state_name, valid_states) if state in state_to_weight: raise HandlingError('duplicate %s: %s' % (state_name, state)) state_to_weight[state] = weight if len(state_to_weight) < len(valid_states): raise HandlingError('one or more %s was not assigned a weight' % state_name) total_weight = float(sum(state_to_weight.values())) if not total_weight: raise HandlingError('each %s weight is zero' % state_name) for state in state_to_weight: state_to_weight[state] /= total_weight return state_to_weight
def multiline_state_to_ndarray(multiline_state): arr = [] for line in stripped_lines(multiline_state.splitlines()): row = [] for s in line: v = int(s) if v not in (0, 1): raise ValueError('invalid allele') row.append(v) arr.append(row) return np.array(arr)