def read_universal_counts(feature_names, state_names, file, file_type, feature_states_file): """ This is a helper function to import global counts of each category of each feature, which then define dirichlet distributions that are used as a prior for p_global. Args: feature_names(dict): the features names (internal and external) state_names(dict): the category names (internal and external) file (str): The file location of the pseudocounts for the prior file_type (str): Two options: ´counts_file´: The counts are given in aggregate form (per features states) ´features_file´: Counts need to be extracted from a features file. feature_states_file (str): The csv file containing the possible state values per feature. Returns: dict, str: The counts for each state in each feature and a log string. """ # Read the global counts from csv if file_type == 'counts_file': counts, feature_names_file, state_names_file = read_feature_occurrence_from_csv(file, feature_states_file) else: _, _, features, feature_names_file, state_names_file, *_ = read_features_from_csv(file, feature_states_file) counts = np.sum(features, axis=0) # # # Sanity checks # # # # ´counts´ matrix has the right shape n_features = len(feature_names_file['external']) n_states = max(len(f_states) for f_states in state_names_file['external']) assert counts.shape == (n_features, n_states) # Are the data count data? if not all(float(y).is_integer() for y in np.nditer(counts)): out = f"The data in {file} must be count data." raise ValueError(out) # Same number of features? assert len(feature_names['external']) == len(feature_names_file['external']) assert len(feature_names['internal']) == len(feature_names_file['internal']) # Same feature names? for f in range(len(feature_names['external'])): assert feature_names['external'][f] == feature_names_file['external'][f] assert feature_names['internal'][f] == feature_names_file['internal'][f] # Same number of categories? assert len(state_names['external']) == len(state_names_file['external']) assert len(state_names['internal']) == len(state_names_file['internal']) # Same category names? for f in range(len(state_names['external'])): assert state_names['external'][f] == state_names_file['external'][f] assert feature_names['internal'][f] == feature_names_file['internal'][f] # Return with log message log = f"Read universal counts from {file}" return counts, log
def read_inheritance_counts(family_names, feature_names, state_names, files, file_type, feature_states_file): """ This is a helper function to import the counts of each feature in the families, which define dirichlet distributions that are used as prior for p_family. Args: family_names(dict): the names of the families (internal and external) feature_names(dict): the features names (internal and external) state_names(dict): the category names (internal and external files(dict): path to the file locations file_type (str): Two options: ´counts_file´: The counts are given in aggregate form (per features states) ´features_file´: Counts need to be extracted from a features file. feature_states_file (str): The csv file containing the possible state values per feature. Returns: dict, list: The dirichlet distribution per family for each category in each feature, and the categories """ n_families = len(family_names['external']) n_features = len(feature_names['external']) n_states = max([len(s) for s in state_names['external']]) counts_all = np.zeros([n_families, n_features, n_states]) log = str() for fam_idx in range(n_families): fam_name = family_names['external'][fam_idx] if fam_name not in files: log += f"No prior information for {fam_name}. Uniform prior used instead.\n" continue # Load counts for family ´fam_name´ file = files[fam_name] if file_type == 'counts_file': counts, feature_names_file, state_names_file = read_feature_occurrence_from_csv( file, feature_states_file) else: _, _, features, feature_names_file, state_names_file, *_ = read_features_from_csv( file, feature_states_file) counts = np.sum(features, axis=0) counts_all[fam_idx, :, :] = counts log += f"Read counts for {fam_name} from {file}\n" # # # Sanity checks # # # if not all(float(y).is_integer() for y in np.nditer(counts)): out = f"The data in {file} must be count data." raise ValueError(out) if len(feature_names['external']) != len(feature_names_file['external']) or \ len(feature_names['internal']) != len(feature_names_file['internal']): out = "Different number of features in " + str( file) + " as in features." raise ValueError(out) for f in range(0, len(feature_names['external'])): if feature_names['external'][f] != feature_names_file['external'][ f]: out = "The external feature " + str(f+1) + " in " + str(file) \ + " differs from the one used in features." raise ValueError(out) if feature_names['internal'][f] != feature_names_file['internal'][ f]: out = "The internal feature name " + str(f+1) + " in " + str(file) \ + " differs from the one used in features." raise ValueError(out) if len(state_names['external']) != len(state_names_file['external']) or \ len(state_names['internal']) != len(state_names_file['internal']): out = "Different number of features in " + str( file) + " as in features." raise ValueError(out) for f in range(0, len(state_names['external'])): if state_names['external'][f] != state_names_file['external'][f]: out = "The external category names for feature " + str(f+1) + " in " + str(file) \ + " differ from those used in features." raise ValueError(out) if feature_names['internal'][f] != feature_names_file['internal'][ f]: out = "The internal category names for " + str(f+1) + " in " + str(file) \ + " differ from those used in features." raise ValueError(out) return counts_all.astype(int), log