def main(file1=None, file2=None, base_directory=None, configuration_file=None, config_base_directory=None, file_out='input_data.txt.gz', row_wise_out=None, col_wise_out=None): # Load configuration for I/O (or automatically generate if not provided as an input) io_config = io.load_io_config(file1_path=file1, file2_path=file2, base_dir=base_directory, config_path=configuration_file, config_base_dir=config_base_directory) # Load and merge datasets data = io.load_datasets_from_config(io_config) # Get data row-wise and column-wise in json format if row_wise_out is not None: row_wise_data = preprocess.tabular2json(data, data.index, data.columns, by_col=False) io.write_json(row_wise_data, row_wise_out) if col_wise_out is not None: col_wise_data = preprocess.tabular2json(data, data.index, data.columns, by_col=True) io.write_json(col_wise_data, col_wise_out) # Write merged data sets to output data.to_csv(file_out, sep='\t', index=True, header=True) return data
def main(file1=None, file2=None, base_directory=None, configuration_file=None, config_base_directory=None): # Load configuration for I/O (or automatically generate if not provided as an input) io_config = io.load_io_config(file1_path=file1, file2_path=file2, base_dir=base_directory, config_path=configuration_file, config_base_dir=config_base_directory) # Load and merge datasets data = io.load_datasets_from_config(io_config) # Filter out low-information columns data = preprocess.eliminate_low_information_columns(data, 0.01) # Identify, separate, and save numeric and categorical columns. Numeric data will be used for clustering # and categorical (attribute) data will be used for enrichment analysis. Note that categorical data is # saved as occurrence counts (in json format) data_numeric, data_object = preprocess.preprocess_and_split(data, fill_na=True) # Save numeric data to file in row-wise and column-wise formats data_numeric.to_csv('rows_numeric_data.txt.gz', sep='\t', header=False) # Row-wise data_numeric.T.to_csv('cols_numeric_data.txt.gz', sep='\t', header=False) # Column-wise # Save categorical (attribute) data data_object.to_json('cols_attribute_data.json.gz', orient='columns') # Save counts of value occurrences (in json format) data_object = preprocess.tabular2json(data_object.values, data_object.index, data_object.columns, by_col=True, pad_rows=False) data_object = preprocess.generate_occurrence_counts(data_object) io.write_json(data_object, 'cols_attribute_counts.json.gz')
def main(file, sep=None, comment=None, index_col=None, header_row=None, rows_out='rows_out.json.gz', cols_out='cols_out.json.gz'): assert isinstance(file, str) # Read data as a data frame data = pd.read_csv(file, sep=sep, comment=comment, index_col=index_col, header=header_row) num_cols, cat_cols = identify_types(data) with open('col_types.json', 'w') as f: json.dump({ 'categorical': cat_cols, 'numerical': num_cols }, f, sort_keys=True, indent='\t') # Drop columns with missing data data.dropna(axis=1, inplace=True) # Get row and column labels row_labels = list(data.index) row_labels = [str(x) for x in row_labels] col_labels = list(data.columns) col_labels = [str(x) for x in col_labels] # Convert to list of lists for subsequent processing data = data.values # Get data row-wise and column-wise in json format rowwise_data = preprocess.tabular2json(data, row_labels, col_labels, by_col=False, pad_rows=False) colwise_data = preprocess.tabular2json(data, row_labels, col_labels, by_col=True, pad_rows=True) # Write row-wise json io.write_json(rowwise_data, rows_out) # Write column-wise json io.write_json(colwise_data, cols_out) return rowwise_data, colwise_data
def main(file, out='hist_out.json'): assert isinstance(file, str) # Load data from .json or .json.gz file data = io.load_json(file) # Generate "histogram" of occurrence counts as a dictionary data_counts = preprocess.generate_occurrence_counts( data, to_lower=True, replace_whitespace='-', collapse_singletons=True) # Write data to .json or .json.gz file format io.write_json(data_counts, out) return data_counts
def main(file, length=100, out='fp_out.json'): assert isinstance(file, str) # Load data from .json or .json.gz file data = io.load_json(file) # Calculate fingerprints data_fp = make.encode_fp(data, length) # Convert numpy arrays to lists for conversion to json data_fp = {k: v.tolist() for k, v in data_fp.items()} # Write data to .json or .json.gz file format io.write_json(data_fp, out) return data_fp
def main(file, method='ward', criterion='distance', cl_labels_out='cluster_labels.txt.gz', cl_members_out='cluster_members.json.gz'): assert isinstance(file, str) # Load data to be clustered data = pd.read_table(file, index_col=0, header=None) # Normalize data down columns (i.e. features or attributes) data = (data - data.mean(axis=0)) / data.std(axis=0) # Use data index as labels labels = np.array(data.index) # Perform clustering linkage_table = shc.linkage(data.values, method=method) # Identify members of all clusters of size 2 or greater cluster_members = cluster.get_cluster_membership(linkage_table, labels) # Write cluster membership to file io.write_json(cluster_members, cl_members_out) # Generate a list of dendrogram cutoff values distances, num_distances = linkage_table[:, 2], len(linkage_table[:, 2]) cutoff_values = [ np.mean(distances[i:min(i + 2, num_distances)]) for i in range(num_distances) ] # Generate cluster assignments at different cutoff values cluster_assignments = [ fcluster(linkage_table, c, criterion=criterion) for c in reversed(cutoff_values) ] # Write cluster labels to file index = list(range(len(cluster_assignments))) cluster_assignments = pd.DataFrame(cluster_assignments, index=index, columns=labels) cluster_assignments.to_csv(cl_labels_out, sep='\t', index=True) return cluster_assignments
def main(file, link_table_in='rows_hier_linkage.txt.gz', cl_members_out='cluster_members.json.gz'): assert isinstance(file, str) # Load the labels from the PCA output file - as strings labels = np.genfromtxt(fname=file, delimiter='\t', dtype='str', usecols=range(0, 1)) # Load hierarchical clustering linkage table linkage_table = pd.read_table(link_table_in, index_col=False, header=None) # Identify members of all clusters of size 2 or greater cluster_members = cluster.get_cluster_membership(linkage_table.values, labels) # Write cluster membership to file io.write_json(cluster_members, cl_members_out) return cluster_members
def main(source_file, cl_members_file, counts_out_file): assert isinstance(source_file, str) # Load original data in json format data = io.load_json(source_file) data = pd.DataFrame(data) # Load cluster members and labels data cl_members_data = io.load_json(cl_members_file) # Get occurrence counts for all attributes and parent/children trios at all branch points in the cluster hierarchy all_occurrence_counts = {} for i in range(len(cl_members_data) - 1): parent_cnts, child1_cnts, child2_cnts = \ enrichment.get_parent_child_occurrence_counts(str(i), data, cl_members_data) all_counts = { 'parent': parent_cnts, 'child1': child1_cnts, 'child2': child2_cnts } all_occurrence_counts[str(i)] = all_counts # Write occurrence count data to file io.write_json(all_occurrence_counts, counts_out_file)
} pairs.append(blob) print("#numerical", N, sep="\t") print("#tests_done", tests_done, sep="\t") print('variableA', 'variableB', 'N', 'rho', 'pval', sep="\t") significant = [] for pair in pairs: corrected = pair['pval'] * tests_done if corrected <= 0.05: pair['pval'] = format(corrected, '.2e') significant.append(pair) print(pair['A'], pair['B'], pair['N'], pair['rho'], pair['pval'], sep="\t") result = { 'relationship': 'is_correlated_to', 'test_type': 'Spearman correlation', 'correction': 'Bonferroni', 'numerical_columns': N, 'tests_done': tests_done, 'tests_passed': len(significant), 'tests': significant } io.write_json(result, 'num_assoc.json.gz')