def collect_studies_unique_comparisons(bin_studies=None, ): """ Collects unique identifiers of comparisons from all studies. arguments: bin_studies (dict): collection of information about each study raises: returns: (list<str>): unique identifiers of comparisons from all studies """ collection = [] for study in bin_studies.keys(): data_study = bin_studies[study]["data"] columns = data_study.columns.to_list() comparisons = list( filter(lambda value: (not value in ["identifier", "name"]), columns)) collection.extend(comparisons) comparisons_unique = sorted( utility.collect_unique_elements(elements_original=collection)) return comparisons_unique
def collect_orphan_gene_set( sets=None, genes_query=None, ): """ Collects the union of elements from multiple sets. arguments: sets (dict<dict<list<str>>>): sets of genes genes_query (list<str>): identifiers of genes in original enrichment query raises: returns: (dict<dict<list<str>>>): sets of genes """ sets = copy.deepcopy(sets) orphans_raw = list( filter(lambda gene: not gene in sets["union"], genes_query)) sets["orphan"] = utility.collect_unique_elements( elements_original=orphans_raw) # Return information. return sets
def collect_report_ontology_parentage_orphan_genes( cluster_reports=None, genes_query=None, report=None, ): """ Extracts information about persons. arguments: cluster_reports (dict): reports for each cluster genes_query (list<str>): identifiers of genes in original enrichment query report (bool): whether to print reports raises: returns: (dict<list<str>>): identifiers of genes in each parent set """ # Collect genes. genes_collection = list() # Iterate on cluster reports. for key in cluster_reports.keys(): # Organize data. data_report = cluster_reports[key]["report"] #print(cluster_reports[key]["name"]) #print(data_report) data_report.rename_axis( index="set", axis="index", copy=False, inplace=True, ) records = utility.convert_dataframe_to_records(data=data_report) # Iterate on sets within cluster. for record in records: # Extract identifiers of genes. genes_set_raw = record["Genes"] genes_set = genes_set_raw.split(", ") # Collect genes. genes_collection.extend(genes_set) # Collect unique genes from parent. genes_parentage = utility.collect_unique_elements( elements_original=genes_collection, ) # Collect orphan genes. genes_orphan = list( filter(lambda gene: not gene in genes_parentage, genes_query)) # Report. if report: utility.print_terminal_partition(level=2) print("unique parentage and orphan genes") print("parentage genes: " + str(len(genes_parentage))) print("orphan genes: " + str(len(genes_orphan))) utility.print_terminal_partition(level=2) pass
def collect_studies_unique_gene_identifiers( bin_studies=None, report=None, ): """ Collects unique identifiers of genes from all studies. arguments: bin_studies (dict): collection of information about each study report (bool): whether to print reports raises: returns: (list<str>): unique identifiers of genes from all studies """ # Report. if report: utility.print_terminal_partition(level=2) print("unique genes from each study") genes_collection = [] for study in bin_studies.keys(): data_study = bin_studies[study]["data"] genes_study = utility.collect_unique_elements( elements_original=data_study["identifier"].to_list()) genes_study_valid = list( filter(lambda identifier: ("ENSG" in str(identifier)), genes_study)) genes_collection.extend(genes_study) # Report. if report: print("study " + study + " : " + str(len(genes_study_valid))) # Determine valid, non null values of the gene's fold change. genes_valid = list( filter(lambda identifier: ("ENSG" in str(identifier)), genes_collection)) genes_unique = utility.collect_unique_elements( elements_original=genes_valid) return genes_unique
def translate_study_comparisons_identifiers( bin_studies=None, report=None, ): """ Collects and organizes unique designations of comparisons from all studies. arguments: bin_studies (dict): collection of information about each study report (bool): whether to print reports raises: returns: (dict): collection of information about each study with unique comparisons in each study """ bin_studies = copy.deepcopy(bin_studies) for study in bin_studies.keys(): identifier_study = bin_studies[study]["identifier"] data_study = bin_studies[study]["data"] columns = data_study.columns.to_list() comparisons = list( filter(lambda value: (not value in ["identifier", "name"]), columns)) comparisons_unique = sorted( utility.collect_unique_elements(elements_original=comparisons)) bin_studies[study]["comparisons"] = comparisons_unique # Organize study's unique comparisons. bin_studies[study]["comparisons_translation"] = dict() for comparison in comparisons_unique: name = str(identifier_study + "_" + comparison) bin_studies[study]["comparisons_translation"][comparison] = name pass # Translate comparison columns. data_study.rename( columns=bin_studies[study]["comparisons_translation"], inplace=True, ) bin_studies[study]["data"] = data_study # Report. if report: print(data_study) pass pass return bin_studies
def collect_union_gene_set(sets=None, ): """ Collects the union of elements from multiple sets. arguments: sets (dict<dict<list<str>>>): sets of genes raises: returns: (dict<dict<list<str>>>): sets of genes """ sets = copy.deepcopy(sets) union = list() for set in sets.keys(): union.extend(sets[set]) sets["union"] = utility.collect_unique_elements(elements_original=union) # Return information. return sets
def select_genes_by_modality_measures_ranks( genes=None, proportion_least=None, proportion_greatest=None, measures=None, data_distribution_report=None, report=None, ): """ Selects genes with least and greatest values of measures of modality. arguments: genes (list<str>): identifiers of genes for which to consider modalities proportion_least (float): proportion of genes to select from those with least values of modality measures proportion_greatest (float): proportion of genes to select from those with greatest values of modality measures measures (list<str>): measures of modality data_distribution_report (object): Pandas data frame of information about genes and their measures of modality report (bool): whether to print reports raises: returns: (dict): information about selection of genes """ # Organize data. genes = copy.deepcopy(genes) data_report = data_distribution_report.copy(deep=True) data_report_genes = data_report.loc[data_report.index.isin(genes), :] # Calculate count of genes to select from least and greatest extremes. count_total = len(genes) count_least = round(proportion_least * count_total) count_greatest = round(proportion_greatest * count_total) # Report. if report: print( "selection percentage least: " + str(round((proportion_least * 100), 2)) ) print("selection count least: " + str(count_least)) utility.print_terminal_partition(level=3) print( "selection percentage greatest: " + str(round((proportion_greatest * 100), 2)) ) print("selection count greatest: " + str(count_greatest)) pass # Iterate on measures of modality. bin = dict() for measure in measures: # Copy data. data_measure = data_report_genes.copy(deep=True) data_measure = data_measure.loc[:, ["name", measure]] # Sort by values of the measure. data_measure.sort_values( by=[measure], axis="index", ascending=True, inplace=True, ) # Select least and greatest genes. # Pay attention to index values. # I validated the selection of threshold values. threshold_least = data_measure.iat[(count_least - 1), 1] data_least = data_measure.iloc[:count_least] genes_least = utility.collect_unique_elements( elements_original=data_least.index.to_list() ) threshold_greatest = ( data_measure.iat[(count_total - (count_greatest)), 1] ) data_greatest = data_measure.iloc[(count_total - count_greatest):] genes_greatest = data_greatest.index.to_list() genes_greatest = utility.collect_unique_elements( elements_original=data_greatest.index.to_list() ) # Collect information. bin[measure] = dict() bin[measure]["least"] = dict() bin[measure]["least"]["threshold"] = threshold_least bin[measure]["least"]["genes"] = genes_least bin[measure]["greatest"] = dict() bin[measure]["greatest"]["threshold"] = threshold_greatest bin[measure]["greatest"]["genes"] = genes_greatest pass # Organize measures' thresholds for plot. bin["measures_thresholds"] = dict() for measure in measures: bin["measures_thresholds"][measure] = ( bin[measure]["greatest"]["threshold"] ) # Return information. return bin
def determine_selection_distribution_genes_valid_modalities( genes_selection=None, genes_distribution=None, data_distribution_report=None, report=None, ): """ Determines selection genes with valid modalities from distribution procedure. Only genes with adequate signal coverage across tissues and persons have valid modalities from distribution procedure. arguments: genes_selection (list<str>): identifiers of genes from selection genes_distribution (list<str>): identifiers of genes from distribution procedure data_distribution_report (object): Pandas data frame of information about genes and their measures of modality report (bool): whether to print reports raises: returns: (dict): information """ # Organize data. genes_selection = copy.deepcopy(genes_selection) data_report = data_distribution_report.copy(deep=True) # Select genes with valid distribution modalities. data_valid = data_report.loc[ :, data_report.columns.isin(["gene", "coefficient", "mixture", "dip"]) ] data_valid.dropna( axis="index", how="any", inplace=True, ) genes_distribution_valid = utility.collect_unique_elements( elements_original=data_valid.index.to_list() ) # Report. if report: utility.print_terminal_partition(level=2) print( "count of all distribution genes: " + str(len(genes_distribution)) ) utility.print_terminal_partition(level=4) print( "count of all distribution genes with valid modalities: " + str(len(genes_distribution_valid)) ) utility.print_terminal_partition(level=2) pass # Select genes from selection procedure with valid distribution modalities. genes_selection_distribution_valid = utility.filter_common_elements( list_one=genes_selection, list_two=genes_distribution_valid, ) # Report. if report: utility.print_terminal_partition(level=2) print( "count of all selection genes: " + str(len(genes_selection)) ) utility.print_terminal_partition(level=4) print( "count of all selection genes with valid modalities: " + str(len(genes_selection_distribution_valid)) ) utility.print_terminal_partition(level=2) pass # Compile information. bin = dict() bin["data_distribution_report"] = data_valid bin["genes_selection_distribution"] = genes_selection_distribution_valid # Return information. return bin
def split_report_write_genes_signals( cohort=None, persons=None, data_samples_tissues_persons=None, data_gene_signal=None, path_directory=None, report=None, ): """ Function to execute module's main behavior. arguments: cohort (str): cohort of persons--selection, respiration, or ventilation persons (list<str>): identifiers of persons data_samples_tissues_persons (object): Pandas data frame of persons and tissues across samples data_gene_signal (object): Pandas data frame of genes' signals across samples path_directory (str): path to directory for product directories and files report (bool): whether to print reports about the selection raises: returns: """ # Report. if report: utility.print_terminal_partition(level=1) print("... Split procedure for: " + str(cohort) + " persons...") print("Count persons: " + str(len(persons))) utility.print_terminal_partition(level=2) # Copy data. data_samples_tissues_persons = data_samples_tissues_persons.copy(deep=True) data_gene_signal = data_gene_signal.copy(deep=True) # Select samples for relevant persons. bin = select_samples_signals_persons( persons=persons, data_samples_tissues_persons=data_samples_tissues_persons, data_gene_signal=data_gene_signal, ) # Split genes' signals across tissues and patients by gene. genes_samples_signals = split_genes_signals( data_samples_tissues_persons=data_samples_tissues_persons, data_gene_signal=bin["data_gene_signal"], ) # Organize genes' identifiers. # Format of genes' identifiers needs to be readable by Bash as an array. genes = utility.collect_unique_elements( elements_original=list(genes_samples_signals.keys())) # Summarize information for a single gene. # Access data for single gene for demonstration. if report: summarize_genes_samples_signals( genes_samples_signals=genes_samples_signals, ) # Write the entire collection of all genes' signals to a single file. # Also write each gene's signals to a separate file. # Conserve memory in parallel pipeline by reading data for each gene # separately. # Compile information. information = { "genes": genes, "genes_samples_signals": genes_samples_signals, } # Write product information to file. write_product( information=information, path_directory=path_directory, ) pass
def collect_ontology_enrichment_cluster_gene_sets( cluster_reports=None, report=None, ): """ Extracts information about persons. arguments: cluster_reports (dict): reports for each cluster report (bool): whether to print reports raises: returns: (dict<list<str>>): identifiers of genes in each parent set """ # Collect unique genes of all children sets from each parent cluster. parents_genes = dict() # Iterate on cluster reports. for key in cluster_reports.keys(): # Organize data. name = cluster_reports[key]["name"] data_report = cluster_reports[key]["report"] data_report.rename_axis( index="set", axis="index", copy=False, inplace=True, ) records = utility.convert_dataframe_to_records(data=data_report) # Collect genes from each children set in parent cluster. genes_child = list() # Iterate on children sets within cluster. for record in records: # Extract identifiers of genes. genes_set_raw = record["Genes"] genes_set = genes_set_raw.split(", ") # Collect genes. genes_child.extend(genes_set) # Collect unique genes from parent. genes_child_unique = utility.collect_unique_elements( elements_original=genes_child, ) parents_genes[name] = genes_child_unique # Organize data. # Report. if report: utility.print_terminal_partition(level=2) print("unique genes from each parent set") for key in cluster_reports.keys(): utility.print_terminal_partition(level=3) print("parent: " + cluster_reports[key]["name"]) print("count of children sets: " + str(cluster_reports[key]["report"].shape[0])) print("count of children genes: " + str(len(parents_genes[cluster_reports[key]["name"]]))) #print(data_parent) utility.print_terminal_partition(level=2) # Return information. return parents_genes
def select_covid19_genes_by_studies_fold_directions( data_genes_comparisons_studies=None, genes_selection=None, threshold_studies=None, report=None, ): """ Collects and organizes genes that show differential expression in multiple studies and comparisons. arguments: data_genes_comparisons_studies (object): Pandas data frame of genes' differential expression in studies genes_selection (list<str>): identifiers of genes threshold_studies (int): minimal count of studies report (bool): whether to print reports raises: returns: (dict<list<str>>): sets of genes """ # Copy data. data = data_genes_comparisons_studies.copy(deep=True) # Select data for genes that match selection for study. data = data.loc[data.index.isin(genes_selection), :] # Report. if report: utility.print_terminal_partition(level=2) print("Count of differential expression genes that match selection" + "of genes for study.") print("selection genes DE COVID-19: " + str(data.shape[0])) # Select data for genes that match threshold. data_studies = data.loc[data["studies"] >= threshold_studies, :] genes_any = utility.collect_unique_elements( elements_original=data_studies.index.to_list()) # Select data for genes that show accumulation in majority of studies. data_accumulation = data_studies.loc[data_studies["accumulations"] > (data_studies["depletions"] + 1), :] genes_accumulation = utility.collect_unique_elements( elements_original=data_accumulation.index.to_list()) # Select data for genes that show depletion in majority of studies. data_depletion = data_studies.loc[data_studies["depletions"] > (data_studies["accumulations"] + 1), :] genes_depletion = utility.collect_unique_elements( elements_original=data_depletion.index.to_list()) # Select data for genes that show depletion in majority of studies. data_mix = data_studies.loc[ lambda datum: (datum["accumulations"] == datum["depletions"]) | (datum["accumulations"] == (datum["depletions"] + 1)) | (datum["depletions"] == (datum["accumulations"] + 1))] genes_mix = utility.collect_unique_elements( elements_original=data_mix.index.to_list()) # Compile information. bin = dict() bin["any"] = genes_any bin["up"] = genes_accumulation bin["down"] = genes_depletion bin["mix"] = genes_mix # Report. if report: utility.print_terminal_partition(level=2) print("Count of differential expression genes that match threshold.") print("threshold count of studies: " + str(threshold_studies)) print("genes DE COVID-19 any direction: " + str(len(genes_any))) print("genes by accumulation: " + str(len(genes_accumulation))) print("genes by depletion: " + str(len(genes_depletion))) print("genes by mix of folds: " + str(len(genes_mix))) # Return information. return bin
def collect_genes_annotations_studies_comparisons_valid_change( genes_identifiers=None, bin_studies=None, data_gene_annotation=None, warn=None, ): """ Collects unique studies in which each gene has a valid fold change. arguments: genes_identifiers (list<str>): unique identifiers of genes in all studies bin_studies (dict): collection of information about each study data_gene_annotation (object): Pandas data frame of genes' annotations warn (bool): whether to print warnings raises: returns: (object): Pandas data frame of studies for each gene """ # Iterate across genes. records = list() for gene_identifier in genes_identifiers: # Determine whether gene has a valid identifier. if ("ENSG" in str(gene_identifier)): # Collect studies and comparisons for the gene. studies = list() comparisons_gene = list() accumulations = list() depletions = list() for study in bin_studies.keys(): data_study = bin_studies[study]["data"] # Determine whether the gene has valid fold change in study. if (gene_identifier in data_study["identifier"].to_list()): # Study mentions current gene. # Determine whether the gene has a valid fold change. # Consider each study comparison for the gene. comparisons_study = list( filter( lambda value: (not value in ["identifier", "name"]), data_study.columns.to_list())) # Iterate across study's comparisons. for comparison in comparisons_study: value = determine_gene_study_comparison_value( gene_identifier=gene_identifier, comparison=comparison, data_study=data_study, warn=warn, ) if not math.isnan(value): studies.append(study) comparisons_gene.append(comparison) if value >= 1: accumulations.append(comparison) elif value < 1: depletions.append(comparison) pass pass pass pass # Collect unique studies. studies_unique = sorted( utility.collect_unique_elements(elements_original=studies)) comparisons_gene_unique = sorted( utility.collect_unique_elements( elements_original=comparisons_gene)) accumulations_unique = sorted( utility.collect_unique_elements( elements_original=accumulations)) depletions_unique = sorted( utility.collect_unique_elements(elements_original=depletions)) # Organize record. record = dict() record["identifier"] = gene_identifier record["studies"] = len(studies_unique) record["comparisons"] = len(comparisons_gene_unique) record["reference"] = ";".join(studies_unique) record["accumulations"] = len(accumulations_unique) record["depletions"] = len(depletions_unique) annotations = assembly.access_gene_contextual_annotations( gene_identifier=gene_identifier, data_gene_annotation=data_gene_annotation, ) record.update(annotations) records.append(record) pass pass # Organize data. data = pandas.DataFrame(data=records) data.set_index( "identifier", drop=True, inplace=True, ) return data