Exemplo n.º 1
0
def check_zero_genes(data=None):
    """
    Checks data for genes with values of 0 for signals across all samples.

    arguments:
        data (object): Pandas data frame of genes' signals for all samples.

    raises:

    returns:

    """

    utility.print_terminal_partition(level=2)
    print("Check for genes with values of 0 for signals across all samples.")
    print("These genes are undetectable.")
    print("shape of original data frame: " + str(data.shape))
    data_nonzero = (data != 0)
    print("shape of data frame without zero genes: " +
          str(data.loc[data_nonzero.any(axis="columns"), :].shape))
    print("Now printing a summary of data for genes with all zero signals.")
    data_zero = (data == 0)
    data_signal_zero = data.loc[data_zero.all(axis="columns"), :]
    print(data_signal_zero.iloc[0:10, 0:10])
    #groups = data_signal_zero.groupby(level="gene")
    #print(groups.describe())
    pass
Exemplo n.º 2
0
def filter_samples_by_signal_threshold(
    data=None,
    threshold=None,
):
    """
    Filter samples to keep only those with signals beyond threshold in at least
    one gene.

    Data format should have samples across columns and genes across rows.

    arguments:
        data (object): Pandas data frame of genes' signals across samples

    raises:

    returns:
        (object): Pandas data frame of genes' signals across samples

    """

    utility.print_terminal_partition(level=2)
    print(
        "Filter samples to keep only those with signals beyond threshold in " +
        "at least one gene. \n" +
        "Data format should have samples across columns and genes across rows."
    )
    print("signal threshold: " + str(threshold))
    print("data dimensions before filter: " + str(data.shape))
    data_threshold = (data >= threshold)
    data_detection = data.loc[:, data_threshold.any(axis="index")]
    print("data dimensions after filter: " + str(data_detection.shape))
    utility.print_terminal_partition(level=3)
    return data_detection
Exemplo n.º 3
0
def check_redundancy_genes(data=None):
    """
    Checks data for redundancy in genes.

    arguments:
        data (object): Pandas data frame of genes' signals for all samples.

    raises:

    returns:

    """

    utility.print_terminal_partition(level=2)
    print("Check for redundant genes in genes' signals.")
    print("Consider names of genes.")
    # Reset indices to consider names of genes.
    data = data.reset_index()
    print(data.iloc[0:10, 0:10])
    data_redundancy = data.duplicated(subset=None, keep="first")
    data_redundancy_list = data_redundancy.to_list()
    if any(data_redundancy_list):
        print("Redundancy in genes: Yes")
    else:
        print("Redundancy in genes: No")
    pass
Exemplo n.º 4
0
def execute_procedure(path_dock=None, ):
    """
    Function to execute module's main behavior.

    arguments:
        path_dock (str): path to dock directory for source and product
            directories and files

    raises:

    returns:

    """

    # 3. read in Coombes' metabolite regression tables (all)
    # 4. iterate on Coombes' metabolite regression tables
    # 5. match metabolite names to Shin 2014 metabolite identifiers
    # 6. merge Coombes' metabolite regression information with metabolite heritabilities
    # 7. filter metabolites by whether identifiable and SNP-heritability > 0.05
    # 8. calculate Benjamini-Hochberg False-Discovery Rates

    # Report version.
    utility.print_terminal_partition(level=1)
    print(path_dock)
    print("version check: 1")
    # Pause procedure.
    time.sleep(5.0)

    # Initialize directories.
    paths = initialize_directories(
        restore=False,
        path_dock=path_dock,
    )
    # Read source information from file.
    source = read_source(
        path_dock=path_dock,
        report=True,
    )

    #print(source["table_reference_shin_2014"])
    #print(source["table_metabolite_heritabilities"])

    # Read and organize tables for regressions between polygenic estimate
    # metabolites and phenotypes.
    pail = read_organize_polygenic_metabolite_phenotype_regression_tables(
        threshold_metabolite_heritability=0.05,  # metabolite heritability
        path_source_directory=paths["coombes_polygene"],
        table_reference_shin_2014=source["table_reference_shin_2014"],
        table_metabolite_heritabilities=(
            source["table_metabolite_heritabilities"]),
        report=True,
    )

    # Collect information.
    information = dict()
    information["tables"] = pail
    # Write product information to file.
    write_product(paths=paths, information=information)

    pass
Exemplo n.º 5
0
def standardize_gene_signal(data_gene_signal=None):
    """
    Transforms values of genes' signals to standard or z-score space.

    Data has genes across rows and samples across columns.

           sample_1 sample_2 sample_3 sample_4 sample_5
    gene_1 ...      ...      ...      ...      ...
    gene_2 ...      ...      ...      ...      ...
    gene_3 ...      ...      ...      ...      ...
    gene_4 ...      ...      ...      ...      ...
    gene_5 ...      ...      ...      ...      ...

    arguments:
        data_gene_signal (object): Pandas data frame of genes' signals across
            samples

    raises:

    returns:
        (object): Pandas data frame of genes' signals across samples

    """

    # Transform signals to standard score space.
    data_gene_signal_standard = calculate_standard_score_gene_signal_by_gene(
        data_gene_signal=data_gene_signal
    )
    print(data_gene_signal_standard.iloc[0:10, 0:10])
    # Compare summary statistics before and after transformation.
    utility.print_terminal_partition(level=3)
    print("Summary statistics for gene signals before standardization.")
    data_mean = data_gene_signal.apply(
        lambda x: x.mean(),
        axis="columns"
    )
    print("Mean")
    print(data_mean.iloc[0:10])
    data_deviation = data_gene_signal.apply(
        lambda x: x.std(),
        axis="columns"
    )
    print("Standard deviation")
    print(data_deviation.iloc[0:10])
    utility.print_terminal_partition(level=3)
    print("Summary statistics for gene signals after standardization.")
    data_mean = data_gene_signal_standard.apply(
        lambda x: x.mean(),
        axis="columns"
    )
    print("Mean")
    print(data_mean.iloc[0:10])
    data_deviation = data_gene_signal_standard.apply(
        lambda x: x.std(),
        axis="columns"
    )
    print("Standard deviation")
    print(data_deviation.iloc[0:10])
    return data_gene_signal_standard
Exemplo n.º 6
0
def filter_genes_by_bimodality_thresholds(
    measures=None,
    thresholds=None,
    data_genes_distributions=None,
    direction=None,
):
    """
    Copy and split information about genes.

    arguments:
        measures (list<str>): measures of bimodality
        thresholds (dict<float>): values of thresholds for measures of
            bimodality
        data_genes_distributions (object): Pandas data frame of information
            about genes and their measures of bimodality
        direction (str): direction of distribution from which to select, lesser
            or greater

    raises:

    returns:
        (dict<list<str>>): identifiers of genes that pass filtration by
            thresholds on each measure of bimodality

    """

    utility.print_terminal_partition(level=1)
    print(
        "count of genes filtered by probabilities of each bimodality " +
        "measurement"
    )

    # Collect genes from filtration by each measurement of bimodality.
    entries = dict()
    for measure in measures:
        # Copy minimal genes' data for each measure of bimodality.
        data_measure = copy_split_minimal_gene_data(
            measure=measure,
            data_genes_distributions=data_genes_distributions,
        )

        # Filter genes by threshold on each measure's probabilities.
        data_filter = filter_genes_by_bimodality_threshold(
            data=data_measure,
            measure=measure,
            threshold=thresholds[direction][measure],
            direction=direction,
        )

        # Extract genes' identifiers.
        genes = data_filter["identifier"].tolist()
        utility.print_terminal_partition(level=3)
        print(measure + ": " + str(len(genes)))

        # Compile information.
        entries[measure] = genes

    # Return information.
    return entries
Exemplo n.º 7
0
def extract_genes_modality_sets(
    direction=None,
    measures=None,
    selection=None,
):
    """
    Extracts identifiers of unique genes from selection by modality measures.

    arguments:
        direction (str): direction of distribution from which to select, lesser
            or greater
        measures (list<str>): measures of modality
        selection (dict): selections of genes

    raises:

    returns:
        (dict<list<str>>): identifiers of genes

    """

    # Organize sets of genes.
    sets = dict()
    for measure in measures:
        sets[measure] = selection[measure][direction]["genes"]

    # Select genes that pass filters by multiple measures of bimodality.
    genes_1 = utility.select_elements_by_sets(
        names=measures,
        sets=sets,
        count=1,
    )
    genes_2 = utility.select_elements_by_sets(
        names=measures,
        sets=sets,
        count=2,
    )
    genes_3 = utility.select_elements_by_sets(
        names=measures,
        sets=sets,
        count=3,
    )

    # Summarize information.
    utility.print_terminal_partition(level=2)
    print("Selection of genes by: " + direction)
    print("... any 1 sets: " + str(len(genes_1)))
    print("... any 2 sets: " + str(len(genes_2)))
    print("... any 3 sets: " + str(len(genes_3)))

    # Collect information.
    bin = dict()
    bin["measures_1"] = genes_1
    bin["measures_2"] = genes_2
    bin["measures_3"] = genes_3
    bin["sets_genes_measures"] = sets
    # Return information.
    return bin
Exemplo n.º 8
0
def collect_report_ontology_parentage_orphan_genes(
    cluster_reports=None,
    genes_query=None,
    report=None,
):
    """
    Extracts information about persons.

    arguments:
        cluster_reports (dict): reports for each cluster
        genes_query (list<str>): identifiers of genes in original enrichment
            query
        report (bool): whether to print reports

    raises:

    returns:
        (dict<list<str>>): identifiers of genes in each parent set

    """

    # Collect genes.
    genes_collection = list()
    # Iterate on cluster reports.
    for key in cluster_reports.keys():
        # Organize data.
        data_report = cluster_reports[key]["report"]
        #print(cluster_reports[key]["name"])
        #print(data_report)
        data_report.rename_axis(
            index="set",
            axis="index",
            copy=False,
            inplace=True,
        )
        records = utility.convert_dataframe_to_records(data=data_report)
        # Iterate on sets within cluster.
        for record in records:
            # Extract identifiers of genes.
            genes_set_raw = record["Genes"]
            genes_set = genes_set_raw.split(", ")
            # Collect genes.
            genes_collection.extend(genes_set)
    # Collect unique genes from parent.
    genes_parentage = utility.collect_unique_elements(
        elements_original=genes_collection, )
    # Collect orphan genes.
    genes_orphan = list(
        filter(lambda gene: not gene in genes_parentage, genes_query))
    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print("unique parentage and orphan genes")
        print("parentage genes: " + str(len(genes_parentage)))
        print("orphan genes: " + str(len(genes_orphan)))
        utility.print_terminal_partition(level=2)
    pass
Exemplo n.º 9
0
def organize_cohort_gene_components(
    cohort=None,
    paths=None,
    report=None,
):
    """
    Organizes evaluation of subpopulation structure on the basis of pan-tissue
    expression of genes of interest.

    arguments:
        cohort (str): cohort of persons--selection, respiration, or ventilation
        paths (dict<str>): collection of paths to directories for procedure's
            files
        report (bool): whether to print reports

    raises:

    returns:

    """

    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print("cohort: " + cohort)

    # Read source information from file.
    source = read_source_cohort_gene_components(
        cohort=cohort,
        dock=paths["dock"],
    )
    # Organize data for principal component analysis.
    data_signals_genes_persons = organize_data_cohort_multimodal_genes_signals(
        data_signals_genes_persons=source["data_signals_genes_persons"],
        report=report,
    )

    if False:
        # Calculate principal components on genes across persons.
        bin = calculate_multimodal_genes_signals_persons_components(
            genes=source["genes_candidacy"]["multimodal"],
            data_signals_genes_persons=data_signals_genes_persons,
            report=report,
        )
        # Compile information.
        information = dict()
        information["data_persons_genes_components"] = bin[
            "data_observations_components"]
        information["data_persons_genes_variances"] = bin[
            "data_components_variances"]
        # Write information to file.
        write_product_cohort_gene_components(
            cohort=cohort,
            information=information,
            paths=paths,
        )
    pass
Exemplo n.º 10
0
def organize_differential_expression_data_sets(
    comparisons=None,
    data_samples_tissues_patients=None,
    data_gene_count=None,
):
    """
    Collect hierarchical structure of tissues, patients, and samples.

    arguments:
        comparisons (dict<list<str>>): Minor categories to compare for each
            major category of tissue
        data_samples_tissues_patients (object): Pandas data frame of patients
            and tissues for all samples
        data_gene_count (object): Pandas data frame of genes' counts for all
            samples

    raises:

    returns:
        (list<dict>): Collections of data sets for differential expression
            analyses

    """

    # Print terminal partition.
    utility.print_terminal_partition(level=2)
    # Report.
    print(
        "Organization of data sets for differential gene expression " +
        "comparison of minor categories of tissues."
    )

    # Collect data sets.
    sets = list()
    tissues_major = list(comparisons.keys())
    for tissue_major in tissues_major:
        set = organize_differential_expression_data_set(
            tissue_major=tissue_major,
            tissues_minor=comparisons[tissue_major],
            data_samples_tissues_patients=data_samples_tissues_patients,
            data_gene_count=data_gene_count,
        )
        # Collect the data set.
        sets.append(set)

    # Print terminal partition.
    utility.print_terminal_partition(level=2)
    # Report.
    print(
        "Data sets by major tissues:"
    )
    for set in sets:
        print(set["tissue"])
    return sets
Exemplo n.º 11
0
def determine_gene_study_comparison_value(
    gene_identifier=None,
    comparison=None,
    data_study=None,
    warn=None,
):
    """
    Determines a gene's value of fold change for a comparison in a study.

    The function that calls this function already verifies that the study
    includes the gene and the comparison.

    arguments:
        gene_identifier (str): unique identifier of a gene
        comparison (str): name of a comparison
        data_study (object): Pandas data frame of information about comparisons
            across genes in a study
        warn (bool): whether to print warnings

    raises:

    returns:
        (float): gene's value of fold change for a comparison in a study

    """

    # Select study's information for gene.
    data_study = data_study.copy(deep=True)
    data_study_gene = data_study.loc[data_study["identifier"] ==
                                     gene_identifier, :].copy(deep=True)
    data_study_gene.drop_duplicates(
        subset=None,
        keep="first",
        inplace=True,
        #ignore_index=True,
    )
    # Determine valid, non null values of the gene's fold change.
    values_valid = list(
        filter(lambda value: (not math.isnan(value)),
               data_study_gene[comparison].to_list()))
    if len(values_valid) > 1:
        value = statistics.mean(values_valid)
        if warn:
            utility.print_terminal_partition(level=3)
            print(
                "warning: gene has multiple fold change values for a single " +
                "study and comparison." + gene_identifier)
    elif len(values_valid) == 1:
        value = values_valid[0]
    else:
        value = float("nan")
    # Return information.
    return value
Exemplo n.º 12
0
def read_source(
    path_dock=None,
    report=None,
):
    """
    Reads and organizes source information from file.

    arguments:
        path_dock (str): path to dock directory for source and product
            directories and files
        report (bool): whether to print reports

    raises:

    returns:
        (object): source information

    """

    # Specify directories and files.
    path_table_reference_shin_2014 = os.path.join(
        path_dock, "metabolite_reference", "24816252_shin_2014",
        "table_metabolite_reference.tsv")
    path_table_metabolite_heritabilities = os.path.join(
        path_dock, "heritability_correlation_2021-04-12",
        "table_shin_2014_heritabilities.tsv")

    # Read information from file.
    table_reference_shin_2014 = pandas.read_csv(
        path_table_reference_shin_2014,
        sep="\t",
        header=0,
        #dtype="string",
    )
    table_metabolite_heritabilities = pandas.read_csv(
        path_table_metabolite_heritabilities,
        sep="\t",
        header=0,
        #dtype="string",
    )

    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print("report from read_source()")
        print(table_reference_shin_2014)
        utility.print_terminal_partition(level=2)
    # Compile and return information.
    return {
        "table_reference_shin_2014": table_reference_shin_2014,
        "table_metabolite_heritabilities": table_metabolite_heritabilities,
    }
Exemplo n.º 13
0
def execute_procedure(dock=None, count=None):
    """
    Function to execute module's main behavior.

    arguments:
        dock (str): path to root or dock directory for source and product
            directories and files
        count (int): count of shuffles to create and store

    raises:

    returns:

    """

    # Remove previous files to avoid version or batch confusion.
    path_shuffle = os.path.join(dock, "shuffle")
    utility.remove_directory(path=path_shuffle)

    # Read source information from file.
    source = read_source(dock=dock)

    # Report.
    utility.print_terminal_partition(level=3)
    print(
        "Creating " + str(count) + " shuffles for matrices of dimension " +
        "zero: " + str(source["tissues_selection"]) + " by dimension one: " +
        str(source["persons_selection"]) + ". "
        "Notice that shuffles occur across dimension one (tissues for each " +
        "person)."
    )
    print(
        "Hence, values will stay matched to their respective tissues, but " +
        "they will be shuffled with respect to persons."
    )
    utility.print_terminal_partition(level=3)

    # Create shuffle indices.
    shuffles = create_shuffle_indices(
        count=count,
        dimension_zero=source["tissues_selection"],
        dimension_one=source["persons_selection"],
    )

    # Compile information.
    information = {
        "shuffles": shuffles
    }
    #Write product information to file.
    write_product(dock=dock, information=information)

    pass
Exemplo n.º 14
0
def organize_genes_heritability_data(
    data_genes_heritability=None,
    report=None,
):
    """
    Organize data summarizing genes' heritabilities.

    arguments:
        data_genes_heritability (object): Pandas data frame of genes'
            heritabilities
        report (bool): whether to print reports

    raises:

    returns:
        (object): Pandas data frame of genes' heritabilities

    """

    # Copy data.
    data = data_genes_heritability.copy(deep=True)
    columns = list()
    columns.append("name")
    columns.append("proportion")
    columns.append("count")
    columns.append("probability")
    columns.append("probability_log")
    columns.append("discovery")
    columns.append("discovery_log")
    columns.append("significance")
    columns.append("error")
    columns.append("confidence_95_interval")
    columns.append("confidence_95_low")
    columns.append("confidence_95_high")
    columns.append("residual")
    columns.append("genotype")
    columns.append("phenotype")
    data = data[[*columns]]
    data.sort_values(
        by=["probability"],
        axis="index",
        ascending=True,
        inplace=True,
    )
    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print("data after organization of columns")
        print(data)
    return data
Exemplo n.º 15
0
def organize_cohort_components_regressions(
    cohort=None,
    paths=None,
    report=None,
):
    """
    Organizes evaluation of subpopulation structure on the basis of pan-tissue
    expression of genes of interest.

    arguments:
        cohort (str): cohort of persons--selection, respiration, or ventilation
        paths (dict<str>): collection of paths to directories for procedure's
            files
        report (bool): whether to print reports

    raises:

    returns:

    """

    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print("cohort: " + cohort)
    # Read source information from file.
    source = read_source_cohort_components_regressions(
        cohort=cohort,
        dock=paths["dock"],
    )
    # Define variables for regression models.
    variables = selection.define_variables()
    # Organize data and regress across components.
    bin_regression = organize_data_regress_cases_report(
        variables_regression=(variables[cohort]["model_hypothesis"]),
        data_persons_properties=source["data_persons_properties"],
        data_persons_genes_components=source["data_persons_genes_components"],
        data_persons_genes_variances=source["data_persons_genes_variances"],
        threshold_discovery=0.05,
        discovery=False,
        report=True,
    )
    # Write information to file.
    write_product_cohort_components_regressions(
        cohort=cohort,
        information=bin_regression,
        paths=paths,
    )
    pass
Exemplo n.º 16
0
def validate_report_selection_thresholds(
    measures=None,
    selection=None,
    genes_scores=None,
):
    """
    Validates thresholds from selection of genes with least and greatest values
        of measures of modality.

    arguments:
        measures (list<str>): measures of modality
        selection (dict): selections of genes
        genes_scores (dict): information about genes' measures of modality

    raises:

    returns:
        (dict): information about selection of genes

    """

    utility.print_terminal_partition(level=2)
    print(
        "Validation of thresholds for selection of unimodal and multimodal " +
        "genes."
    )
    # Iterate on measures of modality.
    for measure in measures:
        for direction in ["least", "greatest"]:
            # Collect values of measure for selection of genes.
            values = list()
            for gene in selection[measure][direction]["genes"]:
                value = genes_scores[gene][measure]
                values.append(value)
            if direction == "least":
                validation = max(values)
            elif direction == "greatest":
                validation = min(values)
            selection[measure][direction]["threshold_validation"] = validation
            threshold = selection[measure][direction]["threshold"]
            utility.print_terminal_partition(level=3)
            print("measure: " + measure)
            print("direction: " + direction)
            print("threshold: " + str(round(threshold, 5)))
            print("validation: " + str(round(validation, 5)))
    # Return information.
    return selection
Exemplo n.º 17
0
def select_samples(tissues=None, persons=None, data_gene_signal=None):
    """
    Selects samples of interest for further analyses.

    arguments:
        tissues (list<str>): Tissues of interest.
        persons (list<str>): persons of interest.
        data_gene_signal (object): Pandas data frame of genes' signals for all
            samples, tissues, and persons.

    raises:

    returns:
        (dict): Pandas data frame of genes' signals for all samples, tissues,
            and persons.

    """

    # Select samples from persons and tissues of interest.
    utility.print_terminal_partition(level=2)
    print("Selection of samples from persons and tissues of interest.")
    print("count of samples, original: " + str(data_gene_signal.shape[0]))
    data_gene_signal.reset_index(level=["person", "tissue", "sample"],
                                 inplace=True)
    data_gene_signal.set_index(["person"],
                               append=False,
                               drop=True,
                               inplace=True)
    data_gene_signal = data_gene_signal.loc[persons, :]
    print("count of samples from persons of interest: " +
          str(data_gene_signal.shape[0]))
    data_gene_signal.reset_index(level=["person"], inplace=True)
    data_gene_signal.set_index(["tissue"],
                               append=False,
                               drop=True,
                               inplace=True)
    data_gene_signal = data_gene_signal.loc[tissues, :]
    print("count of samples from tissues of interest: " +
          str(data_gene_signal.shape[0]))
    data_gene_signal.reset_index(level=["tissue"], inplace=True)
    data_gene_signal.set_index(["person", "tissue", "sample"],
                               append=False,
                               drop=True,
                               inplace=True)

    return data_gene_signal
Exemplo n.º 18
0
def find_intersection_heritability_genes(
    genes_selection=None,
    genes_distribution=None,
    genes_heritability_complete=None,
    path_genes=None,
    report=None,
):
    """
    Reads and organizes source information from file

    arguments:
        genes_selection (list<str>): identifiers of genes from selection
            procedure
        genes_distribution (list<str>): identifiers of genes with valid
            pan-tissue signal distributions
        genes_heritability_complete (list<str>): identifiers of genes for which
            the heritability procedure completed
        path_genes (str): path to heritability genes directory
        report (bool): whether to print reports

    raises:

    returns:
        (list<str>): identifiers of genes of interest from selection that also
            have valid heritability measurements

    """

    # Determine genes for which heritability analysis converged successfully.
    genes_heritability_valid = collect_successful_genes(
        genes=genes_heritability_complete,
        path_genes=path_genes,
    )
    # Determine intersection genes of interest.
    genes_interest = utility.filter_common_elements(
        list_one=genes_selection,
        list_two=genes_heritability_valid,
    )
    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print("genes of interest with valid heritabilities: " +
              str(len(genes_interest)))
        utility.print_terminal_partition(level=2)
    # Return information.
    return genes_interest
Exemplo n.º 19
0
def select_samples_genes(persons=None,
                         tissues=None,
                         data_gene_annotation=None,
                         data_gene_signal=None):
    """
    Selects samples and genes of interest for further analyses.

    arguments:
        persons (list<str>): persons of interest.
        tissues (list<str>): Tissues of interest.
        data_gene_annotation (object): Pandas data frame of genes' annotations.
        data_gene_signal (object): Pandas data frame of genes' signals for all
            samples, tissues, and persons.

    raises:

    returns:
        (object): Pandas data frame of genes' signals for all samples, tissues,
            and persons.

    """

    utility.print_terminal_partition(level=1)
    print("Selection of samples and genes of interest.")

    # Select samples from persons and tissues of interest.
    data_gene_signal = select_samples(
        persons=persons,
        tissues=tissues,
        data_gene_signal=data_gene_signal,
    )

    # Select genes with detectable, non-zero signal in tissues and persons of
    # interest.
    data_gene_signal = select_genes_detection(
        data_gene_signal=data_gene_signal)

    # Select genes that encode proteins.
    data_gene_signal = select_genes_protein(
        data_gene_annotation=data_gene_annotation,
        data_gene_signal=data_gene_signal)

    # Return information.
    return data_gene_signal
Exemplo n.º 20
0
def check_missing_values(data=None):
    """
    Checks data for missing values and prints reports.

    arguments:
        data (object): Pandas data frame of genes' signals for all samples.

    raises:

    returns:

    """

    utility.print_terminal_partition(level=2)
    print("Check for missing values in genes' signals.")
    print("shape of original data frame: " + str(data.shape))
    print("shape without missing axis 0: " + str(data.dropna(axis=0).shape))
    print("shape without missing axis 1: " + str(data.dropna(axis=1).shape))
    pass
Exemplo n.º 21
0
def collect_studies_unique_gene_identifiers(
    bin_studies=None,
    report=None,
):
    """
    Collects unique identifiers of genes from all studies.

    arguments:
        bin_studies (dict): collection of information about each study
        report (bool): whether to print reports

    raises:

    returns:
        (list<str>): unique identifiers of genes from all studies

    """

    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print("unique genes from each study")

    genes_collection = []
    for study in bin_studies.keys():
        data_study = bin_studies[study]["data"]
        genes_study = utility.collect_unique_elements(
            elements_original=data_study["identifier"].to_list())
        genes_study_valid = list(
            filter(lambda identifier: ("ENSG" in str(identifier)),
                   genes_study))
        genes_collection.extend(genes_study)
        # Report.
        if report:
            print("study " + study + " : " + str(len(genes_study_valid)))
    # Determine valid, non null values of the gene's fold change.
    genes_valid = list(
        filter(lambda identifier: ("ENSG" in str(identifier)),
               genes_collection))
    genes_unique = utility.collect_unique_elements(
        elements_original=genes_valid)
    return genes_unique
Exemplo n.º 22
0
def organize_multimodal_genes_signals_persons_components(
    genes=None,
    data_signals_genes_persons=None,
    report=None,
):
    """
    Organizes a principal components analysis on genes' pan-tissue signals as
    features across persons as instances.

    arguments:
        genes (list<str>): identifiers of genes
        data_signals_genes_persons (object): Pandas data frame of genes'
            pan-tissue signals across persons
        report (bool): whether to print reports

    raises:

    returns:
        (dict<object>): collection of Pandas data frames of genes' pairwise
            correlations

    """

    # Copy data.
    data_signals = data_signals_genes_persons.copy(deep=True)
    # Select genes of interest.
    data_selection = data_signals.loc[:, data_signals.columns.isin(genes)]
    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print("Selection of genes with pan-tissue signals across persons.")
        utility.print_terminal_partition(level=3)
        print(data_selection)
    # Reduce dimensionality.
    components = min(int(len(genes)), int(data_selection.shape[0]))
    result = utility.calculate_principal_components(
        data=data_selection,
        components=components,
        report=report,
    )
    # Return information.
    return result
Exemplo n.º 23
0
def select_heritable_genes(
    data_genes_heritability=None,
    threshold_proportion=None,
    threshold_probability=None,
    report=None,
):
    """
    Collects and organizes information about genes.

    arguments:
        data_genes_heritability (object): Pandas data frame of genes'
            heritabilities
        threshold_proportion (float): threshold by proportion of phenotypic
            variance attributable to genotype
        threshold_probability (float): threshold by probability of heritability
            estimate
        report (bool): whether to print reports

    raises:

    returns:
        (list<str>): identifiers of heritable genes

    """

    # Copy genes' heritabilities.
    data_copy = data_genes_heritability.copy(deep=True)
    # Set threshold.
    data_proportion = data_copy.loc[
        data_copy["proportion"] >= threshold_proportion]
    data_probability = data_proportion.loc[
        data_proportion["probability"] <= threshold_probability]
    # Extract identifiers of genes.
    genes = data_probability.index.to_list()
    # Report.
    if report:
        percentage = round((len(genes) / data_copy.shape[0]) * 100, 2)
        utility.print_terminal_partition(level=2)
        print("count of 'heritable' genes': " + str(len(genes)) + " (" +
              str(percentage) + " %)")
    # Return information.
    return genes
Exemplo n.º 24
0
def filter_heritabilities_confidence(
    data_genes_heritability=None,
    threshold=None,
    report=None,
):
    """
    Organizes and combines information about dependent and independent
    variables for regression.

    arguments:
        data_genes_heritability (object): Pandas data frame of genes'
            heritabilities
        threshold (float): maximal confidence interval
        report (bool): whether to print reports

    raises:

    returns:
        (object): Pandas data frame of genes' heritabilities

    """

    # Remove all columns from persons properties except the covariates
    # Copy data.
    data = data_genes_heritability.copy(deep=True)
    # Organize data.
    data["threshold"] = data["confidence_95_interval"].apply(
        lambda value: determine_confidence_threshold_pass(
            value=value,
            threshold=threshold,
        ))
    data_confidence = data.loc[data["threshold"] == True, :]
    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print("data after filter by confidence interval")
        print(data_confidence)
        utility.print_terminal_partition(level=2)
        print("count of candidate genes': " + str(data_confidence.shape[0]))

    # Return information.
    return data_confidence
Exemplo n.º 25
0
def translate_study_genes_identifiers(
    bin_studies=None,
    data_gene_annotation=None,
    translations_genes=None,
    report=None,
):
    """
    Translates genes' names from all studies to Ensembl identifiers.

    arguments:
        bin_studies (dict): collection of information about each study
        data_gene_annotation (object): Pandas data frame of genes' annotations
        translations_genes (dict<str>): pairwise custom translations of genes'
            names to Ensembl identifiers, see
            assembly.read_source_gene_name_identifier_translations()
        report (bool): whether to print reports

    raises:

    returns:
        (dict): collection of information about each study with genes'
            identifiers

    """

    utility.print_terminal_partition(level=2)
    print("translating genes' names to Ensembl identifiers...")
    print("following genes' names do not match...")
    utility.print_terminal_partition(level=2)
    # Iterate on studies.
    bin_studies = copy.deepcopy(bin_studies)
    for study in bin_studies.keys():
        data_study = bin_studies[study]["data"]
        # Determine whether the study already includes genes' identifiers.
        if ("identifier" not in data_study.columns.to_list()):
            data_study["identifier"] = data_study["name"].apply(
                lambda gene_name: assembly.translate_gene_name_to_identifier(
                    name=gene_name,
                    data_gene_annotation=data_gene_annotation,
                    translations_genes=translations_genes,
                ))
            bin_studies[study]["data"] = data_study
            # Report.
            if report:
                print(data_study)
                pass
            pass
        pass
    utility.print_terminal_partition(level=2)
    print("end translation...")
    utility.print_terminal_partition(level=2)
    return bin_studies
Exemplo n.º 26
0
def check_zero_samples(data=None):
    """
    Checks data for samples with values of 0 for all genes' signals.

    arguments:
        data (object): Pandas data frame of genes' signals for all samples.

    raises:

    returns:

    """

    utility.print_terminal_partition(level=2)
    print("Check for samples with values of 0 for all genes' signals.")
    print("shape of original data frame: " + str(data.shape))
    data_nonzero = (data != 0)
    print("shape of data frame without zero samples: " +
          str(data.loc[:, data_nonzero.any(axis="index")].shape))
    pass
Exemplo n.º 27
0
def summarize_measures_thresholds(
    measures=None,
    scores=None,
    thresholds=None,
):
    """
    Summarizes values of thresholds for genes' measures of bimodality.

    arguments:
        measures (list<str>): measures of bimodality
        scores (dict<dict>): information about genes' measures of bimodality
        thresholds (dict<float>): values of thresholds for genes' measures of
            bimodality

    raises:

    returns:


    """

    utility.print_terminal_partition(level=2)
    for measure in measures:
        utility.print_terminal_partition(level=3)
        print("measure: " + str(measure))
        print("mean: " + str(scores[measure]["mean"]))
        print("deviation: " + str(scores[measure]["deviation"]))
        print("threshold lesser: " + str(thresholds["lesser"][measure]))
        print("threshold greater: " + str(thresholds["greater"][measure]))
    pass
    utility.print_terminal_partition(level=2)
Exemplo n.º 28
0
def summarize_genes_samples_signals(genes_samples_signals=None, ):
    """
    Summarize information about a gene's samples and signals.

    arguments:
        data_gene_samples_signals (object): Pandas data frame of a gene's
            signals across samples

    raises:

    returns:
        (dict): counts of persons and tissues

    """

    # Report.
    utility.print_terminal_partition(level=2)
    print("Count of data by genes: " + str(len(genes_samples_signals.keys())))
    print("Access data for a single gene.")
    utility.print_terminal_partition(level=2)
    data = genes_samples_signals["ENSG00000231925"]
    print(data)
    utility.print_terminal_partition(level=2)
    print("Determine counts of persons and tissues.")
    print("Split gene's signals by person.")
    groups = data.groupby("person")
    persons = len(groups)
    print("Count of groups by person: " + str(persons))
    print("Split gene's signals by major tissue category.")
    groups = data.groupby("tissue_major")
    tissues = len(groups)
    print("Count of groups by tissue: " + str(tissues))
    pass
Exemplo n.º 29
0
def drop_undetectable_genes(data=None):
    """
    Drops genes with values of 0 for signals across all samples.

    arguments:
        data (object): Pandas data frame of genes' signals for all samples.

    raises:

    returns:
        (object): Pandas data frame of genes' signals for all samples.

    """

    utility.print_terminal_partition(level=2)
    print("Drop genes that are undetectable.")
    data_nonzero = (data != 0)
    data_signal = data.loc[data_nonzero.any(axis="columns"), :]
    print("Data without undetectable genes.")
    print(data_signal.iloc[0:10, 0:10])
    print("data dimensions: " + str(data_signal.shape))
    return data_signal
Exemplo n.º 30
0
def calculate_report_gene_sample_principal_components(
    data=None,
    data_samples_factors=None,
    components=None,
):
    """
    Calculates the principal components for genes as features and samples as
    observations.

    arguments:
        data (object): Pandas data frame of signals with features across rows
            and observations across columns
        data_samples_factors (object): Pandas data frame of factors for each
            sample
        components (int): count of principle components

    raises:

    returns:
        (object): Pandas data frame of principle components for each factor

    """

    # Describe variance across categories of tissues.
    # Normalize and standardized gene's signals for principal components.
    data_normal_standard = normalize_standardize_gene_signal(
        data_gene_signal=data
    )
    print("Data after normalization and standardization.")
    print(data_normal_standard)

    report = calculate_principal_components(
        data=data_normal_standard,
        components=10
    )
    utility.print_terminal_partition(level=2)
    print("Report from principal component analysis...")
    print("Explained variance by each principal component...")
    print(report["data_component_variance"])
    utility.print_terminal_partition(level=3)
    print(report["data_sample_component"])
    # Associate samples to major and minor tissue types.
    data_factor_component = assembly.associate_samples_persons_tissues(
        data_samples_tissues_persons=data_samples_factors,
        data_gene_sample=report["data_sample_component"],
    )
    utility.print_terminal_partition(level=3)
    print(data_factor_component)

    # Compile information.
    information = {
        "data_component_variance": report["data_component_variance"],
        "data_sample_component": report["data_sample_component"],
        "data_factor_component": data_factor_component,
    }
    # Return information.
    return information