Пример #1
0
def write_summary(out_path, genes_by_merged_signature, meta_genes,
                  global_variables, mde_dict):

    # append total genes and z scores to data
    summary_data = {}
    counter = 1
    for key in meta_genes:
        meta_gene_values = meta_genes[key]
        total_genes = str(len(genes_by_merged_signature[key]))
        summary_data[counter] = total_genes + "\t" + "\t".join(
            str(x) for x in meta_gene_values) + "\n"
        counter += 1

    # sort the summary data by total genes
    summary_data_sorted_on_number_of_genes = sorted(
        summary_data,
        key=lambda key: int(summary_data[key].split("\t")[0]),
        reverse=True)
    summary_data_sorted = {}
    for i in range(0, len(summary_data_sorted_on_number_of_genes)):
        sig_id = "signature_" + str(i + 1) + "\t"
        summary_data_sorted[i + 1] = sig_id + summary_data[
            summary_data_sorted_on_number_of_genes[i]]

    # create header
    sample_names = get_samples_ordered_by_order_list(
        mde_dict["order_list"], global_variables["samples_by_sample_groups"])
    summary_data_sorted[0] = "signature\tsignature_size\t" + "\t".join(
        sample_names) + "\n"

    # writes the summary
    with open(os.path.join(out_path, "Signature_summary.csv"), "w+") as f:
        for k, v in summary_data_sorted.iteritems():
            f.write(v)
Пример #2
0
def add_ne_specific_parameters(global_variables, pr_dictionary):

    pr_dictionary = add_subsection_r(
        os.path.join(pr_dictionary["r_bin_path"], "section_header",
                     "ne_workflow.txt"), "subsection_r_workflow_type",
        "section_header/ne_workflow.txt", pr_dictionary)

    # gets the various samples and sample group lists for ne:
    order_list = global_variables["sample_groups_default_order"]
    sample_groups_by_column = global_variables["sample_groups_by_column"]
    samples_by_sample_groups = global_variables["samples_by_sample_groups"]
    sample_sheet_column_names = global_variables["sample_sheet_column_names"]
    samples_ordered = get_samples_ordered_by_order_list(
        order_list[0:len(sample_groups_by_column[0])],
        samples_by_sample_groups)

    # gets the various R code strings:
    samples_r_string = "c(\"" + "\",\"".join(samples_ordered) + "\")"
    sample_groups_r_string = "c(\"" + "\",\"".join(order_list) + "\")"
    sample_groupings_r_string = get_r_string_sample_groupings(
        order_list, samples_by_sample_groups)
    samples_by_sample_group_r_string = get_r_string_samples_by_sample_group(
        order_list, samples_by_sample_groups)
    sample_groups_by_SS_column_r_string = get_r_string_sample_groups_by_SS_column(
        order_list, sample_groups_by_column)
    sample_groupings_by_SS_column_r_string = get_r_string_sample_groupings_by_SS_column(
        order_list, sample_groups_by_column, samples_by_sample_groups)
    default_samples_colours_by_SS_column_r_string = get_r_string_default_samples_colours_by_SS_column(
        samples_by_sample_groups, order_list, sample_groups_by_column)
    default_sample_group_colours_by_SS_column_r_string = get_r_string_default_sample_group_colours_by_SS_column(
        samples_by_sample_groups, order_list, sample_groups_by_column)
    sample_sheet_column_names_r_string = "c(\"" + "\",\"".join(
        sample_sheet_column_names) + "\")"

    # updates the pr dictionary
    pr_dictionary["workflow_ID"] = "Normalised Expression"
    pr_dictionary["sample_sheet_column_names"] = sample_sheet_column_names
    pr_dictionary["order_list"] = order_list
    pr_dictionary["samples_ordered"] = samples_ordered
    pr_dictionary["samples_r_string"] = samples_r_string
    pr_dictionary["sample_groups_r_string"] = sample_groups_r_string
    pr_dictionary["sample_groupings_r_string"] = sample_groupings_r_string
    pr_dictionary[
        "samples_by_sample_group_r_string"] = samples_by_sample_group_r_string
    pr_dictionary[
        "sample_groups_by_SS_column_r_string"] = sample_groups_by_SS_column_r_string
    pr_dictionary[
        "sample_groupings_by_SS_column_r_string"] = sample_groupings_by_SS_column_r_string
    pr_dictionary[
        "default_sample_colours_by_SS_column_r_string"] = default_samples_colours_by_SS_column_r_string
    pr_dictionary[
        "default_sample_group_colours_by_SS_column_r_string"] = default_sample_group_colours_by_SS_column_r_string
    pr_dictionary[
        "sample_sheet_column_names_r_string"] = sample_sheet_column_names_r_string

    return pr_dictionary
Пример #3
0
def differential_expression_signature(global_variables, infile, out_path, pde_IDs, mpde_dict):

    # open data file
    data = open(infile).readlines()

    # gets a dictionary of genes by signature
    genes_by_signature,signatures_by_gene = get_genes_by_signature(data, pde_IDs)

    # adds the zscores to the genes by signatures
    sample_list = get_samples_ordered_by_order_list(mpde_dict["order_list"], global_variables["samples_by_sample_groups"])
    genes_by_signature = get_expression_data(data,sample_list,genes_by_signature,signatures_by_gene)

    # iteratively merges signatures
    genes_by_merged_signature, meta_genes = merge_signatures(genes_by_signature, mpde_dict, sample_list)

    # gets the number of signatures (for the report)
    mpde_dict["de_signatures"] = range(1,len(genes_by_merged_signature)+1)

    # write data out
    write_data(out_path, genes_by_merged_signature)
    write_summary(out_path, genes_by_merged_signature, meta_genes, global_variables, mpde_dict)

    # returns the updated mpde disct
    return mpde_dict
Пример #4
0
def add_Mde_specific_parameters(global_variables, pr_dictionary,
                                workflow_parameter_dict):

    pr_dictionary = add_subsection_r(
        os.path.join(pr_dictionary["r_bin_path"], "section_header",
                     "mde_workflow.txt"), "subsection_r_workflow_type",
        "section_header/mde_workflow.txt", pr_dictionary)

    # gets the various samples and sample group lists for Mde:
    order_list = workflow_parameter_dict["order_list"]
    samples_by_sample_groups = global_variables["samples_by_sample_groups"]
    samples_ordered = get_samples_ordered_by_order_list(
        order_list, samples_by_sample_groups)
    comparisons = workflow_parameter_dict["de_IDs"]

    # gets the number of signatures
    de_signatures = workflow_parameter_dict["de_signatures"]

    # gets the various R code strings:
    samples_r_string = get_r_string_samples(samples_ordered)
    sample_groups_r_string = get_r_string_sample_groups(order_list)
    sample_groupings_r_string = get_r_string_sample_groupings(
        order_list, samples_by_sample_groups)
    samples_by_sample_group_r_string = get_r_string_samples_by_sample_group(
        order_list, samples_by_sample_groups)
    default_samples_colours_r_string = get_r_string_default_sample_colours(
        order_list, samples_by_sample_groups)
    comparisons_r_string = "c(\"" + "\",\"".join(comparisons) + "\")"

    # updates the pr_dictionary
    pr_dictionary["workflow_ID"] = workflow_parameter_dict["mde_ID"]
    pr_dictionary["signatures_scc"] = workflow_parameter_dict["signatures_scc"]
    pr_dictionary["order_list"] = order_list
    pr_dictionary["samples_ordered"] = samples_ordered
    pr_dictionary["comparisons"] = comparisons
    pr_dictionary["comparisons_r_string"] = comparisons_r_string
    pr_dictionary["samples_by_sample_groups"] = samples_by_sample_groups
    pr_dictionary["sample_groups_r_string"] = sample_groups_r_string
    pr_dictionary["samples_r_string"] = samples_r_string
    pr_dictionary["sample_groupings_r_string"] = sample_groupings_r_string
    pr_dictionary[
        "samples_by_sample_group_r_string"] = samples_by_sample_group_r_string
    pr_dictionary[
        "default_samples_colours_r_string"] = default_samples_colours_r_string
    pr_dictionary["de_signatures"] = de_signatures

    # gets the hypergeometric gene set types
    if global_variables["ora_flag"]:
        hypergeom_gene_set_types = []
        hypergeom_gene_set_min_set_sizes = []
        hypergeom_gene_set_max_set_sizes = []
        hypergeom_gene_set_p_thresholds = []
        hypergeom_gene_set_fold_thresholds = []
        hypergeom_gene_set_network_overlap_ratios = []

        parsed_hypergeom_gene_sets_parameters = global_variables[
            "ora_parameters"]
        for hypergeom_gene_set_parameter_dict in parsed_hypergeom_gene_sets_parameters:
            hypergeom_gene_set_types.append(
                hypergeom_gene_set_parameter_dict["type"])
            hypergeom_gene_set_min_set_sizes.append(
                hypergeom_gene_set_parameter_dict["min_set_size"])
            hypergeom_gene_set_max_set_sizes.append(
                hypergeom_gene_set_parameter_dict["max_set_size"])
            hypergeom_gene_set_p_thresholds.append(
                hypergeom_gene_set_parameter_dict["p_threshold"])
            hypergeom_gene_set_fold_thresholds.append(
                hypergeom_gene_set_parameter_dict["fold_threshold"])
            hypergeom_gene_set_network_overlap_ratios.append(
                hypergeom_gene_set_parameter_dict["network_overlap_ratio"])

        pr_dictionary["hypergeom_gene_set_types"] = hypergeom_gene_set_types
        pr_dictionary[
            "hypergeom_gene_set_min_set_sizes"] = hypergeom_gene_set_min_set_sizes
        pr_dictionary[
            "hypergeom_gene_set_max_set_sizes"] = hypergeom_gene_set_max_set_sizes
        pr_dictionary[
            "hypergeom_gene_set_p_thresholds"] = hypergeom_gene_set_p_thresholds
        pr_dictionary[
            "hypergeom_gene_set_fold_thresholds"] = hypergeom_gene_set_fold_thresholds
        pr_dictionary[
            "hypergeom_gene_set_network_overlap_ratios"] = hypergeom_gene_set_network_overlap_ratios

    return pr_dictionary
Пример #5
0
def add_de_specific_parameters(global_variables, pr_dictionary,
                               workflow_parameter_dict):

    # adds the r subsection workflow type
    pr_dictionary = add_subsection_r(
        os.path.join(pr_dictionary["r_bin_path"], "section_header",
                     "de_workflow.txt"), "subsection_r_workflow_type",
        "section_header/de_workflow.txt", pr_dictionary)

    # gets the various samples and sample group lists for de:
    order_list = workflow_parameter_dict["order_list"]
    samples_by_sample_groups = global_variables["samples_by_sample_groups"]
    samples_ordered = get_samples_ordered_by_order_list(
        order_list, samples_by_sample_groups)

    # gets the various R code strings:
    samples_r_string = get_r_string_samples(samples_ordered)
    sample_groups_r_string = get_r_string_sample_groups(order_list)
    sample_groupings_r_string = get_r_string_sample_groupings(
        order_list, samples_by_sample_groups)
    samples_by_sample_group_r_string = get_r_string_samples_by_sample_group(
        order_list, samples_by_sample_groups)
    default_samples_colours_r_string = get_r_string_default_sample_colours(
        order_list, samples_by_sample_groups)
    comparisons_r_string = "c(\"" + workflow_parameter_dict["de_ID"] + "\")"

    # gets the list of chromosomes
    chromosomes_list = get_chromosome_list(
        os.path.join(pr_dictionary["workflow_outpath"], "data",
                     "de_annotated.csv"))

    # updates the pr_dictionary
    pr_dictionary["workflow_ID"] = workflow_parameter_dict["de_ID"]
    pr_dictionary["de_p_threshold"] = workflow_parameter_dict["p_threshold"]
    pr_dictionary["de_fold_threshold"] = workflow_parameter_dict[
        "fold_threshold"]
    pr_dictionary["de_numerator_group"] = workflow_parameter_dict[
        "numerator_group"]
    pr_dictionary["de_denominator_group"] = workflow_parameter_dict[
        "denominator_group"]
    pr_dictionary["de_file_path"] = workflow_parameter_dict["de_file_path"]
    pr_dictionary[
        "differential_expression_set_size"] = workflow_parameter_dict[
            "differential_expression_set_size"]

    pr_dictionary["order_list"] = order_list
    pr_dictionary["samples_ordered"] = samples_ordered
    pr_dictionary["comparisons_r_string"] = comparisons_r_string
    pr_dictionary["samples_by_sample_groups"] = samples_by_sample_groups
    pr_dictionary["sample_groups_r_string"] = sample_groups_r_string
    pr_dictionary["samples_r_string"] = samples_r_string
    pr_dictionary["sample_groupings_r_string"] = sample_groupings_r_string
    pr_dictionary[
        "samples_by_sample_group_r_string"] = samples_by_sample_group_r_string
    pr_dictionary[
        "default_samples_colours_r_string"] = default_samples_colours_r_string
    pr_dictionary["chromosome_list"] = chromosomes_list

    # gets the hypergeometric gene set types
    if global_variables["ora_flag"]:
        hypergeom_gene_set_types = []
        hypergeom_gene_set_min_set_sizes = []
        hypergeom_gene_set_max_set_sizes = []
        hypergeom_gene_set_p_thresholds = []
        hypergeom_gene_set_fold_thresholds = []
        hypergeom_gene_set_network_overlap_ratios = []

        parsed_hypergeom_gene_sets_parameters = global_variables[
            "ora_parameters"]
        for hypergeom_gene_set_parameter_dict in parsed_hypergeom_gene_sets_parameters:
            hypergeom_gene_set_types.append(
                hypergeom_gene_set_parameter_dict["type"])
            hypergeom_gene_set_min_set_sizes.append(
                hypergeom_gene_set_parameter_dict["min_set_size"])
            hypergeom_gene_set_max_set_sizes.append(
                hypergeom_gene_set_parameter_dict["max_set_size"])
            hypergeom_gene_set_p_thresholds.append(
                hypergeom_gene_set_parameter_dict["p_threshold"])
            hypergeom_gene_set_fold_thresholds.append(
                hypergeom_gene_set_parameter_dict["fold_threshold"])
            hypergeom_gene_set_network_overlap_ratios.append(
                hypergeom_gene_set_parameter_dict["network_overlap_ratio"])

        pr_dictionary["hypergeom_gene_set_types"] = hypergeom_gene_set_types
        pr_dictionary[
            "hypergeom_gene_set_min_set_sizes"] = hypergeom_gene_set_min_set_sizes
        pr_dictionary[
            "hypergeom_gene_set_max_set_sizes"] = hypergeom_gene_set_max_set_sizes
        pr_dictionary[
            "hypergeom_gene_set_p_thresholds"] = hypergeom_gene_set_p_thresholds
        pr_dictionary[
            "hypergeom_gene_set_fold_thresholds"] = hypergeom_gene_set_fold_thresholds
        pr_dictionary[
            "hypergeom_gene_set_network_overlap_ratios"] = hypergeom_gene_set_network_overlap_ratios

    # gets the ipa ureg types
    if global_variables["ura_flag"]:
        ura_types = []
        ura_min_set_sizes = []
        ura_max_set_sizes = []
        ura_zscore_thresholds = []
        ura_p_thresholds = []
        ura_fold_thresholds = []
        ura_overlap_ratios = []

        parsed_ura_parameters = global_variables["ura_parameters"]
        for ura_parameters_dict in parsed_ura_parameters:
            ura_types.append(ura_parameters_dict["type"])
            ura_min_set_sizes.append(ura_parameters_dict["min_set_size"])
            ura_max_set_sizes.append(ura_parameters_dict["max_set_size"])
            ura_zscore_thresholds.append(
                ura_parameters_dict["zscore_threshold"])
            ura_p_thresholds.append(ura_parameters_dict["p_threshold"])
            ura_fold_thresholds.append(ura_parameters_dict["fold_threshold"])
            ura_overlap_ratios.append(
                ura_parameters_dict["network_overlap_ratio"])

        pr_dictionary["ura_types"] = ura_types
        pr_dictionary["ura_min_set_sizes"] = ura_min_set_sizes
        pr_dictionary["ura_max_set_sizes"] = ura_max_set_sizes
        pr_dictionary["ura_zscore_thresholds"] = ura_zscore_thresholds
        pr_dictionary["ura_p_thresholds"] = ura_p_thresholds
        pr_dictionary["ura_fold_thresholds"] = ura_fold_thresholds
        pr_dictionary["ura_overlap_ratios"] = ura_overlap_ratios

    return pr_dictionary