def out_validation(out_path_parameter, global_variables): # required inputs out_path = None # gets the sub-parameters sub_params_list = out_path_parameter.split(",") # checks the sub params for sub_param in sub_params_list: # Tests if there are two parts to the sub-parameter if len(sub_param.split("=")) != 2: print >> sys.stderr, "Error: the out parameter is not in a valid format." sys.exit(1) # Tests the path sub-parameter if sub_param.upper().startswith("path=".upper()): out_path = sub_param.split("=")[1] # makes the outpath and checks if its valid: try: new_directory(out_path) except: print >> sys.stderr, "Error: the outpath is not valid. Does it look right to you?" sys.exit(1) # tests if the required inputs have been supplied if out_path == None: print >> sys.stderr, "Error: the out parameter is not in a valid format." sys.exit(1) print "validated the out parameter"
def copy_shiny_files(global_variables): SL_path = global_variables["SL_path"] out_path = global_variables["out_path"] # Adds the UI ui_in_path = os.path.join(SL_path, "shiny", "app", "ui.r") ui_out_path = os.path.join(out_path, "shiny", "ui.r") copyfile(ui_in_path, ui_out_path) # Adds the server server_in_path = os.path.join(SL_path, "shiny", "app", "global.r") server_out_path = os.path.join(out_path, "shiny", "global.r") copyfile(server_in_path, server_out_path) # Adds the www new_directory(os.path.join(out_path, "shiny", "www")) www_in_path = os.path.join(SL_path, "shiny", "app", "www", "sl2.gif") www_out_path = os.path.join(out_path, "shiny", "www", "sl2.gif") copyfile(www_in_path, www_out_path) www_in_path = os.path.join(SL_path, "shiny", "app", "www", "sl2.png") www_out_path = os.path.join(out_path, "shiny", "www", "sl2.png") copyfile(www_in_path, www_out_path) www_in_path = os.path.join(SL_path, "shiny", "app", "www", "style.css") www_out_path = os.path.join(out_path, "shiny", "www", "style.css") copyfile(www_in_path, www_out_path)
def sub_directories(out_path, type, out_path_tag): new_directory(out_path) new_directory(os.path.join(out_path, type)) if out_path_tag == "NONE": new_directory(os.path.join(out_path, type, "network_data")) else: new_directory(os.path.join(out_path, type, out_path_tag)) new_directory( os.path.join(out_path, type, out_path_tag, "network_data"))
def biotype_folders(global_variables): out_path = global_variables["out_path"] # makes the all genes directory new_directory(os.path.join(out_path, "all_genes")) if global_variables["biotypes_flag"]: biotypes_dict = global_variables["biotypes_dict"] for biotype in biotypes_dict: new_directory(os.path.join(out_path, biotype))
def copy_rdata(biotype, global_variables): out_path = global_variables["out_path"] new_directory(os.path.join(out_path, "shiny", "rdata", biotype)) if global_variables["normexp_flag"]: new_directory( os.path.join(out_path, "shiny", "rdata", biotype, "normexp_workflow")) rdata_in_path = os.path.join(out_path, biotype, "normexp_workflow", "plots", "workflow.rdata") rdata_out_path = os.path.join(out_path, "shiny", "rdata", biotype, "normexp_workflow", "workflow.rdata") try: copyfile(rdata_in_path, rdata_out_path) except: print "Warning: the normexp workflow Rdata file is missing. It will be omitted from Shiny." if global_variables["pde_workflows_flag"]: parsed_pde_parameters = global_variables["pde_parameters"] for pde_parameter_dict in parsed_pde_parameters: pde_ID = pde_parameter_dict["pde_ID"] pde_ID_no_spaces = pde_ID.replace(" ", "_") new_directory( os.path.join(out_path, "shiny", "rdata", biotype, "pde_workflows", pde_ID_no_spaces)) rdata_in_path = os.path.join(out_path, biotype, "pde_workflows", pde_ID_no_spaces, "plots", "workflow.rdata") rdata_out_path = os.path.join(out_path, "shiny", "rdata", biotype, "pde_workflows", pde_ID_no_spaces, "workflow.rdata") try: copyfile(rdata_in_path, rdata_out_path) except: print "Warning: the PDE workflow " + pde_ID + " Rdata file is missing. It will be omitted from Shiny." if global_variables["mpde_workflows_flag"]: parsed_mpde_parameters = global_variables["mpde_parameters"] for mpde_dict in parsed_mpde_parameters: mpde_ID = mpde_dict["mpde_ID"] new_directory( os.path.join(out_path, "shiny", "rdata", biotype, "mpde_workflows", mpde_ID)) rdata_in_path = os.path.join(out_path, biotype, "mpde_workflows", mpde_ID, "plots", "workflow.rdata") rdata_out_path = os.path.join(out_path, "shiny", "rdata", biotype, "mpde_workflows", mpde_ID, "workflow.rdata") try: copyfile(rdata_in_path, rdata_out_path) except: print "Warning: the MPDE workflow " + mpde_ID + " Rdata file is missing. It will be omitted from Shiny."
def get_rdata(global_variables): out_path = global_variables["out_path"] # does the work for "all genes" new_directory(os.path.join(out_path, "shiny", "rdata")) copy_rdata("all_genes", global_variables) #does the work for the biotypes if global_variables["biotypes_flag"] and len( global_variables["biotypes_dict"].keys()) > 1: biotypes_dict = global_variables["biotypes_dict"] biotypes = sorted(biotypes_dict.keys()) for biotype in biotypes: copy_rdata(biotype, global_variables)
def build_server(global_variables, shiny_info): SL_path = global_variables["SL_path"] out_path = global_variables["out_path"] # builds the server new_directory(os.path.join(out_path, "shiny")) server_in_file = open(os.path.join(SL_path, "shiny", "app", "server_end.r")).readlines() server_out_file = open(os.path.join(out_path, "shiny", "server.r"), "w") server_out_file.write("###--- SERVER ---####\n") server_out_file.write("options(shiny.maxRequestSize=30*1024^2)\n") server_out_file.write("server <- function(input, output, session)\n") server_out_file.write("{\n\n") server_out_file.write(shiny_info) for line in server_in_file: server_out_file.write(line)
def write_data(out_path, genes_by_merged_signature): new_directory(out_path) new_directory(os.path.join(out_path, "gene_IDs")) new_directory(os.path.join(out_path, "gene_symbols")) # sort the signatures by number of genes sorted_on_number_of_genes = sorted( genes_by_merged_signature, key=lambda k: len(genes_by_merged_signature[k]), reverse=True) # prints the results for each signature signature_count = 1 for signature in sorted_on_number_of_genes: signature_ids = [] signature_symbols = [] for gene in genes_by_merged_signature[signature]: signature_ids.append(gene.split("\t")[0] + "\n") signature_symbols.append(gene.split("\t")[1] + "\n") open( os.path.join(out_path, "gene_symbols", "signature_" + str(signature_count) + "_symbols.txt"), "w+").writelines(signature_symbols) open( os.path.join(out_path, "gene_IDs", "signature_" + str(signature_count) + "_IDs.txt"), "w+").writelines(signature_ids) signature_count += 1
def core_sub_directories(global_variables, out_path): new_directory(out_path) new_directory(os.path.join(out_path, "data")) new_directory(os.path.join(out_path, "data", "gene_IDs")) new_directory(os.path.join(out_path, "data", "gene_symbols")) new_directory(os.path.join(out_path, "data", "statistical_analysis")) new_directory(os.path.join(out_path, "data", "deciles")) new_directory(os.path.join(out_path, "data", "quartiles")) new_directory(os.path.join(out_path, "plots"))
def undirectional_overlaps(mde_file_path, out_path, de_IDs, overlap_statistics_list): # stores overlap stats overlap_statistics_dict = {} # infile mde_file = open(mde_file_path).readlines() # iterates through the pairwise combinations of des for de1_id in de_IDs: for de2_id in de_IDs: # excludes self comparing if de1_id != de2_id: de1_id_parsed = de1_id.replace(" ", "_") de2_id_parsed = de2_id.replace(" ", "_") # makes the directory new_directory( os.path.join(out_path, "undirectional", de1_id_parsed, de2_id_parsed)) # gets the fold and significance columns for the des de1_log2fold, de1_p, de1_padj, de1_sig, de1_valid = get_Mde_columns_from_file( mde_file, de1_id) de2_log2fold, de2_p, de2_padj, de2_sig, de2_valid = get_Mde_columns_from_file( mde_file, de2_id) # stores the genes in each group de1_unique_genes_IDs = [] de1_unique_genes_symbols = [] de2_unique_genes_IDs = [] de2_unique_genes_symbols = [] overlapping_genes_IDs = [] overlapping_genes_symbols = [] # gets the genes in each group header = True for line in mde_file: if header: header = False else: line_split = line.rstrip().split("\t") # de1 unique if line_split[de1_sig] == "True" and line_split[ de2_sig] == "False": de1_unique_genes_IDs.append(line_split[0]) de1_unique_genes_symbols.append(line_split[1]) # de2 unique elif line_split[de1_sig] == "False" and line_split[ de2_sig] == "True": de2_unique_genes_IDs.append(line_split[0]) de2_unique_genes_symbols.append(line_split[1]) # overlapping elif line_split[de1_sig] == "True" and line_split[ de2_sig] == "True": overlapping_genes_IDs.append(line_split[0]) overlapping_genes_symbols.append(line_split[1]) # outputs the gene lists de1_unique_genes_IDs_file = open( os.path.join(out_path, "undirectional", de1_id_parsed, de2_id_parsed, "IDs_" + de1_id_parsed + "_unique_genes.txt"), "w") de1_unique_genes_symbols_file = open( os.path.join( out_path, "undirectional", de1_id_parsed, de2_id_parsed, "symbols_" + de1_id_parsed + "_unique_genes.txt"), "w") de2_unique_genes_IDs_file = open( os.path.join(out_path, "undirectional", de1_id_parsed, de2_id_parsed, "IDs_" + de2_id_parsed + "_unique_genes.txt"), "w") de2_unique_genes_symbols_file = open( os.path.join( out_path, "undirectional", de1_id_parsed, de2_id_parsed, "symbols_" + de2_id_parsed + "_unique_genes.txt"), "w") overlapping_genes_IDs_file = open( os.path.join(out_path, "undirectional", de1_id_parsed, de2_id_parsed, "IDs_overlapping_genes.txt"), "w") overlapping_genes_symbols_file = open( os.path.join(out_path, "undirectional", de1_id_parsed, de2_id_parsed, "symbols_overlapping_genes.txt"), "w") de1_unique_genes_IDs_file.write( "\n".join(de1_unique_genes_IDs)) de1_unique_genes_symbols_file.write( "\n".join(de1_unique_genes_symbols)) de2_unique_genes_IDs_file.write( "\n".join(de2_unique_genes_IDs)) de2_unique_genes_symbols_file.write( "\n".join(de2_unique_genes_symbols)) overlapping_genes_IDs_file.write( "\n".join(overlapping_genes_IDs)) overlapping_genes_symbols_file.write( "\n".join(overlapping_genes_symbols)) # gets the overlap stats background_size = len(mde_file) - 1 candidate_size = len(de1_unique_genes_IDs) + len( overlapping_genes_IDs) gene_set_size = len(de2_unique_genes_IDs) + len( overlapping_genes_IDs) overlap_size = len(overlapping_genes_IDs) obs_vs_exp, p_Pos, p_Neg = hypergeometric_test( background_size, candidate_size, gene_set_size, overlap_size) # updates the overlap stats (considers A vs B the same as B vs A) sorted_de = "\t".join(sorted([de1_id, de2_id])) if sorted_de not in overlap_statistics_dict: overlap_statistics_list.append([ de1_id, de2_id, background_size, candidate_size, gene_set_size, len(de1_unique_genes_IDs), len(de2_unique_genes_IDs), overlap_size, obs_vs_exp, p_Neg ]) overlap_statistics_dict[sorted_de] = True return overlap_statistics_list
def parse_line(line, pr_dictionary): # detects general tags and replaces them with the appropriate string if "<*comparisons_list*>" in line: line = line.replace("<*comparisons_list*>", pr_dictionary["comparisons_r_string"]) if "<*sample_sheet_column_names_list*>" in line: line = line.replace( "<*sample_sheet_column_names_list*>", pr_dictionary["sample_sheet_column_names_r_string"]) if "<*sample_groups_by_SS_column_list*>" in line: line = line.replace( "<*sample_groups_by_SS_column_list*>", pr_dictionary["sample_groups_by_SS_column_r_string"]) if "<*sample_groupings_by_SS_column_list*>" in line: line = line.replace( "<*sample_groupings_by_SS_column_list*>", pr_dictionary["sample_groupings_by_SS_column_r_string"]) if "<*samples_list*>" in line: line = line.replace("<*samples_list*>", pr_dictionary["samples_r_string"]) if "<*sample_group_list*>" in line: line = line.replace("<*sample_group_list*>", pr_dictionary["sample_groups_r_string"]) if "<*sample_groupings_list*>" in line: line = line.replace("<*sample_groupings_list*>", pr_dictionary["sample_groupings_r_string"]) if "<*samples_by_sample_group_list*>" in line: line = line.replace("<*samples_by_sample_group_list*>", pr_dictionary["samples_by_sample_group_r_string"]) if "<*default_sample_colours_list*>" in line: line = line.replace("<*default_sample_colours_list*>", pr_dictionary["default_samples_colours_r_string"]) if "<*default_sample_colours_by_SS_column_list*>" in line: line = line.replace( "<*default_sample_colours_by_SS_column_list*>", pr_dictionary["default_sample_colours_by_SS_column_r_string"]) if "<*default_sample_group_colours_by_SS_column_list*>" in line: line = line.replace( "<*default_sample_group_colours_by_SS_column_list*>", pr_dictionary["default_sample_group_colours_by_SS_column_r_string"] ) if "<*working_directory*>" in line: line = line.replace("<*working_directory*>", pr_dictionary["workflow_outpath"]) if "<*per_hypergeometric_gene_set*>" in line: line = "" if "<*/per_hypergeometric_gene_set*>" in line: line = "" if "<*per_ipa_ureg*>" in line: line = "" if "<*/per_ipa_ureg*>" in line: line = "" if "<*per_de_signature_hyper_gs*>" in line: line = "" if "<*/per_de_signature_hyper_gs*>" in line: line = "" # detects a path tag and converts to os friendly version, and makes a new folder. if "<*path*>" in line and "<*/path*>" in line: path = line.split("<*path*>")[1].split("<*/path*>")[0] parsed_path = os.path.join(*path.split("/")) line = line.replace("<*path*>" + path + "<*/path*>", parsed_path) new_directory( os.path.join(pr_dictionary["workflow_outpath"], os.path.join(*path.split("/")[0:-1]))) return line
def sub_directories(out_path, type, out_path_tag): new_directory(out_path) new_directory(os.path.join(out_path, type)) new_directory(os.path.join(out_path, type, out_path_tag)) new_directory(os.path.join(out_path, type, out_path_tag, "network_data"))
def spatial_enrichment(global_variables,in_path, sample_groups, out_path, type): # strores the results gene_data_dictionary = {} summary_dictionary = {} # makes the out folder: new_directory(out_path) # opens the files in_file = open(in_path).readlines() genes_out_file = open(os.path.join(out_path,"spatial_enrichment_gene_data.csv"),"w") summary_out_file = open(os.path.join(out_path,"spatial_enrichment_summary.csv"),"w") # writes the headers: if type == "NORMEXP": genes_out_file.write("\t".join(["gene_id","mean_expression","chromosome","midpoint_coordinate"]) + "\n") summary_out_file.write("\t".join(["chromosome", "total_genes", "expressed_genes","expressed_genes_bias_log2fold","expressed_genes_bias_p"]) + "\n") if type == "PDE": genes_out_file.write("\t".join(["gene_id","mean_expression","chromosome","midpoint_coordinate","log2fold","p","significant","pde_valid"]) + "\n") summary_out_file.write("\t".join(["chromosome", "total_genes", "expressed_genes", "pde_valid_genes","positive_fold_genes","negative_fold_genes","significant_genes","upregulated_genes","downregulated_genes","expressed_genes_bias_log2fold","expressed_genes_bias_p","significant_genes_bias_log2fold","significant_genes_bias_p","direction_bias_swing","direction_bias_p"]) + "\n") # gets a dictionary of the samples samples_by_sample_groups = global_variables["samples_by_sample_groups"] samples_dict = {} for sample_group in sample_groups: sample_group_samples = samples_by_sample_groups[sample_group] for sample in sample_group_samples: samples_dict[sample] = True # gets the expression threshold expressed_threshold = global_variables["normexp_threshold"] # gets the column information for the infile sample_columns = get_sample_columns_from_file(samples_dict, in_file) coordinate_columns = get_coordinate_columns_from_file(in_file) if type == "PDE": pde_columns = get_PDE_columns_from_file(in_file) # gets the gene and summary information header = True for line in in_file: if header: header = False else: line_split = line.rstrip().split("\t") # gets the mean expression mean_expression = get_mean_expression(line_split, sample_columns) # gets the coordinates chromosome = line_split[coordinate_columns["CHROMOSOME"]] start = int(line_split[coordinate_columns["START"]]) stop = int(line_split[coordinate_columns["STOP"]]) mid_point = (stop-start)/2 # updates the results with the normexp information gene_data = [str(mean_expression),chromosome,str(mid_point)] if chromosome in summary_dictionary: chromosome_summary = summary_dictionary[chromosome] else: chromosome_summary = [0,0,0,0,0,0,0,0] chromosome_summary[0] = chromosome_summary[0] + 1 # tests for an expressed gene if mean_expression >= expressed_threshold: chromosome_summary[1] = chromosome_summary[1] + 1 # updates the results with the PDE information if type == "PDE": gene_data.append(line_split[pde_columns["LOG2FOLD"]]) gene_data.append(line_split[pde_columns["P"]]) gene_data.append(line_split[pde_columns["SIG"]]) gene_data.append(line_split[pde_columns["PDE_VALID"]]) if line_split[pde_columns["PDE_VALID"]] == "True": chromosome_summary[2] = chromosome_summary[2]+1 if float(line_split[pde_columns["LOG2FOLD"]]) > 0: chromosome_summary[3] = chromosome_summary[3] + 1 elif float(line_split[pde_columns["LOG2FOLD"]]) < 0: chromosome_summary[4] = chromosome_summary[4] + 1 if line_split[pde_columns["SIG"]] == "True": chromosome_summary[5] = chromosome_summary[5] + 1 if float(line_split[pde_columns["LOG2FOLD"]]) > 0: chromosome_summary[6] = chromosome_summary[6] + 1 elif float(line_split[pde_columns["LOG2FOLD"]]) < 0: chromosome_summary[7] = chromosome_summary[7] + 1 # updates the results gene_data_dictionary[line_split[0]] = gene_data summary_dictionary[chromosome] = chromosome_summary # performs the stats total_genes = 0 total_expressed_genes = 0 total_pde_valid_genes = 0 total_significant = 0 total_upregulated_genes = 0 total_downregulated_genes = 0 # counts and summaries for chromosome in summary_dictionary: chromosome_summary = summary_dictionary[chromosome] total_genes += chromosome_summary[0] total_expressed_genes += chromosome_summary[1] total_pde_valid_genes += chromosome_summary[2] total_significant += chromosome_summary[5] total_upregulated_genes += chromosome_summary[6] total_downregulated_genes += chromosome_summary[7] if total_significant > 0: ratio_upregulated = float(total_upregulated_genes) / (float(total_significant)) ratio_downregulated = float(total_downregulated_genes) / (float(total_significant)) else: ratio_upregulated = 0.0 ratio_downregulated = 0.0 # stats for chromosome in summary_dictionary: chromosome_summary = summary_dictionary[chromosome] # expressed genes bias try: expressed_genes_log2fold = math.log(float(chromosome_summary[1])+0.001,2) - math.log(((float(chromosome_summary[0])/float(total_genes))*float(total_expressed_genes))+0.001,2) chromosome_summary.append(round(expressed_genes_log2fold,2)) odds, expressed_genes_p_value = scipy.stats.fisher_exact([[float(total_genes),float(chromosome_summary[0])],[float(total_expressed_genes),float(chromosome_summary[1])]],alternative='two-sided') chromosome_summary.append(expressed_genes_p_value) except: chromosome_summary.append("NA") chromosome_summary.append("NA") # sig genes bias try: significant_genes_log2fold = math.log(float(chromosome_summary[5])+0.001,2) - math.log((float(chromosome_summary[2])/float(total_pde_valid_genes))*float(total_significant)+0.001,2) chromosome_summary.append(round(significant_genes_log2fold,2)) odds, sig_genes_bias_p_value = scipy.stats.fisher_exact([[float(total_pde_valid_genes),float(chromosome_summary[2])],[float(total_significant),float(chromosome_summary[5])]],alternative='two-sided') chromosome_summary.append(sig_genes_bias_p_value) except: chromosome_summary.append("NA") chromosome_summary.append("NA") # direction bias try: expected_upregulated = ratio_upregulated * float(chromosome_summary[5]) expected_downregulated = ratio_downregulated * float(chromosome_summary[5]) swing_difference = float(chromosome_summary[6]) - expected_upregulated swing = str(round(swing_difference/float(chromosome_summary[5])*100,2)) + "%" chromosome_summary.append(swing) odds,direction_bias_p_value = scipy.stats.fisher_exact([[expected_upregulated,float(chromosome_summary[6])],[expected_downregulated,float(chromosome_summary[7])]], alternative='two-sided') chromosome_summary.append(direction_bias_p_value) except: chromosome_summary.append("NA") chromosome_summary.append("NA") summary_dictionary[chromosome] = chromosome_summary #outputs the gene data results: for gene in gene_data_dictionary: genes_out_file.write(gene + "\t" + "\t".join(gene_data_dictionary[gene]) + "\n") #outputs the summary: for chromosome in summary_dictionary: summary_out_file.write(chromosome + "\t" + "\t".join(map(str,summary_dictionary[chromosome])) + "\n")