def make_name_dict(data_file, rose_folder, names_list=[], enhancer_type="super"): """For each name, check for the presence of an enriched file or allEnhancer table. These are the files required for enhancer clustering. """ data_dict = pipeline_utils.load_data_table(data_file) # draw the parent folder from the data_file parent_folder = utils.format_folder( os.path.dirname(os.path.abspath(data_file)), False) if parent_folder.count("data_tables") == 1: parent_folder = parent_folder.replace("data_tables/", "") print("Using {} as the parent folder".format(parent_folder)) # check to see if a rose folder exists already if utils.format_folder(rose_folder, False): rose_exists = True rose_folder = utils.format_folder(rose_folder, False) else: rose_exists = False rose_folder = utils.format_folder(rose_folder, True) # check names_list to see if datasets exist if len(names_list) == 0: names_list = [ name for name in data_dict if name.upper().count("WCE") == 0 and name.upper().count("INPUT") == 0 ] # if no names_list is given, this filters out WCE # now check that all of the datasets at a minimum have a rose output OR enriched region file name_dict = defaultdict(dict) for name in names_list: # check if each dataset has a background background_name = data_dict[name]["background"] if background_name in data_dict: name_dict[name]["background"] = True else: name_dict[name]["background"] = False # assumes standard folder structure for enriched file enriched_file = os.path.join(parent_folder, "macsEnriched", data_dict[name]["enrichedMacs"]) print("Looking for macs output at {}".format(enriched_file)) try: open(enriched_file, "r").close() name_dict[name]["enriched_file"] = enriched_file except (IOError, FileNotFoundError): name_dict[name]["enriched_file"] = "" # roseOutput looks for standard format rose output # need an allEnhancers table and a region table to proceed # if the rose folder doesn't exist, don't bother if rose_exists: try: rose_output_files = os.listdir( os.path.join(rose_folder, "{}_ROSE".format(name))) if enhancer_type == "super": enhancer_string = "AllEnhancers.table.txt" if enhancer_type == "stretch": enhancer_string = "AllEnhancers_Length.table.txt" if enhancer_type == "superstretch": enhancer_string = "AllEnhancers_SuperStretch.table.txt" all_enhancer_file_list = [ x for x in rose_output_files if x.count(enhancer_string) == 1 and x[0] != "." ] # no weird hidden or temp files if all_enhancer_file_list: name_dict[name]["enhancer_file"] = os.path.join( rose_folder, "{}_ROSE".format(name), all_enhancer_file_list[0]) else: name_dict[name]["enhancer_file"] = "" except (OSError, FileNotFoundError): name_dict[name]["enhancer_file"] = "" else: name_dict[name]["enhancer_file"] = "" if (name_dict[name]["enhancer_file"] == "" and name_dict[name]["enriched_file"] == ""): print( "INSUFFICIENT DATA TO RUN ENAHNCER ANALYSIS ON {}. PLEASE MAKE SURE ROSE OUTPUT " "OR MACS ENRICHED REGION PEAKS FILE EXISTS".format(name)) print(name_dict[name]) sys.exit() return name_dict
def main(): """Main function call.""" parser = argparse.ArgumentParser() # required flags parser.add_argument( "-d", "--data", dest="data", default=None, required=True, help="Enter a data file for datasets to be processed", ) parser.add_argument( "-o", "--output", dest="output", default=None, required=True, help="specify an output folder to write results to", ) # additional args parser.add_argument( "-i", "--input", dest="input", required=False, help= "Enter a comma separated list of names to analyze. Default will be all datasets", ) parser.add_argument( "-n", "--name", dest="name", required=False, help="Enter a name for the analysis", ) parser.add_argument( "-r", "--rose", dest="rose", required=False, help="Enter a folder to detect or write rose output", ) parser.add_argument( "-a", "--all", dest="all", action="store_true", default=False, help="flag to run analysis on ALL enhancers (this is much slower)", ) parser.add_argument( "-s", "--stitch", dest="stitch", default="", help= ("specify a fixed stitch distance for all datasets, otherwise will compute stitching " "automatically on each dataset"), ) parser.add_argument( "-e", "--enhancer-type", dest="enhancer_type", default="super", help= "specify type of enhancer to analyze: super, stretch, superStretch", ) parser.add_argument( "-t", "--tss", dest="tss", default=2500, help="specify a tss exclusion window. default is 2500bp", ) parser.add_argument( "--mask", dest="mask", required=False, help= "Create a mask set of regions to filter out of analysis. must be .bed or .gff format", ) args = parser.parse_args() print(args) # pull in the data_file and create a data_dict data_file = args.data # now the output folder output_folder = utils.format_folder( args.output, True) # check and create the output folder # now the rose folder if args.rose: rose_folder = args.rose else: rose_folder = os.path.join(output_folder, "rose") if args.input: names_list = args.input.split(",") else: names_list = [] # get the genome data_dict = pipeline_utils.load_data_table(data_file) genome = data_dict[list(data_dict.keys())[0]]["genome"] # check if using only supers if args.all: super_only = False else: super_only = True # get the anlysis name if args.name: analysis_name = args.name else: analysis_name = "enhancers" # check for a stitching parameter if len(str(args.stitch)) > 0: stitch = str(args.stitch) else: stitch = "" # check for the tss parameter tss_distance = int(args.tss) # check enhancer type enhancer_type = args.enhancer_type.lower() if ["super", "superstretch", "stretch"].count(enhancer_type) == 0: print("ERROR: unsupported enhancer type {}".format(enhancer_type)) sys.exit() # see if there's a mask if args.mask: mask_file = args.mask else: mask_file = "" # ===================================================== # =================SUMMARIZE INPUTS==================== # ===================================================== print("WORKING IN GENOME {}".format(genome)) print("DRAWING DATA FROM {} AND ROSE FOLDER {}".format( data_file, rose_folder)) print("USING {} AS THE OUTPUT FOLDER".format(output_folder)) # ===================================================== # ==============ESTABLISH ALL WORKING FILES============ # ===================================================== print("\n\n\nESTABLISHING WORKING FILES") name_dict = make_name_dict(data_file, rose_folder, names_list, enhancer_type) print(name_dict) print("STARTING ANALYSIS ON THE FOLLOWING DATASETS:") print(list(name_dict.keys())) for name in name_dict: if len(name_dict[name]["enhancer_file"]) == 0: print("NO ROSE OUTPUT FOR {}".format(name)) # ===================================================== # ==============LAUNCH ENHANCER MAPPING================ # ===================================================== print("\n\n\nLAUNCHING ENHANCER MAPPING (IF NECESSARY)") name_dict = launch_enhancer_mapping( data_file, name_dict, output_folder, rose_folder, stitch, tss_distance, enhancer_type, mask_file, ) print(name_dict) # ===================================================== # ====================GET MEDIAN SIGNAL================ # ===================================================== print("\n\n\nGETTING MEDIAN ENHANCER SIGNAL FROM EACH SAMPLE") median_dict = make_median_dict(name_dict) print(median_dict) # ===================================================== # ====================MERGING ENHANCERS================ # ===================================================== print("\n\n\nIDENTIFYING CONSENSUS ENHANCER REGIONS") merged_gff_file = os.path.join( output_folder, "{}_{}_-0_+0.gff".format(genome, analysis_name)) merged_gff_file = merge_collections(name_dict, analysis_name, merged_gff_file, super_only) # ===================================================== # ===============MAP TO MERGED REGIONS================= # ===================================================== print("\n\n\nMAPPING DATA TO CONSENSUS ENHANCER REGIONS") merged_region_map = map_merged_gff(data_file, name_dict, merged_gff_file, analysis_name, output_folder, mask_file) # ===================================================== # ==============CORRECT FOR MEDIAN SIGNAL============== # ===================================================== print("\n\n\nCREATING ENHANCER SIGNAL TABLE") signal_table_file = make_enhancer_signal_table(name_dict, merged_region_map, median_dict, analysis_name, genome, output_folder) # ===================================================== # ===============CALL CLUSTERING R SCRIPT============== # ===================================================== print("\n\n\nGENERATING CLUSTERING OUTPUT") cluster_table_file = call_r_script(genome, output_folder, analysis_name, signal_table_file) # output should be # png of cluster gram with rows as genes # png of cluster gram of samples w/ tree # ordered table w/ cluster assignment # similarity matrix for samples # ===================================================== # =============GENE MAPPING BY CLUSTER================= # ===================================================== cmd = "ROSE2_geneMapper -g {} -i {}".format(genome, cluster_table_file) os.system(cmd) print("FINISHED")
def launch_enhancer_mapping( data_file, name_dict, output_folder, rose_folder, stitch, tss_distance, enhancer_type, mask_file="", ): """Launches enhancer mapping if needed from enriched region files.""" names_list = list(name_dict.keys()) # check to see if everything is good, if so return True and call it a day if len([x for x in names_list if len(name_dict[x]["enhancer_file"]) > 0]) == len(names_list): print("ENHANCER FILE OUTPUT FOUND FOR ALL DATASETS") return name_dict # if not, have to call rose rose_output_folder = utils.format_folder(rose_folder, True) queue_list = [] for name in names_list: # check to see if we need to call rose if name_dict[name]["enhancer_file"] == "": # get the enriched file enriched_file = name_dict[name]["enriched_file"] # call rose print("CALLING ROSE FOR {}".format(name)) bash_file_name = pipeline_utils.call_rose2( data_file, "", rose_output_folder, [name], [], enriched_file, tss_distance, stitch, mask=mask_file, ) print(bash_file_name) os.system("bash {}".format(bash_file_name)) # add name to queue list queue_list.append(name) # define the enhancer type if enhancer_type == "super": enhancer_string = "AllEnhancers.table.txt" if enhancer_type == "stretch": enhancer_string = "AllEnhancers_Length.table.txt" if enhancer_type == "superstretch": enhancer_string = "AllEnhancers_SuperStretch.table.txt" # now check for completion of datasets for name in queue_list: # check for the AllEnhancers table enhancer_file = os.path.join( rose_output_folder, "{}_ROSE".format(name), "{}_peaks_{}".format(name, enhancer_string), ) print("CHECKING FOR {} ROSE OUTPUT IN {}".format(name, enhancer_file)) if utils.check_output(enhancer_file, 1, 10): print("FOUND ENHANCER OUTPUT FOR {}".format(name)) name_dict[name]["enhancer_file"] = enhancer_file else: # try finding it w/ a different name # this will bug out if nothing is there rose_folder = os.path.join(rose_output_folder, "{}_ROSE".format(name)) rose_file_list = [ x for x in os.listdir(rose_folder) if x[0] != "." ] # no hidden files if not rose_file_list: print("No files found in {}".format(rose_folder)) sys.exit() enhancer_file = pipeline_utils.get_file(enhancer_string, rose_file_list, rose_folder) name_dict[name]["enhancer_file"] = enhancer_file return name_dict
def map_merged_gff(data_file, name_dict, merged_gff_file, analysis_name, output_folder, mask_file): """Call rose on the merged_gff_file for all datasets.""" data_dict = pipeline_utils.load_data_table(data_file) rose_parent_folder = os.path.join(output_folder, "rose") utils.format_folder(rose_parent_folder, True) gff_name = os.path.basename(merged_gff_file).split(".")[0] bash_file_name = os.path.join(output_folder, "rose", "{}_roseCall.sh".format(analysis_name)) # names_list is just the first dataset # extrmap will have to have all other datasets + their backgrounds names_list = list(name_dict.keys()) names_list.sort() extra_map = [] for name in names_list[1:]: if name_dict[name]["background"]: background_name = data_dict[name]["background"] if background_name in data_dict: extra_map += [name, background_name] else: print( "ERROR: UNABLE TO FIND LISTED BACKGROUND DATASET {} FOR {}" .format(background_name, name)) sys.exit() else: extra_map += [name] print(extra_map) # first check to see if this has already been done merged_region_map = os.path.join( output_folder, "rose", "{}_ROSE".format(names_list[0]), "{}_0KB_STITCHED_ENHANCER_REGION_MAP.txt".format(gff_name), ) print("LOOKING FOR REGION MAP AT {}".format(merged_region_map)) if utils.check_output(merged_region_map, 1, 1): print("FOUND PREVIOUS REGION MAP") return merged_region_map bash_file_name = pipeline_utils.call_rose2( data_file, "", rose_parent_folder, [names_list[0]], extra_map, merged_gff_file, 0, 0, bash_file_name, mask=mask_file, ) bash_command = "bash {}".format(bash_file_name) os.system(bash_command) print("Running enhancer mapping command:\n{}".format(bash_command)) if utils.check_output(merged_region_map, 5, 60): return merged_region_map else: print( "UNABLE TO CALL ROSE ENHANCER MAPPING ON CONSENSUS ENHANCER FILE {}.\nEXITING NOW" "".format(merged_gff_file)) sys.exit()
def main(): """Main run function.""" parser = argparse.ArgumentParser() # required flags parser.add_argument( "-b", "--bam", dest="bam", nargs="*", help="Enter a comma/space separated list of .bam files to be processed.", required=True, ) parser.add_argument( "-i", "--input", dest="input", type=str, help="Enter .gff or genomic region e.g. chr1:+:1-1000.", required=True, ) parser.add_argument( "-g", "--genome", dest="genome", type=str, help="specify a genome, HG18,HG19,MM8,MM9,MM10 are currently supported", required=True, ) # output flag parser.add_argument( "-o", "--output", dest="output", type=str, help="Enter the output folder.", required=True, ) # additional options parser.add_argument( "--stretch-input", dest="stretch_input", default=None, type=int, help=( "Stretch the input regions to a minimum length in bp, e.g. 10000 (for" " 10kb)" ), ) parser.add_argument( "-c", "--color", dest="color", default=None, nargs="*", help=( "Enter a colon or space separated list of colors e.g. " "255,0,0:255,125,0, default samples the rainbow" ), ) parser.add_argument( "-s", "--sense", dest="sense", default="both", help="Map to '+','-' or 'both' strands. Default maps to both.", ) parser.add_argument( "-e", "--extension", dest="extension", default=200, help="Extends reads by n bp. Default value is 200bp", ) parser.add_argument( "-r", "--rpm", dest="rpm", action="store_true", default=False, help="Normalizes density to reads per million (rpm) Default is False", ) parser.add_argument( "-y", "--yScale", dest="y_scale", default="relative", help=( "Choose either relative or uniform y axis scaling. options = " "'relative,uniform' Default is relative scaling" ), ) parser.add_argument( "-n", "--names", dest="names", default=None, nargs="*", help="Enter a comma or space separated list of names for your bams", ) parser.add_argument( "-p", "--plot", dest="plot", default="MULTIPLE", help=( "Choose either all lines on a single plot or multiple plots. options " "= 'SINGLE,MULTIPLE,MERGE'" ), ) parser.add_argument( "-t", "--title", dest="title", default="", help=( "Specify a title for the output plot(s), default will be the " "coordinate region" ), ) parser.add_argument( "-q", "--skip-cache", dest="skip_cache", action="store_true", default=False, help="Toggles option to skip loading annotation cache file", ) parser.add_argument( "--scale", dest="scale", default=None, nargs="*", help=( "Enter a comma or space separated list of scaling factors for your " "bams. Default is none" ), ) parser.add_argument( "--bed", dest="bed", nargs="*", help="Add a comma-delimited or space-delimited list of bed files to plot", ) parser.add_argument( "--multi-page", dest="multi", action="store_true", default=False, help="If flagged will create a new pdf for each region", ) # DEBUG OPTION TO SAVE TEMP FILES parser.add_argument( "--save-temp", dest="save", action="store_true", default=False, help="If flagged will save temporary files made by bamPlot", ) args = parser.parse_args() print(args) if args.bam and args.input and args.genome and args.output: # Support a legacy mode where a ',' delimited multiple files bam_file_list = args.bam if len(bam_file_list) == 1: bam_file_list = bam_file_list[0].split(",") # Make sure these are actually files & readable (!) for filename in bam_file_list: assert os.access(filename, os.R_OK) # bringing in any beds if args.bed: bed_file_list = args.bed if len(bed_file_list) == 1: bed_file_list = bed_file_list[0].split(",") print(bed_file_list) bed_collection = make_bed_collection(bed_file_list) else: bed_collection = utils.LocusCollection([], 50) # Load the input for graphing. One of: # - A .gff # - A .bed # - a specific input region (e.g. chr10:.:93150000-93180000) valid_sense_options = {"+", "-", "."} if os.access(args.input, os.R_OK): if args.input.endswith(".bed"): # Uniquely graph every input of this bed parsed_input_bed = utils.parse_table(args.input, "\t") gff_name = os.path.basename(args.input) # Graph title gff = None try: if parsed_input_bed[0][5] in valid_sense_options: # This .bed might have a sense parameter gff = [ [e[0], "", args.input, e[1], e[2], "", e[5], "", ""] for e in parsed_input_bed ] except IndexError: pass if gff is None: print( "Your bed doesn't have a valid sense parameter. Defaulting to both " "strands, '.'" ) # We only take chr/start/stop and ignore everything else. gff = [ [e[0], "", args.input, e[1], e[2], "", ".", "", ""] for e in parsed_input_bed ] else: # Default to .gff, since that's the original behavior gff = utils.parse_table(args.input, "\t") gff_name = os.path.basename(args.input).split(".")[0] else: # means a coordinate line has been given e.g. chr1:+:1-100 chrom_line = args.input.split(":") try: chrom = chrom_line[0] sense = chrom_line[1] except IndexError: print("Invalid input line or inaccessible file. Try: chr1:.:1-5000") exit() assert sense in valid_sense_options [start, end] = chrom_line[2].split("-") if chrom[0:3] != "chr": print("ERROR: UNRECOGNIZED GFF OR CHROMOSOME LINE INPUT") exit() gff_line = [chrom, "", args.input, start, end, "", sense, "", ""] gff_name = "{}_{}_{}_{}".format(chrom, sense, start, end) gff = [gff_line] # Consider stretching the regions to a fixed minimum size if args.stretch_input: print( "Stretching inputs to a minimum of: {} bp".format( str(args.stretch_input) ) ) min_length = args.stretch_input stretch_gff = [] for e in gff: difference = int(e[4]) - int(e[3]) if difference < min_length: pad = int((min_length - difference) / 2) stretch_gff.append( [ e[0], e[1], e[2], int(e[3]) - pad, int(e[4]) + pad, e[5], e[6], e[7], e[8], ] ) else: stretch_gff.append(e) gff = stretch_gff # Sanity test the gff object assert all([e[6] in valid_sense_options for e in gff]) # All strands are sane # bring in the genome genome = args.genome.upper() if not ["HG18", "HG19", "HG19_RIBO", "HG38", "MM9", "MM10", "RN4", "RN6"].count( genome ): print( "ERROR: UNSUPPORTED GENOME TYPE {}. USE HG19,HG18, RN4, MM9, or MM10".format( genome, ) ) parser.print_help() exit() # bring in the rest of the options # output root_folder = args.output try: os.listdir(root_folder) except OSError: print("ERROR: UNABLE TO FIND OUTPUT DIRECTORY {}".format(root_folder)) exit() # Get analysis title if not args.title: title = gff_name else: title = args.title # make a temp folder temp_folder = os.path.join(root_folder, title) print("CREATING TEMP FOLDER {}".format(temp_folder)) utils.format_folder(temp_folder, create=True) # colors if args.color: color_list = args.color if len(color_list) == 1: color_list = color_list[0].split(":") color_list = [x.split(",") for x in color_list] if len(color_list) < len(bam_file_list): print( "WARNING: FEWER COLORS THAN BAMS SPECIFIED. COLORS WILL BE RECYCLED" ) # recycling the color list color_list += color_list * (len(bam_file_list) // len(color_list)) color_list = color_list[: len(bam_file_list)] else: # cycles through the colors of the rainbow color_list = taste_the_rainbow(len(bam_file_list)) # sense sense = args.sense extension = int(args.extension) rpm = args.rpm scale = args.scale if scale: if len(scale) == 1: scale = scale[0].split(",") y_scale = args.y_scale.upper() # names if args.names: names = args.names if len(names) == 1: names = names[0].split(",") if len(names) != len(bam_file_list): print("ERROR: NUMBER OF NAMES AND NUMBER OF BAMS DO NOT CORRESPOND") parser.print_help() exit() else: names = [os.path.basename(x) for x in bam_file_list] # plot style plot_style = args.plot.upper() if not ["SINGLE", "MULTIPLE", "MERGE"].count(plot_style): print("ERROR: PLOT STYLE {} NOT AN OPTION".format(plot_style)) parser.print_help() exit() # now run! summary_table_file_name = make_bam_plot_tables( gff, genome, bam_file_list, color_list, n_bins, sense, extension, rpm, temp_folder, names, title, bed_collection, scale, ) print("{} is the summary table".format(summary_table_file_name)) # running the R command to plot multi = args.multi out_file = os.path.join(root_folder, "{}_plots.pdf".format(title)) r_cmd = call_r_plot( summary_table_file_name, out_file, y_scale, plot_style, multi ) # open a bash file bash_file_name = os.path.join(temp_folder, "{}_Rcmd.sh".format(title)) with open(bash_file_name, "w") as bash_file: bash_file.write("#!/usr/bin/bash\n") bash_file.write(r_cmd) print("Wrote R command to {}".format(bash_file_name)) os.system("bash {}".format(bash_file_name)) # delete temp files if not args.save: if utils.check_output(out_file, 1, 10): # This is super dangerous (!). Add some sanity checks. assert " " not in temp_folder assert temp_folder != "/" shutil.rmtree(temp_folder) print("Removing temp folder: {}".format(temp_folder)) else: print("ERROR: NO OUTPUT FILE {} DETECTED".format(out_file)) else: parser.print_help() sys.exit()
def main(): """Main run call.""" debug = False parser = argparse.ArgumentParser() # required flags parser.add_argument( "-i", "--i", dest="input", required=True, help= ("Enter a comma separated list of .gff or .bed file of binding sites used to make " "enhancers"), ) parser.add_argument( "-r", "--rankby", dest="rankby", required=True, help="Enter a comma separated list of bams to rank by", ) parser.add_argument("-o", "--out", dest="out", required=True, help="Enter an output folder") parser.add_argument( "-g", "--genome", dest="genome", required=True, help="Enter the genome build (MM9,MM8,HG18,HG19)", ) # optional flags parser.add_argument( "-n", "--name", dest="name", required=False, help="Provide a name for the analysis otherwise ROSE will guess", ) parser.add_argument( "-c", "--control", dest="control", required=False, help= ("Enter a comma separated list of control bams. Can either provide a single control " "bam for all rankby bams, or provide a control bam for each individual bam" ), ) parser.add_argument( "-s", "--stitch", dest="stitch", default="", help= ("Enter a max linking distance for stitching. Default will determine optimal stitching" " parameter"), ) parser.add_argument( "-t", "--tss", dest="tss", default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion", ) parser.add_argument( "--mask", dest="mask", required=False, help= "Mask a set of regions from analysis. Provide a .bed or .gff of masking regions", ) # RETRIEVING FLAGS args = parser.parse_args() # making the out folder if it doesn't exist out_folder = utils.format_folder(args.out, True) # figuring out folder schema gff_folder = utils.format_folder(os.path.join(out_folder, "gff"), True) mapped_folder = utils.format_folder(os.path.join(out_folder, "mappedGFF"), True) # GETTING INPUT FILE(s) input_list = [ input_file for input_file in args.input.split(",") if len(input_file) > 1 ] # converting all input files into GFFs and moving into the GFF folder input_gf_list = [] for input_file in input_list: # GETTING INPUT FILE if args.input.split(".")[-1] == "bed": # CONVERTING A BED TO GFF input_gff_name = os.path.basename(args.input)[0:-4] input_gff_file = os.path.join(gff_folder, "{}.gff".format(input_gff_name)) utils.bed_to_gff(args.input, input_gff_file) elif args.input.split(".")[-1] == "gff": # COPY THE INPUT GFF TO THE GFF FOLDER input_gff_file = args.input copyfile( input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file)), ) else: print( "WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT" ) # COPY THE INPUT GFF TO THE GFF FOLDER input_gff_file = args.input copyfile( input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file)), ) input_gf_list.append(input_gff_file) # GETTING THE LIST OF bam_fileS TO PROCESS # either same number of bams for rankby and control # or only 1 control #or none! # bamlist should be all rankby bams followed by control bams bam_file_list = [] if args.control: control_bam_list = [ bam for bam in args.control.split(",") if len(bam) > 0 ] rankby_bam_list = [ bam for bam in args.rankby.split(",") if len(bam) > 0 ] if len(control_bam_list) == len(rankby_bam_list): # case where an equal number of backgrounds are given bam_file_list = rankby_bam_list + control_bam_list elif len(control_bam_list) == 1: # case where a universal background is applied bam_file_list = rankby_bam_list + control_bam_list * len( rankby_bam_list) else: print( "ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM" " FOR EACH SAMPLE") sys.exit() else: bam_file_list = [bam for bam in args.rankby.split(",") if len(bam) > 0] # Stitch parameter if args.stitch == "": stitch_window = "" else: stitch_window = int(args.stitch) # tss args tss_window = int(args.tss) if tss_window != 0: remove_tss = True else: remove_tss = False # GETTING THE GENOME genome = args.genome.upper() print("USING {} AS THE GENOME".format(genome)) # GETTING THE CORRECT ANNOT FILE try: annot_file = rose2_utils.genome_dict[genome] except KeyError: print("ERROR: UNSUPPORTED GENOMES TYPE {}".format(genome)) sys.exit() # FINDING THE ANALYSIS NAME if args.name: input_name = args.name else: input_name = os.path.basename(input_gf_list[0]).split(".")[0] print("USING {} AS THE ANALYSIS NAME".format(input_name)) print("FORMATTING INPUT REGIONS") # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs # use a simpler unique region naming system if len(input_gf_list) == 1: input_gff = utils.parse_table(input_gf_list[0], "\t") else: input_loci = [] for gff_file in input_gf_list: print("\tprocessing {}".format(gff_file)) gff = utils.parse_table(gff_file, "\t") gff_collection = utils.gff_to_locus_collection(gff, 50) input_loci += gff_collection.get_loci() input_collection = utils.LocusCollection(input_loci, 50) input_collection = (input_collection.stitch_collection() ) # stitches to produce unique regions input_gff = utils.locus_collection_to_gff(input_collection) formatted_gff = [] # now number things appropriately for i, line in enumerate(input_gff): # use the coordinates to make a new id input_name_chr_sense_start_stop chrom = line[0] coords = [int(line[3]), int(line[4])] sense = line[6] line_id = "{}_{}".format(input_name, str(i + 1)) # 1 indexing new_line = [ chrom, line_id, line_id, min(coords), max(coords), "", sense, "", line_id, ] formatted_gff.append(new_line) # name of the master input gff file master_gff_file = os.path.join( gff_folder, "{}_{}_ALL_-0_+0.gff".format(genome, input_name)) utils.unparse_table(formatted_gff, master_gff_file, "\t") print("USING {} AS THE INPUT GFF".format(master_gff_file)) # GET CHROMS FOUND IN THE BAMS print("GETTING CHROMS IN bam_fileS") bam_chrom_list = rose2_utils.get_bam_chrom_list(bam_file_list) print("USING THE FOLLOWING CHROMS") print(bam_chrom_list) # LOADING IN THE GFF AND FILTERING BY CHROM print("LOADING AND FILTERING THE GFF") input_gff = rose2_utils.filter_gff(master_gff_file, bam_chrom_list) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print("LOADING IN GFF REGIONS") reference_collection = utils.gff_to_locus_collection(input_gff) print("CHECKING REFERENCE COLLECTION:") rose2_utils.check_ref_collection(reference_collection) # MASKING REFERENCE COLLECTION # see if there's a mask if args.mask: mask_file = args.mask # if it's a bed file if mask_file.split(".")[-1].upper() == "BED": mask_gff = utils.bedToGFF(mask_file) elif mask_file.split(".")[-1].upper() == "GFF": mask_gff = utils.parse_table(mask_file, "\t") else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() mask_collection = utils.gff_to_locus_collection(mask_gff) # now mask the reference loci reference_loci = reference_collection.get_loci() filtered_loci = [ locus for locus in reference_loci if len(mask_collection.get_overlap(locus, "both")) == 0 ] print("FILTERED OUT {} LOCI THAT WERE MASKED IN {}".format( len(reference_loci) - len(filtered_loci), mask_file)) reference_collection = utils.LocusCollection(filtered_loci, 50) # NOW STITCH REGIONS print("STITCHING REGIONS TOGETHER") stitched_collection, debug_output, stitch_window = rose2_utils.region_stitching( reference_collection, input_name, out_folder, stitch_window, tss_window, annot_file, remove_tss, ) # NOW MAKE A STITCHED COLLECTION GFF print("MAKING GFF FROM STITCHED COLLECTION") stitched_gff = utils.locus_collection_to_gff(stitched_collection) print(stitch_window) print(type(stitch_window)) if not remove_tss: stitched_gff_file = os.path.join( gff_folder, "{}_{}KB_STITCHED.gff".format(input_name, str(stitch_window // 1000)), ) stitched_gff_name = "{}_{}KB_STITCHED".format( input_name, str(stitch_window // 1000)) debug_out_file = os.path.join( gff_folder, "{}_{}KB_STITCHED.debug".format(input_name, str(stitch_window // 1000)), ) else: stitched_gff_file = os.path.join( gff_folder, "{}_{}KB_STITCHED_TSS_DISTAL.gff".format( input_name, str(stitch_window // 1000)), ) stitched_gff_name = "{}_{}KB_STITCHED_TSS_DISTAL".format( input_name, str(stitch_window // 1000)) debug_out_file = os.path.join( gff_folder, "{}_{}KB_STITCHED_TSS_DISTAL.debug".format( input_name, str(stitch_window // 1000)), ) # WRITING DEBUG OUTPUT TO DISK if debug: print("WRITING DEBUG OUTPUT TO DISK AS {}".format(debug_out_file)) utils.unparse_table(debug_output, debug_out_file, "\t") # WRITE THE GFF TO DISK print("WRITING STITCHED GFF TO DISK AS {}".format(stitched_gff_file)) utils.unparse_table(stitched_gff, stitched_gff_file, "\t") # SETTING UP THE OVERALL OUTPUT FILE output_file1 = os.path.join( out_folder, "{}_ENHANCER_REGION_MAP.txt".format(stitched_gff_name)) print("OUTPUT WILL BE WRITTEN TO {}".format(output_file1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF bam_file_list_unique = list(bam_file_list) bam_file_list_unique = utils.uniquify(bam_file_list_unique) # prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bam_file_list_unique) for bam_file in bam_file_list_unique: bam_file_name = os.path.basename(bam_file) # MAPPING TO THE STITCHED GFF mapped_out1_folder = os.path.join( mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name)) mapped_out1_file = os.path.join( mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name), "matrix.txt", ) if utils.check_output(mapped_out1_file, 0.2, 0.2): print("FOUND {} MAPPING DATA FOR BAM: {}".format( stitched_gff_file, mapped_out1_file)) else: cmd1 = "bamliquidator_batch --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format( stitched_gff_file, mapped_out1_folder, bam_file, ) print(cmd1) os.system(cmd1) if utils.check_output(mapped_out1_file, 0.2, 5): print("SUCCESSFULLY MAPPED TO {} FROM BAM: {}".format( stitched_gff_file, bam_file_name)) else: print("ERROR: FAILED TO MAP {} FROM BAM: {}".format( stitched_gff_file, bam_file_name)) sys.exit() print("BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS") # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR rose2_utils.map_collection( stitched_collection, reference_collection, bam_file_list, mapped_folder, output_file1, ref_name=stitched_gff_name, ) print("FINDING AVERAGE SIGNAL AMONGST BAMS") meta_output_file = collapse_region_map(output_file1, input_name + "_MERGED_SIGNAL", control_bams=args.control) # now try the merging print("CALLING AND PLOTTING SUPER-ENHANCERS") control_name = "NONE" cmd = "Rscript {} {} {} {} {}".format( os.path.join(ROOT_DIR, "scripts", "ROSE2_callSuper.R"), out_folder + "/", # TODO: fix R script so it does not require '/' meta_output_file, input_name, control_name, ) print(cmd) os.system(cmd) # calling the gene mapper print("CALLING GENE MAPPING") super_table_file = "{}_SuperEnhancers.table.txt".format(input_name) # for now don't use ranking bam to call top genes cmd = "ROSE2_geneMapper -g {} -i {} -f".format( genome, os.path.join(out_folder, super_table_file)) print(cmd) os.system(cmd) stretch_table_file = "{}_StretchEnhancers.table.txt".format(input_name) cmd = "ROSE2_geneMapper -g {} -i {} -f".format( genome, os.path.join(out_folder, stretch_table_file)) print(cmd) os.system(cmd) superstretch_table_file = "{}_SuperStretchEnhancers.table.txt".format( input_name) cmd = "ROSE2_geneMapper.py -g {} -i {} -f".format(genome, out_folder, superstretch_table_file) os.system(cmd)
def main(): """Main run call.""" debug = False parser = argparse.ArgumentParser() # required flags parser.add_argument( "-i", "--i", dest="input", required=True, help="Enter a .gff or .bed file of binding sites used to make enhancers", ) parser.add_argument( "-r", "--rankby", dest="rankby", required=True, help="bam_file to rank enhancer by", ) parser.add_argument( "-o", "--out", dest="out", required=True, help="Enter an output folder" ) parser.add_argument( "-g", "--genome", dest="genome", required=True, help="Enter the genome build (MM9,MM8,HG18,HG19)", ) # optional flags parser.add_argument( "-b", "--bams", dest="bams", required=False, help="Enter a comma separated list of additional bam files to map to", ) parser.add_argument( "-c", "--control", dest="control", required=False, help="bam_file to rank enhancer by", ) parser.add_argument( "-s", "--stitch", dest="stitch", default="", help=( "Enter a max linking distance for stitching. Default will determine optimal stitching" " parameter" ), ) parser.add_argument( "-t", "--tss", dest="tss", default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion", ) parser.add_argument( "--mask", dest="mask", required=False, help="Mask a set of regions from analysis. Provide a .bed or .gff of masking regions", ) # RETRIEVING FLAGS args = parser.parse_args() # making the out folder if it doesn't exist out_folder = utils.format_folder(args.out, True) # figuring out folder schema gff_folder = utils.format_folder(os.path.join(out_folder, "gff"), True) mapped_folder = utils.format_folder(os.path.join(out_folder, "mapped_gff"), True) # GETTING INPUT FILE if args.input.split(".")[-1] == "bed": # CONVERTING A BED TO GFF input_gff_name = args.input.split("/")[-1][0:-4] input_gff_file = os.path.join(gff_folder, "{}.gff".format(input_gff_name)) utils.bed_to_gff(args.input, input_gff_file) elif args.input.split(".")[-1] == "gff": # COPY THE INPUT GFF TO THE GFF FOLDER input_gff_file = args.input copyfile( input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file)) ) else: print( "WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT" ) # COPY THE INPUT GFF TO THE GFF FOLDER input_gff_file = args.input copyfile( input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file)) ) # GETTING THE LIST OF bam_fileS TO PROCESS if args.control: bam_file_list = [args.rankby, args.control] else: bam_file_list = [args.rankby] if args.bams: bam_file_list += args.bams.split(",") # bam_file_list = utils.uniquify(bam_file_list) # makes sad when you have the same control # bam over and over again # optional args # Stitch parameter if args.stitch == "": stitch_window = "" else: stitch_window = int(args.stitch) # tss args tss_window = int(args.tss) if tss_window != 0: remove_tss = True else: remove_tss = False # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS print("USING {} AS THE INPUT GFF".format(input_gff_file)) input_name = os.path.basename(input_gff_file).split(".")[0] # GETTING THE GENOME genome = args.genome print("USING {} AS THE GENOME".format(genome)) annot_file = rose2_utils.genome_dict[genome.upper()] # GET CHROMS FOUND IN THE BAMS print("GETTING CHROMS IN bam_fileS") bam_chrom_list = rose2_utils.get_bam_chrom_list(bam_file_list) print("USING THE FOLLOWING CHROMS") print(bam_chrom_list) # LOADING IN THE GFF AND FILTERING BY CHROM print("LOADING AND FILTERING THE GFF") input_gff = rose2_utils.filter_gff(input_gff_file, bam_chrom_list) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print("LOADING IN GFF REGIONS") reference_collection = utils.gff_to_locus_collection(input_gff) print("STARTING WITH {} INPUT REGIONS".format(len(reference_collection))) print("CHECKING REFERENCE COLLECTION:") rose2_utils.check_ref_collection(reference_collection) # MASKING REFERENCE COLLECTION # see if there's a mask if args.mask: mask_file = args.mask print("USING MASK FILE {}".format(mask_file)) # if it's a bed file if mask_file.split(".")[-1].upper() == "BED": mask_gff = utils.bed_to_gff(mask_file) elif mask_file.split(".")[-1].upper() == "GFF": mask_gff = utils.parse_table(mask_file, "\t") else: print("MASK MUST BE A .gff or .bed FILE") mask_collection = utils.gff_to_locus_collection(mask_gff) print("LOADING {} MASK REGIONS".format(str(len(mask_collection)))) # now mask the reference loci reference_loci = reference_collection.get_loci() filtered_loci = [ locus for locus in reference_loci if len(mask_collection.get_overlap(locus, "both")) == 0 ] print( "FILTERED OUT {} LOCI THAT WERE MASKED IN {}".format( str(len(reference_loci) - len(filtered_loci)), mask_file ) ) reference_collection = utils.LocusCollection(filtered_loci, 50) # NOW STITCH REGIONS print("STITCHING REGIONS TOGETHER") stitched_collection, debug_output, stitch_window = rose2_utils.region_stitching( reference_collection, input_name, out_folder, stitch_window, tss_window, annot_file, remove_tss, ) # NOW MAKE A STITCHED COLLECTION GFF print("MAKING GFF FROM STITCHED COLLECTION") stitched_gff = utils.locus_collection_to_gff(stitched_collection) # making sure start/stop ordering are correct for i in range(len(stitched_gff)): line = stitched_gff[i] start = int(line[3]) stop = int(line[4]) if start > stop: line[3] = stop line[4] = start print(stitch_window) print(type(stitch_window)) if not remove_tss: stitched_gff_file = os.path.join( gff_folder, "{}_{}KB_STITCHED.gff".format(input_name, str(stitch_window // 1000)), ) stitched_gff_name = "{}_{}KB_STITCHED".format( input_name, str(stitch_window // 1000) ) debug_out_file = os.path.join( gff_folder, "{}_{}KB_STITCHED.debug".format(input_name, str(stitch_window // 1000)), ) else: stitched_gff_file = os.path.join( gff_folder, "{}_{}KB_STITCHED_TSS_DISTAL.gff".format( input_name, str(stitch_window // 1000) ), ) stitched_gff_name = "{}_{}KB_STITCHED_TSS_DISTAL".format( input_name, str(stitch_window // 1000) ) debug_out_file = os.path.join( gff_folder, "{}_{}KB_STITCHED_TSS_DISTAL.debug".format( input_name, str(stitch_window // 1000) ), ) # WRITING DEBUG OUTPUT TO DISK if debug: print("WRITING DEBUG OUTPUT TO DISK AS {}".format(debug_out_file)) utils.unparse_table(debug_output, debug_out_file, "\t") # WRITE THE GFF TO DISK print("WRITING STITCHED GFF TO DISK AS {}".format(stitched_gff_file)) utils.unparse_table(stitched_gff, stitched_gff_file, "\t") # SETTING UP THE OVERALL OUTPUT FILE output_file1 = os.path.join( out_folder, "{}_ENHANCER_REGION_MAP.txt".format(stitched_gff_name) ) print("OUTPUT WILL BE WRITTEN TO {}".format(output_file1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF bam_file_list_unique = list(bam_file_list) bam_file_list_unique = utils.uniquify(bam_file_list_unique) # prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bam_file_list_unique) for bam_file in bam_file_list_unique: bam_file_name = os.path.basename(bam_file) # MAPPING TO THE STITCHED GFF mapped_out1_folder = os.path.join( mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name) ) mapped_out1_file = os.path.join( mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name), "matrix.txt", ) if utils.check_output(mapped_out1_file, 0.2, 0.2): print( "FOUND {} MAPPING DATA FOR BAM: {}".format( stitched_gff_file, mapped_out1_file ) ) else: cmd1 = "bamliquidator_batch --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format( stitched_gff_file, mapped_out1_folder, bam_file, ) print(cmd1) os.system(cmd1) if utils.check_output(mapped_out1_file, 0.2, 5): print( "SUCCESSFULLY MAPPED TO {} FROM BAM: {}".format( stitched_gff_file, bam_file_name ) ) else: print( "ERROR: FAILED TO MAP {} FROM BAM: {}".format( stitched_gff_file, bam_file_name ) ) sys.exit() print("BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS") # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR rose2_utils.map_collection( stitched_collection, reference_collection, bam_file_list, mapped_folder, output_file1, ref_name=stitched_gff_name, ) print("CALLING AND PLOTTING SUPER-ENHANCERS") if args.control: control_name = os.path.basename(args.control) else: control_name = "NONE" cmd = "Rscript {} {} {} {} {}".format( os.path.join(ROOT_DIR, "scripts", "ROSE2_callSuper.R"), out_folder + "/", # TODO: fix R script so it does not require '/' output_file1, input_name, control_name, ) print(cmd) os.system(cmd) # calling the gene mapper time.sleep(20) super_table_file = "{}_SuperEnhancers.table.txt".format(input_name) if args.control: cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format( genome, args.rankby, args.control, os.path.join(out_folder, super_table_file), ) else: cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format( genome, args.rankby, os.path.join(out_folder, super_table_file) ) os.system(cmd) stretch_table_file = "{}_StretchEnhancers.table.txt".format(input_name) if args.control: cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format( genome, args.rankby, args.control, os.path.join(out_folder, stretch_table_file), ) else: cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format( genome, args.rankby, os.path.join(out_folder, stretch_table_file) ) os.system(cmd) superstretch_table_file = "{}_SuperStretchEnhancers.table.txt".format(input_name) if args.control: cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format( genome, args.rankby, args.control, os.path.join(out_folder, superstretch_table_file), ) else: cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format( genome, args.rankby, os.path.join(out_folder, superstretch_table_file) ) os.system(cmd)
def tf_edge_delta_out( crc_folder, bam_list, analysis_name, edge_table_path_1, edge_table_path_2, group1_list, group2_list, output="", ): """Calculates changes in group out degree at each predicted motif occurrence (by subpeaks).""" crc_folder = utils.format_folder(crc_folder, True) edge_path = merge_edge_tables( edge_table_path_1, edge_table_path_2, os.path.join(crc_folder, "{}_EDGE_TABLE.txt".format(analysis_name)), ) # make a gff of the edge table edge_table = utils.parse_table(edge_path, "\t") edge_gff = [] for line in edge_table[1:]: gff_line = [ line[2], "{}_{}".format(line[0], line[1]), "", line[3], line[4], "", ".", "", "{}_{}".format(line[0], line[1]), ] edge_gff.append(gff_line) edge_gff_path = os.path.join(crc_folder, "{}_EDGE_TABLE.gff".format(analysis_name)) utils.unparse_table(edge_gff, edge_gff_path, "\t") # direct the output to the crc folder signal_path = os.path.join( crc_folder, "{}_EDGE_TABLE_signal.txt".format(analysis_name)) all_group_list = group1_list + group2_list if not utils.check_output(signal_path, 0, 0): signal_table_list = pipeline_utils.map_regions( bam_list, [edge_gff_path], crc_folder, crc_folder, all_group_list, True, signal_path, extend_reads_to=100, ) print(signal_table_list) else: print("Found previous signal table at {}".format(signal_path)) # now bring in the signal table as a dictionary using the locus line as the id print("making log2 group1 vs group2 signal table at edges") signal_table = utils.parse_table(signal_path, "\t") # figure out columns for group1 and group2 group1_columns = [signal_table[0].index(name) for name in group1_list] group2_columns = [signal_table[0].index(name) for name in group2_list] group1_signal_vector = [] group2_signal_vector = [] for line in signal_table[1:]: group1_signal = numpy.mean( [float(line[col]) for col in group1_columns]) group2_signal = numpy.mean( [float(line[col]) for col in group2_columns]) group1_signal_vector.append(group1_signal) group2_signal_vector.append(group2_signal) group1_median = numpy.median(group1_signal_vector) group2_median = numpy.median(group2_signal_vector) print("group1 median signal") print(group1_median) print("group2 median signal") print(group2_median) # now that we have the median, we can take edges where at least 1 edge is above the median # and both are above zero and generate a new table w/ the fold change signal_filtered_path = signal_path.replace(".txt", "_filtered.txt") if utils.check_output(signal_filtered_path, 0, 0): print("Found filtered signal table for edges at {}".format( signal_filtered_path)) signal_table_filtered = utils.parse_table(signal_filtered_path, "\t") else: signal_table_filtered = [ signal_table[0] + ["GROUP1_MEAN", "GROUP2_MEAN", "GROUP1_vs_GROUP2_LOG2"] ] for line in signal_table[1:]: group1_signal = numpy.mean( [float(line[col]) for col in group1_columns]) group2_signal = numpy.mean( [float(line[col]) for col in group2_columns]) if (group1_signal > group1_median or group2_signal > group2_median ) and min(group1_signal, group2_signal) > 0: delta = numpy.log2(group1_signal / group2_signal) new_line = line + [group1_signal, group2_signal, delta] signal_table_filtered.append(new_line) utils.unparse_table(signal_table_filtered, signal_filtered_path, "\t") # now get a list of all TFs in the system tf_list = utils.uniquify( [line[0].split("_")[0] for line in signal_table_filtered[1:]]) tf_list.sort() print(tf_list) out_degree_table = [[ "TF_NAME", "EDGE_COUNT", "DELTA_MEAN", "DELTA_MEDIAN", "DELTA_STD", "DELTA_SEM", ]] for tf_name in tf_list: print(tf_name) edge_vector = [ float(line[-1]) for line in signal_table_filtered[1:] if line[0].split("_")[0] == tf_name ] edge_count = len(edge_vector) delta_mean = round(numpy.mean(edge_vector), 4) delta_median = round(numpy.median(edge_vector), 4) delta_std = round(numpy.std(edge_vector), 4) delta_sem = round(stats.sem(edge_vector), 4) tf_out_line = [ tf_name, edge_count, delta_mean, delta_median, delta_std, delta_sem, ] out_degree_table.append(tf_out_line) # set final output if not output: output_path = os.path.join( crc_folder, "{}_EDGE_DELTA_OUT.txt".format(analysis_name)) else: output_path = output utils.unparse_table(out_degree_table, output_path, "\t") print(output_path) return output_path
def main(): """Main run method for enhancer promoter contribution tool.""" parser = argparse.ArgumentParser() # required flags parser.add_argument( "-b", "--bam", dest="bam", nargs="*", help="Enter a space separated list of .bam files for the main factor", required=True, ) parser.add_argument( "-i", "--input", dest="input", type=str, help="Enter .gff or .bed file of regions to analyze", required=True, ) parser.add_argument( "-g", "--genome", dest="genome", type=str, help=( "specify a genome, HG18,HG19,HG38,MM8,MM9,MM10,RN6 are currently " "supported"), required=True, ) parser.add_argument( "-p", "--chrom-path", dest="chrom_path", type=str, help=("Provide path to a folder with a seperate fasta file for each " "chromosome"), required=True, ) # output flag parser.add_argument( "-o", "--output", dest="output", type=str, help="Enter the output folder.", required=True, ) # additional options flags and optional arguments parser.add_argument( "-a", "--activity", dest="activity", type=str, help=("specify a table where first column represents a list of active " "refseq genes"), required=False, ) parser.add_argument( "-c", "--control", dest="control", nargs="*", help=("Enter a space separated list of .bam files for background. If " "flagged, will perform background subtraction"), required=False, ) parser.add_argument( "-t", "--tss", dest="tss", type=int, help="Define the TSS area +/- the TSS. Default is 1kb", required=False, default=1000, ) parser.add_argument( "-d", "--distal", dest="distal", type=int, help="Enter a window to assign distal enhancer signal. Default is 50kb", required=False, default=50000, ) parser.add_argument( "--other-bams", dest="other", nargs="*", help="enter a space separated list of other bams to map to", required=False, ) parser.add_argument( "--name", dest="name", type=str, help= ("enter a root name for the analysis, otherwise will try to find the " "name from the input file"), required=False, ) parser.add_argument( "--top", dest="top", type=int, help= ("Run the analysis on the top N genes by total signal. Default is 5000" ), required=False, default=5000, ) parser.add_argument( "--tads", dest="tads", type=str, help= ("Include a .bed of tad regions to restrict enhancer/gene association" ), required=False, default=None, ) parser.add_argument( "--mask", dest="mask", default=None, help=( "Mask a set of regions from analysis. Provide a .bed or .gff of " "masking regions"), ) args = parser.parse_args() print(args) # ===================================================================================== # ===============================I. PARSING ARGUMENTS================================== # ===================================================================================== print( "\n\n#======================================\n#===========I. DATA SUMMARY============\n#=" "=====================================\n") # top analysis subset top = args.top # input genome genome = args.genome.upper() print("PERFORMING ANALYSIS ON {} GENOME BUILD".format(genome)) # set of bams bam_file_list = args.bam # bring in the input path input_path = args.input # try to get the input name or use the name argument if args.name: analysis_name = args.name else: analysis_name = os.path.basename(input_path).split(".")[0] print("USING {} AS ANALYSIS NAME".format(analysis_name)) # setting up the output folder parent_folder = utils.format_folder(args.output, True) output_folder = utils.format_folder( os.path.join(parent_folder, analysis_name), True) print("WRITING OUTPUT TO {}".format(output_folder)) if input_path.split(".")[-1] == "bed": # type is bed print("input in bed format, converting to gff") input_gff = utils.bed_to_gff(input_path) else: input_gff = utils.parse_table(input_path, "\t") # the tss window for proximal signal assignment tss_window = int(args.tss) # the distal window for assigning nearby enhancer signal distal_window = int(args.distal) # activity path if args.activity: activity_path = args.activity activity_table = utils.parse_table(activity_path, "\t") ref_col = 0 # try to find the column for refseq id for i in range(len( activity_table[2])): # use an internal row in case of header if str(activity_table[1][i]).count("NM_") or str( activity_table[1][i]).count("NR_"): ref_col = i # now check for header if not str(activity_table[0][i]).count("NM_") and not str( activity_table[0][i]).count("NR_"): print("REMOVING HEADER FROM GENE TABLE:") print(activity_table[0]) activity_table.pop(0) gene_list = [line[ref_col] for line in activity_table ] # this needs to be REFSEQ NM ID print("IDENTIFIED {} ACTIVE GENES".format(len(gene_list))) else: gene_list = [] # check if tads are being invoked if args.tads: print("LOADING TAD LOCATIONS FROM {}".format(args.tads)) tads_path = args.tads else: tads_path = "" print("LOADING ANNOTATION DATA FOR GENOME {}".format(genome)) genome_dir = args.chrom_path # making a chrom_dict that is a list of all chroms with sequence chrom_list = utils.uniquify( [name.split(".")[0] for name in os.listdir(genome_dir) if name]) # important here to define the window start_dict, tss_collection, mouse_convert_dict = load_annot_file( genome, tss_window, gene_list, ) print("FILTERING THE INPUT GFF FOR GOOD CHROMOSOMES") print(chrom_list) filtered_gff = [line for line in input_gff if chrom_list.count(line[0])] print("{} of INITIAL {} REGIONS ARE IN GOOD CHROMOSOMES".format( str(len(filtered_gff)), str(len(input_gff)), )) # ===================================================================================== # ================II. IDENTIFYING TSS PROXIMAL AND DISTAL ELEMENTS===================== # ===================================================================================== print( "\n\n#======================================\n#==II. MAPPING TO TSS/DISTAL REGIONS===\n#=" "=====================================\n") # now we need to split the input region print("SPLITTING THE INPUT GFF USING A WINDOW OF {}".format(tss_window)) split_gff = split_regions(filtered_gff, tss_collection, mask_file=args.mask) print(len(filtered_gff)) print(len(split_gff)) split_gff_path = os.path.join(output_folder, "{}_SPLIT.gff".format(analysis_name)) utils.unparse_table(split_gff, split_gff_path, "\t") print("WRITING TSS SPLIT GFF OUT TO {}".format(split_gff_path)) # now you have to map the bams to the gff print("MAPPING TO THE SPLIT GFF") mapped_folder = utils.format_folder( os.path.join(output_folder, "bam_mapping"), True) signal_table = map_bams(bam_file_list, split_gff_path, analysis_name, mapped_folder) signal_table_path = os.path.join( output_folder, "{}_signal_table.txt".format(analysis_name)) utils.unparse_table(signal_table, signal_table_path, "\t") if args.control: control_bam_file_list = args.control control_signal_table = map_bams( control_bam_file_list, split_gff_path, analysis_name, mapped_folder, ) control_signal_table_path = os.path.join( output_folder, "{}_control_signal_table.txt".format(analysis_name), ) utils.unparse_table(control_signal_table, control_signal_table_path, "\t") # now create the background subtracted summarized average table print("CREATING AN AVERAGE SIGNAL TABLE") average_table = make_average_table( output_folder, analysis_name, use_background=args.control # TODO: fix to True or False ) average_table_path = os.path.join( output_folder, "{}_average_table.txt".format(analysis_name)) utils.unparse_table(average_table, average_table_path, "\t") # now load up all of the cpg and other parameters to make the actual peak table # first check if this has already been done peak_table_path = os.path.join(output_folder, "{}_PEAK_TABLE.txt".format(analysis_name)) if utils.check_output(peak_table_path, 0.1, 0.1): print("PEAK TABLE OUTPUT ALREADY EXISTS") peak_table = utils.parse_table(peak_table_path, "\t") else: peak_table = make_peak_table( param_dict, split_gff_path, average_table_path, start_dict, gene_list, genome_dir, tss_window, distal_window, tads_path, ) utils.unparse_table(peak_table, peak_table_path, "\t") gene_table = make_gene_table(peak_table, analysis_name) gene_table_path = os.path.join(output_folder, "{}_GENE_TABLE.txt".format(analysis_name)) utils.unparse_table(gene_table, gene_table_path, "\t") # if mouse, need to convert genes over if genome.count("MM") == 1: print("CONVERTING MOUSE NAMES TO HUMAN HOMOLOGS FOR GSEA") converted_gene_table_path = os.path.join( output_folder, "{}_GENE_TABLE_CONVERTED.txt".format(analysis_name), ) converted_gene_table = [gene_table[0]] for line in gene_table[1:]: converted_name = mouse_convert_dict[line[0]] if converted_name: converted_gene_table.append([converted_name] + line[1:]) utils.unparse_table(converted_gene_table, converted_gene_table_path, "\t") gene_table_path = converted_gene_table_path gene_table = converted_gene_table # ===================================================================================== # ===================================III. PLOTTING ==================================== # ===================================================================================== print( "\n\n#======================================\n#===III. PLOTTING ENHANCER/PROMOTER===\n#==" "====================================\n") # if there are fewer genes in the gene table than the top genes, only run on all if len(gene_table) < int(top): print( "WARNING: ONLY {} GENES WITH SIGNAL AT EITHER PROMOTERS OR ENHANCERS. NOT ENOUGH TO" "RUN ANALYSIS ON TOP {}".format(str(len(gene_table) - 1), str(top))) top = 0 # now call the R code print("CALLING R PLOTTING SCRIPTS") call_r_waterfall(gene_table_path, output_folder, analysis_name, top)
def main(): """Main run function.""" parser = argparse.ArgumentParser() # required flags parser.add_argument( "-g", "--genome", dest="genome", required=True, help="Enter the genome build (HG18,HG19,MM9,RN4,RN6) for the project", ) parser.add_argument( "-d", "--data", dest="data", required=True, help="Enter the data file for the project", ) parser.add_argument( "-r", "--rose", dest="rose", required=True, help="Enter a comma separated list of rose folder", ) parser.add_argument( "-o", "--output", dest="output", required=True, help="Enter the output folder for the project", ) parser.add_argument( "-n", "--names", dest="names", required=True, help="Enter a comma separated list of names to go with the datasets", ) # additional args parser.add_argument( "-p", "--plot", dest="plot", action="store_true", default=False, help="If flagged, will plot differential regions", ) parser.add_argument( "-a", "--all", dest="all", action="store_true", default=False, help= "If flagged, will run analysis for all enhancers and not just supers.", ) parser.add_argument( "-m", "--median", dest="median", action="store_true", default=False, help="If flagged, will use median enhancer scaling", ) parser.add_argument( "-e", "--enhancer-type", dest="enhancer_type", default="super", help= "specify type of enhancer to analyze: super, stretch, superStretch", ) args = parser.parse_args() print(args) genome = args.genome.upper() data_file = args.data rose_folder_string = args.rose rose_folder1, rose_folder2 = rose_folder_string.split(",") parent_folder = utils.format_folder(args.output, True) name_string = args.names name1, name2 = name_string.split(",") merge_name = "{}_{}_merged".format(name1, name2) # option for median scaling median_scale = args.median plot_bam = args.plot if args.all: super_only = False else: super_only = True if super_only and plot_bam: print( "Running dynamic enhancer analysis on all super enhancers in {} and {} and plotting " "output to {}".format(name1, name2, parent_folder)) if super_only and not plot_bam: print( "Running dynamic enhancer analysis on all super enhancers in {} and {} and writing " "output to {}".format(name1, name2, parent_folder)) if not super_only and plot_bam: print( "Running dynamic enhancer analysis on all enhancers in {} and {} and plotting output " "to {}. WARNING: Plotting all differential enhancers could take a while" .format(name1, name2, parent_folder)) if not super_only and not plot_bam: print( "Running dynamic enhancer analysis on all enhancers in {} and {} and writing output " "to {}.".format(name1, name2, parent_folder)) # part 1 print("PART1: analyzing ROSE output from {} and {}".format(name1, name2)) # start with the all enhancer tables from the initial rose calls rose_folder1 = utils.format_folder(rose_folder1, False) rose_folder2 = utils.format_folder(rose_folder2, False) rose_dict1 = make_rose_dict(rose_folder1) rose_dict2 = make_rose_dict(rose_folder2) # choosing the type of enhancer to analyze enhancer_call_type = args.enhancer_type.lower() if super_only: print("ANALYZING ENHANCER TYPE: {}".format(enhancer_call_type.upper())) super_file1 = rose_dict1[enhancer_call_type] super_file2 = rose_dict2[enhancer_call_type] all_file1 = rose_dict1["AllEnhancer"] all_file2 = rose_dict2["AllEnhancer"] print("\tMERGING ENHANCERS AND CALLING ROSE") if super_only: if len(super_file1) == 0: print("ERROR: UNABLE TO FIND {} FILES IN {}".format( enhancer_call_type, rose_folder1)) sys.exit() if len(super_file2) == 0: print("ERROR: UNABLE TO FIND {} FILES IN {}".format( enhancer_call_type, rose_folder2)) sys.exit() rose_output = call_merge_supers( data_file, super_file1, super_file2, name1, name2, merge_name, genome, parent_folder, ) else: rose_output = call_merge_supers( data_file, all_file1, all_file2, name1, name2, merge_name, genome, parent_folder, ) print("\tCALCULATING ENHANCER DELTA AND MAKING PLOTS") # part2 is the R script merged_gff_file = os.path.join( parent_folder, "{}_{}_MERGED_REGIONS_-0_+0.gff".format(genome, merge_name)) rcmd = call_delta_r_script( merged_gff_file, parent_folder, data_file, name1, name2, all_file1, all_file2, median_scale, ) print(rcmd) os.system(rcmd) time.sleep(30) call_rose_gene_mapper(merged_gff_file, genome, parent_folder, name1) # rank the genes # part 3 # rank the delta print("PART 3: assinging ranks to differential enhancers") print("\tASSIGNING SUPER RANK TO MERGED ENHANCERS") gff_name = "{}_{}_MERGED_REGIONS_-0_+0".format(genome, merge_name) enhancer_to_gene_file = os.path.join( parent_folder, "{}_ROSE".format(name1), "{}_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB.txt".format( gff_name), ) if utils.check_output(enhancer_to_gene_file): rank_output = os.path.join( parent_folder, "{}_ROSE".format(name1), "{}_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB_RANK.txt". format(gff_name), ) assign_enhancer_rank(enhancer_to_gene_file, all_file1, all_file2, name1, name2, rank_output) else: print("ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN") sys.exit() # make the rank plot print("MAKING RANK PLOTS") if utils.check_output(rank_output): rcmd = call_rank_r_script(rank_output, name1, name2, super_file1, super_file2) print(rcmd) os.system(rcmd) else: print("ERROR: RANK PLOT SCRIPT FAILED TO RUN") sys.exit() time.sleep(30) print("FINISHING OUTPUT") finish_rank_output( data_file, rank_output, genome, parent_folder, merge_name, name1, name2, 1, 100000, super_only, plot_bam, )
def finish_rank_output( data_file, rank_output, genome, merge_folder, merge_name, name1, name2, cut_off=1.5, window=100000, super_only=True, plot_bam=True, ): """Finish rank output. Clean up the rank output table. Make a gff of all of the gained/lost supers beyond a certain cut_off w/ a window. Make a list of gained genes and lost genes. Make a bed of gained loss. """ data_dict = pipeline_utils.load_data_table(data_file) # making sure window and cut_off are int/float cut_off = float(cut_off) window = int(window) genome = genome.upper() # make the output folder output_folder = utils.format_folder(os.path.join(merge_folder, "output"), True) # bring in the old rank table rank_enhancer_table = utils.parse_table(rank_output, "\t") # make a new formatted table header = rank_enhancer_table[0] header[-4] = "DELTA RANK" header[-3] = "IS_SUPER" formatted_rank_table = [header] # the gffs gained_gff = [] lost_gff = [] gained_window_gff = [] lost_window_gff = [] if super_only: enhancer_type = "SUPERS" else: enhancer_type = "ENHANCERS" # the beds if super_only: gained_track_header = ( 'track name="{} {} only SEs" description="{} super enhancers that are found only in ' '{} vs {}" itemRGB=On color=255,0,0'.format( genome, name2, genome, name2, name1)) gained_bed = [[gained_track_header]] conserved_track_header = ( 'track name="{} {} and {} SEs" description="{} super enhancers that are found in both' ' {} vs {}" itemRGB=On color=0,0,0'.format(genome, name1, name2, genome, name1, name2)) conserved_bed = [[conserved_track_header]] lost_track_header = ( 'track name="{} {} only SEs" description="{} super enhancers that are found only in ' '{} vs {}" itemRGB=On color=0,255,0'.format( genome, name1, genome, name1, name2)) lost_bed = [[lost_track_header]] else: gained_track_header = ( 'track name="{} {} only enhancers" description="{} enhancers that are found only in ' '{} vs {}" itemRGB=On color=255,0,0'.format( genome, name2, genome, name2, name1)) gained_bed = [[gained_track_header]] conserved_track_header = ( 'track name="{} {} and {} enhancers" description="{} enhancers that are found in both' ' {} vs {}" itemRGB=On color=0,0,0'.format(genome, name1, name2, genome, name1, name2)) conserved_bed = [[conserved_track_header]] lost_track_header = ( 'track name="{} {} only enhancers" description="{} enhancers that are found only in ' '{} vs {}" itemRGB=On color=0,255,0'.format( genome, name1, genome, name1, name2)) lost_bed = [[lost_track_header]] # the genes gene_table = [[ "GENE", "ENHANCER_ID", "ENHANCER_CHROM", "ENHANCER_START", "ENHANCER_STOP", header[6], header[7], header[8], "STATUS", ]] for line in rank_enhancer_table[1:]: # fixing the enhancer ID line[0] = line[0].replace("_lociStitched", "") formatted_rank_table.append(line) # getting the genes gene_list = [] gene_list += line[9].split(",") gene_list += line[10].split(",") gene_list += line[11].split(",") gene_list = [x for x in gene_list if len(x) > 0] gene_list = utils.uniquify(gene_list) gene_string = ",".join(gene_list) bed_line = [line[1], line[2], line[3], line[0], line[-4]] # for gained if float(line[6]) > cut_off: gff_line = [ line[1], line[0], "", line[2], line[3], "", ".", "", gene_string, ] gff_window_line = [ line[1], line[0], "", int(line[2]) - window, int(line[3]) + window, "", ".", "", gene_string, ] gained_gff.append(gff_line) gained_window_gff.append(gff_window_line) gene_status = name2 gained_bed.append(bed_line) # for lost elif float(line[6]) < (-1 * cut_off): gff_line = [ line[1], line[0], "", line[2], line[3], "", ".", "", gene_string, ] gff_window_line = [ line[1], line[0], "", int(line[2]) - window, int(line[3]) + window, "", ".", "", gene_string, ] lost_gff.append(gff_line) lost_window_gff.append(gff_window_line) gene_status = name1 lost_bed.append(bed_line) # for conserved else: gene_status = "CONSERVED" conserved_bed.append(bed_line) # now fill in the gene Table for gene in gene_list: gene_table_line = [ gene, line[0], line[1], line[2], line[3], line[6], line[7], line[8], gene_status, ] gene_table.append(gene_table_line) # concat the bed full_bed = gained_bed + conserved_bed + lost_bed # start writing the output # there's the two gffs, the bed,the formatted table, the gene table # formatted table formatted_filename = os.path.join( output_folder, "{}_{}_MERGED_{}_RANK_TABLE.txt".format(genome, merge_name, enhancer_type), ) utils.unparse_table(formatted_rank_table, formatted_filename, "\t") # gffs gff_folder = utils.format_folder(output_folder + "gff/", True) gff_filename_gained = os.path.join( gff_folder, "{}_{}_{}_ONLY_{}_-0_+0.gff".format(genome, merge_name, name2.upper(), enhancer_type), ) gff_filename_window_gained = os.path.join( gff_folder, "{}_{}_{}_ONLY_{}_-{}KB_+{}KB.gff".format( genome, merge_name, name2.upper(), enhancer_type, str(window // 1000), str(window // 1000), ), ) gff_filename_lost = os.path.join( gff_folder, "{}_{}_{}_ONLY_{}_-0_+0.gff".format(genome, merge_name, name1.upper(), enhancer_type), ) gff_filename_window_lost = os.path.join( gff_folder, "{}_{}_{}_ONLY_{}_-{}KB_+{}KB.gff".format( genome, merge_name, name1.upper(), enhancer_type, str(window // 1000), str(window // 1000), ), ) utils.unparse_table(gained_gff, gff_filename_gained, "\t") utils.unparse_table(gained_window_gff, gff_filename_window_gained, "\t") utils.unparse_table(lost_gff, gff_filename_lost, "\t") utils.unparse_table(lost_window_gff, gff_filename_window_lost, "\t") # bed bed_filename = os.path.join( output_folder, "{}_{}_MERGED_{}.bed".format(genome, merge_name, enhancer_type)) utils.unparse_table(full_bed, bed_filename, "\t") # gene_table gene_filename = os.path.join( output_folder, "{}_{}_MERGED_{}_GENE_TABLE.txt".format(genome, merge_name, enhancer_type), ) utils.unparse_table(gene_table, gene_filename, "\t") # finally, move all of the plots to the output folder copyfile( glob.glob(os.path.join(merge_folder, "{}_ROSE".format(name1), "*.pdf"))[0], os.path.join( output_folder, "{}_{}_MERGED_{}_DELTA.pdf".format(genome, merge_name, enhancer_type), ), ) copyfile( glob.glob( os.path.join(merge_folder, "{}_ROSE".format(name1), "*RANK_PLOT.png"))[0], os.path.join( output_folder, "{}_{}_MERGED_{}_RANK_PLOT.png".format(genome, merge_name, enhancer_type), ), ) # now execute the bamPlot_turbo commands if plot_bam: bam1 = data_dict[name1]["bam"] bam2 = data_dict[name2]["bam"] bam_string = "{} {}".format(bam1, bam2) name_string = "{} {}".format(name1, name2) color_string = "0,0,0:100,100,100" if len(gained_gff) > 0: # gained command plot_title = "{}_ONLY_SE".format(name2) cmd = ( "bamPlot_turbo -g {} -b {} -i {} -o {} -n {} -c {} -t {} -r -y UNIFORM -p " "MULTIPLE".format( genome, bam_string, gff_filename_gained, output_folder, name_string, color_string, plot_title, )) os.system(cmd) # gained window command plot_title = "{}_ONLY_SE_{}KB_WINDOW".format( name2, str(window // 1000)) cmd = ( "bamPlot_turbo -g {} -b {} -i {} -o {} -n {} -c {} -t {} -r -y UNIFORM -p " "MULTIPLE".format( genome, bam_string, gff_filename_window_gained, output_folder, name_string, color_string, plot_title, )) os.system(cmd) if len(lost_gff) > 0: # lost command plot_title = "{}_ONLY_SE".format(name1) cmd = ( "bamPlot_turbo -g {} -b {} -i {} -o {} -n {} -c {} -t {} -r -y UNIFORM -p " "MULTIPLE".format( genome, bam_string, gff_filename_lost, output_folder, name_string, color_string, plot_title, )) os.system(cmd) # lost command plot_title = "{}_ONLY_SE_{}KB_WINDOW".format( name1, str(window // 1000)) cmd = ( "bamPlot_turbo -g {} -b {} -i {} -o {} -n {} -c {} -t {} -r -y UNIFORM -p " "MULTIPLE".format( genome, bam_string, gff_filename_window_lost, output_folder, name_string, color_string, plot_title, )) os.system(cmd) return