def make_bed_collection(bed_file_list): """Takes in a list of bed files and makes a single huge collection. Each locus has as its ID the name of the bed file. """ bed_loci = [] print("MAKING BED COLLECTION FOR:") for bed_file in bed_file_list: bed_name = os.path.basename(bed_file).split(".")[0] print(bed_name) bed = utils.parse_table(bed_file, "\t") for line in bed: if len(line) >= 3: # check that line[0] if line[0][0:3] == "chr": try: coords = [int(line[1]), int(line[2])] bed_locus = utils.Locus( line[0], min(coords), max(coords), ".", bed_name ) bed_loci.append(bed_locus) except ValueError: pass print("IDENTIFIED {} BED REGIONS".format(str(len(bed_loci)))) return utils.LocusCollection(bed_loci, 50)
def merge_collections(name_dict, analysis_name, output="", super_only=True): """Merge them collections.""" all_loci = [] names_list = list(name_dict.keys()) for name in names_list: se_collection = make_se_collection(name_dict[name]["enhancer_file"], name, super_only) if super_only: print("DATASET: {} HAS {} SUPERENHANCERS".format( name, str(len(se_collection)))) else: print("DATASET: {} HAS {} ENHANCERS".format( name, str(len(se_collection)))) all_loci += se_collection.get_loci() print(str(len(all_loci))) merged_collection = utils.LocusCollection(all_loci, 50) # stitch the collection together stitched_collection = merged_collection.stitch_collection() stitched_loci = stitched_collection.get_loci() print("IDENTIFIED {} CONSENSUS ENHANCER REGIONS".format( str(len(stitched_loci)))) # sort by size and provide a unique ID size_list = [locus.len() for locus in stitched_loci] size_order = utils.order(size_list, decreasing=True) ordered_loci = [stitched_loci[i] for i in size_order] for i in range(len(ordered_loci)): ordered_loci[i].id = "merged_{}_{}".format(analysis_name, str(i + 1)) merged_gff = [] for locus in ordered_loci: new_line = [ locus.chr, locus.id, "", locus.start, locus.end, "", locus.sense, "", locus.id, ] merged_gff.append(new_line) if len(output) == 0: return merged_gff else: print("writing merged gff to {}".format(output)) utils.unparse_table(merged_gff, output, "\t") return output
def merge_collections(super_file1, super_file2, name1, name2, output=""): """Merge them collections.""" con_super_collection = make_se_collection(super_file1, name1) tnf_super_collection = make_se_collection(super_file2, name2) # now merge them merged_loci = con_super_collection.get_loci( ) + tnf_super_collection.get_loci() merged_collection = utils.LocusCollection(merged_loci, 50) # stitch the collection together stitched_collection = merged_collection.stitch_collection() stitched_loci = stitched_collection.get_loci() # loci that are in both get renamed with a new unique identifier renamed_loci = [] ticker = 1 for locus in stitched_loci: if len(con_super_collection.get_overlap(locus)) > 0 and len( tnf_super_collection.get_overlap(locus)): new_id = "CONSERVED_{}".format(str(ticker)) ticker += 1 locus.id = new_id else: locus.id = locus.id[2:] renamed_loci.append(locus) # now we turn this into a gff and write it out gff = utils.locus_collection_to_gff(utils.LocusCollection( renamed_loci, 50)) if len(output) == 0: return gff else: print("writing merged gff to {}".format(output)) utils.unparse_table(gff, output, "\t") return output
def make_se_collection(enhancer_file, name, super_only=True): """Return a locus collection from a super table.""" enhancer_table = utils.parse_table(enhancer_file, "\t") enhancer_loci = [] for line in enhancer_table: if line[0][0] == "#" or line[0][0] == "R": continue else: if super_only and int(line[-1]) == 0: break enhancer_loci.append( utils.Locus(line[1], line[2], line[3], ".", "{}_{}".format(name, line[0]))) return utils.LocusCollection(enhancer_loci, 50)
def load_annot_file(genome, tss_window, gene_list=[]): """Load in the annotation. Create a start_dict and tss collection for a set of refseq IDs for a given genome. """ annotation_folder = os.path.join(ROOT_DIR, "annotation") genome_dict = { "HG18": os.path.join(annotation_folder, "hg18_refseq.ucsc"), "MM9": os.path.join(annotation_folder, "mm9_refseq.ucsc"), "MM10": os.path.join(annotation_folder, "mm10_refseq.ucsc"), "HG19": os.path.join(annotation_folder, "hg19_refseq.ucsc"), "HG19_RIBO": os.path.join(annotation_folder, "hg19_refseq.ucsc"), "RN4": os.path.join(annotation_folder, "rn4_refseq.ucsc"), "RN6": os.path.join(annotation_folder, "rn6_refseq.ucsc"), "HG38": os.path.join(annotation_folder, "hg38_refseq.ucsc"), } mouse_convert_file = os.path.join(annotation_folder, "HMD_HumanPhenotype.rpt") # making a dictionary for mouse to human conversion mouse_convert_dict = defaultdict(str) mouse_convert_table = utils.parse_table(mouse_convert_file, "\t") for line in mouse_convert_table: mouse_convert_dict[line[4]] = line[0] annot_file = genome_dict[genome.upper()] start_dict = utils.make_start_dict(annot_file, gene_list) tss_loci = [] if not gene_list: gene_list = [*start_dict] for gene in gene_list: tss_loci.append( utils.make_tss_locus(gene, start_dict, tss_window, tss_window)) tss_collection = utils.LocusCollection(tss_loci, 50) return start_dict, tss_collection, mouse_convert_dict
def make_se_collection(enhancer_file, name, top=0): """Return a locus collection from a super table. Top gives the number of rows. """ enhancer_table = utils.parse_table(enhancer_file, "\t") super_loci = [] ticker = 0 for line in enhancer_table: if line[0][0] == "#" or line[0][0] == "R": continue else: ticker += 1 super_loci.append( utils.Locus(line[1], line[2], line[3], ".", "{}_{}".format(name, line[0]))) if ticker == top: break return utils.LocusCollection(super_loci, 50)
def map_collection( stitched_collection, reference_collection, bam_file_list, mapped_folder, output, ref_name, ): """Makes a table of factor density in a stitched locus. Rank table by number of loci stitched together. """ print("FORMATTING TABLE") loci = list(stitched_collection.get_loci()) locus_table = [[ "REGION_ID", "CHROM", "START", "STOP", "NUM_LOCI", "CONSTITUENT_SIZE" ]] loci_len_list = [] # strip out any that are in chrY for locus in loci: if locus.chr == "chrY": loci.remove(locus) for locus in loci: # numLociList.append(int(stitchLocus.id.split('_')[1])) loci_len_list.append(locus.len()) # numOrder = order(numLociList,decreasing=True) len_order = utils.order(loci_len_list, decreasing=True) ticker = 0 for i in len_order: ticker += 1 if ticker % 1000 == 0: print(ticker) locus = loci[i] # First get the size of the enriched regions within the stitched locus ref_enrich_size = 0 ref_overlapping_loci = reference_collection.get_overlap(locus, "both") for ref_locus in ref_overlapping_loci: ref_enrich_size += ref_locus.len() try: stitch_count = int(locus.id.split("_")[0]) except ValueError: stitch_count = 1 coords = [int(x) for x in locus.coords()] locus_table.append([ locus.id, locus.chr, min(coords), max(coords), stitch_count, ref_enrich_size, ]) print("GETTING MAPPED DATA") print("USING A bam_file LIST:") print(bam_file_list) for bam_file in bam_file_list: bam_file_name = os.path.basename(bam_file) print("GETTING MAPPING DATA FOR {}".format(bam_file)) # assumes standard convention for naming enriched region gffs # opening up the mapped GFF mapped_gff_file = os.path.join( mapped_folder, "{}_{}_MAPPED".format(ref_name, bam_file_name), "matrix.txt") print("OPENING {}".format(mapped_gff_file)) mapped_gff = utils.parse_table(mapped_gff_file, "\t") signal_dict = defaultdict(float) print("MAKING SIGNAL DICT FOR {}".format(bam_file)) mapped_loci = [] for line in mapped_gff[1:]: chrom = line[1].split("(")[0] start = int(line[1].split(":")[-1].split("-")[0]) end = int(line[1].split(":")[-1].split("-")[1]) mapped_loci.append(utils.Locus(chrom, start, end, ".", line[0])) try: signal_dict[line[0]] = float(line[2]) * (abs(end - start)) except ValueError: print("WARNING NO SIGNAL FOR LINE:") print(line) continue mapped_collection = utils.LocusCollection(mapped_loci, 500) locus_table[0].append(bam_file_name) for i in range(1, len(locus_table)): signal = 0.0 line = locus_table[i] line_locus = utils.Locus(line[1], line[2], line[3], ".") overlapping_regions = mapped_collection.get_overlap(line_locus, sense="both") for region in overlapping_regions: signal += signal_dict[region.id] locus_table[i].append(signal) utils.unparse_table(locus_table, output, "\t")
def region_stitching( reference_collection, name, out_folder, stitch_window, tss_window, annot_file, remove_tss=True, ): """Preform region stitching.""" print("PERFORMING REGION STITCHING") # first have to turn bound region file into a locus collection # need to make sure this names correctly... each region should have a unique name # reference_collection debug_output = [] # filter out all bound regions that overlap the TSS of an ACTIVE GENE if remove_tss: print("REMOVING TSS FROM REGIONS USING AN EXCLUSION WINDOW OF {}BP". format(str(tss_window))) # first make a locus collection of TSS start_dict = utils.make_start_dict(annot_file) # now makeTSS loci for active genes remove_ticker = 0 # this loop makes a locus centered around +/- tss_window of transcribed genes # then adds it to the list tss_loci tss_loci = [] for gene_id in list(start_dict.keys()): tss_loci.append( utils.make_tss_locus(gene_id, start_dict, tss_window, tss_window)) # this turns the tss_loci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really matter tss_collection = utils.LocusCollection(tss_loci, 50) # gives all the loci in reference_collection bound_loci = list(reference_collection.get_loci()) # this loop will check if each bound region is contained by the TSS exclusion zone # this will drop out a lot of the promoter only regions that are tiny # typical exclusion window is around 2kb for locus in bound_loci: if len(tss_collection.get_containers(locus, "both")) > 0: # if true, the bound locus overlaps an active gene reference_collection.remove(locus) debug_output.append([locus.__str__(), locus.id, "CONTAINED"]) remove_ticker += 1 print("REMOVED {} LOCI BECAUSE THEY WERE CONTAINED BY A TSS".format( str(remove_ticker))) # reference_collection is now all enriched region loci that don't overlap an active TSS if stitch_window == "": print("DETERMINING OPTIMUM STITCHING PARAMTER") opt_collection = copy.deepcopy(reference_collection) stitch_window = optimize_stitching(opt_collection, name, out_folder, step_size=500) print("USING A STITCHING PARAMETER OF {}".format(stitch_window)) stitched_collection = reference_collection.stitch_collection( stitch_window, "both") if remove_tss: # now replace any stitched region that overlap 2 distinct genes # with the original loci that were there fixed_loci = [] tss_loci = [] for gene_id in list(start_dict.keys()): tss_loci.append(utils.make_tss_locus(gene_id, start_dict, 50, 50)) # this turns the tss_loci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really matter tss_collection = utils.LocusCollection(tss_loci, 50) remove_ticker = 0 original_ticker = 0 for stitched_locus in stitched_collection.get_loci(): overlapping_tss_loci = tss_collection.get_overlap( stitched_locus, "both") tss_names = [ start_dict[tss_locus.id]["name"] for tss_locus in overlapping_tss_loci ] tss_names = utils.uniquify(tss_names) if len(tss_names) > 2: # stitched_collection.remove(stitched_locus) original_loci = reference_collection.get_overlap( stitched_locus, "both") original_ticker += len(original_loci) fixed_loci += original_loci debug_output.append([ stitched_locus.__str__(), stitched_locus.id, "MULTIPLE_TSS" ]) remove_ticker += 1 else: fixed_loci.append(stitched_locus) print("REMOVED {} STITCHED LOCI BECAUSE THEY OVERLAPPED MULTIPLE TSSs". format(str(remove_ticker))) print("ADDED BACK {} ORIGINAL LOCI".format(str(original_ticker))) fixed_collection = utils.LocusCollection(fixed_loci, 50) return fixed_collection, debug_output, stitch_window else: return stitched_collection, debug_output, stitch_window
def main(): """Main run call.""" debug = False parser = argparse.ArgumentParser() # required flags parser.add_argument( "-i", "--i", dest="input", required=True, help= ("Enter a comma separated list of .gff or .bed file of binding sites used to make " "enhancers"), ) parser.add_argument( "-r", "--rankby", dest="rankby", required=True, help="Enter a comma separated list of bams to rank by", ) parser.add_argument("-o", "--out", dest="out", required=True, help="Enter an output folder") parser.add_argument( "-g", "--genome", dest="genome", required=True, help="Enter the genome build (MM9,MM8,HG18,HG19)", ) # optional flags parser.add_argument( "-n", "--name", dest="name", required=False, help="Provide a name for the analysis otherwise ROSE will guess", ) parser.add_argument( "-c", "--control", dest="control", required=False, help= ("Enter a comma separated list of control bams. Can either provide a single control " "bam for all rankby bams, or provide a control bam for each individual bam" ), ) parser.add_argument( "-s", "--stitch", dest="stitch", default="", help= ("Enter a max linking distance for stitching. Default will determine optimal stitching" " parameter"), ) parser.add_argument( "-t", "--tss", dest="tss", default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion", ) parser.add_argument( "--mask", dest="mask", required=False, help= "Mask a set of regions from analysis. Provide a .bed or .gff of masking regions", ) # RETRIEVING FLAGS args = parser.parse_args() # making the out folder if it doesn't exist out_folder = utils.format_folder(args.out, True) # figuring out folder schema gff_folder = utils.format_folder(os.path.join(out_folder, "gff"), True) mapped_folder = utils.format_folder(os.path.join(out_folder, "mappedGFF"), True) # GETTING INPUT FILE(s) input_list = [ input_file for input_file in args.input.split(",") if len(input_file) > 1 ] # converting all input files into GFFs and moving into the GFF folder input_gf_list = [] for input_file in input_list: # GETTING INPUT FILE if args.input.split(".")[-1] == "bed": # CONVERTING A BED TO GFF input_gff_name = os.path.basename(args.input)[0:-4] input_gff_file = os.path.join(gff_folder, "{}.gff".format(input_gff_name)) utils.bed_to_gff(args.input, input_gff_file) elif args.input.split(".")[-1] == "gff": # COPY THE INPUT GFF TO THE GFF FOLDER input_gff_file = args.input copyfile( input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file)), ) else: print( "WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT" ) # COPY THE INPUT GFF TO THE GFF FOLDER input_gff_file = args.input copyfile( input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file)), ) input_gf_list.append(input_gff_file) # GETTING THE LIST OF bam_fileS TO PROCESS # either same number of bams for rankby and control # or only 1 control #or none! # bamlist should be all rankby bams followed by control bams bam_file_list = [] if args.control: control_bam_list = [ bam for bam in args.control.split(",") if len(bam) > 0 ] rankby_bam_list = [ bam for bam in args.rankby.split(",") if len(bam) > 0 ] if len(control_bam_list) == len(rankby_bam_list): # case where an equal number of backgrounds are given bam_file_list = rankby_bam_list + control_bam_list elif len(control_bam_list) == 1: # case where a universal background is applied bam_file_list = rankby_bam_list + control_bam_list * len( rankby_bam_list) else: print( "ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM" " FOR EACH SAMPLE") sys.exit() else: bam_file_list = [bam for bam in args.rankby.split(",") if len(bam) > 0] # Stitch parameter if args.stitch == "": stitch_window = "" else: stitch_window = int(args.stitch) # tss args tss_window = int(args.tss) if tss_window != 0: remove_tss = True else: remove_tss = False # GETTING THE GENOME genome = args.genome.upper() print("USING {} AS THE GENOME".format(genome)) # GETTING THE CORRECT ANNOT FILE try: annot_file = rose2_utils.genome_dict[genome] except KeyError: print("ERROR: UNSUPPORTED GENOMES TYPE {}".format(genome)) sys.exit() # FINDING THE ANALYSIS NAME if args.name: input_name = args.name else: input_name = os.path.basename(input_gf_list[0]).split(".")[0] print("USING {} AS THE ANALYSIS NAME".format(input_name)) print("FORMATTING INPUT REGIONS") # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs # use a simpler unique region naming system if len(input_gf_list) == 1: input_gff = utils.parse_table(input_gf_list[0], "\t") else: input_loci = [] for gff_file in input_gf_list: print("\tprocessing {}".format(gff_file)) gff = utils.parse_table(gff_file, "\t") gff_collection = utils.gff_to_locus_collection(gff, 50) input_loci += gff_collection.get_loci() input_collection = utils.LocusCollection(input_loci, 50) input_collection = (input_collection.stitch_collection() ) # stitches to produce unique regions input_gff = utils.locus_collection_to_gff(input_collection) formatted_gff = [] # now number things appropriately for i, line in enumerate(input_gff): # use the coordinates to make a new id input_name_chr_sense_start_stop chrom = line[0] coords = [int(line[3]), int(line[4])] sense = line[6] line_id = "{}_{}".format(input_name, str(i + 1)) # 1 indexing new_line = [ chrom, line_id, line_id, min(coords), max(coords), "", sense, "", line_id, ] formatted_gff.append(new_line) # name of the master input gff file master_gff_file = os.path.join( gff_folder, "{}_{}_ALL_-0_+0.gff".format(genome, input_name)) utils.unparse_table(formatted_gff, master_gff_file, "\t") print("USING {} AS THE INPUT GFF".format(master_gff_file)) # GET CHROMS FOUND IN THE BAMS print("GETTING CHROMS IN bam_fileS") bam_chrom_list = rose2_utils.get_bam_chrom_list(bam_file_list) print("USING THE FOLLOWING CHROMS") print(bam_chrom_list) # LOADING IN THE GFF AND FILTERING BY CHROM print("LOADING AND FILTERING THE GFF") input_gff = rose2_utils.filter_gff(master_gff_file, bam_chrom_list) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print("LOADING IN GFF REGIONS") reference_collection = utils.gff_to_locus_collection(input_gff) print("CHECKING REFERENCE COLLECTION:") rose2_utils.check_ref_collection(reference_collection) # MASKING REFERENCE COLLECTION # see if there's a mask if args.mask: mask_file = args.mask # if it's a bed file if mask_file.split(".")[-1].upper() == "BED": mask_gff = utils.bedToGFF(mask_file) elif mask_file.split(".")[-1].upper() == "GFF": mask_gff = utils.parse_table(mask_file, "\t") else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() mask_collection = utils.gff_to_locus_collection(mask_gff) # now mask the reference loci reference_loci = reference_collection.get_loci() filtered_loci = [ locus for locus in reference_loci if len(mask_collection.get_overlap(locus, "both")) == 0 ] print("FILTERED OUT {} LOCI THAT WERE MASKED IN {}".format( len(reference_loci) - len(filtered_loci), mask_file)) reference_collection = utils.LocusCollection(filtered_loci, 50) # NOW STITCH REGIONS print("STITCHING REGIONS TOGETHER") stitched_collection, debug_output, stitch_window = rose2_utils.region_stitching( reference_collection, input_name, out_folder, stitch_window, tss_window, annot_file, remove_tss, ) # NOW MAKE A STITCHED COLLECTION GFF print("MAKING GFF FROM STITCHED COLLECTION") stitched_gff = utils.locus_collection_to_gff(stitched_collection) print(stitch_window) print(type(stitch_window)) if not remove_tss: stitched_gff_file = os.path.join( gff_folder, "{}_{}KB_STITCHED.gff".format(input_name, str(stitch_window // 1000)), ) stitched_gff_name = "{}_{}KB_STITCHED".format( input_name, str(stitch_window // 1000)) debug_out_file = os.path.join( gff_folder, "{}_{}KB_STITCHED.debug".format(input_name, str(stitch_window // 1000)), ) else: stitched_gff_file = os.path.join( gff_folder, "{}_{}KB_STITCHED_TSS_DISTAL.gff".format( input_name, str(stitch_window // 1000)), ) stitched_gff_name = "{}_{}KB_STITCHED_TSS_DISTAL".format( input_name, str(stitch_window // 1000)) debug_out_file = os.path.join( gff_folder, "{}_{}KB_STITCHED_TSS_DISTAL.debug".format( input_name, str(stitch_window // 1000)), ) # WRITING DEBUG OUTPUT TO DISK if debug: print("WRITING DEBUG OUTPUT TO DISK AS {}".format(debug_out_file)) utils.unparse_table(debug_output, debug_out_file, "\t") # WRITE THE GFF TO DISK print("WRITING STITCHED GFF TO DISK AS {}".format(stitched_gff_file)) utils.unparse_table(stitched_gff, stitched_gff_file, "\t") # SETTING UP THE OVERALL OUTPUT FILE output_file1 = os.path.join( out_folder, "{}_ENHANCER_REGION_MAP.txt".format(stitched_gff_name)) print("OUTPUT WILL BE WRITTEN TO {}".format(output_file1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF bam_file_list_unique = list(bam_file_list) bam_file_list_unique = utils.uniquify(bam_file_list_unique) # prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bam_file_list_unique) for bam_file in bam_file_list_unique: bam_file_name = os.path.basename(bam_file) # MAPPING TO THE STITCHED GFF mapped_out1_folder = os.path.join( mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name)) mapped_out1_file = os.path.join( mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name), "matrix.txt", ) if utils.check_output(mapped_out1_file, 0.2, 0.2): print("FOUND {} MAPPING DATA FOR BAM: {}".format( stitched_gff_file, mapped_out1_file)) else: cmd1 = "bamliquidator_batch --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format( stitched_gff_file, mapped_out1_folder, bam_file, ) print(cmd1) os.system(cmd1) if utils.check_output(mapped_out1_file, 0.2, 5): print("SUCCESSFULLY MAPPED TO {} FROM BAM: {}".format( stitched_gff_file, bam_file_name)) else: print("ERROR: FAILED TO MAP {} FROM BAM: {}".format( stitched_gff_file, bam_file_name)) sys.exit() print("BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS") # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR rose2_utils.map_collection( stitched_collection, reference_collection, bam_file_list, mapped_folder, output_file1, ref_name=stitched_gff_name, ) print("FINDING AVERAGE SIGNAL AMONGST BAMS") meta_output_file = collapse_region_map(output_file1, input_name + "_MERGED_SIGNAL", control_bams=args.control) # now try the merging print("CALLING AND PLOTTING SUPER-ENHANCERS") control_name = "NONE" cmd = "Rscript {} {} {} {} {}".format( os.path.join(ROOT_DIR, "scripts", "ROSE2_callSuper.R"), out_folder + "/", # TODO: fix R script so it does not require '/' meta_output_file, input_name, control_name, ) print(cmd) os.system(cmd) # calling the gene mapper print("CALLING GENE MAPPING") super_table_file = "{}_SuperEnhancers.table.txt".format(input_name) # for now don't use ranking bam to call top genes cmd = "ROSE2_geneMapper -g {} -i {} -f".format( genome, os.path.join(out_folder, super_table_file)) print(cmd) os.system(cmd) stretch_table_file = "{}_StretchEnhancers.table.txt".format(input_name) cmd = "ROSE2_geneMapper -g {} -i {} -f".format( genome, os.path.join(out_folder, stretch_table_file)) print(cmd) os.system(cmd) superstretch_table_file = "{}_SuperStretchEnhancers.table.txt".format( input_name) cmd = "ROSE2_geneMapper.py -g {} -i {} -f".format(genome, out_folder, superstretch_table_file) os.system(cmd)
def map_gff_line_to_annot( gff_line, out_folder, n_bins, gene_dict, tx_collection, sense="both", header="" ): """For every line produces a file with all of the rectangles to draw.""" if not header: gff_string = "{}_{}_{}_{}".format( gff_line[0], gff_line[6], gff_line[3], gff_line[4] ) else: gff_string = header diagram_table = [[0, 0, 0, 0]] name_table = [["", 0, 0]] gff_locus = utils.Locus( gff_line[0], int(gff_line[3]), int(gff_line[4]), gff_line[6], gff_line[1], ) scale_factor = n_bins / gff_locus.len() # plotting buffer for diagrams plot_buffer = int(gff_locus.len() / n_bins * 20) overlap_loci = tx_collection.get_overlap(gff_locus, sense="both") gene_list = [locus.id for locus in overlap_loci] if gff_line[6] == "-": ref_point = int(gff_line[4]) else: ref_point = int(gff_line[3]) offset_collection = utils.LocusCollection([], 500) for gene_id in gene_list: gene = gene_dict[gene_id] print(gene.common_name()) if len(gene.common_name()) > 1: name = gene.common_name() else: name = gene_id offset = 4 * len(offset_collection.get_overlap(gene.tx_locus())) offset_collection.append( utils.make_search_locus(gene.tx_locus(), plot_buffer, plot_buffer,) ) # write the name of the gene down if gene.sense() == "+": gene_start = gene.tx_locus().start else: gene_start = gene.tx_locus().end gene_start = abs(gene_start - ref_point) * scale_factor name_table.append([name, gene_start, -2 - offset]) # draw a line across the entire txLocus [start, stop] = [ abs(x - ref_point) * scale_factor for x in gene.tx_locus().coords() ] diagram_table.append([start, -0.01 - offset, stop, 0.01 - offset]) # now draw thin boxes for all tx_exons if gene.tx_exons(): for tx_exon in gene.tx_exons(): [start, stop] = [ abs(x - ref_point) * scale_factor for x in tx_exon.coords() ] diagram_table.append([start, -0.5 - offset, stop, 0.5 - offset]) # now draw fatty boxes for the coding exons if any if gene.cd_exons(): for cd_exon in gene.cd_exons(): [start, stop] = [ abs(x - ref_point) * scale_factor for x in cd_exon.coords() ] diagram_table.append([start, -1 - offset, stop, 1 - offset]) utils.unparse_table( diagram_table, os.path.join(out_folder, "{}_diagramTemp.txt".format(gff_string)), "\t", ) utils.unparse_table( name_table, os.path.join(out_folder, "{}_nameTemp.txt".format(gff_string)), "\t", )
def main(): """Main run function.""" parser = argparse.ArgumentParser() # required flags parser.add_argument( "-b", "--bam", dest="bam", nargs="*", help="Enter a comma/space separated list of .bam files to be processed.", required=True, ) parser.add_argument( "-i", "--input", dest="input", type=str, help="Enter .gff or genomic region e.g. chr1:+:1-1000.", required=True, ) parser.add_argument( "-g", "--genome", dest="genome", type=str, help="specify a genome, HG18,HG19,MM8,MM9,MM10 are currently supported", required=True, ) # output flag parser.add_argument( "-o", "--output", dest="output", type=str, help="Enter the output folder.", required=True, ) # additional options parser.add_argument( "--stretch-input", dest="stretch_input", default=None, type=int, help=( "Stretch the input regions to a minimum length in bp, e.g. 10000 (for" " 10kb)" ), ) parser.add_argument( "-c", "--color", dest="color", default=None, nargs="*", help=( "Enter a colon or space separated list of colors e.g. " "255,0,0:255,125,0, default samples the rainbow" ), ) parser.add_argument( "-s", "--sense", dest="sense", default="both", help="Map to '+','-' or 'both' strands. Default maps to both.", ) parser.add_argument( "-e", "--extension", dest="extension", default=200, help="Extends reads by n bp. Default value is 200bp", ) parser.add_argument( "-r", "--rpm", dest="rpm", action="store_true", default=False, help="Normalizes density to reads per million (rpm) Default is False", ) parser.add_argument( "-y", "--yScale", dest="y_scale", default="relative", help=( "Choose either relative or uniform y axis scaling. options = " "'relative,uniform' Default is relative scaling" ), ) parser.add_argument( "-n", "--names", dest="names", default=None, nargs="*", help="Enter a comma or space separated list of names for your bams", ) parser.add_argument( "-p", "--plot", dest="plot", default="MULTIPLE", help=( "Choose either all lines on a single plot or multiple plots. options " "= 'SINGLE,MULTIPLE,MERGE'" ), ) parser.add_argument( "-t", "--title", dest="title", default="", help=( "Specify a title for the output plot(s), default will be the " "coordinate region" ), ) parser.add_argument( "-q", "--skip-cache", dest="skip_cache", action="store_true", default=False, help="Toggles option to skip loading annotation cache file", ) parser.add_argument( "--scale", dest="scale", default=None, nargs="*", help=( "Enter a comma or space separated list of scaling factors for your " "bams. Default is none" ), ) parser.add_argument( "--bed", dest="bed", nargs="*", help="Add a comma-delimited or space-delimited list of bed files to plot", ) parser.add_argument( "--multi-page", dest="multi", action="store_true", default=False, help="If flagged will create a new pdf for each region", ) # DEBUG OPTION TO SAVE TEMP FILES parser.add_argument( "--save-temp", dest="save", action="store_true", default=False, help="If flagged will save temporary files made by bamPlot", ) args = parser.parse_args() print(args) if args.bam and args.input and args.genome and args.output: # Support a legacy mode where a ',' delimited multiple files bam_file_list = args.bam if len(bam_file_list) == 1: bam_file_list = bam_file_list[0].split(",") # Make sure these are actually files & readable (!) for filename in bam_file_list: assert os.access(filename, os.R_OK) # bringing in any beds if args.bed: bed_file_list = args.bed if len(bed_file_list) == 1: bed_file_list = bed_file_list[0].split(",") print(bed_file_list) bed_collection = make_bed_collection(bed_file_list) else: bed_collection = utils.LocusCollection([], 50) # Load the input for graphing. One of: # - A .gff # - A .bed # - a specific input region (e.g. chr10:.:93150000-93180000) valid_sense_options = {"+", "-", "."} if os.access(args.input, os.R_OK): if args.input.endswith(".bed"): # Uniquely graph every input of this bed parsed_input_bed = utils.parse_table(args.input, "\t") gff_name = os.path.basename(args.input) # Graph title gff = None try: if parsed_input_bed[0][5] in valid_sense_options: # This .bed might have a sense parameter gff = [ [e[0], "", args.input, e[1], e[2], "", e[5], "", ""] for e in parsed_input_bed ] except IndexError: pass if gff is None: print( "Your bed doesn't have a valid sense parameter. Defaulting to both " "strands, '.'" ) # We only take chr/start/stop and ignore everything else. gff = [ [e[0], "", args.input, e[1], e[2], "", ".", "", ""] for e in parsed_input_bed ] else: # Default to .gff, since that's the original behavior gff = utils.parse_table(args.input, "\t") gff_name = os.path.basename(args.input).split(".")[0] else: # means a coordinate line has been given e.g. chr1:+:1-100 chrom_line = args.input.split(":") try: chrom = chrom_line[0] sense = chrom_line[1] except IndexError: print("Invalid input line or inaccessible file. Try: chr1:.:1-5000") exit() assert sense in valid_sense_options [start, end] = chrom_line[2].split("-") if chrom[0:3] != "chr": print("ERROR: UNRECOGNIZED GFF OR CHROMOSOME LINE INPUT") exit() gff_line = [chrom, "", args.input, start, end, "", sense, "", ""] gff_name = "{}_{}_{}_{}".format(chrom, sense, start, end) gff = [gff_line] # Consider stretching the regions to a fixed minimum size if args.stretch_input: print( "Stretching inputs to a minimum of: {} bp".format( str(args.stretch_input) ) ) min_length = args.stretch_input stretch_gff = [] for e in gff: difference = int(e[4]) - int(e[3]) if difference < min_length: pad = int((min_length - difference) / 2) stretch_gff.append( [ e[0], e[1], e[2], int(e[3]) - pad, int(e[4]) + pad, e[5], e[6], e[7], e[8], ] ) else: stretch_gff.append(e) gff = stretch_gff # Sanity test the gff object assert all([e[6] in valid_sense_options for e in gff]) # All strands are sane # bring in the genome genome = args.genome.upper() if not ["HG18", "HG19", "HG19_RIBO", "HG38", "MM9", "MM10", "RN4", "RN6"].count( genome ): print( "ERROR: UNSUPPORTED GENOME TYPE {}. USE HG19,HG18, RN4, MM9, or MM10".format( genome, ) ) parser.print_help() exit() # bring in the rest of the options # output root_folder = args.output try: os.listdir(root_folder) except OSError: print("ERROR: UNABLE TO FIND OUTPUT DIRECTORY {}".format(root_folder)) exit() # Get analysis title if not args.title: title = gff_name else: title = args.title # make a temp folder temp_folder = os.path.join(root_folder, title) print("CREATING TEMP FOLDER {}".format(temp_folder)) utils.format_folder(temp_folder, create=True) # colors if args.color: color_list = args.color if len(color_list) == 1: color_list = color_list[0].split(":") color_list = [x.split(",") for x in color_list] if len(color_list) < len(bam_file_list): print( "WARNING: FEWER COLORS THAN BAMS SPECIFIED. COLORS WILL BE RECYCLED" ) # recycling the color list color_list += color_list * (len(bam_file_list) // len(color_list)) color_list = color_list[: len(bam_file_list)] else: # cycles through the colors of the rainbow color_list = taste_the_rainbow(len(bam_file_list)) # sense sense = args.sense extension = int(args.extension) rpm = args.rpm scale = args.scale if scale: if len(scale) == 1: scale = scale[0].split(",") y_scale = args.y_scale.upper() # names if args.names: names = args.names if len(names) == 1: names = names[0].split(",") if len(names) != len(bam_file_list): print("ERROR: NUMBER OF NAMES AND NUMBER OF BAMS DO NOT CORRESPOND") parser.print_help() exit() else: names = [os.path.basename(x) for x in bam_file_list] # plot style plot_style = args.plot.upper() if not ["SINGLE", "MULTIPLE", "MERGE"].count(plot_style): print("ERROR: PLOT STYLE {} NOT AN OPTION".format(plot_style)) parser.print_help() exit() # now run! summary_table_file_name = make_bam_plot_tables( gff, genome, bam_file_list, color_list, n_bins, sense, extension, rpm, temp_folder, names, title, bed_collection, scale, ) print("{} is the summary table".format(summary_table_file_name)) # running the R command to plot multi = args.multi out_file = os.path.join(root_folder, "{}_plots.pdf".format(title)) r_cmd = call_r_plot( summary_table_file_name, out_file, y_scale, plot_style, multi ) # open a bash file bash_file_name = os.path.join(temp_folder, "{}_Rcmd.sh".format(title)) with open(bash_file_name, "w") as bash_file: bash_file.write("#!/usr/bin/bash\n") bash_file.write(r_cmd) print("Wrote R command to {}".format(bash_file_name)) os.system("bash {}".format(bash_file_name)) # delete temp files if not args.save: if utils.check_output(out_file, 1, 10): # This is super dangerous (!). Add some sanity checks. assert " " not in temp_folder assert temp_folder != "/" shutil.rmtree(temp_folder) print("Removing temp folder: {}".format(temp_folder)) else: print("ERROR: NO OUTPUT FILE {} DETECTED".format(out_file)) else: parser.print_help() sys.exit()
def main(): """Main run call.""" debug = False parser = argparse.ArgumentParser() # required flags parser.add_argument( "-i", "--i", dest="input", required=True, help="Enter a .gff or .bed file of binding sites used to make enhancers", ) parser.add_argument( "-r", "--rankby", dest="rankby", required=True, help="bam_file to rank enhancer by", ) parser.add_argument( "-o", "--out", dest="out", required=True, help="Enter an output folder" ) parser.add_argument( "-g", "--genome", dest="genome", required=True, help="Enter the genome build (MM9,MM8,HG18,HG19)", ) # optional flags parser.add_argument( "-b", "--bams", dest="bams", required=False, help="Enter a comma separated list of additional bam files to map to", ) parser.add_argument( "-c", "--control", dest="control", required=False, help="bam_file to rank enhancer by", ) parser.add_argument( "-s", "--stitch", dest="stitch", default="", help=( "Enter a max linking distance for stitching. Default will determine optimal stitching" " parameter" ), ) parser.add_argument( "-t", "--tss", dest="tss", default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion", ) parser.add_argument( "--mask", dest="mask", required=False, help="Mask a set of regions from analysis. Provide a .bed or .gff of masking regions", ) # RETRIEVING FLAGS args = parser.parse_args() # making the out folder if it doesn't exist out_folder = utils.format_folder(args.out, True) # figuring out folder schema gff_folder = utils.format_folder(os.path.join(out_folder, "gff"), True) mapped_folder = utils.format_folder(os.path.join(out_folder, "mapped_gff"), True) # GETTING INPUT FILE if args.input.split(".")[-1] == "bed": # CONVERTING A BED TO GFF input_gff_name = args.input.split("/")[-1][0:-4] input_gff_file = os.path.join(gff_folder, "{}.gff".format(input_gff_name)) utils.bed_to_gff(args.input, input_gff_file) elif args.input.split(".")[-1] == "gff": # COPY THE INPUT GFF TO THE GFF FOLDER input_gff_file = args.input copyfile( input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file)) ) else: print( "WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT" ) # COPY THE INPUT GFF TO THE GFF FOLDER input_gff_file = args.input copyfile( input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file)) ) # GETTING THE LIST OF bam_fileS TO PROCESS if args.control: bam_file_list = [args.rankby, args.control] else: bam_file_list = [args.rankby] if args.bams: bam_file_list += args.bams.split(",") # bam_file_list = utils.uniquify(bam_file_list) # makes sad when you have the same control # bam over and over again # optional args # Stitch parameter if args.stitch == "": stitch_window = "" else: stitch_window = int(args.stitch) # tss args tss_window = int(args.tss) if tss_window != 0: remove_tss = True else: remove_tss = False # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS print("USING {} AS THE INPUT GFF".format(input_gff_file)) input_name = os.path.basename(input_gff_file).split(".")[0] # GETTING THE GENOME genome = args.genome print("USING {} AS THE GENOME".format(genome)) annot_file = rose2_utils.genome_dict[genome.upper()] # GET CHROMS FOUND IN THE BAMS print("GETTING CHROMS IN bam_fileS") bam_chrom_list = rose2_utils.get_bam_chrom_list(bam_file_list) print("USING THE FOLLOWING CHROMS") print(bam_chrom_list) # LOADING IN THE GFF AND FILTERING BY CHROM print("LOADING AND FILTERING THE GFF") input_gff = rose2_utils.filter_gff(input_gff_file, bam_chrom_list) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print("LOADING IN GFF REGIONS") reference_collection = utils.gff_to_locus_collection(input_gff) print("STARTING WITH {} INPUT REGIONS".format(len(reference_collection))) print("CHECKING REFERENCE COLLECTION:") rose2_utils.check_ref_collection(reference_collection) # MASKING REFERENCE COLLECTION # see if there's a mask if args.mask: mask_file = args.mask print("USING MASK FILE {}".format(mask_file)) # if it's a bed file if mask_file.split(".")[-1].upper() == "BED": mask_gff = utils.bed_to_gff(mask_file) elif mask_file.split(".")[-1].upper() == "GFF": mask_gff = utils.parse_table(mask_file, "\t") else: print("MASK MUST BE A .gff or .bed FILE") mask_collection = utils.gff_to_locus_collection(mask_gff) print("LOADING {} MASK REGIONS".format(str(len(mask_collection)))) # now mask the reference loci reference_loci = reference_collection.get_loci() filtered_loci = [ locus for locus in reference_loci if len(mask_collection.get_overlap(locus, "both")) == 0 ] print( "FILTERED OUT {} LOCI THAT WERE MASKED IN {}".format( str(len(reference_loci) - len(filtered_loci)), mask_file ) ) reference_collection = utils.LocusCollection(filtered_loci, 50) # NOW STITCH REGIONS print("STITCHING REGIONS TOGETHER") stitched_collection, debug_output, stitch_window = rose2_utils.region_stitching( reference_collection, input_name, out_folder, stitch_window, tss_window, annot_file, remove_tss, ) # NOW MAKE A STITCHED COLLECTION GFF print("MAKING GFF FROM STITCHED COLLECTION") stitched_gff = utils.locus_collection_to_gff(stitched_collection) # making sure start/stop ordering are correct for i in range(len(stitched_gff)): line = stitched_gff[i] start = int(line[3]) stop = int(line[4]) if start > stop: line[3] = stop line[4] = start print(stitch_window) print(type(stitch_window)) if not remove_tss: stitched_gff_file = os.path.join( gff_folder, "{}_{}KB_STITCHED.gff".format(input_name, str(stitch_window // 1000)), ) stitched_gff_name = "{}_{}KB_STITCHED".format( input_name, str(stitch_window // 1000) ) debug_out_file = os.path.join( gff_folder, "{}_{}KB_STITCHED.debug".format(input_name, str(stitch_window // 1000)), ) else: stitched_gff_file = os.path.join( gff_folder, "{}_{}KB_STITCHED_TSS_DISTAL.gff".format( input_name, str(stitch_window // 1000) ), ) stitched_gff_name = "{}_{}KB_STITCHED_TSS_DISTAL".format( input_name, str(stitch_window // 1000) ) debug_out_file = os.path.join( gff_folder, "{}_{}KB_STITCHED_TSS_DISTAL.debug".format( input_name, str(stitch_window // 1000) ), ) # WRITING DEBUG OUTPUT TO DISK if debug: print("WRITING DEBUG OUTPUT TO DISK AS {}".format(debug_out_file)) utils.unparse_table(debug_output, debug_out_file, "\t") # WRITE THE GFF TO DISK print("WRITING STITCHED GFF TO DISK AS {}".format(stitched_gff_file)) utils.unparse_table(stitched_gff, stitched_gff_file, "\t") # SETTING UP THE OVERALL OUTPUT FILE output_file1 = os.path.join( out_folder, "{}_ENHANCER_REGION_MAP.txt".format(stitched_gff_name) ) print("OUTPUT WILL BE WRITTEN TO {}".format(output_file1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF bam_file_list_unique = list(bam_file_list) bam_file_list_unique = utils.uniquify(bam_file_list_unique) # prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bam_file_list_unique) for bam_file in bam_file_list_unique: bam_file_name = os.path.basename(bam_file) # MAPPING TO THE STITCHED GFF mapped_out1_folder = os.path.join( mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name) ) mapped_out1_file = os.path.join( mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name), "matrix.txt", ) if utils.check_output(mapped_out1_file, 0.2, 0.2): print( "FOUND {} MAPPING DATA FOR BAM: {}".format( stitched_gff_file, mapped_out1_file ) ) else: cmd1 = "bamliquidator_batch --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format( stitched_gff_file, mapped_out1_folder, bam_file, ) print(cmd1) os.system(cmd1) if utils.check_output(mapped_out1_file, 0.2, 5): print( "SUCCESSFULLY MAPPED TO {} FROM BAM: {}".format( stitched_gff_file, bam_file_name ) ) else: print( "ERROR: FAILED TO MAP {} FROM BAM: {}".format( stitched_gff_file, bam_file_name ) ) sys.exit() print("BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS") # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR rose2_utils.map_collection( stitched_collection, reference_collection, bam_file_list, mapped_folder, output_file1, ref_name=stitched_gff_name, ) print("CALLING AND PLOTTING SUPER-ENHANCERS") if args.control: control_name = os.path.basename(args.control) else: control_name = "NONE" cmd = "Rscript {} {} {} {} {}".format( os.path.join(ROOT_DIR, "scripts", "ROSE2_callSuper.R"), out_folder + "/", # TODO: fix R script so it does not require '/' output_file1, input_name, control_name, ) print(cmd) os.system(cmd) # calling the gene mapper time.sleep(20) super_table_file = "{}_SuperEnhancers.table.txt".format(input_name) if args.control: cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format( genome, args.rankby, args.control, os.path.join(out_folder, super_table_file), ) else: cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format( genome, args.rankby, os.path.join(out_folder, super_table_file) ) os.system(cmd) stretch_table_file = "{}_StretchEnhancers.table.txt".format(input_name) if args.control: cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format( genome, args.rankby, args.control, os.path.join(out_folder, stretch_table_file), ) else: cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format( genome, args.rankby, os.path.join(out_folder, stretch_table_file) ) os.system(cmd) superstretch_table_file = "{}_SuperStretchEnhancers.table.txt".format(input_name) if args.control: cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format( genome, args.rankby, args.control, os.path.join(out_folder, superstretch_table_file), ) else: cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format( genome, args.rankby, os.path.join(out_folder, superstretch_table_file) ) os.system(cmd)
def split_regions(input_gff, tss_collection, mask_file=None): """Split regions if even a single coordinate is shared with the +/-1kb.""" # create mask regions collection if mask_file: print("USING MASK FILE {}".format(mask_file)) # if it's a bed file if mask_file.split(".")[-1].upper() == "BED": mask_gff = utils.bed_to_gff(mask_file) elif mask_file.split(".")[-1].upper() == "GFF": mask_gff = utils.parse_table(mask_file, "\t") else: print("MASK MUST BE A .gff or .bed FILE") mask_collection = utils.gff_to_locus_collection(mask_gff) print("LOADING {} MASK REGIONS".format(len(mask_collection))) split_gff = [] for line in input_gff: chrom = line[0] region_id = line[1] line_locus = utils.Locus(line[0], line[3], line[4], ".") # mask regions if mask_file: if mask_collection.get_overlap(line_locus, "both"): continue overlapping_loci = tss_collection.get_overlap(line_locus) if overlapping_loci: # case where a tss overlap # identify the parts of the line locus that are contained local_tss_collection = utils.LocusCollection(overlapping_loci, 50) overlapping_coords = line_locus.coords() for tss_locus in overlapping_loci: overlapping_coords += tss_locus.coords() overlapping_coords = utils.uniquify(overlapping_coords) overlapping_coords.sort() # you need to hack and slash add 1 to the last coordinate of the overlapping_coords overlapping_coords[-1] += 1 i = 0 region_ticker = 1 while i < (len(overlapping_coords) - 1): start = int(overlapping_coords[i]) stop = int(overlapping_coords[(i + 1)]) - 1 if (stop - start) < 50: # this eliminates really tiny regions i += 1 continue split_locus = utils.Locus(chrom, start + 1, stop, ".") if line_locus.overlaps(split_locus): new_id = "{}_{}".format(region_id, region_ticker) tss_status = 0 if local_tss_collection.get_overlap(split_locus): tss_status = 1 split_gff_line = [ chrom, new_id, new_id, start, stop, "", ".", tss_status, new_id, ] split_gff.append(split_gff_line) region_ticker += 1 i += 1 else: line[7] = 0 split_gff.append(line) return split_gff
def make_peak_table( param_dict, split_gff_path, average_table_path, start_dict, gene_list, genome_directory, tss_window, distal_window, tads_path="", ): """Makes the final peak table with ebox info.""" peak_table = [[ "REGION_ID", "CHROM", "START", "STOP", "LENGTH", "TSS", "CPG", "CPG_FRACTION", "GC_FREQ", "SIGNAL", "CANON_EBOX_COUNT", "NON_CANON_EBOX_COUNT", "TOTAL_EBOX_COUNT", "OVERLAPPING_GENES", "PROXIMAL_GENES", ]] print("LOADING PEAK REGIONS") peak_gff = utils.parse_table(split_gff_path, "\t") print("LOADING BINDING DATA") signal_table = utils.parse_table(average_table_path, "\t") print("LOADING CPGS ISLANDS") cpg_bed = utils.parse_table(param_dict["cpg_path"], "\t") cpg_loci = [] for line in cpg_bed: cpg_loci.append(utils.Locus(line[0], line[1], line[2], ".", line[-1])) cpg_collection = utils.LocusCollection(cpg_loci, 50) print("MAKING TSS COLLECTIONS") if not gene_list: gene_list = [*start_dict] tss_prox_loci = [] tss_distal_loci = [] for ref_id in gene_list: tss_prox_loci.append( utils.make_tss_locus(ref_id, start_dict, tss_window, tss_window)) tss_distal_loci.append( utils.make_tss_locus( ref_id, start_dict, distal_window, distal_window, )) # make a 1kb flanking and 50kb flanking collection tss_prox_collection = utils.LocusCollection(tss_prox_loci, 50) tss_distal_collection = utils.LocusCollection(tss_distal_loci, 50) if tads_path: print("LOADING TADS FROM {}".format(tads_path)) tad_collection = utils.import_bound_region(tads_path, "tad") use_tads = True # building a tad dict keyed by tad ID w/ genes in that tad provided tad_dict = defaultdict(list) for tss_locus in tss_prox_loci: overlapping_tads = tad_collection.get_overlap(tss_locus, "both") for tad_locus in overlapping_tads: tad_dict[tad_locus.id].append(tss_locus.id) else: use_tads = False print("CLASSIFYING PEAKS") ticker = 0 no_tad_count = 0 for i in range(len(peak_gff)): if not ticker % 1000: print(ticker) ticker += 1 # getting the particulars of the region gff_line = peak_gff[i] peak_id = gff_line[1] chrom = gff_line[0] start = int(gff_line[3]) stop = int(gff_line[4]) line_locus = utils.Locus(chrom, start, stop, ".", peak_id) # getting the mapped signal signal_line = signal_table[(i + 1)] signal_vector = [float(x) for x in signal_line[2:]] # setting up the new line new_line = [peak_id, chrom, start, stop, line_locus.len()] # get the tss status from the gff itself # (we are able to do this nicely from the split gff code earlier) new_line.append(gff_line[7]) # check cpg status if cpg_collection.get_overlap(line_locus, "both"): new_line.append(1) else: new_line.append(0) # now do fractional cpgoverlap overlapping_cpg_loci = cpg_collection.get_overlap(line_locus, "both") overlapping_bases = 0 for locus in overlapping_cpg_loci: cpg_start = max(locus.start, line_locus.start) cpg_end = min(locus.end, line_locus.end) overlapping_bases += cpg_end - cpg_start overlap_fraction = float(overlapping_bases) / line_locus.len() new_line.append(round(overlap_fraction, 2)) # now get the seq line_seq = utils.fetch_seq(genome_directory, chrom, start, stop, True).upper() if not line_seq: print("UH OH") print(line_seq) print(gff_line) print(i) print(chrom) print(start) print(stop) sys.exit() gc_freq = float(line_seq.count("GC") + line_seq.count("CG")) / len(line_seq) new_line.append(gc_freq) # this is where we add the ChIP-seq signal new_line += signal_vector ebox_match_list = re.findall("CA..TG", line_seq) if not ebox_match_list: new_line += [0] * 3 else: total_count = len(ebox_match_list) canon_count = ebox_match_list.count("CACGTG") other_count = total_count - canon_count new_line += [canon_count, other_count, total_count] # now find the overlapping and proximal genes # here each overlapping gene the tss prox locus overlaps the peak if use_tads: tad_loci = tad_collection.get_overlap(line_locus, "both") tad_id_list = [tad_locus.id for tad_locus in tad_loci] tad_genes = [] for tad_id in tad_id_list: tad_genes += tad_dict[tad_id] if not tad_genes: no_tad_count += 1 else: tad_genes = [] if tad_genes: overlapping_genes = [ start_dict[locus.id]["name"] for locus in tss_prox_collection.get_overlap( line_locus, "both") if tad_genes.count(locus.id) ] proximal_genes = [ start_dict[locus.id]["name"] for locus in tss_distal_collection.get_overlap( line_locus, "both") if tad_genes.count(locus.id) ] else: overlapping_genes = [ start_dict[locus.id]["name"] for locus in tss_prox_collection.get_overlap( line_locus, "both") ] proximal_genes = [ start_dict[locus.id]["name"] for locus in tss_distal_collection.get_overlap( line_locus, "both") ] overlapping_genes = utils.uniquify(overlapping_genes) # here the tss 50kb locus overlaps the peak # overlap takes priority over proximal proximal_genes = [ gene for gene in proximal_genes if not overlapping_genes.count(gene) ] proximal_genes = utils.uniquify(proximal_genes) overlapping_string = ",".join(overlapping_genes) proximal_string = ",".join(proximal_genes) new_line += [overlapping_string, proximal_string] peak_table.append(new_line) print("Out of {} regions, {} were assigned to at least 1 tad".format( str(len(peak_table)), str(no_tad_count), )) return peak_table