def map_gff_line_to_bed(gff_line, out_folder, n_bins, bed_collection, header=""): """For every line produces a file with all of the rectangles to draw.""" if not header: gff_string = "{}_{}_{}_{}".format( gff_line[0], gff_line[6], gff_line[3], gff_line[4] ) else: gff_string = header diagram_table = [[0, 0, 0, 0]] name_table = [["", 0, 0]] gff_locus = utils.Locus( gff_line[0], int(gff_line[3]), int(gff_line[4]), gff_line[6], gff_line[1], ) scale_factor = n_bins / gff_locus.len() overlap_loci = bed_collection.get_overlap(gff_locus, sense="both") print( "IDENTIFIED {} OVERLAPPING BED LOCI FOR REGION {}".format( str(len(overlap_loci)), gff_line, ) ) # since beds come from multiple sources, we want to figure out how to offset them offset_dict = {} # this will store each ID name bed_names_list = utils.uniquify([locus.id for locus in overlap_loci]) bed_names_list.sort() for i in range(len(bed_names_list)): offset_dict[bed_names_list[i]] = ( 2 * i ) # offsets different categories of bed regions if gff_line[6] == "-": ref_point = int(gff_line[4]) else: ref_point = int(gff_line[3]) # fill out the name table for name in bed_names_list: offset = offset_dict[name] name_table.append([name, 0, 0.0 - offset]) for bed_locus in overlap_loci: offset = offset_dict[bed_locus.id] [start, stop] = [abs(x - ref_point) * scale_factor for x in bed_locus.coords()] diagram_table.append([start, -0.5 - offset, stop, 0.5 - offset]) utils.unparse_table( diagram_table, os.path.join(out_folder, "{}_bedDiagramTemp.txt".format(gff_string)), "\t", ) utils.unparse_table( name_table, os.path.join(out_folder, "{}_bedNameTemp.txt".format(gff_string)), "\t", )
def make_bed_collection(bed_file_list): """Takes in a list of bed files and makes a single huge collection. Each locus has as its ID the name of the bed file. """ bed_loci = [] print("MAKING BED COLLECTION FOR:") for bed_file in bed_file_list: bed_name = os.path.basename(bed_file).split(".")[0] print(bed_name) bed = utils.parse_table(bed_file, "\t") for line in bed: if len(line) >= 3: # check that line[0] if line[0][0:3] == "chr": try: coords = [int(line[1]), int(line[2])] bed_locus = utils.Locus( line[0], min(coords), max(coords), ".", bed_name ) bed_loci.append(bed_locus) except ValueError: pass print("IDENTIFIED {} BED REGIONS".format(str(len(bed_loci)))) return utils.LocusCollection(bed_loci, 50)
def assign_enhancer_rank(enhancer_to_gene_file, enhancer_file1, enhancer_file2, name1, name2, rank_output=""): """Assign enhancer rank to genes. For all genes in the enhancer_to_gene table, assign the highest overlapping ranked enhancer in the other tables. """ enhancer_to_gene = utils.parse_table(enhancer_to_gene_file, "\t") enhancer_collection1 = make_se_collection(enhancer_file1, name1, False) enhancer_collection2 = make_se_collection(enhancer_file2, name2, False) enhancer_dict1 = make_se_dict(enhancer_file1, name1, False) enhancer_dict2 = make_se_dict(enhancer_file2, name2, False) # we're going to update the enhancer_to_gene_table enhancer_to_gene[0] += ["{}_rank".format(name1), "{}_rank".format(name2)] for i in range(1, len(enhancer_to_gene)): line = enhancer_to_gene[i] locus_line = utils.Locus(line[1], line[2], line[3], ".", line[0]) # if the enhancer doesn't exist, its ranking is dead last on the enhancer list enhancer1_overlap = enhancer_collection1.get_overlap( locus_line, "both") if len(enhancer1_overlap) == 0: enhancer1_rank = len(enhancer_collection1) else: rank_list1 = [ enhancer_dict1[x.id]["rank"] for x in enhancer1_overlap ] enhancer1_rank = min(rank_list1) enhancer2_overlap = enhancer_collection2.get_overlap( locus_line, "both") if len(enhancer2_overlap) == 0: enhancer2_rank = len(enhancer_collection2) else: rank_list2 = [ enhancer_dict2[x.id]["rank"] for x in enhancer2_overlap ] enhancer2_rank = min(rank_list2) enhancer_to_gene[i] += [enhancer1_rank, enhancer2_rank] if len(rank_output) == 0: return enhancer_to_gene else: utils.unparse_table(enhancer_to_gene, rank_output, "\t")
def make_se_collection(enhancer_file, name, super_only=True): """Return a locus collection from a super table.""" enhancer_table = utils.parse_table(enhancer_file, "\t") enhancer_loci = [] for line in enhancer_table: if line[0][0] == "#" or line[0][0] == "R": continue else: if super_only and int(line[-1]) == 0: break enhancer_loci.append( utils.Locus(line[1], line[2], line[3], ".", "{}_{}".format(name, line[0]))) return utils.LocusCollection(enhancer_loci, 50)
def make_se_collection(enhancer_file, name, top=0): """Return a locus collection from a super table. Top gives the number of rows. """ enhancer_table = utils.parse_table(enhancer_file, "\t") super_loci = [] ticker = 0 for line in enhancer_table: if line[0][0] == "#" or line[0][0] == "R": continue else: ticker += 1 super_loci.append( utils.Locus(line[1], line[2], line[3], ".", "{}_{}".format(name, line[0]))) if ticker == top: break return utils.LocusCollection(super_loci, 50)
def map_collection( stitched_collection, reference_collection, bam_file_list, mapped_folder, output, ref_name, ): """Makes a table of factor density in a stitched locus. Rank table by number of loci stitched together. """ print("FORMATTING TABLE") loci = list(stitched_collection.get_loci()) locus_table = [[ "REGION_ID", "CHROM", "START", "STOP", "NUM_LOCI", "CONSTITUENT_SIZE" ]] loci_len_list = [] # strip out any that are in chrY for locus in loci: if locus.chr == "chrY": loci.remove(locus) for locus in loci: # numLociList.append(int(stitchLocus.id.split('_')[1])) loci_len_list.append(locus.len()) # numOrder = order(numLociList,decreasing=True) len_order = utils.order(loci_len_list, decreasing=True) ticker = 0 for i in len_order: ticker += 1 if ticker % 1000 == 0: print(ticker) locus = loci[i] # First get the size of the enriched regions within the stitched locus ref_enrich_size = 0 ref_overlapping_loci = reference_collection.get_overlap(locus, "both") for ref_locus in ref_overlapping_loci: ref_enrich_size += ref_locus.len() try: stitch_count = int(locus.id.split("_")[0]) except ValueError: stitch_count = 1 coords = [int(x) for x in locus.coords()] locus_table.append([ locus.id, locus.chr, min(coords), max(coords), stitch_count, ref_enrich_size, ]) print("GETTING MAPPED DATA") print("USING A bam_file LIST:") print(bam_file_list) for bam_file in bam_file_list: bam_file_name = os.path.basename(bam_file) print("GETTING MAPPING DATA FOR {}".format(bam_file)) # assumes standard convention for naming enriched region gffs # opening up the mapped GFF mapped_gff_file = os.path.join( mapped_folder, "{}_{}_MAPPED".format(ref_name, bam_file_name), "matrix.txt") print("OPENING {}".format(mapped_gff_file)) mapped_gff = utils.parse_table(mapped_gff_file, "\t") signal_dict = defaultdict(float) print("MAKING SIGNAL DICT FOR {}".format(bam_file)) mapped_loci = [] for line in mapped_gff[1:]: chrom = line[1].split("(")[0] start = int(line[1].split(":")[-1].split("-")[0]) end = int(line[1].split(":")[-1].split("-")[1]) mapped_loci.append(utils.Locus(chrom, start, end, ".", line[0])) try: signal_dict[line[0]] = float(line[2]) * (abs(end - start)) except ValueError: print("WARNING NO SIGNAL FOR LINE:") print(line) continue mapped_collection = utils.LocusCollection(mapped_loci, 500) locus_table[0].append(bam_file_name) for i in range(1, len(locus_table)): signal = 0.0 line = locus_table[i] line_locus = utils.Locus(line[1], line[2], line[3], ".") overlapping_regions = mapped_collection.get_overlap(line_locus, sense="both") for region in overlapping_regions: signal += signal_dict[region.id] locus_table[i].append(signal) utils.unparse_table(locus_table, output, "\t")
def map_gff_line_to_annot( gff_line, out_folder, n_bins, gene_dict, tx_collection, sense="both", header="" ): """For every line produces a file with all of the rectangles to draw.""" if not header: gff_string = "{}_{}_{}_{}".format( gff_line[0], gff_line[6], gff_line[3], gff_line[4] ) else: gff_string = header diagram_table = [[0, 0, 0, 0]] name_table = [["", 0, 0]] gff_locus = utils.Locus( gff_line[0], int(gff_line[3]), int(gff_line[4]), gff_line[6], gff_line[1], ) scale_factor = n_bins / gff_locus.len() # plotting buffer for diagrams plot_buffer = int(gff_locus.len() / n_bins * 20) overlap_loci = tx_collection.get_overlap(gff_locus, sense="both") gene_list = [locus.id for locus in overlap_loci] if gff_line[6] == "-": ref_point = int(gff_line[4]) else: ref_point = int(gff_line[3]) offset_collection = utils.LocusCollection([], 500) for gene_id in gene_list: gene = gene_dict[gene_id] print(gene.common_name()) if len(gene.common_name()) > 1: name = gene.common_name() else: name = gene_id offset = 4 * len(offset_collection.get_overlap(gene.tx_locus())) offset_collection.append( utils.make_search_locus(gene.tx_locus(), plot_buffer, plot_buffer,) ) # write the name of the gene down if gene.sense() == "+": gene_start = gene.tx_locus().start else: gene_start = gene.tx_locus().end gene_start = abs(gene_start - ref_point) * scale_factor name_table.append([name, gene_start, -2 - offset]) # draw a line across the entire txLocus [start, stop] = [ abs(x - ref_point) * scale_factor for x in gene.tx_locus().coords() ] diagram_table.append([start, -0.01 - offset, stop, 0.01 - offset]) # now draw thin boxes for all tx_exons if gene.tx_exons(): for tx_exon in gene.tx_exons(): [start, stop] = [ abs(x - ref_point) * scale_factor for x in tx_exon.coords() ] diagram_table.append([start, -0.5 - offset, stop, 0.5 - offset]) # now draw fatty boxes for the coding exons if any if gene.cd_exons(): for cd_exon in gene.cd_exons(): [start, stop] = [ abs(x - ref_point) * scale_factor for x in cd_exon.coords() ] diagram_table.append([start, -1 - offset, stop, 1 - offset]) utils.unparse_table( diagram_table, os.path.join(out_folder, "{}_diagramTemp.txt".format(gff_string)), "\t", ) utils.unparse_table( name_table, os.path.join(out_folder, "{}_nameTemp.txt".format(gff_string)), "\t", )
def map_bam_to_gff_line( bam_file, mmr, name, gff_line, color, n_bins, sense="both", extension=200 ): """Maps reads from a bam to a gff.""" print("using a MMR/scaling denominator value of {}".format(mmr)) line = gff_line[0:9] gff_locus = utils.Locus(line[0], int(line[3]), int(line[4]), line[6], line[1]) # setting up the output clusterline color_line = color bam_name = os.path.basename(bam_file) cluster_line = [bam_name, gff_locus.id, name, gff_locus.__str__()] + color_line bin_size = gff_locus.len() // n_bins # some regions will be too short to get info on # we just kick these back and abandon them if not bin_size: cluster_line += ["NA"] * int(n_bins) return cluster_line # flippy flip if sense is negative sense_trans = str.maketrans("-+.", "+-+") if sense == "-": bam_sense = gff_locus.sense.translate(sense_trans) elif sense == "+": bam_sense = gff_locus.sense else: bam_sense = "." # using the bamliquidator to get the read_string bam_command = "bamliquidator {} {} {} {} {} {} {}".format( bam_file, gff_locus.chr, gff_locus.start, gff_locus.end, bam_sense, n_bins, extension, ) get_reads = subprocess.Popen( bam_command, stdin=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True, ) read_string = get_reads.communicate() den_list = read_string[0].decode("utf-8").split("\n")[:-1] # flip the denList if the actual gff region is - if gff_locus.sense == "-": den_list = den_list[::-1] # converting from units of total bp of read sequence per bin to rpm/bp den_list = [round(float(x) / bin_size / mmr, 4) for x in den_list] cluster_line += den_list return cluster_line
def split_regions(input_gff, tss_collection, mask_file=None): """Split regions if even a single coordinate is shared with the +/-1kb.""" # create mask regions collection if mask_file: print("USING MASK FILE {}".format(mask_file)) # if it's a bed file if mask_file.split(".")[-1].upper() == "BED": mask_gff = utils.bed_to_gff(mask_file) elif mask_file.split(".")[-1].upper() == "GFF": mask_gff = utils.parse_table(mask_file, "\t") else: print("MASK MUST BE A .gff or .bed FILE") mask_collection = utils.gff_to_locus_collection(mask_gff) print("LOADING {} MASK REGIONS".format(len(mask_collection))) split_gff = [] for line in input_gff: chrom = line[0] region_id = line[1] line_locus = utils.Locus(line[0], line[3], line[4], ".") # mask regions if mask_file: if mask_collection.get_overlap(line_locus, "both"): continue overlapping_loci = tss_collection.get_overlap(line_locus) if overlapping_loci: # case where a tss overlap # identify the parts of the line locus that are contained local_tss_collection = utils.LocusCollection(overlapping_loci, 50) overlapping_coords = line_locus.coords() for tss_locus in overlapping_loci: overlapping_coords += tss_locus.coords() overlapping_coords = utils.uniquify(overlapping_coords) overlapping_coords.sort() # you need to hack and slash add 1 to the last coordinate of the overlapping_coords overlapping_coords[-1] += 1 i = 0 region_ticker = 1 while i < (len(overlapping_coords) - 1): start = int(overlapping_coords[i]) stop = int(overlapping_coords[(i + 1)]) - 1 if (stop - start) < 50: # this eliminates really tiny regions i += 1 continue split_locus = utils.Locus(chrom, start + 1, stop, ".") if line_locus.overlaps(split_locus): new_id = "{}_{}".format(region_id, region_ticker) tss_status = 0 if local_tss_collection.get_overlap(split_locus): tss_status = 1 split_gff_line = [ chrom, new_id, new_id, start, stop, "", ".", tss_status, new_id, ] split_gff.append(split_gff_line) region_ticker += 1 i += 1 else: line[7] = 0 split_gff.append(line) return split_gff
def make_peak_table( param_dict, split_gff_path, average_table_path, start_dict, gene_list, genome_directory, tss_window, distal_window, tads_path="", ): """Makes the final peak table with ebox info.""" peak_table = [[ "REGION_ID", "CHROM", "START", "STOP", "LENGTH", "TSS", "CPG", "CPG_FRACTION", "GC_FREQ", "SIGNAL", "CANON_EBOX_COUNT", "NON_CANON_EBOX_COUNT", "TOTAL_EBOX_COUNT", "OVERLAPPING_GENES", "PROXIMAL_GENES", ]] print("LOADING PEAK REGIONS") peak_gff = utils.parse_table(split_gff_path, "\t") print("LOADING BINDING DATA") signal_table = utils.parse_table(average_table_path, "\t") print("LOADING CPGS ISLANDS") cpg_bed = utils.parse_table(param_dict["cpg_path"], "\t") cpg_loci = [] for line in cpg_bed: cpg_loci.append(utils.Locus(line[0], line[1], line[2], ".", line[-1])) cpg_collection = utils.LocusCollection(cpg_loci, 50) print("MAKING TSS COLLECTIONS") if not gene_list: gene_list = [*start_dict] tss_prox_loci = [] tss_distal_loci = [] for ref_id in gene_list: tss_prox_loci.append( utils.make_tss_locus(ref_id, start_dict, tss_window, tss_window)) tss_distal_loci.append( utils.make_tss_locus( ref_id, start_dict, distal_window, distal_window, )) # make a 1kb flanking and 50kb flanking collection tss_prox_collection = utils.LocusCollection(tss_prox_loci, 50) tss_distal_collection = utils.LocusCollection(tss_distal_loci, 50) if tads_path: print("LOADING TADS FROM {}".format(tads_path)) tad_collection = utils.import_bound_region(tads_path, "tad") use_tads = True # building a tad dict keyed by tad ID w/ genes in that tad provided tad_dict = defaultdict(list) for tss_locus in tss_prox_loci: overlapping_tads = tad_collection.get_overlap(tss_locus, "both") for tad_locus in overlapping_tads: tad_dict[tad_locus.id].append(tss_locus.id) else: use_tads = False print("CLASSIFYING PEAKS") ticker = 0 no_tad_count = 0 for i in range(len(peak_gff)): if not ticker % 1000: print(ticker) ticker += 1 # getting the particulars of the region gff_line = peak_gff[i] peak_id = gff_line[1] chrom = gff_line[0] start = int(gff_line[3]) stop = int(gff_line[4]) line_locus = utils.Locus(chrom, start, stop, ".", peak_id) # getting the mapped signal signal_line = signal_table[(i + 1)] signal_vector = [float(x) for x in signal_line[2:]] # setting up the new line new_line = [peak_id, chrom, start, stop, line_locus.len()] # get the tss status from the gff itself # (we are able to do this nicely from the split gff code earlier) new_line.append(gff_line[7]) # check cpg status if cpg_collection.get_overlap(line_locus, "both"): new_line.append(1) else: new_line.append(0) # now do fractional cpgoverlap overlapping_cpg_loci = cpg_collection.get_overlap(line_locus, "both") overlapping_bases = 0 for locus in overlapping_cpg_loci: cpg_start = max(locus.start, line_locus.start) cpg_end = min(locus.end, line_locus.end) overlapping_bases += cpg_end - cpg_start overlap_fraction = float(overlapping_bases) / line_locus.len() new_line.append(round(overlap_fraction, 2)) # now get the seq line_seq = utils.fetch_seq(genome_directory, chrom, start, stop, True).upper() if not line_seq: print("UH OH") print(line_seq) print(gff_line) print(i) print(chrom) print(start) print(stop) sys.exit() gc_freq = float(line_seq.count("GC") + line_seq.count("CG")) / len(line_seq) new_line.append(gc_freq) # this is where we add the ChIP-seq signal new_line += signal_vector ebox_match_list = re.findall("CA..TG", line_seq) if not ebox_match_list: new_line += [0] * 3 else: total_count = len(ebox_match_list) canon_count = ebox_match_list.count("CACGTG") other_count = total_count - canon_count new_line += [canon_count, other_count, total_count] # now find the overlapping and proximal genes # here each overlapping gene the tss prox locus overlaps the peak if use_tads: tad_loci = tad_collection.get_overlap(line_locus, "both") tad_id_list = [tad_locus.id for tad_locus in tad_loci] tad_genes = [] for tad_id in tad_id_list: tad_genes += tad_dict[tad_id] if not tad_genes: no_tad_count += 1 else: tad_genes = [] if tad_genes: overlapping_genes = [ start_dict[locus.id]["name"] for locus in tss_prox_collection.get_overlap( line_locus, "both") if tad_genes.count(locus.id) ] proximal_genes = [ start_dict[locus.id]["name"] for locus in tss_distal_collection.get_overlap( line_locus, "both") if tad_genes.count(locus.id) ] else: overlapping_genes = [ start_dict[locus.id]["name"] for locus in tss_prox_collection.get_overlap( line_locus, "both") ] proximal_genes = [ start_dict[locus.id]["name"] for locus in tss_distal_collection.get_overlap( line_locus, "both") ] overlapping_genes = utils.uniquify(overlapping_genes) # here the tss 50kb locus overlaps the peak # overlap takes priority over proximal proximal_genes = [ gene for gene in proximal_genes if not overlapping_genes.count(gene) ] proximal_genes = utils.uniquify(proximal_genes) overlapping_string = ",".join(overlapping_genes) proximal_string = ",".join(proximal_genes) new_line += [overlapping_string, proximal_string] peak_table.append(new_line) print("Out of {} regions, {} were assigned to at least 1 tad".format( str(len(peak_table)), str(no_tad_count), )) return peak_table