def find_valleys(gene_to_enhancer_dict, bam_file_list, project_name, project_folder, cutoff=0.2): """Returns a dictionary of refseqs with all valley loci that are associated. Returns 2 kinds of bed files. 1 = all """ # First make the bamDict all_valley_bed = [] valley_dict = {} # Start w/ a bam_file_list and make a list of bam type objects bam_list = [utils.Bam(bam_path) for bam_path in bam_file_list] max_read_length = max([bam.get_read_lengths()[0] for bam in bam_list]) gene_list = list(gene_to_enhancer_dict.keys()) gene_list.sort() ticker = 0 print("number of regions processed:") for gene in gene_list: valley_dict[gene] = [] for region in gene_to_enhancer_dict[gene]: if ticker % 100 == 0: print(ticker) ticker += 1 score_array = score_valley( region, bam_list, max_read_length, ) for index, score in enumerate(score_array): if score > cutoff: valley = utils.Locus( region.chr, region.start + index * 10, region.start + (index + 1) * 10, ".", ) valley_dict[gene].append(valley) stitched_valleys = stitch_valleys(valley_dict[gene]) for valley in stitched_valleys: all_valley_bed.append([valley.chr, valley.start, valley.end]) valley_dict[gene] = stitched_valleys all_bed_path = project_folder + project_name + "_all_valleys.bed" utils.unparse_table(all_valley_bed, all_bed_path, "\t") return all_bed_path
def generate_subpeak_fasta(gene_to_enhancer_dict, subpeaks, genome, project_name, const_extension): """Generate a subpeak FASTA. From a BED file of constituents generate a FASTA for the consituients contained within the canidate supers. """ genome_directory = genome.directory() subpeak_dict = {} subpeak_bed = [["track name=" + project_name + " color=204,0,204"]] subpeak_table = utils.parse_table(subpeaks, "\t") subpeak_loci = [ utils.Locus(l[0], int(l[1]), int(l[2]), ".") for l in subpeak_table ] subpeak_collection = utils.LocusCollection(subpeak_loci, 50) for gene in gene_to_enhancer_dict.keys(): subpeak_dict[gene] = [] for region in gene_to_enhancer_dict[gene]: overlaps = subpeak_collection.get_overlap(region) extended_overlaps = [ utils.make_search_locus(x, const_extension, const_extension) for x in overlaps ] overlap_collection_temp = utils.LocusCollection( extended_overlaps, 50) overlap_collection = overlap_collection_temp.stitch_collection() for overlap in overlap_collection.get_loci(): subpeak_bed.append([overlap.chr, overlap.start, overlap.end]) subpeak_dict[gene].append(overlap) fasta = [] for gene in subpeak_dict: for subpeak in subpeak_dict[gene]: fasta_title = "|".join( [gene, subpeak.chr, str(subpeak.start), str(subpeak.end)]) fasta_line = utils.fetch_seq( genome_directory, subpeak.chr, int(subpeak.start + 1), int(subpeak.end + 1), ) fasta.append(">" + fasta_title) fasta.append(fasta_line.upper()) return subpeak_bed, fasta
def collapse_fimo(fimo_output, candidate_tf_list, output_folder, analysis_name, motif_convert_file): """Collapses motifs from fimo. For each source node (TF) and each target node (gene enhancer regions), collapse motif instances then spit out a ginormous set of beds and a single crazy collapsed bed. """ # First build up the motif name conversion database motif_database = utils.parse_table(motif_convert_file, '\t') motif_database_dict = defaultdict(list) # The reverse of the other dict, from motif name to gene name # A motif can go to multiple genes for line in motif_database: motif_database_dict[line[0]].append(line[1]) # Make the folder to store motif beds utils.format_folder('{}motif_beds/'.format(output_folder), True) edge_dict = {} # First layer are source nodes for tf in candidate_tf_list: edge_dict[tf] = defaultdict(list) # Next layer are target nodes which are derived from the fimo output fimo_table = utils.parse_table(fimo_output, '\t') print(fimo_output) # fimo sometimes puts the region in either the first or second column fimo_line = fimo_table[1] if fimo_line[1].count('|') > 0: region_index = 1 else: region_index = 2 print('USING COLUMN {} OF FIMO OUTPUT FOR REGION'.format(region_index)) for line in fimo_table[1:]: source_tfs = motif_database_dict[line[0]] # motifId for source in source_tfs: if candidate_tf_list.count(source) == 0: continue region = line[region_index].split('|') target = region[0] if region_index == 2: target_locus = utils.Locus(region[1], int(region[2]) + int(line[3]), int(region[2]) + int(line[4]), '.') else: target_locus = utils.Locus(region[1], int(region[2]) + int(line[2]), int(region[2]) + int(line[3]), '.') # What's missing here is the enhancer id of the target locus try: edge_dict[source][target].append(target_locus) except KeyError: print('This motif is not in the network') print(line) sys.exit() # Now we actually want to collapse this down in a meaningful way # Overlapping motifs count as a single binding site. This way a TF with tons of motifs # that finds the same site over and over again doesn't get over counted all_bed = [] all_bed_path = '{}{}_all_motifs.bed'.format(output_folder, analysis_name) for tf in candidate_tf_list: print(tf) target_nodes = edge_dict[tf].keys() bed_header = [ 'track name = "{}" description="{} motifs in {}"'.format( tf, tf, analysis_name) ] all_bed.append(bed_header) target_bed = [bed_header] target_bed_path = '{}motif_beds/{}_motifs.bed'.format( output_folder, tf) for target in target_nodes: edge_collection = utils.LocusCollection(edge_dict[tf][target], 50) edge_collection = edge_collection.stitch_collection() edge_loci = edge_collection.get_loci() edge_dict[tf][target] = edge_loci for locus in edge_loci: bed_line = [locus.chr, locus.start, locus.end, target, '', '+'] target_bed.append(bed_line) all_bed.append(bed_line) utils.unparse_table(target_bed, target_bed_path, '\t') # Now the loci are all stitched up utils.unparse_table(all_bed, all_bed_path, '\t') return edge_dict
def gene_to_enhancer(genome, enhancer_file, activity_path): """Assign each Super-Enhancer to the closest active TSS to its center. Return a dictionary keyed by TF that points to a list of loci. """ print( 'Identifying enhancers and target genes from {}'.format(enhancer_file)) # Should this do gene assignment???? # For now assume gene assignment has been done # Can later toggle to do gene assignment # First load the TF lists tf_table = utils.parse_table(genome.return_feature('tf_file'), '\t') motif_table = utils.parse_table(genome.return_feature('motif_convert'), '\t') # This gives all tfs that have a motif motif_tfs = utils.uniquify([line[1] for line in motif_table]) # Intersect w/ the activity table if activity_path: activity_table = utils.parse_table(activity_path, '\t') # Figure out the right column for actual gene names # (basically not NM or NR and not a numeral) for i in range(len(activity_table[0])): # Assumes refseq if (activity_table[0][i][0:2] != 'NM' and activity_table[0][i][0:2] != 'NR' and not activity_table[0][i].isdigit()): gene_col = i break print('using column {} of {} gene activity table for common names' ''.format(gene_col + 1, activity_path)) active_gene_list = [line[gene_col].upper() for line in activity_table] tf_list_name = utils.uniquify([ line[1] for line in tf_table if active_gene_list.count(line[1]) > 0 and motif_tfs.count(line[1]) > 0 ]) else: tf_list_name = [ line[1] for line in tf_table if motif_tfs.count(line[1]) > 0 ] print('Identified {} TFs from {} that have motifs' ''.format(len(tf_list_name), genome.return_feature('tf_file'))) # Keyed by gene with loci objects in the list gene_to_enhancer_dict = defaultdict(list) enhancer_to_gene_dict = defaultdict(list) # Assuming id,chrom,start,stop w/ gene names in the last 3 columns per standard ROSE output enhancer_table = utils.parse_table(enhancer_file, '\t') print('Analyzing {} cis-regulatory regions'.format(len(enhancer_table))) # Now let's make the enhancer table by region and then by gene enhancer_region_table = [[ 'ENHANCER_ID', 'CHROM', 'START', 'STOP', 'GENE_LIST' ]] enhancer_tf_region_table = [[ 'ENHANCER_ID', 'CHROM', 'START', 'STOP', 'GENE_LIST' ]] gene_region_table = [[ 'GENE', 'TF', 'CHROM', 'START', 'STOP', 'ENHANCER_ID' ]] gene_tf_region_table = [['GENE', 'CHROM', 'START', 'STOP', 'ENHANCER_ID']] gene_summary_table = [['GENE', 'TF', 'ENHANCER_LIST']] # Will need to track which ones are TFs candidate_tf_list = [] # Find the columns for gene assignment header = enhancer_table[0] header_length = len(enhancer_table[0]) closest_index = header.index('CLOSEST_GENE') proximal_index = header.index('PROXIMAL_GENES') overlap_index = header.index('OVERLAP_GENES') for line in enhancer_table[1:]: # Don't bother trying to figure out lines w/o target genes if len(line) != header_length: continue enhancer_locus = utils.Locus(line[1], line[2], line[3], '.', line[0]) closest_gene_list = line[closest_index].split( ',') if line[closest_index] else [] proximal_gene_list = line[proximal_index].split( ',') if line[proximal_index] else [] overlap_gene_list = line[overlap_index].split( ',') if line[overlap_index] else [] all_gene_list = closest_gene_list + proximal_gene_list + overlap_gene_list all_gene_list = [gene.upper() for gene in all_gene_list] # Gets a unique list of all tfs if activity_path: all_gene_list = utils.uniquify([ gene for gene in all_gene_list if active_gene_list.count(gene) > 0 ]) else: all_gene_list = utils.uniquify(all_gene_list) candidate_gene_list = utils.uniquify( [gene for gene in all_gene_list if tf_list_name.count(gene) > 0]) if all_gene_list: for gene in all_gene_list: gene_to_enhancer_dict[gene].append(enhancer_locus) enhancer_to_gene_dict[enhancer_locus].append(gene) newline = line[0:4] + [','.join(all_gene_list)] else: newline = line[0:4] + [''] enhancer_region_table.append(newline) if candidate_gene_list: tf_line = line[0:4] + [','.join(candidate_gene_list)] enhancer_tf_region_table.append(tf_line) # Now iterate through each gene and list the enhancers gene_list = list(gene_to_enhancer_dict.keys()) print(gene_list) gene_list.sort() for gene in gene_list: if tf_list_name.count(gene) > 0: tf_status = 1 candidate_tf_list.append(gene) else: tf_status = 0 enhancer_loci = gene_to_enhancer_dict[gene] enhancer_string = ','.join([enhancer.id for enhancer in enhancer_loci]) gene_summary_table.append([gene, tf_status, enhancer_string]) for enhancer in enhancer_loci: newline = [ gene, tf_status, enhancer.chr, enhancer.start, enhancer.end, enhancer.id, ] gene_region_table.append(newline) if tf_status == 1: newline = [ gene, enhancer.chr, enhancer.start, enhancer.end, enhancer.id ] gene_tf_region_table.append(newline) return ( gene_region_table, gene_tf_region_table, enhancer_region_table, enhancer_tf_region_table, gene_summary_table, candidate_tf_list, gene_to_enhancer_dict, )