def generate_subpeak_fasta(gene_to_enhancer_dict, subpeaks, genome, project_name, const_extension): """Generate a subpeak FASTA. From a BED file of constituents generate a FASTA for the consituients contained within the canidate supers. """ genome_directory = genome.directory() subpeak_dict = {} subpeak_bed = [["track name=" + project_name + " color=204,0,204"]] subpeak_table = utils.parse_table(subpeaks, "\t") subpeak_loci = [ utils.Locus(l[0], int(l[1]), int(l[2]), ".") for l in subpeak_table ] subpeak_collection = utils.LocusCollection(subpeak_loci, 50) for gene in gene_to_enhancer_dict.keys(): subpeak_dict[gene] = [] for region in gene_to_enhancer_dict[gene]: overlaps = subpeak_collection.get_overlap(region) extended_overlaps = [ utils.make_search_locus(x, const_extension, const_extension) for x in overlaps ] overlap_collection_temp = utils.LocusCollection( extended_overlaps, 50) overlap_collection = overlap_collection_temp.stitch_collection() for overlap in overlap_collection.get_loci(): subpeak_bed.append([overlap.chr, overlap.start, overlap.end]) subpeak_dict[gene].append(overlap) fasta = [] for gene in subpeak_dict: for subpeak in subpeak_dict[gene]: fasta_title = "|".join( [gene, subpeak.chr, str(subpeak.start), str(subpeak.end)]) fasta_line = utils.fetch_seq( genome_directory, subpeak.chr, int(subpeak.start + 1), int(subpeak.end + 1), ) fasta.append(">" + fasta_title) fasta.append(fasta_line.upper()) return subpeak_bed, fasta
def stitch_valleys(valley_list): """Returns a stitched list of valleys to extract seq from.""" valley_collection = utils.LocusCollection(valley_list, 1) stitched_valley_collection = valley_collection.stitch_collection() loci = [] regions = [] for valley in stitched_valley_collection.get_loci(): if [valley.chr, valley.start, valley.end] not in regions: loci.append(valley) regions.append([valley.chr, valley.start, valley.end]) return loci
def build_graph(edge_dict, gene_to_enhancer_dict, output_folder, analysis_name, cutoff=1): """Build a target graph from the collapsed edge dictionary. Require at least n motifs to constitute an edge where n is set by cutoff. Default is 1. """ node_list = list(edge_dict.keys()) node_list.sort() # This is only edges between TFs graph = nx.DiGraph(name=analysis_name) graph.add_nodes_from(node_list) # This stores ALL edges identified by motifs edge_table = [[ 'SOURCE', 'TARGET', 'CHROM', 'START', 'STOP', 'REGION_ID', 'TF_INTERACTION' ]] edge_output = '{}{}_EDGE_TABLE.txt'.format(output_folder, analysis_name) for source in node_list: print(source) target_list = list(edge_dict[source].keys()) target_list.sort() for target in target_list: # Now we need to see which target regions this guy overlaps target_regions = gene_to_enhancer_dict[target] target_collection = utils.LocusCollection(target_regions, 50) # Get the edges hitting that target edge_loci = edge_dict[source][target] if node_list.count(target) > 0: tf_interaction = 1 else: tf_interaction = 0 # Only add to the graph if this is a TF/TF interaction if len(edge_loci) >= cutoff and node_list.count(target) > 0: graph.add_edge(source, target) # Now for each edge, add to the table for edge_locus in edge_loci: region_string = ','.join([ locus.id for locus in target_collection.get_overlap(edge_locus) ]) edge_line = [ source, target, edge_locus.chr, edge_locus.start, edge_locus.end, region_string, tf_interaction, ] edge_table.append(edge_line) utils.unparse_table(edge_table, edge_output, '\t') return graph
def collapse_fimo(fimo_output, candidate_tf_list, output_folder, analysis_name, motif_convert_file): """Collapses motifs from fimo. For each source node (TF) and each target node (gene enhancer regions), collapse motif instances then spit out a ginormous set of beds and a single crazy collapsed bed. """ # First build up the motif name conversion database motif_database = utils.parse_table(motif_convert_file, '\t') motif_database_dict = defaultdict(list) # The reverse of the other dict, from motif name to gene name # A motif can go to multiple genes for line in motif_database: motif_database_dict[line[0]].append(line[1]) # Make the folder to store motif beds utils.format_folder('{}motif_beds/'.format(output_folder), True) edge_dict = {} # First layer are source nodes for tf in candidate_tf_list: edge_dict[tf] = defaultdict(list) # Next layer are target nodes which are derived from the fimo output fimo_table = utils.parse_table(fimo_output, '\t') print(fimo_output) # fimo sometimes puts the region in either the first or second column fimo_line = fimo_table[1] if fimo_line[1].count('|') > 0: region_index = 1 else: region_index = 2 print('USING COLUMN {} OF FIMO OUTPUT FOR REGION'.format(region_index)) for line in fimo_table[1:]: source_tfs = motif_database_dict[line[0]] # motifId for source in source_tfs: if candidate_tf_list.count(source) == 0: continue region = line[region_index].split('|') target = region[0] if region_index == 2: target_locus = utils.Locus(region[1], int(region[2]) + int(line[3]), int(region[2]) + int(line[4]), '.') else: target_locus = utils.Locus(region[1], int(region[2]) + int(line[2]), int(region[2]) + int(line[3]), '.') # What's missing here is the enhancer id of the target locus try: edge_dict[source][target].append(target_locus) except KeyError: print('This motif is not in the network') print(line) sys.exit() # Now we actually want to collapse this down in a meaningful way # Overlapping motifs count as a single binding site. This way a TF with tons of motifs # that finds the same site over and over again doesn't get over counted all_bed = [] all_bed_path = '{}{}_all_motifs.bed'.format(output_folder, analysis_name) for tf in candidate_tf_list: print(tf) target_nodes = edge_dict[tf].keys() bed_header = [ 'track name = "{}" description="{} motifs in {}"'.format( tf, tf, analysis_name) ] all_bed.append(bed_header) target_bed = [bed_header] target_bed_path = '{}motif_beds/{}_motifs.bed'.format( output_folder, tf) for target in target_nodes: edge_collection = utils.LocusCollection(edge_dict[tf][target], 50) edge_collection = edge_collection.stitch_collection() edge_loci = edge_collection.get_loci() edge_dict[tf][target] = edge_loci for locus in edge_loci: bed_line = [locus.chr, locus.start, locus.end, target, '', '+'] target_bed.append(bed_line) all_bed.append(bed_line) utils.unparse_table(target_bed, target_bed_path, '\t') # Now the loci are all stitched up utils.unparse_table(all_bed, all_bed_path, '\t') return edge_dict