Пример #1
0
def generate_subpeak_fasta(gene_to_enhancer_dict, subpeaks, genome,
                           project_name, const_extension):
    """Generate a subpeak FASTA.

    From a BED file of constituents generate a FASTA for the consituients contained within the
    canidate supers.

    """
    genome_directory = genome.directory()
    subpeak_dict = {}
    subpeak_bed = [["track name=" + project_name + " color=204,0,204"]]
    subpeak_table = utils.parse_table(subpeaks, "\t")

    subpeak_loci = [
        utils.Locus(l[0], int(l[1]), int(l[2]), ".") for l in subpeak_table
    ]
    subpeak_collection = utils.LocusCollection(subpeak_loci, 50)

    for gene in gene_to_enhancer_dict.keys():
        subpeak_dict[gene] = []
        for region in gene_to_enhancer_dict[gene]:
            overlaps = subpeak_collection.get_overlap(region)
            extended_overlaps = [
                utils.make_search_locus(x, const_extension, const_extension)
                for x in overlaps
            ]

            overlap_collection_temp = utils.LocusCollection(
                extended_overlaps, 50)
            overlap_collection = overlap_collection_temp.stitch_collection()
            for overlap in overlap_collection.get_loci():
                subpeak_bed.append([overlap.chr, overlap.start, overlap.end])
                subpeak_dict[gene].append(overlap)

    fasta = []
    for gene in subpeak_dict:
        for subpeak in subpeak_dict[gene]:
            fasta_title = "|".join(
                [gene, subpeak.chr,
                 str(subpeak.start),
                 str(subpeak.end)])
            fasta_line = utils.fetch_seq(
                genome_directory,
                subpeak.chr,
                int(subpeak.start + 1),
                int(subpeak.end + 1),
            )

            fasta.append(">" + fasta_title)
            fasta.append(fasta_line.upper())

    return subpeak_bed, fasta
Пример #2
0
def stitch_valleys(valley_list):
    """Returns a stitched list of valleys to extract seq from."""
    valley_collection = utils.LocusCollection(valley_list, 1)
    stitched_valley_collection = valley_collection.stitch_collection()
    loci = []
    regions = []
    for valley in stitched_valley_collection.get_loci():
        if [valley.chr, valley.start, valley.end] not in regions:
            loci.append(valley)
            regions.append([valley.chr, valley.start, valley.end])
    return loci
Пример #3
0
def build_graph(edge_dict,
                gene_to_enhancer_dict,
                output_folder,
                analysis_name,
                cutoff=1):
    """Build a target graph from the collapsed edge dictionary.

    Require at least n motifs to constitute an edge where n is set by cutoff.
    Default is 1.

    """
    node_list = list(edge_dict.keys())
    node_list.sort()

    # This is only edges between TFs
    graph = nx.DiGraph(name=analysis_name)
    graph.add_nodes_from(node_list)

    # This stores ALL edges identified by motifs
    edge_table = [[
        'SOURCE', 'TARGET', 'CHROM', 'START', 'STOP', 'REGION_ID',
        'TF_INTERACTION'
    ]]
    edge_output = '{}{}_EDGE_TABLE.txt'.format(output_folder, analysis_name)

    for source in node_list:
        print(source)
        target_list = list(edge_dict[source].keys())
        target_list.sort()
        for target in target_list:

            # Now we need to see which target regions this guy overlaps
            target_regions = gene_to_enhancer_dict[target]
            target_collection = utils.LocusCollection(target_regions, 50)

            # Get the edges hitting that target
            edge_loci = edge_dict[source][target]
            if node_list.count(target) > 0:
                tf_interaction = 1
            else:
                tf_interaction = 0
            # Only add to the graph if this is a TF/TF interaction
            if len(edge_loci) >= cutoff and node_list.count(target) > 0:
                graph.add_edge(source, target)

            # Now for each edge, add to the table
            for edge_locus in edge_loci:
                region_string = ','.join([
                    locus.id
                    for locus in target_collection.get_overlap(edge_locus)
                ])
                edge_line = [
                    source,
                    target,
                    edge_locus.chr,
                    edge_locus.start,
                    edge_locus.end,
                    region_string,
                    tf_interaction,
                ]
                edge_table.append(edge_line)

    utils.unparse_table(edge_table, edge_output, '\t')
    return graph
Пример #4
0
def collapse_fimo(fimo_output, candidate_tf_list, output_folder, analysis_name,
                  motif_convert_file):
    """Collapses motifs from fimo.

    For each source node (TF) and each target node (gene enhancer regions), collapse motif
    instances then spit out a ginormous set of beds and a single crazy collapsed bed.

    """
    # First build up the motif name conversion database
    motif_database = utils.parse_table(motif_convert_file, '\t')
    motif_database_dict = defaultdict(list)

    # The reverse of the other dict, from motif name to gene name
    # A motif can go to multiple genes
    for line in motif_database:
        motif_database_dict[line[0]].append(line[1])

    # Make the folder to store motif beds
    utils.format_folder('{}motif_beds/'.format(output_folder), True)

    edge_dict = {}

    # First layer are source nodes
    for tf in candidate_tf_list:
        edge_dict[tf] = defaultdict(list)
    # Next layer are target nodes which are derived from the fimo output

    fimo_table = utils.parse_table(fimo_output, '\t')
    print(fimo_output)

    # fimo sometimes puts the region in either the first or second column
    fimo_line = fimo_table[1]
    if fimo_line[1].count('|') > 0:
        region_index = 1
    else:
        region_index = 2
    print('USING COLUMN {} OF FIMO OUTPUT FOR REGION'.format(region_index))

    for line in fimo_table[1:]:
        source_tfs = motif_database_dict[line[0]]  # motifId
        for source in source_tfs:
            if candidate_tf_list.count(source) == 0:
                continue
            region = line[region_index].split('|')

            target = region[0]
            if region_index == 2:
                target_locus = utils.Locus(region[1],
                                           int(region[2]) + int(line[3]),
                                           int(region[2]) + int(line[4]), '.')
            else:
                target_locus = utils.Locus(region[1],
                                           int(region[2]) + int(line[2]),
                                           int(region[2]) + int(line[3]), '.')

            # What's missing here is the enhancer id of the target locus
            try:
                edge_dict[source][target].append(target_locus)
            except KeyError:
                print('This motif is not in the network')
                print(line)
                sys.exit()

    # Now we actually want to collapse this down in a meaningful way
    # Overlapping motifs count as a single binding site. This way a TF with tons of motifs
    # that finds the same site over and over again doesn't get over counted
    all_bed = []
    all_bed_path = '{}{}_all_motifs.bed'.format(output_folder, analysis_name)
    for tf in candidate_tf_list:
        print(tf)
        target_nodes = edge_dict[tf].keys()
        bed_header = [
            'track name = "{}" description="{} motifs in {}"'.format(
                tf, tf, analysis_name)
        ]
        all_bed.append(bed_header)
        target_bed = [bed_header]
        target_bed_path = '{}motif_beds/{}_motifs.bed'.format(
            output_folder, tf)
        for target in target_nodes:
            edge_collection = utils.LocusCollection(edge_dict[tf][target], 50)
            edge_collection = edge_collection.stitch_collection()
            edge_loci = edge_collection.get_loci()
            edge_dict[tf][target] = edge_loci
            for locus in edge_loci:
                bed_line = [locus.chr, locus.start, locus.end, target, '', '+']
                target_bed.append(bed_line)
                all_bed.append(bed_line)

        utils.unparse_table(target_bed, target_bed_path, '\t')
    # Now the loci are all stitched up
    utils.unparse_table(all_bed, all_bed_path, '\t')
    return edge_dict