Пример #1
0
def find_valleys(gene_to_enhancer_dict,
                 bam_file_list,
                 project_name,
                 project_folder,
                 cutoff=0.2):
    """Returns a dictionary of refseqs with all valley loci that are associated.

    Returns 2 kinds of bed files. 1 = all

    """
    # First make the bamDict
    all_valley_bed = []
    valley_dict = {}

    # Start w/ a bam_file_list and make a list of bam type objects
    bam_list = [utils.Bam(bam_path) for bam_path in bam_file_list]
    max_read_length = max([bam.get_read_lengths()[0] for bam in bam_list])

    gene_list = list(gene_to_enhancer_dict.keys())
    gene_list.sort()
    ticker = 0
    print("number of regions processed:")
    for gene in gene_list:

        valley_dict[gene] = []

        for region in gene_to_enhancer_dict[gene]:
            if ticker % 100 == 0:
                print(ticker)
            ticker += 1
            score_array = score_valley(
                region,
                bam_list,
                max_read_length,
            )
            for index, score in enumerate(score_array):
                if score > cutoff:
                    valley = utils.Locus(
                        region.chr,
                        region.start + index * 10,
                        region.start + (index + 1) * 10,
                        ".",
                    )
                    valley_dict[gene].append(valley)

        stitched_valleys = stitch_valleys(valley_dict[gene])
        for valley in stitched_valleys:
            all_valley_bed.append([valley.chr, valley.start, valley.end])
            valley_dict[gene] = stitched_valleys

    all_bed_path = project_folder + project_name + "_all_valleys.bed"
    utils.unparse_table(all_valley_bed, all_bed_path, "\t")

    return all_bed_path
Пример #2
0
def generate_subpeak_fasta(gene_to_enhancer_dict, subpeaks, genome,
                           project_name, const_extension):
    """Generate a subpeak FASTA.

    From a BED file of constituents generate a FASTA for the consituients contained within the
    canidate supers.

    """
    genome_directory = genome.directory()
    subpeak_dict = {}
    subpeak_bed = [["track name=" + project_name + " color=204,0,204"]]
    subpeak_table = utils.parse_table(subpeaks, "\t")

    subpeak_loci = [
        utils.Locus(l[0], int(l[1]), int(l[2]), ".") for l in subpeak_table
    ]
    subpeak_collection = utils.LocusCollection(subpeak_loci, 50)

    for gene in gene_to_enhancer_dict.keys():
        subpeak_dict[gene] = []
        for region in gene_to_enhancer_dict[gene]:
            overlaps = subpeak_collection.get_overlap(region)
            extended_overlaps = [
                utils.make_search_locus(x, const_extension, const_extension)
                for x in overlaps
            ]

            overlap_collection_temp = utils.LocusCollection(
                extended_overlaps, 50)
            overlap_collection = overlap_collection_temp.stitch_collection()
            for overlap in overlap_collection.get_loci():
                subpeak_bed.append([overlap.chr, overlap.start, overlap.end])
                subpeak_dict[gene].append(overlap)

    fasta = []
    for gene in subpeak_dict:
        for subpeak in subpeak_dict[gene]:
            fasta_title = "|".join(
                [gene, subpeak.chr,
                 str(subpeak.start),
                 str(subpeak.end)])
            fasta_line = utils.fetch_seq(
                genome_directory,
                subpeak.chr,
                int(subpeak.start + 1),
                int(subpeak.end + 1),
            )

            fasta.append(">" + fasta_title)
            fasta.append(fasta_line.upper())

    return subpeak_bed, fasta
Пример #3
0
def collapse_fimo(fimo_output, candidate_tf_list, output_folder, analysis_name,
                  motif_convert_file):
    """Collapses motifs from fimo.

    For each source node (TF) and each target node (gene enhancer regions), collapse motif
    instances then spit out a ginormous set of beds and a single crazy collapsed bed.

    """
    # First build up the motif name conversion database
    motif_database = utils.parse_table(motif_convert_file, '\t')
    motif_database_dict = defaultdict(list)

    # The reverse of the other dict, from motif name to gene name
    # A motif can go to multiple genes
    for line in motif_database:
        motif_database_dict[line[0]].append(line[1])

    # Make the folder to store motif beds
    utils.format_folder('{}motif_beds/'.format(output_folder), True)

    edge_dict = {}

    # First layer are source nodes
    for tf in candidate_tf_list:
        edge_dict[tf] = defaultdict(list)
    # Next layer are target nodes which are derived from the fimo output

    fimo_table = utils.parse_table(fimo_output, '\t')
    print(fimo_output)

    # fimo sometimes puts the region in either the first or second column
    fimo_line = fimo_table[1]
    if fimo_line[1].count('|') > 0:
        region_index = 1
    else:
        region_index = 2
    print('USING COLUMN {} OF FIMO OUTPUT FOR REGION'.format(region_index))

    for line in fimo_table[1:]:
        source_tfs = motif_database_dict[line[0]]  # motifId
        for source in source_tfs:
            if candidate_tf_list.count(source) == 0:
                continue
            region = line[region_index].split('|')

            target = region[0]
            if region_index == 2:
                target_locus = utils.Locus(region[1],
                                           int(region[2]) + int(line[3]),
                                           int(region[2]) + int(line[4]), '.')
            else:
                target_locus = utils.Locus(region[1],
                                           int(region[2]) + int(line[2]),
                                           int(region[2]) + int(line[3]), '.')

            # What's missing here is the enhancer id of the target locus
            try:
                edge_dict[source][target].append(target_locus)
            except KeyError:
                print('This motif is not in the network')
                print(line)
                sys.exit()

    # Now we actually want to collapse this down in a meaningful way
    # Overlapping motifs count as a single binding site. This way a TF with tons of motifs
    # that finds the same site over and over again doesn't get over counted
    all_bed = []
    all_bed_path = '{}{}_all_motifs.bed'.format(output_folder, analysis_name)
    for tf in candidate_tf_list:
        print(tf)
        target_nodes = edge_dict[tf].keys()
        bed_header = [
            'track name = "{}" description="{} motifs in {}"'.format(
                tf, tf, analysis_name)
        ]
        all_bed.append(bed_header)
        target_bed = [bed_header]
        target_bed_path = '{}motif_beds/{}_motifs.bed'.format(
            output_folder, tf)
        for target in target_nodes:
            edge_collection = utils.LocusCollection(edge_dict[tf][target], 50)
            edge_collection = edge_collection.stitch_collection()
            edge_loci = edge_collection.get_loci()
            edge_dict[tf][target] = edge_loci
            for locus in edge_loci:
                bed_line = [locus.chr, locus.start, locus.end, target, '', '+']
                target_bed.append(bed_line)
                all_bed.append(bed_line)

        utils.unparse_table(target_bed, target_bed_path, '\t')
    # Now the loci are all stitched up
    utils.unparse_table(all_bed, all_bed_path, '\t')
    return edge_dict
Пример #4
0
def gene_to_enhancer(genome, enhancer_file, activity_path):
    """Assign each Super-Enhancer to the closest active TSS to its center.

    Return a dictionary keyed by TF that points to a list of loci.

    """
    print(
        'Identifying enhancers and target genes from {}'.format(enhancer_file))
    # Should this do gene assignment????
    # For now assume gene assignment has been done
    # Can later toggle to do gene assignment

    # First load the TF lists
    tf_table = utils.parse_table(genome.return_feature('tf_file'), '\t')

    motif_table = utils.parse_table(genome.return_feature('motif_convert'),
                                    '\t')

    # This gives all tfs that have a motif
    motif_tfs = utils.uniquify([line[1] for line in motif_table])

    # Intersect w/ the activity table
    if activity_path:
        activity_table = utils.parse_table(activity_path, '\t')

        # Figure out the right column for actual gene names
        # (basically not NM or NR and not a numeral)
        for i in range(len(activity_table[0])):
            # Assumes refseq
            if (activity_table[0][i][0:2] != 'NM'
                    and activity_table[0][i][0:2] != 'NR'
                    and not activity_table[0][i].isdigit()):
                gene_col = i
                break
        print('using column {} of {} gene activity table for common names'
              ''.format(gene_col + 1, activity_path))

        active_gene_list = [line[gene_col].upper() for line in activity_table]

        tf_list_name = utils.uniquify([
            line[1] for line in tf_table if active_gene_list.count(line[1]) > 0
            and motif_tfs.count(line[1]) > 0
        ])
    else:
        tf_list_name = [
            line[1] for line in tf_table if motif_tfs.count(line[1]) > 0
        ]

    print('Identified {} TFs from {} that have motifs'
          ''.format(len(tf_list_name), genome.return_feature('tf_file')))

    # Keyed by gene with loci objects in the list
    gene_to_enhancer_dict = defaultdict(list)
    enhancer_to_gene_dict = defaultdict(list)

    # Assuming id,chrom,start,stop w/ gene names in the last 3 columns per standard ROSE output
    enhancer_table = utils.parse_table(enhancer_file, '\t')
    print('Analyzing {} cis-regulatory regions'.format(len(enhancer_table)))

    # Now let's make the enhancer table by region and then by gene
    enhancer_region_table = [[
        'ENHANCER_ID', 'CHROM', 'START', 'STOP', 'GENE_LIST'
    ]]
    enhancer_tf_region_table = [[
        'ENHANCER_ID', 'CHROM', 'START', 'STOP', 'GENE_LIST'
    ]]
    gene_region_table = [[
        'GENE', 'TF', 'CHROM', 'START', 'STOP', 'ENHANCER_ID'
    ]]
    gene_tf_region_table = [['GENE', 'CHROM', 'START', 'STOP', 'ENHANCER_ID']]
    gene_summary_table = [['GENE', 'TF', 'ENHANCER_LIST']]

    # Will need to track which ones are TFs
    candidate_tf_list = []
    # Find the columns for gene assignment
    header = enhancer_table[0]
    header_length = len(enhancer_table[0])
    closest_index = header.index('CLOSEST_GENE')
    proximal_index = header.index('PROXIMAL_GENES')
    overlap_index = header.index('OVERLAP_GENES')
    for line in enhancer_table[1:]:
        # Don't bother trying to figure out lines w/o target genes
        if len(line) != header_length:
            continue
        enhancer_locus = utils.Locus(line[1], line[2], line[3], '.', line[0])
        closest_gene_list = line[closest_index].split(
            ',') if line[closest_index] else []
        proximal_gene_list = line[proximal_index].split(
            ',') if line[proximal_index] else []
        overlap_gene_list = line[overlap_index].split(
            ',') if line[overlap_index] else []
        all_gene_list = closest_gene_list + proximal_gene_list + overlap_gene_list
        all_gene_list = [gene.upper() for gene in all_gene_list]

        # Gets a unique list of all tfs
        if activity_path:
            all_gene_list = utils.uniquify([
                gene for gene in all_gene_list
                if active_gene_list.count(gene) > 0
            ])
        else:
            all_gene_list = utils.uniquify(all_gene_list)

        candidate_gene_list = utils.uniquify(
            [gene for gene in all_gene_list if tf_list_name.count(gene) > 0])
        if all_gene_list:
            for gene in all_gene_list:
                gene_to_enhancer_dict[gene].append(enhancer_locus)
                enhancer_to_gene_dict[enhancer_locus].append(gene)
            newline = line[0:4] + [','.join(all_gene_list)]
        else:
            newline = line[0:4] + ['']
        enhancer_region_table.append(newline)

        if candidate_gene_list:
            tf_line = line[0:4] + [','.join(candidate_gene_list)]
            enhancer_tf_region_table.append(tf_line)

    # Now iterate through each gene and list the enhancers
    gene_list = list(gene_to_enhancer_dict.keys())
    print(gene_list)
    gene_list.sort()
    for gene in gene_list:
        if tf_list_name.count(gene) > 0:
            tf_status = 1
            candidate_tf_list.append(gene)
        else:
            tf_status = 0
        enhancer_loci = gene_to_enhancer_dict[gene]
        enhancer_string = ','.join([enhancer.id for enhancer in enhancer_loci])
        gene_summary_table.append([gene, tf_status, enhancer_string])
        for enhancer in enhancer_loci:
            newline = [
                gene,
                tf_status,
                enhancer.chr,
                enhancer.start,
                enhancer.end,
                enhancer.id,
            ]
            gene_region_table.append(newline)
            if tf_status == 1:
                newline = [
                    gene, enhancer.chr, enhancer.start, enhancer.end,
                    enhancer.id
                ]
                gene_tf_region_table.append(newline)

    return (
        gene_region_table,
        gene_tf_region_table,
        enhancer_region_table,
        enhancer_tf_region_table,
        gene_summary_table,
        candidate_tf_list,
        gene_to_enhancer_dict,
    )