Exemplo n.º 1
0
def call_merge_supers(data_file, super_file1, super_file2, name1, name2,
                      merge_name, genome, parent_folder):
    """Call ROSE2 on merged super enhancers."""
    merged_gff_file = "%s%s_%s_MERGED_REGIONS_-0_+0.gff" % (
        parent_folder,
        genome.upper(),
        merge_name,
    )

    # check to make sure this hasn't been done yet
    rose_output = os.path.join(
        parent_folder,
        "{}_ROSE".format(name1),
        "{}_{}_MERGED_REGIONS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt".
        format(genome.upper(), merge_name),
    )

    try:
        utils.parse_table(rose_output, "\t")
        print("ROSE OUTPUT ALREADY FOUND HERE {}".format(rose_output))
        return rose_output
    except (FileNotFoundError, IOError):
        print("MERGING ENHANCER REGIONS FROM {} and {}".format(
            super_file1, super_file2))
        merged_gff = merge_collections(super_file1, super_file2, name1, name2,
                                       merged_gff_file)

        # call rose on the merged collection
        rose_bash_file = call_rose_merged(data_file, merged_gff, name1, name2,
                                          parent_folder)
        print(rose_bash_file)

        # run the bash command
        os.system("bash {}".format(rose_bash_file))

        # check for and return output
        if utils.check_output(rose_output, 1, 10):
            return rose_output
        else:
            # try finding it w/ a different name
            # this will bug out if nothing is there
            rose_folder = os.path.join(parent_folder, "{}_ROSE".format(name1))
            rose_file_list = [
                x for x in os.listdir(rose_folder) if x[0] != "."
            ]  # no hidden files
            if not rose_file_list:
                print("No files found in {}".format(rose_folder))
                sys.exit()

            pipeline_utils.get_file("_SuperEnhancers_ENHANCER_TO_GENE.txt",
                                    rose_file_list, rose_folder)
Exemplo n.º 2
0
def make_bed_collection(bed_file_list):
    """Takes in a list of bed files and makes a single huge collection.

    Each locus has as its ID the name of the bed file.

    """
    bed_loci = []
    print("MAKING BED COLLECTION FOR:")
    for bed_file in bed_file_list:

        bed_name = os.path.basename(bed_file).split(".")[0]
        print(bed_name)
        bed = utils.parse_table(bed_file, "\t")
        for line in bed:
            if len(line) >= 3:
                # check that line[0]
                if line[0][0:3] == "chr":
                    try:
                        coords = [int(line[1]), int(line[2])]
                        bed_locus = utils.Locus(
                            line[0], min(coords), max(coords), ".", bed_name
                        )
                        bed_loci.append(bed_locus)
                    except ValueError:
                        pass

        print("IDENTIFIED {} BED REGIONS".format(str(len(bed_loci))))

    return utils.LocusCollection(bed_loci, 50)
Exemplo n.º 3
0
def collapse_region_map(region_map_file, name="", control_bams=False):
    """Take a region_map file and collapse signal into a single column.

    Also fix any stupid start/stop sorting issues. Need to take into account whether or not
    controls were used.

    """
    region_map = utils.parse_table(region_map_file, "\t")

    for n, line in enumerate(region_map):
        if n == 0:
            # new header
            if len(name) == 0:
                name = "MERGED_SIGNAL"
            region_map[n] = line[0:6] + [name]

        else:
            new_line = list(line[0:6])
            if control_bams:
                signal_line = [float(x) for x in line[6:]]
                rankby_indexes = range(0, len(signal_line) // 2, 1)
                control_indexes = range(
                    len(signal_line) // 2, len(signal_line), 1)
                meta_vector = []
                for i, j in zip(rankby_indexes, control_indexes):
                    # min signal is 0
                    meta_vector.append(max(0, signal_line[i] - signal_line[j]))
                meta_signal = numpy.mean(meta_vector)
            else:
                meta_signal = numpy.mean([float(x) for x in line[6:]])
            region_map[n] = new_line + [meta_signal]

    output_file = region_map_file.replace("REGION", "META")
    utils.unparse_table(region_map, output_file, "\t")
    return output_file
Exemplo n.º 4
0
def make_enhancer_signal_table(name_dict, merged_region_map, median_dict,
                               analysis_name, genome, output_folder):
    """Makes a signal table.

    Each row is an enhancer and each column is the log2 background corrected signal vs. median.

    """
    # load in the region map
    region_map = utils.parse_table(merged_region_map, "\t")
    names_list = list(name_dict.keys())
    names_list.sort()
    signal_table = [[
        "REGION_ID", "CHROM", "START", "STOP", "NUM_LOCI", "CONSTITUENT_SIZE"
    ] + names_list]

    print("len of {} for names_list".format(len(names_list)))
    print(names_list)

    for line in region_map[1:]:
        new_line = line[0:6]
        # a little tricky here to add datasets sequentially
        i = 6  # start w/ the first column w/ data
        for name in names_list:
            if name_dict[name]["background"] is True:
                enhancer_index = int(i)
                i += 1
                control_index = int(i)
                i += 1
                try:
                    enhancer_signal = float(line[enhancer_index]) - float(
                        line[control_index])
                except IndexError:
                    print(line)
                    print(len(line))
                    print(enhancer_index)
                    print(control_index)
                    sys.exit()

            else:
                enhancer_index = int(i)
                i += 1
                enhancer_signal = float(line[enhancer_index])

            if enhancer_signal < 0:
                enhancer_signal = 0
            enhancer_signal = enhancer_signal / median_dict[name]
            new_line.append(enhancer_signal)

        signal_table.append(new_line)

    output_file = os.path.join(
        output_folder, "{}_{}_signal_table.txt".format(genome, analysis_name))
    print("WRITING MEDIAN NORMALIZED SIGNAL TABLE TO {}".format(output_file))
    utils.unparse_table(signal_table, output_file, "\t")

    return output_file
Exemplo n.º 5
0
def make_signal_table(
    names_list, gff_file, mapped_folder, median_norm=False, output=""
):
    """For each sample, make a dictionary keyed by locus ID."""
    signal_dict = {}
    for name in names_list:
        signal_dict[name] = defaultdict(float)

    # now start filling in the signal dict
    gff_name = os.path.basename(gff_file).split(".")[0]
    print(gff_name)
    for name in names_list:
        print("MAKING SIGNAL DICT FOR %s" % (name))

        # try opening the batch mapping output first
        mapped_file = os.path.join(
            mapped_folder, gff_name, "{}_{}.txt".format(gff_name, name)
        )
        if utils.check_output(mapped_file, 0.02, 0.02):
            print("FOUND MAPPED FILE FOR {} AT {}".format(name, mapped_file))
        else:
            mapped_file = os.path.join(
                mapped_folder, gff_name, "{}_{}.txt".format(gff_name, name),
            )

        if utils.check_output(mapped_file, 0.02, 0.02):
            print("FOUND MAPPED FILE FOR {} AT {}".format(name, mapped_file))
        else:
            print("ERROR NO MAPPED FILE FOUND FOR {}".format(name))
            sys.exit()

        mapped_table = utils.parse_table(mapped_file, "\t")
        if median_norm:
            median_signal = numpy.median([float(line[2]) for line in mapped_table[1:]])
        else:
            median_signal = 1

        for line in mapped_table[1:]:
            signal_dict[name][line[1]] = float(line[2]) / median_signal

    # now make the signal table
    signal_table = []
    header = ["GENE_ID", "locusLine"] + names_list
    signal_table.append(header)

    for line in mapped_table[1:]:
        locus_id = line[1]
        sig_line = line[0:2] + [signal_dict[name][locus_id] for name in names_list]
        signal_table.append(sig_line)

    if not output:
        return signal_table
    else:
        utils.unparse_table(signal_table, output, "\t")
        return signal_table
Exemplo n.º 6
0
def format_data_table(data_file):
    """Formats the data_file and rewrite.

    First 3 columns are required for every line. If they aren't there the line is deleted.

    """
    print("reformatting data table")

    data_table = utils.parse_table(data_file, "\t")

    new_data_table = [
        [
            "FILE_PATH",
            "UNIQUE_ID",
            "GENOME",
            "NAME",
            "BACKGROUND",
            "ENRICHED_REGION",
            "ENRICHED_MACS",
            "COLOR",
            "FASTQ_FILE",
        ]
    ]
    # first check to make sure the table is formatted correctly
    for line in data_table[1:]:
        if len(line) < 3:
            continue
        # this spots header lines that may be out of place
        if line[0] == "FILE_PATH":
            continue
        # check if it at least has the first 3 columns filled in
        if len(line[0]) == 0 or len(line[1]) == 0 or len(line[2]) == 0:
            print("ERROR required fields missing in line")
            print(line)
        # if the first three are filled in, check to make sure there are 8 columns
        else:
            if len(line) > 3 and len(line) < 9:
                new_line = line + (8 - len(line)) * [""] + ["NA"]
                new_data_table.append(new_line)
            elif len(line) >= 9:
                new_line = line[0:9]
                new_data_table.append(new_line)

    # lower case all of the genomes
    # make the color 0,0,0 for blank lines and strip out any " marks
    for i in range(1, len(new_data_table)):
        new_data_table[i][2] = new_data_table[i][2].lower()
        color = new_data_table[i][7]
        if len(color) == 0:
            new_data_table[i][7] = "0,0,0"
    utils.unparse_table(new_data_table, data_file, "\t")

    return new_data_table
Exemplo n.º 7
0
def assign_enhancer_rank(enhancer_to_gene_file,
                         enhancer_file1,
                         enhancer_file2,
                         name1,
                         name2,
                         rank_output=""):
    """Assign enhancer rank to genes.

    For all genes in the enhancer_to_gene table, assign the highest overlapping ranked enhancer
    in the other tables.

    """
    enhancer_to_gene = utils.parse_table(enhancer_to_gene_file, "\t")

    enhancer_collection1 = make_se_collection(enhancer_file1, name1, False)
    enhancer_collection2 = make_se_collection(enhancer_file2, name2, False)

    enhancer_dict1 = make_se_dict(enhancer_file1, name1, False)
    enhancer_dict2 = make_se_dict(enhancer_file2, name2, False)

    # we're going to update the enhancer_to_gene_table
    enhancer_to_gene[0] += ["{}_rank".format(name1), "{}_rank".format(name2)]
    for i in range(1, len(enhancer_to_gene)):
        line = enhancer_to_gene[i]
        locus_line = utils.Locus(line[1], line[2], line[3], ".", line[0])

        # if the enhancer doesn't exist, its ranking is dead last on the enhancer list
        enhancer1_overlap = enhancer_collection1.get_overlap(
            locus_line, "both")
        if len(enhancer1_overlap) == 0:
            enhancer1_rank = len(enhancer_collection1)
        else:
            rank_list1 = [
                enhancer_dict1[x.id]["rank"] for x in enhancer1_overlap
            ]
            enhancer1_rank = min(rank_list1)

        enhancer2_overlap = enhancer_collection2.get_overlap(
            locus_line, "both")
        if len(enhancer2_overlap) == 0:
            enhancer2_rank = len(enhancer_collection2)
        else:
            rank_list2 = [
                enhancer_dict2[x.id]["rank"] for x in enhancer2_overlap
            ]
            enhancer2_rank = min(rank_list2)
        enhancer_to_gene[i] += [enhancer1_rank, enhancer2_rank]

    if len(rank_output) == 0:
        return enhancer_to_gene
    else:
        utils.unparse_table(enhancer_to_gene, rank_output, "\t")
Exemplo n.º 8
0
def make_se_collection(enhancer_file, name, super_only=True):
    """Return a locus collection from a super table."""
    enhancer_table = utils.parse_table(enhancer_file, "\t")
    enhancer_loci = []
    for line in enhancer_table:
        if line[0][0] == "#" or line[0][0] == "R":
            continue
        else:
            if super_only and int(line[-1]) == 0:
                break
            enhancer_loci.append(
                utils.Locus(line[1], line[2], line[3], ".",
                            "{}_{}".format(name, line[0])))

    return utils.LocusCollection(enhancer_loci, 50)
Exemplo n.º 9
0
def get_median_signal(enhancer_file, name, data_file):
    """Return the median enhancer signal of a file."""
    data_dict = pipeline_utils.load_data_table(data_file)
    enhancer_table = utils.parse_table(enhancer_file, "\t")
    background_name = data_dict[name]["background"]
    if background_name in data_dict:
        enhancer_vector = [
            float(line[6]) - float(line[7]) for line in enhancer_table[6:]
        ]
    else:
        enhancer_vector = [float(line[6]) for line in enhancer_table[6:]]

    median = numpy.median(enhancer_vector)

    return median
Exemplo n.º 10
0
def filter_gff(gff_file, chrom_list):
    """Take in a gff and filter out all lines that don't belong to a chrom in the chrom_list."""
    gff = utils.parse_table(gff_file, "\t")
    filtered_gff = []
    exclude_list = []
    for line in gff:
        if chrom_list.count(line[0]) == 1:
            filtered_gff.append(line)
        else:
            exclude_list.append(line[0])

    exclude_list = utils.uniquify(exclude_list)
    if len(exclude_list) > 0:
        print("EXCLUDED GFF REGIONS FROM THE FALLING CHROMS: {}".format(
            ",".join(exclude_list)))

    return filtered_gff
Exemplo n.º 11
0
def make_median_dict(name_dict):
    """For each dataset returns the median background subtracted enhancer signal."""
    median_dict = {}
    for name in name_dict:
        # open up the allenhancer_table
        enhancer_table = utils.parse_table(name_dict[name]["enhancer_file"],
                                           "\t")

        if name_dict[name]["background"] is True:
            # assume header ends after line 5
            enhancer_vector = [
                float(line[6]) - float(line[7]) for line in enhancer_table[6:]
            ]
        else:
            enhancer_vector = [float(line[6]) for line in enhancer_table[6:]]

        median_dict[name] = numpy.median(enhancer_vector)

    return median_dict
Exemplo n.º 12
0
def load_annot_file(genome, tss_window, gene_list=[]):
    """Load in the annotation.

    Create a start_dict and tss collection for a set of refseq IDs for a given genome.

    """
    annotation_folder = os.path.join(ROOT_DIR, "annotation")
    genome_dict = {
        "HG18": os.path.join(annotation_folder, "hg18_refseq.ucsc"),
        "MM9": os.path.join(annotation_folder, "mm9_refseq.ucsc"),
        "MM10": os.path.join(annotation_folder, "mm10_refseq.ucsc"),
        "HG19": os.path.join(annotation_folder, "hg19_refseq.ucsc"),
        "HG19_RIBO": os.path.join(annotation_folder, "hg19_refseq.ucsc"),
        "RN4": os.path.join(annotation_folder, "rn4_refseq.ucsc"),
        "RN6": os.path.join(annotation_folder, "rn6_refseq.ucsc"),
        "HG38": os.path.join(annotation_folder, "hg38_refseq.ucsc"),
    }

    mouse_convert_file = os.path.join(annotation_folder,
                                      "HMD_HumanPhenotype.rpt")

    # making a dictionary for mouse to human conversion
    mouse_convert_dict = defaultdict(str)

    mouse_convert_table = utils.parse_table(mouse_convert_file, "\t")
    for line in mouse_convert_table:
        mouse_convert_dict[line[4]] = line[0]

    annot_file = genome_dict[genome.upper()]

    start_dict = utils.make_start_dict(annot_file, gene_list)
    tss_loci = []
    if not gene_list:
        gene_list = [*start_dict]
    for gene in gene_list:
        tss_loci.append(
            utils.make_tss_locus(gene, start_dict, tss_window, tss_window))

    tss_collection = utils.LocusCollection(tss_loci, 50)

    return start_dict, tss_collection, mouse_convert_dict
Exemplo n.º 13
0
def make_se_dict(enhancer_file, name, super_only=True):
    """Make an attribute dict for enhancers keyed by uniqueID."""
    se_dict = {}
    enhancer_table = utils.parse_table(enhancer_file, "\t")
    for line in enhancer_table:
        if line[0][0] == "#":
            continue
        if line[0][0] == "R":
            header = line
            sup_column = header.index("isSuper")
            continue
        if super_only:
            if int(line[sup_column]) == 1:
                rank = int(line[-2])
                enhancer_id = "{}_{}".format(name, line[0])
                se_dict[enhancer_id] = {"rank": rank}
        else:
            rank = int(line[-2])
            enhancer_id = "{}_{}".format(name, line[0])
            se_dict[enhancer_id] = {"rank": rank}

    return se_dict
Exemplo n.º 14
0
def make_se_collection(enhancer_file, name, top=0):
    """Return a locus collection from a super table.

    Top gives the number of rows.

    """
    enhancer_table = utils.parse_table(enhancer_file, "\t")
    super_loci = []

    ticker = 0
    for line in enhancer_table:
        if line[0][0] == "#" or line[0][0] == "R":
            continue
        else:
            ticker += 1
            super_loci.append(
                utils.Locus(line[1], line[2], line[3], ".",
                            "{}_{}".format(name, line[0])))

            if ticker == top:
                break

    return utils.LocusCollection(super_loci, 50)
Exemplo n.º 15
0
def load_data_table(data_file):
    """Load the master data table."""

    if isinstance(data_file, str):
        data_table = utils.parse_table(data_file, "\t")
    else:
        data_table = list(data_file)
    # first check to make sure the table is formatted correctly
    for line in data_table:
        # print(line)
        if len(line) != 9:
            print("this line did not pass")
            print(line)
            data_table = format_data_table(data_file)
            break

    data_dict = defaultdict(dict)
    for line in data_table[1:]:
        data_dict[line[3]]["folder"] = utils.format_folder(line[0], False)
        data_dict[line[3]]["uniqueID"] = line[1]
        data_dict[line[3]]["genome"] = line[2].upper()
        genome = line[2]

        data_dict[line[3]]["sam"] = "".join([line[0], line[1], ".", genome, ".bwt.sam"])
        data_dict[line[3]]["ylf"] = "".join([line[0], line[1], ".", genome, ".bwt.ylf"])
        data_dict[line[3]]["enriched"] = line[5]
        data_dict[line[3]]["background"] = line[4]
        data_dict[line[3]]["enrichedMacs"] = line[6]
        color_string = line[7].replace('"', "")
        data_dict[line[3]]["color"] = color_string
        data_dict[line[3]]["fastq"] = line[8]

        # figure out which bam convention we are using
        # default will be new convention
        # look in the bam_folder for all bams that might fit the bill
        bam_folder = str(line[0])
        bam_file_list = [
            x for x in os.listdir(bam_folder) if len(x) > 0 and x[0] != "."
        ]

        bam_file_candidates = [
            x
            for x in bam_file_list
            if x.count(line[1]) == 1
            and x.split(".")[-1] == "bam"
            and x.count("bai") == 0
        ]
        if not bam_file_candidates:
            print(
                "UNABLE TO FIND A BAM FILE IN {} WITH UNIQUE ID {}".format(
                    bam_folder, line[1]
                )
            )
            full_bam_path = ""
        elif len(bam_file_candidates) > 1:
            print(
                "MUTLIPLE BAM FILES IN {} WITH UNIQUE ID {}. NO BAM ASISGNED".format(
                    bam_folder, line[1]
                )
            )
            print(bam_file_candidates)
            full_bam_path = ""
        else:
            bam_file = bam_file_candidates[0]
            full_bam_path = os.path.abspath(os.path.join(bam_folder, bam_file))
            full_bai_path = full_bam_path + ".bai"

        if full_bam_path:
            try:
                open(full_bam_path, "r").close()
            except (IOError, FileNotFoundError):
                print("ERROR: BAM FILE {} DOES NOT EXIST".format(full_bam_path))
                full_bam_path = ""
            try:
                open(full_bai_path, "r").close()
            except (IOError, FileNotFoundError):
                print(
                    "ERROR: BAM FILE {} DOES NOT HAVE BAI INDEX".format(full_bam_path)
                )
                full_bam_path = ""

        data_dict[line[3]]["bam"] = full_bam_path

    return data_dict
Exemplo n.º 16
0
def main():
    """Main run call."""
    debug = False
    parser = argparse.ArgumentParser()
    # required flags
    parser.add_argument(
        "-i",
        "--i",
        dest="input",
        required=True,
        help="Enter a .gff or .bed file of binding sites used to make enhancers",
    )
    parser.add_argument(
        "-r",
        "--rankby",
        dest="rankby",
        required=True,
        help="bam_file to rank enhancer by",
    )
    parser.add_argument(
        "-o", "--out", dest="out", required=True, help="Enter an output folder"
    )
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        required=True,
        help="Enter the genome build (MM9,MM8,HG18,HG19)",
    )

    # optional flags
    parser.add_argument(
        "-b",
        "--bams",
        dest="bams",
        required=False,
        help="Enter a comma separated list of additional bam files to map to",
    )
    parser.add_argument(
        "-c",
        "--control",
        dest="control",
        required=False,
        help="bam_file to rank enhancer by",
    )
    parser.add_argument(
        "-s",
        "--stitch",
        dest="stitch",
        default="",
        help=(
            "Enter a max linking distance for stitching. Default will determine optimal stitching"
            " parameter"
        ),
    )
    parser.add_argument(
        "-t",
        "--tss",
        dest="tss",
        default=0,
        help="Enter a distance from TSS to exclude. 0 = no TSS exclusion",
    )

    parser.add_argument(
        "--mask",
        dest="mask",
        required=False,
        help="Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions",
    )

    # RETRIEVING FLAGS
    args = parser.parse_args()

    # making the out folder if it doesn't exist
    out_folder = utils.format_folder(args.out, True)

    # figuring out folder schema
    gff_folder = utils.format_folder(os.path.join(out_folder, "gff"), True)
    mapped_folder = utils.format_folder(os.path.join(out_folder, "mapped_gff"), True)

    # GETTING INPUT FILE
    if args.input.split(".")[-1] == "bed":
        # CONVERTING A BED TO GFF
        input_gff_name = args.input.split("/")[-1][0:-4]
        input_gff_file = os.path.join(gff_folder, "{}.gff".format(input_gff_name))
        utils.bed_to_gff(args.input, input_gff_file)
    elif args.input.split(".")[-1] == "gff":
        # COPY THE INPUT GFF TO THE GFF FOLDER
        input_gff_file = args.input
        copyfile(
            input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file))
        )

    else:
        print(
            "WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT"
        )
        # COPY THE INPUT GFF TO THE GFF FOLDER
        input_gff_file = args.input
        copyfile(
            input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file))
        )

    # GETTING THE LIST OF bam_fileS TO PROCESS
    if args.control:
        bam_file_list = [args.rankby, args.control]

    else:
        bam_file_list = [args.rankby]

    if args.bams:
        bam_file_list += args.bams.split(",")
        # bam_file_list = utils.uniquify(bam_file_list) # makes sad when you have the same control
        # bam over and over again
    # optional args

    # Stitch parameter
    if args.stitch == "":
        stitch_window = ""
    else:
        stitch_window = int(args.stitch)

    # tss args
    tss_window = int(args.tss)
    if tss_window != 0:
        remove_tss = True
    else:
        remove_tss = False

    # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS
    print("USING {} AS THE INPUT GFF".format(input_gff_file))
    input_name = os.path.basename(input_gff_file).split(".")[0]

    # GETTING THE GENOME
    genome = args.genome
    print("USING {} AS THE GENOME".format(genome))

    annot_file = rose2_utils.genome_dict[genome.upper()]

    # GET CHROMS FOUND IN THE BAMS
    print("GETTING CHROMS IN bam_fileS")
    bam_chrom_list = rose2_utils.get_bam_chrom_list(bam_file_list)
    print("USING THE FOLLOWING CHROMS")
    print(bam_chrom_list)

    # LOADING IN THE GFF AND FILTERING BY CHROM
    print("LOADING AND FILTERING THE GFF")
    input_gff = rose2_utils.filter_gff(input_gff_file, bam_chrom_list)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print("LOADING IN GFF REGIONS")
    reference_collection = utils.gff_to_locus_collection(input_gff)
    print("STARTING WITH {} INPUT REGIONS".format(len(reference_collection)))
    print("CHECKING REFERENCE COLLECTION:")
    rose2_utils.check_ref_collection(reference_collection)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if args.mask:
        mask_file = args.mask
        print("USING MASK FILE {}".format(mask_file))
        # if it's a bed file
        if mask_file.split(".")[-1].upper() == "BED":
            mask_gff = utils.bed_to_gff(mask_file)
        elif mask_file.split(".")[-1].upper() == "GFF":
            mask_gff = utils.parse_table(mask_file, "\t")
        else:
            print("MASK MUST BE A .gff or .bed FILE")

        mask_collection = utils.gff_to_locus_collection(mask_gff)
        print("LOADING {} MASK REGIONS".format(str(len(mask_collection))))
        # now mask the reference loci
        reference_loci = reference_collection.get_loci()
        filtered_loci = [
            locus
            for locus in reference_loci
            if len(mask_collection.get_overlap(locus, "both")) == 0
        ]
        print(
            "FILTERED OUT {} LOCI THAT WERE MASKED IN {}".format(
                str(len(reference_loci) - len(filtered_loci)), mask_file
            )
        )
        reference_collection = utils.LocusCollection(filtered_loci, 50)

    # NOW STITCH REGIONS
    print("STITCHING REGIONS TOGETHER")
    stitched_collection, debug_output, stitch_window = rose2_utils.region_stitching(
        reference_collection,
        input_name,
        out_folder,
        stitch_window,
        tss_window,
        annot_file,
        remove_tss,
    )
    # NOW MAKE A STITCHED COLLECTION GFF
    print("MAKING GFF FROM STITCHED COLLECTION")
    stitched_gff = utils.locus_collection_to_gff(stitched_collection)
    # making sure start/stop ordering are correct
    for i in range(len(stitched_gff)):

        line = stitched_gff[i]
        start = int(line[3])
        stop = int(line[4])
        if start > stop:
            line[3] = stop
            line[4] = start

    print(stitch_window)
    print(type(stitch_window))
    if not remove_tss:
        stitched_gff_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED.gff".format(input_name, str(stitch_window // 1000)),
        )
        stitched_gff_name = "{}_{}KB_STITCHED".format(
            input_name, str(stitch_window // 1000)
        )
        debug_out_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED.debug".format(input_name, str(stitch_window // 1000)),
        )
    else:
        stitched_gff_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED_TSS_DISTAL.gff".format(
                input_name, str(stitch_window // 1000)
            ),
        )
        stitched_gff_name = "{}_{}KB_STITCHED_TSS_DISTAL".format(
            input_name, str(stitch_window // 1000)
        )
        debug_out_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED_TSS_DISTAL.debug".format(
                input_name, str(stitch_window // 1000)
            ),
        )

    # WRITING DEBUG OUTPUT TO DISK
    if debug:
        print("WRITING DEBUG OUTPUT TO DISK AS {}".format(debug_out_file))
        utils.unparse_table(debug_output, debug_out_file, "\t")

    # WRITE THE GFF TO DISK
    print("WRITING STITCHED GFF TO DISK AS {}".format(stitched_gff_file))
    utils.unparse_table(stitched_gff, stitched_gff_file, "\t")

    # SETTING UP THE OVERALL OUTPUT FILE
    output_file1 = os.path.join(
        out_folder, "{}_ENHANCER_REGION_MAP.txt".format(stitched_gff_name)
    )
    print("OUTPUT WILL BE WRITTEN TO  {}".format(output_file1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    bam_file_list_unique = list(bam_file_list)
    bam_file_list_unique = utils.uniquify(bam_file_list_unique)
    # prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bam_file_list_unique)
    for bam_file in bam_file_list_unique:

        bam_file_name = os.path.basename(bam_file)

        # MAPPING TO THE STITCHED GFF
        mapped_out1_folder = os.path.join(
            mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name)
        )
        mapped_out1_file = os.path.join(
            mapped_folder,
            "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name),
            "matrix.txt",
        )
        if utils.check_output(mapped_out1_file, 0.2, 0.2):
            print(
                "FOUND {} MAPPING DATA FOR BAM: {}".format(
                    stitched_gff_file, mapped_out1_file
                )
            )
        else:
            cmd1 = "bamliquidator_batch --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format(
                stitched_gff_file, mapped_out1_folder, bam_file,
            )
            print(cmd1)

            os.system(cmd1)
            if utils.check_output(mapped_out1_file, 0.2, 5):
                print(
                    "SUCCESSFULLY MAPPED TO {} FROM BAM: {}".format(
                        stitched_gff_file, bam_file_name
                    )
                )
            else:
                print(
                    "ERROR: FAILED TO MAP {} FROM BAM: {}".format(
                        stitched_gff_file, bam_file_name
                    )
                )
                sys.exit()

    print("BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS")
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    rose2_utils.map_collection(
        stitched_collection,
        reference_collection,
        bam_file_list,
        mapped_folder,
        output_file1,
        ref_name=stitched_gff_name,
    )

    print("CALLING AND PLOTTING SUPER-ENHANCERS")

    if args.control:
        control_name = os.path.basename(args.control)
    else:
        control_name = "NONE"
    cmd = "Rscript {} {} {} {} {}".format(
        os.path.join(ROOT_DIR, "scripts", "ROSE2_callSuper.R"),
        out_folder + "/",  # TODO: fix R script so it does not require '/'
        output_file1,
        input_name,
        control_name,
    )
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    time.sleep(20)
    super_table_file = "{}_SuperEnhancers.table.txt".format(input_name)
    if args.control:
        cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format(
            genome,
            args.rankby,
            args.control,
            os.path.join(out_folder, super_table_file),
        )
    else:
        cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format(
            genome, args.rankby, os.path.join(out_folder, super_table_file)
        )
    os.system(cmd)

    stretch_table_file = "{}_StretchEnhancers.table.txt".format(input_name)
    if args.control:
        cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format(
            genome,
            args.rankby,
            args.control,
            os.path.join(out_folder, stretch_table_file),
        )
    else:
        cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format(
            genome, args.rankby, os.path.join(out_folder, stretch_table_file)
        )
    os.system(cmd)

    superstretch_table_file = "{}_SuperStretchEnhancers.table.txt".format(input_name)
    if args.control:
        cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format(
            genome,
            args.rankby,
            args.control,
            os.path.join(out_folder, superstretch_table_file),
        )
    else:
        cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format(
            genome, args.rankby, os.path.join(out_folder, superstretch_table_file)
        )
    os.system(cmd)
Exemplo n.º 17
0
def map_bams(bam_file_list, split_gff_path, analysis_name, mapped_folder):
    """Map bams to a GFF."""
    print("MAPPING TO THE FOLLOWING BAMS:")

    for bam_file in bam_file_list:
        print(bam_file)
        bam_file_name = os.path.basename(bam_file)

        # MAPPING TO THE STITCHED GFF
        mapped_out_folder = os.path.join(
            mapped_folder,
            "{}_{}_MAPPED".format(analysis_name, bam_file_name),
        )
        mapped_out_file = os.path.join(mapped_out_folder, "matrix.txt")
        if utils.check_output(mapped_out_file, 0.2, 0.2):
            print("FOUND {} MAPPING DATA FOR BAM: {}".format(
                split_gff_path, mapped_out_file))
        else:
            cmd = "bamliquidator_batch --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format(
                split_gff_path,
                mapped_out_folder,
                bam_file,
            )
            print(cmd)

            os.system(cmd)
            if utils.check_output(mapped_out_file, 0.2, 5):
                print("SUCCESSFULLY MAPPED TO {} FROM BAM: {}".format(
                    split_gff_path,
                    bam_file_name,
                ))
            else:
                print("ERROR: FAILED TO MAP {} FROM BAM: {}".format(
                    split_gff_path,
                    bam_file_name,
                ))
                sys.exit()

    print("BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS")

    # now we make a signal table
    # set up the table using the first bam
    if len(bam_file_list) > 1:

        # set up the first pass at the table
        signal_table = [
            ["REGION_ID", "locusLine"] +
            [name.split("/")[-1] for name in bam_file_list],
        ]
        bam_file_name = bam_file_list[0].split("/")[-1]
        mapped_table = utils.parse_table(
            os.path.join(
                mapped_folder,
                "{}_{}_MAPPED".format(analysis_name, bam_file_name),
                "matrix.txt",
            ),
            "\t",
        )
        signal_table = mapped_table[1:]

        for bam_file in bam_file_list[1:]:
            bam_file_name = bam_file.split("/")[-1]

            mapped_table = utils.parse_table(
                os.path.join(
                    mapped_folder,
                    "{}_{}_MAPPED".format(analysis_name, bam_file_name),
                    "matrix.txt",
                ),
                "\t",
            )

            for i in range(1, len(mapped_table)):
                map_signal = mapped_table[i][2]
                signal_table[i].append(map_signal)
    else:
        bam_file_name = bam_file_list[0].split("/")[-1]
        signal_table = utils.parse_table(
            os.path.join(
                mapped_folder,
                "{}_{}_MAPPED".format(analysis_name, bam_file_name),
                "matrix.txt",
            ),
            "\t",
        )

    return signal_table
Exemplo n.º 18
0
def finish_rank_output(
    data_file,
    rank_output,
    genome,
    merge_folder,
    merge_name,
    name1,
    name2,
    cut_off=1.5,
    window=100000,
    super_only=True,
    plot_bam=True,
):
    """Finish rank output.

    Clean up the rank output table. Make a gff of all of the gained/lost supers beyond a certain
    cut_off w/ a window. Make a list of gained genes and lost genes. Make a bed of gained loss.

    """
    data_dict = pipeline_utils.load_data_table(data_file)
    # making sure window and cut_off are int/float
    cut_off = float(cut_off)
    window = int(window)
    genome = genome.upper()

    # make the output folder
    output_folder = utils.format_folder(os.path.join(merge_folder, "output"),
                                        True)

    # bring in the old rank table
    rank_enhancer_table = utils.parse_table(rank_output, "\t")

    # make a new formatted table
    header = rank_enhancer_table[0]
    header[-4] = "DELTA RANK"
    header[-3] = "IS_SUPER"
    formatted_rank_table = [header]

    # the gffs
    gained_gff = []
    lost_gff = []

    gained_window_gff = []
    lost_window_gff = []

    if super_only:
        enhancer_type = "SUPERS"
    else:
        enhancer_type = "ENHANCERS"

    # the beds
    if super_only:
        gained_track_header = (
            'track name="{} {} only SEs" description="{} super enhancers that are found only in '
            '{} vs {}" itemRGB=On color=255,0,0'.format(
                genome, name2, genome, name2, name1))
        gained_bed = [[gained_track_header]]
        conserved_track_header = (
            'track name="{} {} and {} SEs" description="{} super enhancers that are found in both'
            ' {} vs {}" itemRGB=On color=0,0,0'.format(genome, name1, name2,
                                                       genome, name1, name2))
        conserved_bed = [[conserved_track_header]]

        lost_track_header = (
            'track name="{} {} only SEs" description="{} super enhancers that are found only in '
            '{} vs {}" itemRGB=On color=0,255,0'.format(
                genome, name1, genome, name1, name2))
        lost_bed = [[lost_track_header]]
    else:
        gained_track_header = (
            'track name="{} {} only enhancers" description="{} enhancers that are found only in '
            '{} vs {}" itemRGB=On color=255,0,0'.format(
                genome, name2, genome, name2, name1))
        gained_bed = [[gained_track_header]]
        conserved_track_header = (
            'track name="{} {} and {} enhancers" description="{} enhancers that are found in both'
            ' {} vs {}" itemRGB=On color=0,0,0'.format(genome, name1, name2,
                                                       genome, name1, name2))
        conserved_bed = [[conserved_track_header]]

        lost_track_header = (
            'track name="{} {} only enhancers" description="{} enhancers that are found only in '
            '{} vs {}" itemRGB=On color=0,255,0'.format(
                genome, name1, genome, name1, name2))
        lost_bed = [[lost_track_header]]

    # the genes
    gene_table = [[
        "GENE",
        "ENHANCER_ID",
        "ENHANCER_CHROM",
        "ENHANCER_START",
        "ENHANCER_STOP",
        header[6],
        header[7],
        header[8],
        "STATUS",
    ]]

    for line in rank_enhancer_table[1:]:
        # fixing the enhancer ID
        line[0] = line[0].replace("_lociStitched", "")
        formatted_rank_table.append(line)

        # getting the genes
        gene_list = []
        gene_list += line[9].split(",")
        gene_list += line[10].split(",")
        gene_list += line[11].split(",")
        gene_list = [x for x in gene_list if len(x) > 0]
        gene_list = utils.uniquify(gene_list)
        gene_string = ",".join(gene_list)

        bed_line = [line[1], line[2], line[3], line[0], line[-4]]

        # for gained
        if float(line[6]) > cut_off:
            gff_line = [
                line[1],
                line[0],
                "",
                line[2],
                line[3],
                "",
                ".",
                "",
                gene_string,
            ]
            gff_window_line = [
                line[1],
                line[0],
                "",
                int(line[2]) - window,
                int(line[3]) + window,
                "",
                ".",
                "",
                gene_string,
            ]
            gained_gff.append(gff_line)
            gained_window_gff.append(gff_window_line)
            gene_status = name2
            gained_bed.append(bed_line)
        # for lost
        elif float(line[6]) < (-1 * cut_off):
            gff_line = [
                line[1],
                line[0],
                "",
                line[2],
                line[3],
                "",
                ".",
                "",
                gene_string,
            ]
            gff_window_line = [
                line[1],
                line[0],
                "",
                int(line[2]) - window,
                int(line[3]) + window,
                "",
                ".",
                "",
                gene_string,
            ]
            lost_gff.append(gff_line)
            lost_window_gff.append(gff_window_line)
            gene_status = name1
            lost_bed.append(bed_line)
        # for conserved
        else:
            gene_status = "CONSERVED"
            conserved_bed.append(bed_line)

        # now fill in the gene Table
        for gene in gene_list:
            gene_table_line = [
                gene,
                line[0],
                line[1],
                line[2],
                line[3],
                line[6],
                line[7],
                line[8],
                gene_status,
            ]
            gene_table.append(gene_table_line)

    # concat the bed
    full_bed = gained_bed + conserved_bed + lost_bed

    # start writing the output
    # there's the two gffs, the bed,the formatted table, the gene table

    # formatted table
    formatted_filename = os.path.join(
        output_folder,
        "{}_{}_MERGED_{}_RANK_TABLE.txt".format(genome, merge_name,
                                                enhancer_type),
    )
    utils.unparse_table(formatted_rank_table, formatted_filename, "\t")

    # gffs
    gff_folder = utils.format_folder(output_folder + "gff/", True)
    gff_filename_gained = os.path.join(
        gff_folder,
        "{}_{}_{}_ONLY_{}_-0_+0.gff".format(genome, merge_name, name2.upper(),
                                            enhancer_type),
    )
    gff_filename_window_gained = os.path.join(
        gff_folder,
        "{}_{}_{}_ONLY_{}_-{}KB_+{}KB.gff".format(
            genome,
            merge_name,
            name2.upper(),
            enhancer_type,
            str(window // 1000),
            str(window // 1000),
        ),
    )

    gff_filename_lost = os.path.join(
        gff_folder,
        "{}_{}_{}_ONLY_{}_-0_+0.gff".format(genome, merge_name, name1.upper(),
                                            enhancer_type),
    )
    gff_filename_window_lost = os.path.join(
        gff_folder,
        "{}_{}_{}_ONLY_{}_-{}KB_+{}KB.gff".format(
            genome,
            merge_name,
            name1.upper(),
            enhancer_type,
            str(window // 1000),
            str(window // 1000),
        ),
    )

    utils.unparse_table(gained_gff, gff_filename_gained, "\t")
    utils.unparse_table(gained_window_gff, gff_filename_window_gained, "\t")

    utils.unparse_table(lost_gff, gff_filename_lost, "\t")
    utils.unparse_table(lost_window_gff, gff_filename_window_lost, "\t")

    # bed
    bed_filename = os.path.join(
        output_folder, "{}_{}_MERGED_{}.bed".format(genome, merge_name,
                                                    enhancer_type))
    utils.unparse_table(full_bed, bed_filename, "\t")

    # gene_table
    gene_filename = os.path.join(
        output_folder,
        "{}_{}_MERGED_{}_GENE_TABLE.txt".format(genome, merge_name,
                                                enhancer_type),
    )
    utils.unparse_table(gene_table, gene_filename, "\t")

    # finally, move all of the plots to the output folder
    copyfile(
        glob.glob(os.path.join(merge_folder, "{}_ROSE".format(name1),
                               "*.pdf"))[0],
        os.path.join(
            output_folder,
            "{}_{}_MERGED_{}_DELTA.pdf".format(genome, merge_name,
                                               enhancer_type),
        ),
    )

    copyfile(
        glob.glob(
            os.path.join(merge_folder, "{}_ROSE".format(name1),
                         "*RANK_PLOT.png"))[0],
        os.path.join(
            output_folder,
            "{}_{}_MERGED_{}_RANK_PLOT.png".format(genome, merge_name,
                                                   enhancer_type),
        ),
    )

    # now execute the bamPlot_turbo commands
    if plot_bam:
        bam1 = data_dict[name1]["bam"]
        bam2 = data_dict[name2]["bam"]
        bam_string = "{} {}".format(bam1, bam2)
        name_string = "{} {}".format(name1, name2)
        color_string = "0,0,0:100,100,100"

        if len(gained_gff) > 0:
            # gained command
            plot_title = "{}_ONLY_SE".format(name2)
            cmd = (
                "bamPlot_turbo -g {} -b {} -i {} -o {} -n {} -c {} -t {} -r -y UNIFORM -p "
                "MULTIPLE".format(
                    genome,
                    bam_string,
                    gff_filename_gained,
                    output_folder,
                    name_string,
                    color_string,
                    plot_title,
                ))
            os.system(cmd)

            # gained window command
            plot_title = "{}_ONLY_SE_{}KB_WINDOW".format(
                name2, str(window // 1000))
            cmd = (
                "bamPlot_turbo -g {} -b {} -i {} -o {} -n {} -c {} -t {} -r -y UNIFORM -p "
                "MULTIPLE".format(
                    genome,
                    bam_string,
                    gff_filename_window_gained,
                    output_folder,
                    name_string,
                    color_string,
                    plot_title,
                ))
            os.system(cmd)

        if len(lost_gff) > 0:
            # lost command
            plot_title = "{}_ONLY_SE".format(name1)
            cmd = (
                "bamPlot_turbo -g {} -b {} -i {} -o {} -n {} -c {} -t {} -r -y UNIFORM -p "
                "MULTIPLE".format(
                    genome,
                    bam_string,
                    gff_filename_lost,
                    output_folder,
                    name_string,
                    color_string,
                    plot_title,
                ))
            os.system(cmd)

            # lost command
            plot_title = "{}_ONLY_SE_{}KB_WINDOW".format(
                name1, str(window // 1000))
            cmd = (
                "bamPlot_turbo -g {} -b {} -i {} -o {} -n {} -c {} -t {} -r -y UNIFORM -p "
                "MULTIPLE".format(
                    genome,
                    bam_string,
                    gff_filename_window_lost,
                    output_folder,
                    name_string,
                    color_string,
                    plot_title,
                ))
            os.system(cmd)

    return
Exemplo n.º 19
0
def make_average_table(output_folder, analysis_name, use_background=False):
    """Makes a signal table that is the average background subtracted signal for each region.

    If background is present, will zero out regions before trying to take average.
    i.e. no negative regions allowed.

    """
    signal_table_path = os.path.join(
        output_folder,
        "{}_signal_table.txt".format(analysis_name),
    )
    signal_table = utils.parse_table(signal_table_path, "\t")

    average_table = [[
        "GENE_ID", "locusLine", "{}_signal".format(analysis_name)
    ]]

    # first the easy case with no background
    if not use_background:
        for line in signal_table[1:]:
            new_line = line[0:2]
            avg_signal = round(numpy.mean([float(x) for x in line[2:]]), 4)
            new_line.append(avg_signal)
            average_table.append(new_line)

    # now the condition w/ background
    else:
        control_table_path = os.path.join(
            output_folder,
            "{}_control_signal_table.txt".format(analysis_name),
        )
        control_table = utils.parse_table(control_table_path, "\t")

        # checking to make sure the # of backgrounds = number of signal bams
        # otherwise throw an error
        signal_n_col = len(signal_table[0])
        control_n_col = len(control_table[0])

        if signal_n_col != control_n_col:
            print("ERROR: MUST PROVIDE SAME NUMBER OF CONTROL BAMS")
            sys.exit()

        signal_n_rows = len(signal_table)
        control_n_rows = len(control_table)

        if signal_n_rows != control_n_rows:
            print("ERROR: MAPPED FILES ARE NOT THE SAME LENGTH")
            sys.exit()

        for i in range(1, len(signal_table)):
            signal_line = signal_table[i]
            control_line = control_table[i]
            if signal_line[0:2] != control_line[0:2]:
                print("ERROR: REGIONS ON LINE {} DO NOT CORRESPOND".format(
                    str(i)))
                sys.exit()

            new_line = signal_line[0:2]

            signal_values = [float(x) for x in signal_line[2:]]
            control_values = [float(x) for x in control_line[2:]]

            subtracted_values = [
                signal_values[x] - control_values[x]
                for x in range(len(signal_values))
            ]
            subtracted_values = [max(0, x) for x in subtracted_values
                                 ]  # now make negative numbers 0
            avg_signal = round(numpy.mean(subtracted_values), 4)
            new_line.append(avg_signal)
            average_table.append(new_line)

    return average_table
Exemplo n.º 20
0
def map_collection(
    stitched_collection,
    reference_collection,
    bam_file_list,
    mapped_folder,
    output,
    ref_name,
):
    """Makes a table of factor density in a stitched locus.

    Rank table by number of loci stitched together.

    """
    print("FORMATTING TABLE")
    loci = list(stitched_collection.get_loci())

    locus_table = [[
        "REGION_ID", "CHROM", "START", "STOP", "NUM_LOCI", "CONSTITUENT_SIZE"
    ]]

    loci_len_list = []

    # strip out any that are in chrY
    for locus in loci:
        if locus.chr == "chrY":
            loci.remove(locus)

    for locus in loci:
        # numLociList.append(int(stitchLocus.id.split('_')[1]))
        loci_len_list.append(locus.len())
        # numOrder = order(numLociList,decreasing=True)
    len_order = utils.order(loci_len_list, decreasing=True)
    ticker = 0
    for i in len_order:
        ticker += 1
        if ticker % 1000 == 0:
            print(ticker)
        locus = loci[i]

        # First get the size of the enriched regions within the stitched locus
        ref_enrich_size = 0
        ref_overlapping_loci = reference_collection.get_overlap(locus, "both")
        for ref_locus in ref_overlapping_loci:
            ref_enrich_size += ref_locus.len()

        try:
            stitch_count = int(locus.id.split("_")[0])
        except ValueError:
            stitch_count = 1
        coords = [int(x) for x in locus.coords()]

        locus_table.append([
            locus.id,
            locus.chr,
            min(coords),
            max(coords),
            stitch_count,
            ref_enrich_size,
        ])

    print("GETTING MAPPED DATA")
    print("USING A bam_file LIST:")
    print(bam_file_list)
    for bam_file in bam_file_list:

        bam_file_name = os.path.basename(bam_file)

        print("GETTING MAPPING DATA FOR  {}".format(bam_file))
        # assumes standard convention for naming enriched region gffs

        # opening up the mapped GFF
        mapped_gff_file = os.path.join(
            mapped_folder, "{}_{}_MAPPED".format(ref_name, bam_file_name),
            "matrix.txt")
        print("OPENING {}".format(mapped_gff_file))

        mapped_gff = utils.parse_table(mapped_gff_file, "\t")

        signal_dict = defaultdict(float)
        print("MAKING SIGNAL DICT FOR {}".format(bam_file))
        mapped_loci = []
        for line in mapped_gff[1:]:

            chrom = line[1].split("(")[0]
            start = int(line[1].split(":")[-1].split("-")[0])
            end = int(line[1].split(":")[-1].split("-")[1])
            mapped_loci.append(utils.Locus(chrom, start, end, ".", line[0]))
            try:
                signal_dict[line[0]] = float(line[2]) * (abs(end - start))
            except ValueError:
                print("WARNING NO SIGNAL FOR LINE:")
                print(line)
                continue

        mapped_collection = utils.LocusCollection(mapped_loci, 500)
        locus_table[0].append(bam_file_name)

        for i in range(1, len(locus_table)):
            signal = 0.0
            line = locus_table[i]
            line_locus = utils.Locus(line[1], line[2], line[3], ".")
            overlapping_regions = mapped_collection.get_overlap(line_locus,
                                                                sense="both")
            for region in overlapping_regions:
                signal += signal_dict[region.id]
            locus_table[i].append(signal)

    utils.unparse_table(locus_table, output, "\t")
Exemplo n.º 21
0
def main():
    """Main run method for enhancer promoter contribution tool."""
    parser = argparse.ArgumentParser()

    # required flags
    parser.add_argument(
        "-b",
        "--bam",
        dest="bam",
        nargs="*",
        help="Enter a space separated list of .bam files for the main factor",
        required=True,
    )
    parser.add_argument(
        "-i",
        "--input",
        dest="input",
        type=str,
        help="Enter .gff or .bed file of regions to analyze",
        required=True,
    )
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        type=str,
        help=(
            "specify a genome, HG18,HG19,HG38,MM8,MM9,MM10,RN6 are currently "
            "supported"),
        required=True,
    )
    parser.add_argument(
        "-p",
        "--chrom-path",
        dest="chrom_path",
        type=str,
        help=("Provide path to a folder with a seperate fasta file for each "
              "chromosome"),
        required=True,
    )
    # output flag
    parser.add_argument(
        "-o",
        "--output",
        dest="output",
        type=str,
        help="Enter the output folder.",
        required=True,
    )

    # additional options flags and optional arguments
    parser.add_argument(
        "-a",
        "--activity",
        dest="activity",
        type=str,
        help=("specify a table where first column represents a list of active "
              "refseq genes"),
        required=False,
    )
    parser.add_argument(
        "-c",
        "--control",
        dest="control",
        nargs="*",
        help=("Enter a space separated list of .bam files for background. If "
              "flagged, will perform background subtraction"),
        required=False,
    )
    parser.add_argument(
        "-t",
        "--tss",
        dest="tss",
        type=int,
        help="Define the TSS area +/- the TSS. Default is 1kb",
        required=False,
        default=1000,
    )
    parser.add_argument(
        "-d",
        "--distal",
        dest="distal",
        type=int,
        help="Enter a window to assign distal enhancer signal. Default is 50kb",
        required=False,
        default=50000,
    )
    parser.add_argument(
        "--other-bams",
        dest="other",
        nargs="*",
        help="enter a space separated list of other bams to map to",
        required=False,
    )
    parser.add_argument(
        "--name",
        dest="name",
        type=str,
        help=
        ("enter a root name for the analysis, otherwise will try to find the "
         "name from the input file"),
        required=False,
    )
    parser.add_argument(
        "--top",
        dest="top",
        type=int,
        help=
        ("Run the analysis on the top N genes by total signal. Default is 5000"
         ),
        required=False,
        default=5000,
    )
    parser.add_argument(
        "--tads",
        dest="tads",
        type=str,
        help=
        ("Include a .bed of tad regions to restrict enhancer/gene association"
         ),
        required=False,
        default=None,
    )
    parser.add_argument(
        "--mask",
        dest="mask",
        default=None,
        help=(
            "Mask a set of regions from analysis.  Provide a .bed or .gff of "
            "masking regions"),
    )

    args = parser.parse_args()

    print(args)

    # =====================================================================================
    # ===============================I. PARSING ARGUMENTS==================================
    # =====================================================================================

    print(
        "\n\n#======================================\n#===========I. DATA SUMMARY============\n#="
        "=====================================\n")

    # top analysis subset
    top = args.top

    # input genome
    genome = args.genome.upper()
    print("PERFORMING ANALYSIS ON {} GENOME BUILD".format(genome))

    # set of bams
    bam_file_list = args.bam

    # bring in the input path
    input_path = args.input

    # try to get the input name or use the name argument
    if args.name:
        analysis_name = args.name
    else:
        analysis_name = os.path.basename(input_path).split(".")[0]

    print("USING {} AS ANALYSIS NAME".format(analysis_name))
    # setting up the output folder
    parent_folder = utils.format_folder(args.output, True)
    output_folder = utils.format_folder(
        os.path.join(parent_folder, analysis_name), True)

    print("WRITING OUTPUT TO {}".format(output_folder))

    if input_path.split(".")[-1] == "bed":
        # type is bed
        print("input in bed format, converting to gff")
        input_gff = utils.bed_to_gff(input_path)
    else:
        input_gff = utils.parse_table(input_path, "\t")

    # the tss window for proximal signal assignment
    tss_window = int(args.tss)

    # the distal window for assigning nearby enhancer signal
    distal_window = int(args.distal)

    # activity path
    if args.activity:
        activity_path = args.activity
        activity_table = utils.parse_table(activity_path, "\t")
        ref_col = 0
        # try to find the column for refseq id
        for i in range(len(
                activity_table[2])):  # use an internal row in case of header
            if str(activity_table[1][i]).count("NM_") or str(
                    activity_table[1][i]).count("NR_"):
                ref_col = i

        # now check for header
        if not str(activity_table[0][i]).count("NM_") and not str(
                activity_table[0][i]).count("NR_"):
            print("REMOVING HEADER FROM GENE TABLE:")
            print(activity_table[0])
            activity_table.pop(0)

        gene_list = [line[ref_col] for line in activity_table
                     ]  # this needs to be REFSEQ NM ID
        print("IDENTIFIED {} ACTIVE GENES".format(len(gene_list)))

    else:
        gene_list = []

    # check if tads are being invoked
    if args.tads:
        print("LOADING TAD LOCATIONS FROM {}".format(args.tads))
        tads_path = args.tads
    else:
        tads_path = ""

    print("LOADING ANNOTATION DATA FOR GENOME {}".format(genome))

    genome_dir = args.chrom_path

    # making a chrom_dict that is a list of all chroms with sequence
    chrom_list = utils.uniquify(
        [name.split(".")[0] for name in os.listdir(genome_dir) if name])

    # important here to define the window
    start_dict, tss_collection, mouse_convert_dict = load_annot_file(
        genome,
        tss_window,
        gene_list,
    )

    print("FILTERING THE INPUT GFF FOR GOOD CHROMOSOMES")

    print(chrom_list)
    filtered_gff = [line for line in input_gff if chrom_list.count(line[0])]

    print("{} of INITIAL {} REGIONS ARE IN GOOD CHROMOSOMES".format(
        str(len(filtered_gff)),
        str(len(input_gff)),
    ))

    # =====================================================================================
    # ================II. IDENTIFYING TSS PROXIMAL AND DISTAL ELEMENTS=====================
    # =====================================================================================

    print(
        "\n\n#======================================\n#==II. MAPPING TO TSS/DISTAL REGIONS===\n#="
        "=====================================\n")

    # now we need to split the input region
    print("SPLITTING THE INPUT GFF USING A WINDOW OF {}".format(tss_window))
    split_gff = split_regions(filtered_gff,
                              tss_collection,
                              mask_file=args.mask)
    print(len(filtered_gff))
    print(len(split_gff))

    split_gff_path = os.path.join(output_folder,
                                  "{}_SPLIT.gff".format(analysis_name))
    utils.unparse_table(split_gff, split_gff_path, "\t")
    print("WRITING TSS SPLIT GFF OUT TO {}".format(split_gff_path))

    # now you have to map the bams to the gff
    print("MAPPING TO THE SPLIT GFF")
    mapped_folder = utils.format_folder(
        os.path.join(output_folder, "bam_mapping"), True)

    signal_table = map_bams(bam_file_list, split_gff_path, analysis_name,
                            mapped_folder)
    signal_table_path = os.path.join(
        output_folder, "{}_signal_table.txt".format(analysis_name))
    utils.unparse_table(signal_table, signal_table_path, "\t")

    if args.control:
        control_bam_file_list = args.control
        control_signal_table = map_bams(
            control_bam_file_list,
            split_gff_path,
            analysis_name,
            mapped_folder,
        )
        control_signal_table_path = os.path.join(
            output_folder,
            "{}_control_signal_table.txt".format(analysis_name),
        )
        utils.unparse_table(control_signal_table, control_signal_table_path,
                            "\t")

    # now create the background subtracted summarized average table
    print("CREATING AN AVERAGE SIGNAL TABLE")
    average_table = make_average_table(
        output_folder,
        analysis_name,
        use_background=args.control  # TODO: fix to True or False
    )
    average_table_path = os.path.join(
        output_folder, "{}_average_table.txt".format(analysis_name))
    utils.unparse_table(average_table, average_table_path, "\t")

    # now load up all of the cpg and other parameters to make the actual peak table

    # first check if this has already been done
    peak_table_path = os.path.join(output_folder,
                                   "{}_PEAK_TABLE.txt".format(analysis_name))
    if utils.check_output(peak_table_path, 0.1, 0.1):
        print("PEAK TABLE OUTPUT ALREADY EXISTS")
        peak_table = utils.parse_table(peak_table_path, "\t")
    else:
        peak_table = make_peak_table(
            param_dict,
            split_gff_path,
            average_table_path,
            start_dict,
            gene_list,
            genome_dir,
            tss_window,
            distal_window,
            tads_path,
        )
        utils.unparse_table(peak_table, peak_table_path, "\t")

    gene_table = make_gene_table(peak_table, analysis_name)

    gene_table_path = os.path.join(output_folder,
                                   "{}_GENE_TABLE.txt".format(analysis_name))
    utils.unparse_table(gene_table, gene_table_path, "\t")

    # if mouse, need to convert genes over
    if genome.count("MM") == 1:
        print("CONVERTING MOUSE NAMES TO HUMAN HOMOLOGS FOR GSEA")
        converted_gene_table_path = os.path.join(
            output_folder,
            "{}_GENE_TABLE_CONVERTED.txt".format(analysis_name),
        )

        converted_gene_table = [gene_table[0]]
        for line in gene_table[1:]:
            converted_name = mouse_convert_dict[line[0]]
            if converted_name:
                converted_gene_table.append([converted_name] + line[1:])

                utils.unparse_table(converted_gene_table,
                                    converted_gene_table_path, "\t")

        gene_table_path = converted_gene_table_path
        gene_table = converted_gene_table

    # =====================================================================================
    # ===================================III. PLOTTING ====================================
    # =====================================================================================

    print(
        "\n\n#======================================\n#===III. PLOTTING ENHANCER/PROMOTER===\n#=="
        "====================================\n")

    # if there are fewer genes in the gene table than the top genes, only run on all
    if len(gene_table) < int(top):
        print(
            "WARNING: ONLY {} GENES WITH SIGNAL AT EITHER PROMOTERS OR ENHANCERS. NOT ENOUGH TO"
            "RUN ANALYSIS ON TOP {}".format(str(len(gene_table) - 1),
                                            str(top)))
        top = 0

    # now call the R code
    print("CALLING R PLOTTING SCRIPTS")
    call_r_waterfall(gene_table_path, output_folder, analysis_name, top)
Exemplo n.º 22
0
def make_peak_table(
    param_dict,
    split_gff_path,
    average_table_path,
    start_dict,
    gene_list,
    genome_directory,
    tss_window,
    distal_window,
    tads_path="",
):
    """Makes the final peak table with ebox info."""
    peak_table = [[
        "REGION_ID",
        "CHROM",
        "START",
        "STOP",
        "LENGTH",
        "TSS",
        "CPG",
        "CPG_FRACTION",
        "GC_FREQ",
        "SIGNAL",
        "CANON_EBOX_COUNT",
        "NON_CANON_EBOX_COUNT",
        "TOTAL_EBOX_COUNT",
        "OVERLAPPING_GENES",
        "PROXIMAL_GENES",
    ]]

    print("LOADING PEAK REGIONS")
    peak_gff = utils.parse_table(split_gff_path, "\t")

    print("LOADING BINDING DATA")
    signal_table = utils.parse_table(average_table_path, "\t")

    print("LOADING CPGS ISLANDS")
    cpg_bed = utils.parse_table(param_dict["cpg_path"], "\t")
    cpg_loci = []
    for line in cpg_bed:
        cpg_loci.append(utils.Locus(line[0], line[1], line[2], ".", line[-1]))
    cpg_collection = utils.LocusCollection(cpg_loci, 50)

    print("MAKING TSS COLLECTIONS")
    if not gene_list:
        gene_list = [*start_dict]

    tss_prox_loci = []
    tss_distal_loci = []
    for ref_id in gene_list:
        tss_prox_loci.append(
            utils.make_tss_locus(ref_id, start_dict, tss_window, tss_window))
        tss_distal_loci.append(
            utils.make_tss_locus(
                ref_id,
                start_dict,
                distal_window,
                distal_window,
            ))

    # make a 1kb flanking and 50kb flanking collection
    tss_prox_collection = utils.LocusCollection(tss_prox_loci, 50)
    tss_distal_collection = utils.LocusCollection(tss_distal_loci, 50)

    if tads_path:
        print("LOADING TADS FROM {}".format(tads_path))
        tad_collection = utils.import_bound_region(tads_path, "tad")
        use_tads = True

        # building a tad dict keyed by tad ID w/ genes in that tad provided
        tad_dict = defaultdict(list)
        for tss_locus in tss_prox_loci:
            overlapping_tads = tad_collection.get_overlap(tss_locus, "both")
            for tad_locus in overlapping_tads:
                tad_dict[tad_locus.id].append(tss_locus.id)
    else:
        use_tads = False

    print("CLASSIFYING PEAKS")
    ticker = 0

    no_tad_count = 0
    for i in range(len(peak_gff)):
        if not ticker % 1000:
            print(ticker)
        ticker += 1

        # getting the particulars of the region
        gff_line = peak_gff[i]
        peak_id = gff_line[1]
        chrom = gff_line[0]
        start = int(gff_line[3])
        stop = int(gff_line[4])
        line_locus = utils.Locus(chrom, start, stop, ".", peak_id)

        # getting the mapped signal
        signal_line = signal_table[(i + 1)]
        signal_vector = [float(x) for x in signal_line[2:]]

        # setting up the new line
        new_line = [peak_id, chrom, start, stop, line_locus.len()]

        # get the tss status from the gff itself
        # (we are able to do this nicely from the split gff code earlier)
        new_line.append(gff_line[7])

        # check cpg status
        if cpg_collection.get_overlap(line_locus, "both"):
            new_line.append(1)
        else:
            new_line.append(0)

        # now do fractional cpgoverlap
        overlapping_cpg_loci = cpg_collection.get_overlap(line_locus, "both")
        overlapping_bases = 0
        for locus in overlapping_cpg_loci:
            cpg_start = max(locus.start, line_locus.start)
            cpg_end = min(locus.end, line_locus.end)
            overlapping_bases += cpg_end - cpg_start
        overlap_fraction = float(overlapping_bases) / line_locus.len()

        new_line.append(round(overlap_fraction, 2))

        # now get the seq
        line_seq = utils.fetch_seq(genome_directory, chrom, start, stop,
                                   True).upper()
        if not line_seq:
            print("UH OH")
            print(line_seq)
            print(gff_line)
            print(i)
            print(chrom)
            print(start)
            print(stop)
            sys.exit()

        gc_freq = float(line_seq.count("GC") +
                        line_seq.count("CG")) / len(line_seq)
        new_line.append(gc_freq)

        # this is where we add the ChIP-seq signal
        new_line += signal_vector

        ebox_match_list = re.findall("CA..TG", line_seq)
        if not ebox_match_list:
            new_line += [0] * 3
        else:
            total_count = len(ebox_match_list)
            canon_count = ebox_match_list.count("CACGTG")
            other_count = total_count - canon_count
            new_line += [canon_count, other_count, total_count]

        # now find the overlapping and proximal genes
        # here each overlapping gene the tss prox locus overlaps the peak

        if use_tads:
            tad_loci = tad_collection.get_overlap(line_locus, "both")

            tad_id_list = [tad_locus.id for tad_locus in tad_loci]
            tad_genes = []
            for tad_id in tad_id_list:
                tad_genes += tad_dict[tad_id]
            if not tad_genes:
                no_tad_count += 1
        else:
            tad_genes = []

        if tad_genes:
            overlapping_genes = [
                start_dict[locus.id]["name"]
                for locus in tss_prox_collection.get_overlap(
                    line_locus, "both") if tad_genes.count(locus.id)
            ]
            proximal_genes = [
                start_dict[locus.id]["name"]
                for locus in tss_distal_collection.get_overlap(
                    line_locus, "both") if tad_genes.count(locus.id)
            ]
        else:
            overlapping_genes = [
                start_dict[locus.id]["name"]
                for locus in tss_prox_collection.get_overlap(
                    line_locus, "both")
            ]
            proximal_genes = [
                start_dict[locus.id]["name"]
                for locus in tss_distal_collection.get_overlap(
                    line_locus, "both")
            ]

        overlapping_genes = utils.uniquify(overlapping_genes)
        # here the tss 50kb locus overlaps the peak
        # overlap takes priority over proximal
        proximal_genes = [
            gene for gene in proximal_genes
            if not overlapping_genes.count(gene)
        ]
        proximal_genes = utils.uniquify(proximal_genes)

        overlapping_string = ",".join(overlapping_genes)
        proximal_string = ",".join(proximal_genes)

        new_line += [overlapping_string, proximal_string]

        peak_table.append(new_line)

    print("Out of {} regions, {} were assigned to at least 1 tad".format(
        str(len(peak_table)),
        str(no_tad_count),
    ))
    return peak_table
Exemplo n.º 23
0
def main():
    """Main run call."""
    debug = False

    parser = argparse.ArgumentParser()
    # required flags
    parser.add_argument(
        "-i",
        "--i",
        dest="input",
        required=True,
        help=
        ("Enter a comma separated list of .gff or .bed file of binding sites used to make "
         "enhancers"),
    )
    parser.add_argument(
        "-r",
        "--rankby",
        dest="rankby",
        required=True,
        help="Enter a comma separated list of bams to rank by",
    )
    parser.add_argument("-o",
                        "--out",
                        dest="out",
                        required=True,
                        help="Enter an output folder")
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        required=True,
        help="Enter the genome build (MM9,MM8,HG18,HG19)",
    )

    # optional flags
    parser.add_argument(
        "-n",
        "--name",
        dest="name",
        required=False,
        help="Provide a name for the analysis otherwise ROSE will guess",
    )
    parser.add_argument(
        "-c",
        "--control",
        dest="control",
        required=False,
        help=
        ("Enter a comma separated list of control bams. Can either provide a single control "
         "bam for all rankby bams, or provide a control bam for each individual bam"
         ),
    )
    parser.add_argument(
        "-s",
        "--stitch",
        dest="stitch",
        default="",
        help=
        ("Enter a max linking distance for stitching. Default will determine optimal stitching"
         " parameter"),
    )
    parser.add_argument(
        "-t",
        "--tss",
        dest="tss",
        default=0,
        help="Enter a distance from TSS to exclude. 0 = no TSS exclusion",
    )

    parser.add_argument(
        "--mask",
        dest="mask",
        required=False,
        help=
        "Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions",
    )

    # RETRIEVING FLAGS
    args = parser.parse_args()

    # making the out folder if it doesn't exist
    out_folder = utils.format_folder(args.out, True)

    # figuring out folder schema
    gff_folder = utils.format_folder(os.path.join(out_folder, "gff"), True)
    mapped_folder = utils.format_folder(os.path.join(out_folder, "mappedGFF"),
                                        True)

    # GETTING INPUT FILE(s)
    input_list = [
        input_file for input_file in args.input.split(",")
        if len(input_file) > 1
    ]

    # converting all input files into GFFs and moving into the GFF folder
    input_gf_list = []
    for input_file in input_list:
        # GETTING INPUT FILE
        if args.input.split(".")[-1] == "bed":
            # CONVERTING A BED TO GFF
            input_gff_name = os.path.basename(args.input)[0:-4]
            input_gff_file = os.path.join(gff_folder,
                                          "{}.gff".format(input_gff_name))
            utils.bed_to_gff(args.input, input_gff_file)
        elif args.input.split(".")[-1] == "gff":
            # COPY THE INPUT GFF TO THE GFF FOLDER
            input_gff_file = args.input
            copyfile(
                input_gff_file,
                os.path.join(gff_folder, os.path.basename(input_gff_file)),
            )
        else:
            print(
                "WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT"
            )
            # COPY THE INPUT GFF TO THE GFF FOLDER
            input_gff_file = args.input
            copyfile(
                input_gff_file,
                os.path.join(gff_folder, os.path.basename(input_gff_file)),
            )

        input_gf_list.append(input_gff_file)

    # GETTING THE LIST OF bam_fileS TO PROCESS
    # either same number of bams for rankby and control
    # or only 1 control #or none!
    # bamlist should be all rankby bams followed by control bams

    bam_file_list = []
    if args.control:
        control_bam_list = [
            bam for bam in args.control.split(",") if len(bam) > 0
        ]
        rankby_bam_list = [
            bam for bam in args.rankby.split(",") if len(bam) > 0
        ]

        if len(control_bam_list) == len(rankby_bam_list):
            # case where an equal number of backgrounds are given
            bam_file_list = rankby_bam_list + control_bam_list
        elif len(control_bam_list) == 1:
            # case where a universal background is applied
            bam_file_list = rankby_bam_list + control_bam_list * len(
                rankby_bam_list)
        else:
            print(
                "ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM"
                " FOR EACH SAMPLE")
            sys.exit()
    else:
        bam_file_list = [bam for bam in args.rankby.split(",") if len(bam) > 0]

    # Stitch parameter
    if args.stitch == "":
        stitch_window = ""
    else:
        stitch_window = int(args.stitch)

    # tss args
    tss_window = int(args.tss)
    if tss_window != 0:
        remove_tss = True
    else:
        remove_tss = False

    # GETTING THE GENOME
    genome = args.genome.upper()
    print("USING {} AS THE GENOME".format(genome))

    # GETTING THE CORRECT ANNOT FILE
    try:
        annot_file = rose2_utils.genome_dict[genome]
    except KeyError:
        print("ERROR: UNSUPPORTED GENOMES TYPE {}".format(genome))
        sys.exit()

    # FINDING THE ANALYSIS NAME
    if args.name:
        input_name = args.name
    else:
        input_name = os.path.basename(input_gf_list[0]).split(".")[0]
    print("USING {} AS THE ANALYSIS NAME".format(input_name))

    print("FORMATTING INPUT REGIONS")
    # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs
    # use a simpler unique region naming system
    if len(input_gf_list) == 1:
        input_gff = utils.parse_table(input_gf_list[0], "\t")
    else:
        input_loci = []
        for gff_file in input_gf_list:
            print("\tprocessing {}".format(gff_file))
            gff = utils.parse_table(gff_file, "\t")
            gff_collection = utils.gff_to_locus_collection(gff, 50)
            input_loci += gff_collection.get_loci()

        input_collection = utils.LocusCollection(input_loci, 50)
        input_collection = (input_collection.stitch_collection()
                            )  # stitches to produce unique regions

        input_gff = utils.locus_collection_to_gff(input_collection)

    formatted_gff = []
    # now number things appropriately
    for i, line in enumerate(input_gff):

        # use the coordinates to make a new id input_name_chr_sense_start_stop
        chrom = line[0]
        coords = [int(line[3]), int(line[4])]
        sense = line[6]

        line_id = "{}_{}".format(input_name, str(i + 1))  # 1 indexing

        new_line = [
            chrom,
            line_id,
            line_id,
            min(coords),
            max(coords),
            "",
            sense,
            "",
            line_id,
        ]
        formatted_gff.append(new_line)

    # name of the master input gff file
    master_gff_file = os.path.join(
        gff_folder, "{}_{}_ALL_-0_+0.gff".format(genome, input_name))
    utils.unparse_table(formatted_gff, master_gff_file, "\t")

    print("USING {} AS THE INPUT GFF".format(master_gff_file))

    # GET CHROMS FOUND IN THE BAMS
    print("GETTING CHROMS IN bam_fileS")
    bam_chrom_list = rose2_utils.get_bam_chrom_list(bam_file_list)
    print("USING THE FOLLOWING CHROMS")
    print(bam_chrom_list)

    # LOADING IN THE GFF AND FILTERING BY CHROM
    print("LOADING AND FILTERING THE GFF")
    input_gff = rose2_utils.filter_gff(master_gff_file, bam_chrom_list)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print("LOADING IN GFF REGIONS")
    reference_collection = utils.gff_to_locus_collection(input_gff)

    print("CHECKING REFERENCE COLLECTION:")
    rose2_utils.check_ref_collection(reference_collection)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if args.mask:
        mask_file = args.mask
        # if it's a bed file
        if mask_file.split(".")[-1].upper() == "BED":
            mask_gff = utils.bedToGFF(mask_file)
        elif mask_file.split(".")[-1].upper() == "GFF":
            mask_gff = utils.parse_table(mask_file, "\t")
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        mask_collection = utils.gff_to_locus_collection(mask_gff)

        # now mask the reference loci
        reference_loci = reference_collection.get_loci()
        filtered_loci = [
            locus for locus in reference_loci
            if len(mask_collection.get_overlap(locus, "both")) == 0
        ]
        print("FILTERED OUT {} LOCI THAT WERE MASKED IN {}".format(
            len(reference_loci) - len(filtered_loci), mask_file))
        reference_collection = utils.LocusCollection(filtered_loci, 50)

    # NOW STITCH REGIONS
    print("STITCHING REGIONS TOGETHER")
    stitched_collection, debug_output, stitch_window = rose2_utils.region_stitching(
        reference_collection,
        input_name,
        out_folder,
        stitch_window,
        tss_window,
        annot_file,
        remove_tss,
    )

    # NOW MAKE A STITCHED COLLECTION GFF
    print("MAKING GFF FROM STITCHED COLLECTION")
    stitched_gff = utils.locus_collection_to_gff(stitched_collection)

    print(stitch_window)
    print(type(stitch_window))
    if not remove_tss:
        stitched_gff_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED.gff".format(input_name,
                                          str(stitch_window // 1000)),
        )
        stitched_gff_name = "{}_{}KB_STITCHED".format(
            input_name, str(stitch_window // 1000))
        debug_out_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED.debug".format(input_name,
                                            str(stitch_window // 1000)),
        )
    else:
        stitched_gff_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED_TSS_DISTAL.gff".format(
                input_name, str(stitch_window // 1000)),
        )
        stitched_gff_name = "{}_{}KB_STITCHED_TSS_DISTAL".format(
            input_name, str(stitch_window // 1000))
        debug_out_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED_TSS_DISTAL.debug".format(
                input_name, str(stitch_window // 1000)),
        )

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print("WRITING DEBUG OUTPUT TO DISK AS {}".format(debug_out_file))
        utils.unparse_table(debug_output, debug_out_file, "\t")

    # WRITE THE GFF TO DISK
    print("WRITING STITCHED GFF TO DISK AS {}".format(stitched_gff_file))
    utils.unparse_table(stitched_gff, stitched_gff_file, "\t")

    # SETTING UP THE OVERALL OUTPUT FILE
    output_file1 = os.path.join(
        out_folder, "{}_ENHANCER_REGION_MAP.txt".format(stitched_gff_name))
    print("OUTPUT WILL BE WRITTEN TO  {}".format(output_file1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    bam_file_list_unique = list(bam_file_list)
    bam_file_list_unique = utils.uniquify(bam_file_list_unique)
    # prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bam_file_list_unique)
    for bam_file in bam_file_list_unique:

        bam_file_name = os.path.basename(bam_file)

        # MAPPING TO THE STITCHED GFF
        mapped_out1_folder = os.path.join(
            mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name,
                                                 bam_file_name))
        mapped_out1_file = os.path.join(
            mapped_folder,
            "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name),
            "matrix.txt",
        )
        if utils.check_output(mapped_out1_file, 0.2, 0.2):
            print("FOUND {} MAPPING DATA FOR BAM: {}".format(
                stitched_gff_file, mapped_out1_file))
        else:
            cmd1 = "bamliquidator_batch --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format(
                stitched_gff_file,
                mapped_out1_folder,
                bam_file,
            )
            print(cmd1)

            os.system(cmd1)
            if utils.check_output(mapped_out1_file, 0.2, 5):
                print("SUCCESSFULLY MAPPED TO {} FROM BAM: {}".format(
                    stitched_gff_file, bam_file_name))
            else:
                print("ERROR: FAILED TO MAP {} FROM BAM: {}".format(
                    stitched_gff_file, bam_file_name))
                sys.exit()

    print("BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS")
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    rose2_utils.map_collection(
        stitched_collection,
        reference_collection,
        bam_file_list,
        mapped_folder,
        output_file1,
        ref_name=stitched_gff_name,
    )

    print("FINDING AVERAGE SIGNAL AMONGST BAMS")
    meta_output_file = collapse_region_map(output_file1,
                                           input_name + "_MERGED_SIGNAL",
                                           control_bams=args.control)

    # now try the merging

    print("CALLING AND PLOTTING SUPER-ENHANCERS")

    control_name = "NONE"
    cmd = "Rscript {} {} {} {} {}".format(
        os.path.join(ROOT_DIR, "scripts", "ROSE2_callSuper.R"),
        out_folder + "/",  # TODO: fix R script so it does not require '/'
        meta_output_file,
        input_name,
        control_name,
    )
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    print("CALLING GENE MAPPING")

    super_table_file = "{}_SuperEnhancers.table.txt".format(input_name)

    # for now don't use ranking bam to call top genes
    cmd = "ROSE2_geneMapper -g {} -i {} -f".format(
        genome, os.path.join(out_folder, super_table_file))
    print(cmd)
    os.system(cmd)

    stretch_table_file = "{}_StretchEnhancers.table.txt".format(input_name)

    cmd = "ROSE2_geneMapper -g {} -i {} -f".format(
        genome, os.path.join(out_folder, stretch_table_file))
    print(cmd)
    os.system(cmd)

    superstretch_table_file = "{}_SuperStretchEnhancers.table.txt".format(
        input_name)

    cmd = "ROSE2_geneMapper.py -g {} -i {} -f".format(genome, out_folder,
                                                      superstretch_table_file)
    os.system(cmd)
Exemplo n.º 24
0
def split_regions(input_gff, tss_collection, mask_file=None):
    """Split regions if even a single coordinate is shared with the +/-1kb."""
    # create mask regions collection
    if mask_file:
        print("USING MASK FILE {}".format(mask_file))
        # if it's a bed file
        if mask_file.split(".")[-1].upper() == "BED":
            mask_gff = utils.bed_to_gff(mask_file)
        elif mask_file.split(".")[-1].upper() == "GFF":
            mask_gff = utils.parse_table(mask_file, "\t")
        else:
            print("MASK MUST BE A .gff or .bed FILE")

        mask_collection = utils.gff_to_locus_collection(mask_gff)
        print("LOADING {} MASK REGIONS".format(len(mask_collection)))

    split_gff = []
    for line in input_gff:
        chrom = line[0]
        region_id = line[1]
        line_locus = utils.Locus(line[0], line[3], line[4], ".")

        # mask regions
        if mask_file:
            if mask_collection.get_overlap(line_locus, "both"):
                continue

        overlapping_loci = tss_collection.get_overlap(line_locus)
        if overlapping_loci:  # case where a tss overlap
            # identify the parts of the line locus that are contained
            local_tss_collection = utils.LocusCollection(overlapping_loci, 50)
            overlapping_coords = line_locus.coords()
            for tss_locus in overlapping_loci:
                overlapping_coords += tss_locus.coords()

            overlapping_coords = utils.uniquify(overlapping_coords)
            overlapping_coords.sort()

            # you need to hack and slash add 1 to the last coordinate of the overlapping_coords
            overlapping_coords[-1] += 1

            i = 0
            region_ticker = 1
            while i < (len(overlapping_coords) - 1):
                start = int(overlapping_coords[i])
                stop = int(overlapping_coords[(i + 1)]) - 1
                if (stop - start) < 50:  # this eliminates really tiny regions
                    i += 1
                    continue
                split_locus = utils.Locus(chrom, start + 1, stop, ".")

                if line_locus.overlaps(split_locus):
                    new_id = "{}_{}".format(region_id, region_ticker)
                    tss_status = 0
                    if local_tss_collection.get_overlap(split_locus):
                        tss_status = 1
                    split_gff_line = [
                        chrom,
                        new_id,
                        new_id,
                        start,
                        stop,
                        "",
                        ".",
                        tss_status,
                        new_id,
                    ]

                    split_gff.append(split_gff_line)
                    region_ticker += 1
                i += 1
        else:
            line[7] = 0
            split_gff.append(line)

    return split_gff
Exemplo n.º 25
0
def main():
    """Main run function."""
    parser = argparse.ArgumentParser()

    # required flags
    parser.add_argument(
        "-b",
        "--bam",
        dest="bam",
        nargs="*",
        help="Enter a comma/space separated list of .bam files to be processed.",
        required=True,
    )
    parser.add_argument(
        "-i",
        "--input",
        dest="input",
        type=str,
        help="Enter .gff or genomic region e.g. chr1:+:1-1000.",
        required=True,
    )
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        type=str,
        help="specify a genome, HG18,HG19,MM8,MM9,MM10 are currently supported",
        required=True,
    )

    # output flag
    parser.add_argument(
        "-o",
        "--output",
        dest="output",
        type=str,
        help="Enter the output folder.",
        required=True,
    )

    # additional options
    parser.add_argument(
        "--stretch-input",
        dest="stretch_input",
        default=None,
        type=int,
        help=(
            "Stretch the input regions to a minimum length in bp, e.g. 10000 (for"
            " 10kb)"
        ),
    )
    parser.add_argument(
        "-c",
        "--color",
        dest="color",
        default=None,
        nargs="*",
        help=(
            "Enter a colon or space separated list of colors e.g. "
            "255,0,0:255,125,0, default samples the rainbow"
        ),
    )
    parser.add_argument(
        "-s",
        "--sense",
        dest="sense",
        default="both",
        help="Map to '+','-' or 'both' strands. Default maps to both.",
    )
    parser.add_argument(
        "-e",
        "--extension",
        dest="extension",
        default=200,
        help="Extends reads by n bp. Default value is 200bp",
    )
    parser.add_argument(
        "-r",
        "--rpm",
        dest="rpm",
        action="store_true",
        default=False,
        help="Normalizes density to reads per million (rpm) Default is False",
    )
    parser.add_argument(
        "-y",
        "--yScale",
        dest="y_scale",
        default="relative",
        help=(
            "Choose either relative or uniform y axis scaling. options = "
            "'relative,uniform' Default is relative scaling"
        ),
    )
    parser.add_argument(
        "-n",
        "--names",
        dest="names",
        default=None,
        nargs="*",
        help="Enter a comma or space separated list of names for your bams",
    )
    parser.add_argument(
        "-p",
        "--plot",
        dest="plot",
        default="MULTIPLE",
        help=(
            "Choose either all lines on a single plot or multiple plots. options "
            "= 'SINGLE,MULTIPLE,MERGE'"
        ),
    )
    parser.add_argument(
        "-t",
        "--title",
        dest="title",
        default="",
        help=(
            "Specify a title for the output plot(s), default will be the "
            "coordinate region"
        ),
    )
    parser.add_argument(
        "-q",
        "--skip-cache",
        dest="skip_cache",
        action="store_true",
        default=False,
        help="Toggles option to skip loading annotation cache file",
    )

    parser.add_argument(
        "--scale",
        dest="scale",
        default=None,
        nargs="*",
        help=(
            "Enter a comma or space separated list of scaling factors for your "
            "bams. Default is none"
        ),
    )
    parser.add_argument(
        "--bed",
        dest="bed",
        nargs="*",
        help="Add a comma-delimited or space-delimited list of bed files to plot",
    )
    parser.add_argument(
        "--multi-page",
        dest="multi",
        action="store_true",
        default=False,
        help="If flagged will create a new pdf for each region",
    )

    # DEBUG OPTION TO SAVE TEMP FILES
    parser.add_argument(
        "--save-temp",
        dest="save",
        action="store_true",
        default=False,
        help="If flagged will save temporary files made by bamPlot",
    )

    args = parser.parse_args()

    print(args)

    if args.bam and args.input and args.genome and args.output:

        # Support a legacy mode where a ',' delimited multiple files
        bam_file_list = args.bam
        if len(bam_file_list) == 1:
            bam_file_list = bam_file_list[0].split(",")

        # Make sure these are actually files & readable (!)
        for filename in bam_file_list:
            assert os.access(filename, os.R_OK)

        # bringing in any beds
        if args.bed:
            bed_file_list = args.bed
            if len(bed_file_list) == 1:
                bed_file_list = bed_file_list[0].split(",")
            print(bed_file_list)
            bed_collection = make_bed_collection(bed_file_list)
        else:
            bed_collection = utils.LocusCollection([], 50)

        # Load the input for graphing. One of:
        # - A .gff
        # - A .bed
        # - a specific input region (e.g. chr10:.:93150000-93180000)

        valid_sense_options = {"+", "-", "."}
        if os.access(args.input, os.R_OK):
            if args.input.endswith(".bed"):
                # Uniquely graph every input of this bed
                parsed_input_bed = utils.parse_table(args.input, "\t")
                gff_name = os.path.basename(args.input)  # Graph title
                gff = None
                try:
                    if parsed_input_bed[0][5] in valid_sense_options:
                        # This .bed might have a sense parameter
                        gff = [
                            [e[0], "", args.input, e[1], e[2], "", e[5], "", ""]
                            for e in parsed_input_bed
                        ]
                except IndexError:
                    pass

                if gff is None:
                    print(
                        "Your bed doesn't have a valid sense parameter. Defaulting to both "
                        "strands, '.'"
                    )
                    # We only take chr/start/stop and ignore everything else.
                    gff = [
                        [e[0], "", args.input, e[1], e[2], "", ".", "", ""]
                        for e in parsed_input_bed
                    ]
            else:
                # Default to .gff, since that's the original behavior
                gff = utils.parse_table(args.input, "\t")
                gff_name = os.path.basename(args.input).split(".")[0]
        else:
            # means a coordinate line has been given e.g. chr1:+:1-100
            chrom_line = args.input.split(":")
            try:
                chrom = chrom_line[0]
                sense = chrom_line[1]
            except IndexError:
                print("Invalid input line or inaccessible file. Try: chr1:.:1-5000")
                exit()
            assert sense in valid_sense_options
            [start, end] = chrom_line[2].split("-")
            if chrom[0:3] != "chr":
                print("ERROR: UNRECOGNIZED GFF OR CHROMOSOME LINE INPUT")
                exit()
            gff_line = [chrom, "", args.input, start, end, "", sense, "", ""]
            gff_name = "{}_{}_{}_{}".format(chrom, sense, start, end)
            gff = [gff_line]

        # Consider stretching the regions to a fixed minimum size
        if args.stretch_input:
            print(
                "Stretching inputs to a minimum of: {} bp".format(
                    str(args.stretch_input)
                )
            )
            min_length = args.stretch_input
            stretch_gff = []
            for e in gff:
                difference = int(e[4]) - int(e[3])
                if difference < min_length:
                    pad = int((min_length - difference) / 2)
                    stretch_gff.append(
                        [
                            e[0],
                            e[1],
                            e[2],
                            int(e[3]) - pad,
                            int(e[4]) + pad,
                            e[5],
                            e[6],
                            e[7],
                            e[8],
                        ]
                    )
                else:
                    stretch_gff.append(e)

            gff = stretch_gff

        # Sanity test the gff object
        assert all([e[6] in valid_sense_options for e in gff])  # All strands are sane

        # bring in the genome
        genome = args.genome.upper()
        if not ["HG18", "HG19", "HG19_RIBO", "HG38", "MM9", "MM10", "RN4", "RN6"].count(
            genome
        ):
            print(
                "ERROR: UNSUPPORTED GENOME TYPE {}. USE HG19,HG18, RN4, MM9, or MM10".format(
                    genome,
                )
            )
            parser.print_help()
            exit()

        # bring in the rest of the options

        # output
        root_folder = args.output
        try:
            os.listdir(root_folder)
        except OSError:
            print("ERROR: UNABLE TO FIND OUTPUT DIRECTORY {}".format(root_folder))
            exit()

        # Get analysis title
        if not args.title:
            title = gff_name
        else:
            title = args.title

        # make a temp folder
        temp_folder = os.path.join(root_folder, title)
        print("CREATING TEMP FOLDER {}".format(temp_folder))
        utils.format_folder(temp_folder, create=True)

        # colors
        if args.color:
            color_list = args.color
            if len(color_list) == 1:
                color_list = color_list[0].split(":")
            color_list = [x.split(",") for x in color_list]
            if len(color_list) < len(bam_file_list):
                print(
                    "WARNING: FEWER COLORS THAN BAMS SPECIFIED. COLORS WILL BE RECYCLED"
                )
                # recycling the color list
                color_list += color_list * (len(bam_file_list) // len(color_list))
                color_list = color_list[: len(bam_file_list)]

        else:
            # cycles through the colors of the rainbow
            color_list = taste_the_rainbow(len(bam_file_list))

        # sense
        sense = args.sense

        extension = int(args.extension)

        rpm = args.rpm

        scale = args.scale
        if scale:
            if len(scale) == 1:
                scale = scale[0].split(",")

        y_scale = args.y_scale.upper()

        # names
        if args.names:
            names = args.names
            if len(names) == 1:
                names = names[0].split(",")

            if len(names) != len(bam_file_list):
                print("ERROR: NUMBER OF NAMES AND NUMBER OF BAMS DO NOT CORRESPOND")
                parser.print_help()
                exit()
        else:
            names = [os.path.basename(x) for x in bam_file_list]

        # plot style
        plot_style = args.plot.upper()
        if not ["SINGLE", "MULTIPLE", "MERGE"].count(plot_style):
            print("ERROR: PLOT STYLE {} NOT AN OPTION".format(plot_style))
            parser.print_help()
            exit()

        # now run!
        summary_table_file_name = make_bam_plot_tables(
            gff,
            genome,
            bam_file_list,
            color_list,
            n_bins,
            sense,
            extension,
            rpm,
            temp_folder,
            names,
            title,
            bed_collection,
            scale,
        )
        print("{} is the summary table".format(summary_table_file_name))

        # running the R command to plot
        multi = args.multi
        out_file = os.path.join(root_folder, "{}_plots.pdf".format(title))
        r_cmd = call_r_plot(
            summary_table_file_name, out_file, y_scale, plot_style, multi
        )

        # open a bash file
        bash_file_name = os.path.join(temp_folder, "{}_Rcmd.sh".format(title))
        with open(bash_file_name, "w") as bash_file:
            bash_file.write("#!/usr/bin/bash\n")
            bash_file.write(r_cmd)
        print("Wrote R command to {}".format(bash_file_name))
        os.system("bash {}".format(bash_file_name))

        # delete temp files
        if not args.save:
            if utils.check_output(out_file, 1, 10):
                # This is super dangerous (!). Add some sanity checks.
                assert " " not in temp_folder
                assert temp_folder != "/"
                shutil.rmtree(temp_folder)
                print("Removing temp folder: {}".format(temp_folder))
            else:
                print("ERROR: NO OUTPUT FILE {} DETECTED".format(out_file))

    else:
        parser.print_help()
        sys.exit()
Exemplo n.º 26
0
def make_bam_plot_tables(
    gff,
    genome,
    bam_file_list,
    color_list,
    n_bins,
    sense,
    extension,
    rpm,
    out_folder,
    names,
    title,
    bed_collection,
    scale=None,
):
    """Makes a plot table for each line of the gff mapped against all the bams in the bamList."""
    # load in the gff
    if isinstance(gff, str):
        gff = utils.parse_table(gff, "\t")

    # load in the annotation
    print("loading in annotation for {}".format(genome))
    gene_dict, tx_collection = load_annot_file(genome)

    # make an MMR dict so MMRs are only computed once
    print("Getting information about read depth in bams")
    mmr_dict = {}

    if scale:
        print("Applying scaling factors")
        scale_list = [float(x) for x in scale]
    else:
        scale_list = [1] * len(bam_file_list)

    # now iterate through the bam files
    for i, bam_file in enumerate(bam_file_list):
        # millionMappedReads
        idx_cmd = "samtools idxstats {}".format(bam_file)

        idx_pipe = subprocess.Popen(
            idx_cmd,
            stdin=subprocess.PIPE,
            stderr=subprocess.PIPE,
            stdout=subprocess.PIPE,
            shell=True,
        )  # TODO: this does not produce an error if samtools are not installed
        idx_stats = idx_pipe.communicate()
        idx_stats = idx_stats[0].decode("utf-8").split("\n")
        idx_stats = [line.split("\t") for line in idx_stats]
        raw_count = sum([int(line[2]) for line in idx_stats[:-1]])

        # implement scaling
        read_scale_factor = scale_list[i]

        if rpm:
            mmr = round(raw_count / 1000000 / read_scale_factor, 4)
        else:
            mmr = round(1 / read_scale_factor, 4)
        mmr_dict[bam_file] = mmr

    ticker = 1
    # go line by line in the gff
    summary_table = [
        [
            "DIAGRAM_TABLE",
            "NAME_TABLE",
            "BED_DIAGRAM_TABLE",
            "BED_NAME_TABLE",
            "PLOT_TABLE",
            "CHROM",
            "ID",
            "SENSE",
            "START",
            "END",
        ]
    ]
    for gff_line in gff:
        gff_string = "line_{}_{}_{}_{}_{}_{}".format(
            ticker, gff_line[0], gff_line[1], gff_line[6], gff_line[3], gff_line[4],
        )
        ticker += 1
        print("writing the gene diagram table for region {}".format(gff_line[1]))
        map_gff_line_to_annot(
            gff_line,
            out_folder,
            n_bins,
            gene_dict,
            tx_collection,
            sense="both",
            header=gff_string,
        )
        map_gff_line_to_bed(
            gff_line, out_folder, n_bins, bed_collection, header=gff_string,
        )
        out_table = []

        out_table.append(
            ["BAM", "GENE_ID", "NAME", "LOCUSLINE", "COLOR1", "COLOR2", "COLOR3"]
            + ["bin_" + str(n) for n in range(1, int(n_bins) + 1, 1)]
        )

        for i, bam_file in enumerate(bam_file_list):
            name = names[i]
            color = color_list[i]
            print(
                "getting data for location {} in dataset {}".format(
                    gff_line[1], bam_file
                )
            )
            mmr = mmr_dict[bam_file]
            new_line = map_bam_to_gff_line(
                bam_file, mmr, name, gff_line, color, n_bins, sense, extension,
            )
            out_table.append(new_line)

        # get the gene name
        if gff_line[1] in gene_dict:
            gene_name = gene_dict[gff_line[1]].common_name()
        else:
            gene_name = gff_line[1]
        utils.unparse_table(
            out_table,
            os.path.join(out_folder, "{}_plotTemp.txt".format(gff_string)),
            "\t",
        )
        diagram_table = os.path.join(
            out_folder, "{}_diagramTemp.txt".format(gff_string)
        )
        plot_table = os.path.join(out_folder, "{}_plotTemp.txt".format(gff_string))
        name_table = os.path.join(out_folder, "{}_nameTemp.txt".format(gff_string))
        bed_name_table = os.path.join(
            out_folder, "{}_bedNameTemp.txt".format(gff_string)
        )
        bed_diagram_table = os.path.join(
            out_folder, "{}_bedDiagramTemp.txt".format(gff_string)
        )
        summary_table.append(
            [
                diagram_table,
                name_table,
                bed_diagram_table,
                bed_name_table,
                plot_table,
                gff_line[0],
                gene_name,
                gff_line[6],
                gff_line[3],
                gff_line[4],
            ]
        )
    summary_table_file_name = os.path.join(out_folder, "{}_summary.txt".format(title))
    utils.unparse_table(summary_table, summary_table_file_name, "\t")
    return summary_table_file_name
Exemplo n.º 27
0
def tf_edge_delta_out(
    crc_folder,
    bam_list,
    analysis_name,
    edge_table_path_1,
    edge_table_path_2,
    group1_list,
    group2_list,
    output="",
):
    """Calculates changes in group out degree at each predicted motif occurrence (by subpeaks)."""
    crc_folder = utils.format_folder(crc_folder, True)
    edge_path = merge_edge_tables(
        edge_table_path_1,
        edge_table_path_2,
        os.path.join(crc_folder, "{}_EDGE_TABLE.txt".format(analysis_name)),
    )

    # make a gff of the edge table
    edge_table = utils.parse_table(edge_path, "\t")
    edge_gff = []
    for line in edge_table[1:]:
        gff_line = [
            line[2],
            "{}_{}".format(line[0], line[1]),
            "",
            line[3],
            line[4],
            "",
            ".",
            "",
            "{}_{}".format(line[0], line[1]),
        ]
        edge_gff.append(gff_line)

    edge_gff_path = os.path.join(crc_folder,
                                 "{}_EDGE_TABLE.gff".format(analysis_name))
    utils.unparse_table(edge_gff, edge_gff_path, "\t")

    # direct the output to the crc folder
    signal_path = os.path.join(
        crc_folder, "{}_EDGE_TABLE_signal.txt".format(analysis_name))

    all_group_list = group1_list + group2_list
    if not utils.check_output(signal_path, 0, 0):
        signal_table_list = pipeline_utils.map_regions(
            bam_list,
            [edge_gff_path],
            crc_folder,
            crc_folder,
            all_group_list,
            True,
            signal_path,
            extend_reads_to=100,
        )
        print(signal_table_list)
    else:
        print("Found previous signal table at {}".format(signal_path))

    # now bring in the signal table as a dictionary using the locus line as the id
    print("making log2 group1 vs group2 signal table at edges")
    signal_table = utils.parse_table(signal_path, "\t")

    # figure out columns for group1 and group2
    group1_columns = [signal_table[0].index(name) for name in group1_list]
    group2_columns = [signal_table[0].index(name) for name in group2_list]
    group1_signal_vector = []
    group2_signal_vector = []
    for line in signal_table[1:]:
        group1_signal = numpy.mean(
            [float(line[col]) for col in group1_columns])
        group2_signal = numpy.mean(
            [float(line[col]) for col in group2_columns])

        group1_signal_vector.append(group1_signal)
        group2_signal_vector.append(group2_signal)

    group1_median = numpy.median(group1_signal_vector)
    group2_median = numpy.median(group2_signal_vector)

    print("group1 median signal")
    print(group1_median)
    print("group2 median signal")
    print(group2_median)

    # now that we have the median, we can take edges where at least 1 edge is above the median
    # and both are above zero and generate a new table w/ the fold change
    signal_filtered_path = signal_path.replace(".txt", "_filtered.txt")
    if utils.check_output(signal_filtered_path, 0, 0):
        print("Found filtered signal table for edges at {}".format(
            signal_filtered_path))
        signal_table_filtered = utils.parse_table(signal_filtered_path, "\t")
    else:
        signal_table_filtered = [
            signal_table[0] +
            ["GROUP1_MEAN", "GROUP2_MEAN", "GROUP1_vs_GROUP2_LOG2"]
        ]
        for line in signal_table[1:]:
            group1_signal = numpy.mean(
                [float(line[col]) for col in group1_columns])
            group2_signal = numpy.mean(
                [float(line[col]) for col in group2_columns])

            if (group1_signal > group1_median or group2_signal > group2_median
                ) and min(group1_signal, group2_signal) > 0:
                delta = numpy.log2(group1_signal / group2_signal)
                new_line = line + [group1_signal, group2_signal, delta]
                signal_table_filtered.append(new_line)

        utils.unparse_table(signal_table_filtered, signal_filtered_path, "\t")

    # now get a list of all TFs in the system
    tf_list = utils.uniquify(
        [line[0].split("_")[0] for line in signal_table_filtered[1:]])
    tf_list.sort()
    print(tf_list)

    out_degree_table = [[
        "TF_NAME",
        "EDGE_COUNT",
        "DELTA_MEAN",
        "DELTA_MEDIAN",
        "DELTA_STD",
        "DELTA_SEM",
    ]]

    for tf_name in tf_list:
        print(tf_name)
        edge_vector = [
            float(line[-1]) for line in signal_table_filtered[1:]
            if line[0].split("_")[0] == tf_name
        ]

        edge_count = len(edge_vector)
        delta_mean = round(numpy.mean(edge_vector), 4)
        delta_median = round(numpy.median(edge_vector), 4)
        delta_std = round(numpy.std(edge_vector), 4)
        delta_sem = round(stats.sem(edge_vector), 4)
        tf_out_line = [
            tf_name,
            edge_count,
            delta_mean,
            delta_median,
            delta_std,
            delta_sem,
        ]
        out_degree_table.append(tf_out_line)

    # set final output
    if not output:
        output_path = os.path.join(
            crc_folder, "{}_EDGE_DELTA_OUT.txt".format(analysis_name))
    else:
        output_path = output

    utils.unparse_table(out_degree_table, output_path, "\t")
    print(output_path)
    return output_path