Пример #1
0
def make_name_dict(data_file,
                   rose_folder,
                   names_list=[],
                   enhancer_type="super"):
    """For each name, check for the presence of an enriched file or allEnhancer table.

    These are the files required for enhancer clustering.

    """
    data_dict = pipeline_utils.load_data_table(data_file)

    # draw the parent folder from the data_file
    parent_folder = utils.format_folder(
        os.path.dirname(os.path.abspath(data_file)), False)
    if parent_folder.count("data_tables") == 1:
        parent_folder = parent_folder.replace("data_tables/", "")
    print("Using {} as the parent folder".format(parent_folder))

    # check to see if a rose folder exists already
    if utils.format_folder(rose_folder, False):
        rose_exists = True
        rose_folder = utils.format_folder(rose_folder, False)
    else:
        rose_exists = False
        rose_folder = utils.format_folder(rose_folder, True)

    # check names_list to see if datasets exist
    if len(names_list) == 0:
        names_list = [
            name for name in data_dict if name.upper().count("WCE") == 0
            and name.upper().count("INPUT") == 0
        ]
        # if no names_list is given, this filters out WCE

    # now check that all of the datasets at a minimum have a rose output OR enriched region file

    name_dict = defaultdict(dict)
    for name in names_list:
        # check if each dataset has a background
        background_name = data_dict[name]["background"]
        if background_name in data_dict:
            name_dict[name]["background"] = True
        else:
            name_dict[name]["background"] = False

        # assumes standard folder structure for enriched file
        enriched_file = os.path.join(parent_folder, "macsEnriched",
                                     data_dict[name]["enrichedMacs"])

        print("Looking for macs output at {}".format(enriched_file))

        try:
            open(enriched_file, "r").close()
            name_dict[name]["enriched_file"] = enriched_file
        except (IOError, FileNotFoundError):
            name_dict[name]["enriched_file"] = ""

        # roseOutput looks for standard format rose output
        # need an allEnhancers table and a region table to proceed
        # if the rose folder doesn't exist, don't bother
        if rose_exists:
            try:
                rose_output_files = os.listdir(
                    os.path.join(rose_folder, "{}_ROSE".format(name)))
                if enhancer_type == "super":
                    enhancer_string = "AllEnhancers.table.txt"
                if enhancer_type == "stretch":
                    enhancer_string = "AllEnhancers_Length.table.txt"
                if enhancer_type == "superstretch":
                    enhancer_string = "AllEnhancers_SuperStretch.table.txt"

                all_enhancer_file_list = [
                    x for x in rose_output_files
                    if x.count(enhancer_string) == 1 and x[0] != "."
                ]  # no weird hidden or temp files
                if all_enhancer_file_list:
                    name_dict[name]["enhancer_file"] = os.path.join(
                        rose_folder, "{}_ROSE".format(name),
                        all_enhancer_file_list[0])
                else:
                    name_dict[name]["enhancer_file"] = ""
            except (OSError, FileNotFoundError):
                name_dict[name]["enhancer_file"] = ""
        else:
            name_dict[name]["enhancer_file"] = ""

        if (name_dict[name]["enhancer_file"] == ""
                and name_dict[name]["enriched_file"] == ""):
            print(
                "INSUFFICIENT DATA TO RUN ENAHNCER ANALYSIS ON {}. PLEASE MAKE SURE ROSE OUTPUT "
                "OR MACS ENRICHED REGION PEAKS FILE EXISTS".format(name))
            print(name_dict[name])
            sys.exit()

    return name_dict
Пример #2
0
def main():
    """Main function call."""
    parser = argparse.ArgumentParser()
    # required flags
    parser.add_argument(
        "-d",
        "--data",
        dest="data",
        default=None,
        required=True,
        help="Enter a data file for datasets to be processed",
    )
    parser.add_argument(
        "-o",
        "--output",
        dest="output",
        default=None,
        required=True,
        help="specify an output folder to write results to",
    )

    # additional args
    parser.add_argument(
        "-i",
        "--input",
        dest="input",
        required=False,
        help=
        "Enter a comma separated list of names to analyze. Default will be all datasets",
    )

    parser.add_argument(
        "-n",
        "--name",
        dest="name",
        required=False,
        help="Enter a name for the analysis",
    )

    parser.add_argument(
        "-r",
        "--rose",
        dest="rose",
        required=False,
        help="Enter a folder to detect or write rose output",
    )

    parser.add_argument(
        "-a",
        "--all",
        dest="all",
        action="store_true",
        default=False,
        help="flag to run analysis on ALL enhancers (this is much slower)",
    )
    parser.add_argument(
        "-s",
        "--stitch",
        dest="stitch",
        default="",
        help=
        ("specify a fixed stitch distance for all datasets, otherwise will compute stitching "
         "automatically on each dataset"),
    )
    parser.add_argument(
        "-e",
        "--enhancer-type",
        dest="enhancer_type",
        default="super",
        help=
        "specify type of enhancer to analyze: super, stretch, superStretch",
    )

    parser.add_argument(
        "-t",
        "--tss",
        dest="tss",
        default=2500,
        help="specify a tss exclusion window. default is 2500bp",
    )

    parser.add_argument(
        "--mask",
        dest="mask",
        required=False,
        help=
        "Create a mask set of regions to filter out of analysis. must be .bed or .gff format",
    )

    args = parser.parse_args()
    print(args)

    # pull in the data_file and create a data_dict
    data_file = args.data

    # now the output folder
    output_folder = utils.format_folder(
        args.output, True)  # check and create the output folder
    # now the rose folder
    if args.rose:
        rose_folder = args.rose
    else:
        rose_folder = os.path.join(output_folder, "rose")

    if args.input:
        names_list = args.input.split(",")
    else:
        names_list = []

    # get the genome
    data_dict = pipeline_utils.load_data_table(data_file)
    genome = data_dict[list(data_dict.keys())[0]]["genome"]

    # check if using only supers
    if args.all:
        super_only = False
    else:
        super_only = True

    # get the anlysis name
    if args.name:
        analysis_name = args.name
    else:
        analysis_name = "enhancers"

    # check for a stitching parameter
    if len(str(args.stitch)) > 0:
        stitch = str(args.stitch)
    else:
        stitch = ""

    # check for the tss parameter
    tss_distance = int(args.tss)

    # check enhancer type
    enhancer_type = args.enhancer_type.lower()
    if ["super", "superstretch", "stretch"].count(enhancer_type) == 0:
        print("ERROR: unsupported enhancer type {}".format(enhancer_type))
        sys.exit()

    # see if there's a mask
    if args.mask:
        mask_file = args.mask
    else:
        mask_file = ""

    # =====================================================
    # =================SUMMARIZE INPUTS====================
    # =====================================================

    print("WORKING IN GENOME {}".format(genome))
    print("DRAWING DATA FROM {} AND ROSE FOLDER {}".format(
        data_file, rose_folder))
    print("USING {} AS THE OUTPUT FOLDER".format(output_folder))

    # =====================================================
    # ==============ESTABLISH ALL WORKING FILES============
    # =====================================================

    print("\n\n\nESTABLISHING WORKING FILES")
    name_dict = make_name_dict(data_file, rose_folder, names_list,
                               enhancer_type)

    print(name_dict)

    print("STARTING ANALYSIS ON THE FOLLOWING DATASETS:")
    print(list(name_dict.keys()))

    for name in name_dict:
        if len(name_dict[name]["enhancer_file"]) == 0:
            print("NO ROSE OUTPUT FOR {}".format(name))

    # =====================================================
    # ==============LAUNCH ENHANCER MAPPING================
    # =====================================================

    print("\n\n\nLAUNCHING ENHANCER MAPPING (IF NECESSARY)")
    name_dict = launch_enhancer_mapping(
        data_file,
        name_dict,
        output_folder,
        rose_folder,
        stitch,
        tss_distance,
        enhancer_type,
        mask_file,
    )
    print(name_dict)

    # =====================================================
    # ====================GET MEDIAN SIGNAL================
    # =====================================================

    print("\n\n\nGETTING MEDIAN ENHANCER SIGNAL FROM EACH SAMPLE")
    median_dict = make_median_dict(name_dict)

    print(median_dict)

    # =====================================================
    # ====================MERGING ENHANCERS================
    # =====================================================

    print("\n\n\nIDENTIFYING CONSENSUS ENHANCER REGIONS")

    merged_gff_file = os.path.join(
        output_folder, "{}_{}_-0_+0.gff".format(genome, analysis_name))
    merged_gff_file = merge_collections(name_dict, analysis_name,
                                        merged_gff_file, super_only)

    # =====================================================
    # ===============MAP TO MERGED REGIONS=================
    # =====================================================

    print("\n\n\nMAPPING DATA TO CONSENSUS ENHANCER REGIONS")
    merged_region_map = map_merged_gff(data_file, name_dict, merged_gff_file,
                                       analysis_name, output_folder, mask_file)

    # =====================================================
    # ==============CORRECT FOR MEDIAN SIGNAL==============
    # =====================================================

    print("\n\n\nCREATING ENHANCER SIGNAL TABLE")
    signal_table_file = make_enhancer_signal_table(name_dict,
                                                   merged_region_map,
                                                   median_dict, analysis_name,
                                                   genome, output_folder)

    # =====================================================
    # ===============CALL CLUSTERING R SCRIPT==============
    # =====================================================

    print("\n\n\nGENERATING CLUSTERING OUTPUT")
    cluster_table_file = call_r_script(genome, output_folder, analysis_name,
                                       signal_table_file)
    # output should be
    # png of cluster gram with rows as genes
    # png of cluster gram of samples w/ tree
    # ordered table w/ cluster assignment
    # similarity matrix for samples

    # =====================================================
    # =============GENE MAPPING BY CLUSTER=================
    # =====================================================

    cmd = "ROSE2_geneMapper -g {} -i {}".format(genome, cluster_table_file)
    os.system(cmd)

    print("FINISHED")
Пример #3
0
def launch_enhancer_mapping(
    data_file,
    name_dict,
    output_folder,
    rose_folder,
    stitch,
    tss_distance,
    enhancer_type,
    mask_file="",
):
    """Launches enhancer mapping if needed from enriched region files."""
    names_list = list(name_dict.keys())

    # check to see if everything is good, if so return True and call it a day
    if len([x for x in names_list
            if len(name_dict[x]["enhancer_file"]) > 0]) == len(names_list):
        print("ENHANCER FILE OUTPUT FOUND FOR ALL DATASETS")
        return name_dict

    # if not, have to call rose
    rose_output_folder = utils.format_folder(rose_folder, True)

    queue_list = []
    for name in names_list:
        # check to see if we need to call rose
        if name_dict[name]["enhancer_file"] == "":
            # get the enriched file
            enriched_file = name_dict[name]["enriched_file"]
            # call rose
            print("CALLING ROSE FOR {}".format(name))
            bash_file_name = pipeline_utils.call_rose2(
                data_file,
                "",
                rose_output_folder,
                [name],
                [],
                enriched_file,
                tss_distance,
                stitch,
                mask=mask_file,
            )
            print(bash_file_name)
            os.system("bash {}".format(bash_file_name))
            # add name to queue list
            queue_list.append(name)

    # define the enhancer type
    if enhancer_type == "super":
        enhancer_string = "AllEnhancers.table.txt"
    if enhancer_type == "stretch":
        enhancer_string = "AllEnhancers_Length.table.txt"
    if enhancer_type == "superstretch":
        enhancer_string = "AllEnhancers_SuperStretch.table.txt"

    # now check for completion of datasets
    for name in queue_list:
        # check for the AllEnhancers table
        enhancer_file = os.path.join(
            rose_output_folder,
            "{}_ROSE".format(name),
            "{}_peaks_{}".format(name, enhancer_string),
        )

        print("CHECKING FOR {} ROSE OUTPUT IN {}".format(name, enhancer_file))
        if utils.check_output(enhancer_file, 1, 10):

            print("FOUND ENHANCER OUTPUT FOR {}".format(name))
            name_dict[name]["enhancer_file"] = enhancer_file
        else:
            # try finding it w/ a different name
            # this will bug out if nothing is there
            rose_folder = os.path.join(rose_output_folder,
                                       "{}_ROSE".format(name))
            rose_file_list = [
                x for x in os.listdir(rose_folder) if x[0] != "."
            ]  # no hidden files
            if not rose_file_list:
                print("No files found in {}".format(rose_folder))
                sys.exit()
            enhancer_file = pipeline_utils.get_file(enhancer_string,
                                                    rose_file_list,
                                                    rose_folder)
            name_dict[name]["enhancer_file"] = enhancer_file

    return name_dict
Пример #4
0
def map_merged_gff(data_file, name_dict, merged_gff_file, analysis_name,
                   output_folder, mask_file):
    """Call rose on the merged_gff_file for all datasets."""
    data_dict = pipeline_utils.load_data_table(data_file)
    rose_parent_folder = os.path.join(output_folder, "rose")
    utils.format_folder(rose_parent_folder, True)
    gff_name = os.path.basename(merged_gff_file).split(".")[0]
    bash_file_name = os.path.join(output_folder, "rose",
                                  "{}_roseCall.sh".format(analysis_name))
    # names_list is just the first dataset
    # extrmap will have to have all other datasets + their backgrounds

    names_list = list(name_dict.keys())
    names_list.sort()
    extra_map = []
    for name in names_list[1:]:
        if name_dict[name]["background"]:
            background_name = data_dict[name]["background"]
            if background_name in data_dict:
                extra_map += [name, background_name]
            else:
                print(
                    "ERROR: UNABLE TO FIND LISTED BACKGROUND DATASET {} FOR {}"
                    .format(background_name, name))
                sys.exit()
        else:
            extra_map += [name]

    print(extra_map)

    # first check to see if this has already been done
    merged_region_map = os.path.join(
        output_folder,
        "rose",
        "{}_ROSE".format(names_list[0]),
        "{}_0KB_STITCHED_ENHANCER_REGION_MAP.txt".format(gff_name),
    )
    print("LOOKING FOR REGION MAP AT {}".format(merged_region_map))

    if utils.check_output(merged_region_map, 1, 1):
        print("FOUND PREVIOUS REGION MAP")

        return merged_region_map

    bash_file_name = pipeline_utils.call_rose2(
        data_file,
        "",
        rose_parent_folder,
        [names_list[0]],
        extra_map,
        merged_gff_file,
        0,
        0,
        bash_file_name,
        mask=mask_file,
    )

    bash_command = "bash {}".format(bash_file_name)
    os.system(bash_command)
    print("Running enhancer mapping command:\n{}".format(bash_command))

    if utils.check_output(merged_region_map, 5, 60):
        return merged_region_map
    else:
        print(
            "UNABLE TO CALL ROSE ENHANCER MAPPING ON CONSENSUS ENHANCER FILE {}.\nEXITING NOW"
            "".format(merged_gff_file))
        sys.exit()
Пример #5
0
def main():
    """Main run function."""
    parser = argparse.ArgumentParser()

    # required flags
    parser.add_argument(
        "-b",
        "--bam",
        dest="bam",
        nargs="*",
        help="Enter a comma/space separated list of .bam files to be processed.",
        required=True,
    )
    parser.add_argument(
        "-i",
        "--input",
        dest="input",
        type=str,
        help="Enter .gff or genomic region e.g. chr1:+:1-1000.",
        required=True,
    )
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        type=str,
        help="specify a genome, HG18,HG19,MM8,MM9,MM10 are currently supported",
        required=True,
    )

    # output flag
    parser.add_argument(
        "-o",
        "--output",
        dest="output",
        type=str,
        help="Enter the output folder.",
        required=True,
    )

    # additional options
    parser.add_argument(
        "--stretch-input",
        dest="stretch_input",
        default=None,
        type=int,
        help=(
            "Stretch the input regions to a minimum length in bp, e.g. 10000 (for"
            " 10kb)"
        ),
    )
    parser.add_argument(
        "-c",
        "--color",
        dest="color",
        default=None,
        nargs="*",
        help=(
            "Enter a colon or space separated list of colors e.g. "
            "255,0,0:255,125,0, default samples the rainbow"
        ),
    )
    parser.add_argument(
        "-s",
        "--sense",
        dest="sense",
        default="both",
        help="Map to '+','-' or 'both' strands. Default maps to both.",
    )
    parser.add_argument(
        "-e",
        "--extension",
        dest="extension",
        default=200,
        help="Extends reads by n bp. Default value is 200bp",
    )
    parser.add_argument(
        "-r",
        "--rpm",
        dest="rpm",
        action="store_true",
        default=False,
        help="Normalizes density to reads per million (rpm) Default is False",
    )
    parser.add_argument(
        "-y",
        "--yScale",
        dest="y_scale",
        default="relative",
        help=(
            "Choose either relative or uniform y axis scaling. options = "
            "'relative,uniform' Default is relative scaling"
        ),
    )
    parser.add_argument(
        "-n",
        "--names",
        dest="names",
        default=None,
        nargs="*",
        help="Enter a comma or space separated list of names for your bams",
    )
    parser.add_argument(
        "-p",
        "--plot",
        dest="plot",
        default="MULTIPLE",
        help=(
            "Choose either all lines on a single plot or multiple plots. options "
            "= 'SINGLE,MULTIPLE,MERGE'"
        ),
    )
    parser.add_argument(
        "-t",
        "--title",
        dest="title",
        default="",
        help=(
            "Specify a title for the output plot(s), default will be the "
            "coordinate region"
        ),
    )
    parser.add_argument(
        "-q",
        "--skip-cache",
        dest="skip_cache",
        action="store_true",
        default=False,
        help="Toggles option to skip loading annotation cache file",
    )

    parser.add_argument(
        "--scale",
        dest="scale",
        default=None,
        nargs="*",
        help=(
            "Enter a comma or space separated list of scaling factors for your "
            "bams. Default is none"
        ),
    )
    parser.add_argument(
        "--bed",
        dest="bed",
        nargs="*",
        help="Add a comma-delimited or space-delimited list of bed files to plot",
    )
    parser.add_argument(
        "--multi-page",
        dest="multi",
        action="store_true",
        default=False,
        help="If flagged will create a new pdf for each region",
    )

    # DEBUG OPTION TO SAVE TEMP FILES
    parser.add_argument(
        "--save-temp",
        dest="save",
        action="store_true",
        default=False,
        help="If flagged will save temporary files made by bamPlot",
    )

    args = parser.parse_args()

    print(args)

    if args.bam and args.input and args.genome and args.output:

        # Support a legacy mode where a ',' delimited multiple files
        bam_file_list = args.bam
        if len(bam_file_list) == 1:
            bam_file_list = bam_file_list[0].split(",")

        # Make sure these are actually files & readable (!)
        for filename in bam_file_list:
            assert os.access(filename, os.R_OK)

        # bringing in any beds
        if args.bed:
            bed_file_list = args.bed
            if len(bed_file_list) == 1:
                bed_file_list = bed_file_list[0].split(",")
            print(bed_file_list)
            bed_collection = make_bed_collection(bed_file_list)
        else:
            bed_collection = utils.LocusCollection([], 50)

        # Load the input for graphing. One of:
        # - A .gff
        # - A .bed
        # - a specific input region (e.g. chr10:.:93150000-93180000)

        valid_sense_options = {"+", "-", "."}
        if os.access(args.input, os.R_OK):
            if args.input.endswith(".bed"):
                # Uniquely graph every input of this bed
                parsed_input_bed = utils.parse_table(args.input, "\t")
                gff_name = os.path.basename(args.input)  # Graph title
                gff = None
                try:
                    if parsed_input_bed[0][5] in valid_sense_options:
                        # This .bed might have a sense parameter
                        gff = [
                            [e[0], "", args.input, e[1], e[2], "", e[5], "", ""]
                            for e in parsed_input_bed
                        ]
                except IndexError:
                    pass

                if gff is None:
                    print(
                        "Your bed doesn't have a valid sense parameter. Defaulting to both "
                        "strands, '.'"
                    )
                    # We only take chr/start/stop and ignore everything else.
                    gff = [
                        [e[0], "", args.input, e[1], e[2], "", ".", "", ""]
                        for e in parsed_input_bed
                    ]
            else:
                # Default to .gff, since that's the original behavior
                gff = utils.parse_table(args.input, "\t")
                gff_name = os.path.basename(args.input).split(".")[0]
        else:
            # means a coordinate line has been given e.g. chr1:+:1-100
            chrom_line = args.input.split(":")
            try:
                chrom = chrom_line[0]
                sense = chrom_line[1]
            except IndexError:
                print("Invalid input line or inaccessible file. Try: chr1:.:1-5000")
                exit()
            assert sense in valid_sense_options
            [start, end] = chrom_line[2].split("-")
            if chrom[0:3] != "chr":
                print("ERROR: UNRECOGNIZED GFF OR CHROMOSOME LINE INPUT")
                exit()
            gff_line = [chrom, "", args.input, start, end, "", sense, "", ""]
            gff_name = "{}_{}_{}_{}".format(chrom, sense, start, end)
            gff = [gff_line]

        # Consider stretching the regions to a fixed minimum size
        if args.stretch_input:
            print(
                "Stretching inputs to a minimum of: {} bp".format(
                    str(args.stretch_input)
                )
            )
            min_length = args.stretch_input
            stretch_gff = []
            for e in gff:
                difference = int(e[4]) - int(e[3])
                if difference < min_length:
                    pad = int((min_length - difference) / 2)
                    stretch_gff.append(
                        [
                            e[0],
                            e[1],
                            e[2],
                            int(e[3]) - pad,
                            int(e[4]) + pad,
                            e[5],
                            e[6],
                            e[7],
                            e[8],
                        ]
                    )
                else:
                    stretch_gff.append(e)

            gff = stretch_gff

        # Sanity test the gff object
        assert all([e[6] in valid_sense_options for e in gff])  # All strands are sane

        # bring in the genome
        genome = args.genome.upper()
        if not ["HG18", "HG19", "HG19_RIBO", "HG38", "MM9", "MM10", "RN4", "RN6"].count(
            genome
        ):
            print(
                "ERROR: UNSUPPORTED GENOME TYPE {}. USE HG19,HG18, RN4, MM9, or MM10".format(
                    genome,
                )
            )
            parser.print_help()
            exit()

        # bring in the rest of the options

        # output
        root_folder = args.output
        try:
            os.listdir(root_folder)
        except OSError:
            print("ERROR: UNABLE TO FIND OUTPUT DIRECTORY {}".format(root_folder))
            exit()

        # Get analysis title
        if not args.title:
            title = gff_name
        else:
            title = args.title

        # make a temp folder
        temp_folder = os.path.join(root_folder, title)
        print("CREATING TEMP FOLDER {}".format(temp_folder))
        utils.format_folder(temp_folder, create=True)

        # colors
        if args.color:
            color_list = args.color
            if len(color_list) == 1:
                color_list = color_list[0].split(":")
            color_list = [x.split(",") for x in color_list]
            if len(color_list) < len(bam_file_list):
                print(
                    "WARNING: FEWER COLORS THAN BAMS SPECIFIED. COLORS WILL BE RECYCLED"
                )
                # recycling the color list
                color_list += color_list * (len(bam_file_list) // len(color_list))
                color_list = color_list[: len(bam_file_list)]

        else:
            # cycles through the colors of the rainbow
            color_list = taste_the_rainbow(len(bam_file_list))

        # sense
        sense = args.sense

        extension = int(args.extension)

        rpm = args.rpm

        scale = args.scale
        if scale:
            if len(scale) == 1:
                scale = scale[0].split(",")

        y_scale = args.y_scale.upper()

        # names
        if args.names:
            names = args.names
            if len(names) == 1:
                names = names[0].split(",")

            if len(names) != len(bam_file_list):
                print("ERROR: NUMBER OF NAMES AND NUMBER OF BAMS DO NOT CORRESPOND")
                parser.print_help()
                exit()
        else:
            names = [os.path.basename(x) for x in bam_file_list]

        # plot style
        plot_style = args.plot.upper()
        if not ["SINGLE", "MULTIPLE", "MERGE"].count(plot_style):
            print("ERROR: PLOT STYLE {} NOT AN OPTION".format(plot_style))
            parser.print_help()
            exit()

        # now run!
        summary_table_file_name = make_bam_plot_tables(
            gff,
            genome,
            bam_file_list,
            color_list,
            n_bins,
            sense,
            extension,
            rpm,
            temp_folder,
            names,
            title,
            bed_collection,
            scale,
        )
        print("{} is the summary table".format(summary_table_file_name))

        # running the R command to plot
        multi = args.multi
        out_file = os.path.join(root_folder, "{}_plots.pdf".format(title))
        r_cmd = call_r_plot(
            summary_table_file_name, out_file, y_scale, plot_style, multi
        )

        # open a bash file
        bash_file_name = os.path.join(temp_folder, "{}_Rcmd.sh".format(title))
        with open(bash_file_name, "w") as bash_file:
            bash_file.write("#!/usr/bin/bash\n")
            bash_file.write(r_cmd)
        print("Wrote R command to {}".format(bash_file_name))
        os.system("bash {}".format(bash_file_name))

        # delete temp files
        if not args.save:
            if utils.check_output(out_file, 1, 10):
                # This is super dangerous (!). Add some sanity checks.
                assert " " not in temp_folder
                assert temp_folder != "/"
                shutil.rmtree(temp_folder)
                print("Removing temp folder: {}".format(temp_folder))
            else:
                print("ERROR: NO OUTPUT FILE {} DETECTED".format(out_file))

    else:
        parser.print_help()
        sys.exit()
Пример #6
0
def main():
    """Main run call."""
    debug = False

    parser = argparse.ArgumentParser()
    # required flags
    parser.add_argument(
        "-i",
        "--i",
        dest="input",
        required=True,
        help=
        ("Enter a comma separated list of .gff or .bed file of binding sites used to make "
         "enhancers"),
    )
    parser.add_argument(
        "-r",
        "--rankby",
        dest="rankby",
        required=True,
        help="Enter a comma separated list of bams to rank by",
    )
    parser.add_argument("-o",
                        "--out",
                        dest="out",
                        required=True,
                        help="Enter an output folder")
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        required=True,
        help="Enter the genome build (MM9,MM8,HG18,HG19)",
    )

    # optional flags
    parser.add_argument(
        "-n",
        "--name",
        dest="name",
        required=False,
        help="Provide a name for the analysis otherwise ROSE will guess",
    )
    parser.add_argument(
        "-c",
        "--control",
        dest="control",
        required=False,
        help=
        ("Enter a comma separated list of control bams. Can either provide a single control "
         "bam for all rankby bams, or provide a control bam for each individual bam"
         ),
    )
    parser.add_argument(
        "-s",
        "--stitch",
        dest="stitch",
        default="",
        help=
        ("Enter a max linking distance for stitching. Default will determine optimal stitching"
         " parameter"),
    )
    parser.add_argument(
        "-t",
        "--tss",
        dest="tss",
        default=0,
        help="Enter a distance from TSS to exclude. 0 = no TSS exclusion",
    )

    parser.add_argument(
        "--mask",
        dest="mask",
        required=False,
        help=
        "Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions",
    )

    # RETRIEVING FLAGS
    args = parser.parse_args()

    # making the out folder if it doesn't exist
    out_folder = utils.format_folder(args.out, True)

    # figuring out folder schema
    gff_folder = utils.format_folder(os.path.join(out_folder, "gff"), True)
    mapped_folder = utils.format_folder(os.path.join(out_folder, "mappedGFF"),
                                        True)

    # GETTING INPUT FILE(s)
    input_list = [
        input_file for input_file in args.input.split(",")
        if len(input_file) > 1
    ]

    # converting all input files into GFFs and moving into the GFF folder
    input_gf_list = []
    for input_file in input_list:
        # GETTING INPUT FILE
        if args.input.split(".")[-1] == "bed":
            # CONVERTING A BED TO GFF
            input_gff_name = os.path.basename(args.input)[0:-4]
            input_gff_file = os.path.join(gff_folder,
                                          "{}.gff".format(input_gff_name))
            utils.bed_to_gff(args.input, input_gff_file)
        elif args.input.split(".")[-1] == "gff":
            # COPY THE INPUT GFF TO THE GFF FOLDER
            input_gff_file = args.input
            copyfile(
                input_gff_file,
                os.path.join(gff_folder, os.path.basename(input_gff_file)),
            )
        else:
            print(
                "WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT"
            )
            # COPY THE INPUT GFF TO THE GFF FOLDER
            input_gff_file = args.input
            copyfile(
                input_gff_file,
                os.path.join(gff_folder, os.path.basename(input_gff_file)),
            )

        input_gf_list.append(input_gff_file)

    # GETTING THE LIST OF bam_fileS TO PROCESS
    # either same number of bams for rankby and control
    # or only 1 control #or none!
    # bamlist should be all rankby bams followed by control bams

    bam_file_list = []
    if args.control:
        control_bam_list = [
            bam for bam in args.control.split(",") if len(bam) > 0
        ]
        rankby_bam_list = [
            bam for bam in args.rankby.split(",") if len(bam) > 0
        ]

        if len(control_bam_list) == len(rankby_bam_list):
            # case where an equal number of backgrounds are given
            bam_file_list = rankby_bam_list + control_bam_list
        elif len(control_bam_list) == 1:
            # case where a universal background is applied
            bam_file_list = rankby_bam_list + control_bam_list * len(
                rankby_bam_list)
        else:
            print(
                "ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM"
                " FOR EACH SAMPLE")
            sys.exit()
    else:
        bam_file_list = [bam for bam in args.rankby.split(",") if len(bam) > 0]

    # Stitch parameter
    if args.stitch == "":
        stitch_window = ""
    else:
        stitch_window = int(args.stitch)

    # tss args
    tss_window = int(args.tss)
    if tss_window != 0:
        remove_tss = True
    else:
        remove_tss = False

    # GETTING THE GENOME
    genome = args.genome.upper()
    print("USING {} AS THE GENOME".format(genome))

    # GETTING THE CORRECT ANNOT FILE
    try:
        annot_file = rose2_utils.genome_dict[genome]
    except KeyError:
        print("ERROR: UNSUPPORTED GENOMES TYPE {}".format(genome))
        sys.exit()

    # FINDING THE ANALYSIS NAME
    if args.name:
        input_name = args.name
    else:
        input_name = os.path.basename(input_gf_list[0]).split(".")[0]
    print("USING {} AS THE ANALYSIS NAME".format(input_name))

    print("FORMATTING INPUT REGIONS")
    # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs
    # use a simpler unique region naming system
    if len(input_gf_list) == 1:
        input_gff = utils.parse_table(input_gf_list[0], "\t")
    else:
        input_loci = []
        for gff_file in input_gf_list:
            print("\tprocessing {}".format(gff_file))
            gff = utils.parse_table(gff_file, "\t")
            gff_collection = utils.gff_to_locus_collection(gff, 50)
            input_loci += gff_collection.get_loci()

        input_collection = utils.LocusCollection(input_loci, 50)
        input_collection = (input_collection.stitch_collection()
                            )  # stitches to produce unique regions

        input_gff = utils.locus_collection_to_gff(input_collection)

    formatted_gff = []
    # now number things appropriately
    for i, line in enumerate(input_gff):

        # use the coordinates to make a new id input_name_chr_sense_start_stop
        chrom = line[0]
        coords = [int(line[3]), int(line[4])]
        sense = line[6]

        line_id = "{}_{}".format(input_name, str(i + 1))  # 1 indexing

        new_line = [
            chrom,
            line_id,
            line_id,
            min(coords),
            max(coords),
            "",
            sense,
            "",
            line_id,
        ]
        formatted_gff.append(new_line)

    # name of the master input gff file
    master_gff_file = os.path.join(
        gff_folder, "{}_{}_ALL_-0_+0.gff".format(genome, input_name))
    utils.unparse_table(formatted_gff, master_gff_file, "\t")

    print("USING {} AS THE INPUT GFF".format(master_gff_file))

    # GET CHROMS FOUND IN THE BAMS
    print("GETTING CHROMS IN bam_fileS")
    bam_chrom_list = rose2_utils.get_bam_chrom_list(bam_file_list)
    print("USING THE FOLLOWING CHROMS")
    print(bam_chrom_list)

    # LOADING IN THE GFF AND FILTERING BY CHROM
    print("LOADING AND FILTERING THE GFF")
    input_gff = rose2_utils.filter_gff(master_gff_file, bam_chrom_list)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print("LOADING IN GFF REGIONS")
    reference_collection = utils.gff_to_locus_collection(input_gff)

    print("CHECKING REFERENCE COLLECTION:")
    rose2_utils.check_ref_collection(reference_collection)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if args.mask:
        mask_file = args.mask
        # if it's a bed file
        if mask_file.split(".")[-1].upper() == "BED":
            mask_gff = utils.bedToGFF(mask_file)
        elif mask_file.split(".")[-1].upper() == "GFF":
            mask_gff = utils.parse_table(mask_file, "\t")
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        mask_collection = utils.gff_to_locus_collection(mask_gff)

        # now mask the reference loci
        reference_loci = reference_collection.get_loci()
        filtered_loci = [
            locus for locus in reference_loci
            if len(mask_collection.get_overlap(locus, "both")) == 0
        ]
        print("FILTERED OUT {} LOCI THAT WERE MASKED IN {}".format(
            len(reference_loci) - len(filtered_loci), mask_file))
        reference_collection = utils.LocusCollection(filtered_loci, 50)

    # NOW STITCH REGIONS
    print("STITCHING REGIONS TOGETHER")
    stitched_collection, debug_output, stitch_window = rose2_utils.region_stitching(
        reference_collection,
        input_name,
        out_folder,
        stitch_window,
        tss_window,
        annot_file,
        remove_tss,
    )

    # NOW MAKE A STITCHED COLLECTION GFF
    print("MAKING GFF FROM STITCHED COLLECTION")
    stitched_gff = utils.locus_collection_to_gff(stitched_collection)

    print(stitch_window)
    print(type(stitch_window))
    if not remove_tss:
        stitched_gff_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED.gff".format(input_name,
                                          str(stitch_window // 1000)),
        )
        stitched_gff_name = "{}_{}KB_STITCHED".format(
            input_name, str(stitch_window // 1000))
        debug_out_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED.debug".format(input_name,
                                            str(stitch_window // 1000)),
        )
    else:
        stitched_gff_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED_TSS_DISTAL.gff".format(
                input_name, str(stitch_window // 1000)),
        )
        stitched_gff_name = "{}_{}KB_STITCHED_TSS_DISTAL".format(
            input_name, str(stitch_window // 1000))
        debug_out_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED_TSS_DISTAL.debug".format(
                input_name, str(stitch_window // 1000)),
        )

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print("WRITING DEBUG OUTPUT TO DISK AS {}".format(debug_out_file))
        utils.unparse_table(debug_output, debug_out_file, "\t")

    # WRITE THE GFF TO DISK
    print("WRITING STITCHED GFF TO DISK AS {}".format(stitched_gff_file))
    utils.unparse_table(stitched_gff, stitched_gff_file, "\t")

    # SETTING UP THE OVERALL OUTPUT FILE
    output_file1 = os.path.join(
        out_folder, "{}_ENHANCER_REGION_MAP.txt".format(stitched_gff_name))
    print("OUTPUT WILL BE WRITTEN TO  {}".format(output_file1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    bam_file_list_unique = list(bam_file_list)
    bam_file_list_unique = utils.uniquify(bam_file_list_unique)
    # prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bam_file_list_unique)
    for bam_file in bam_file_list_unique:

        bam_file_name = os.path.basename(bam_file)

        # MAPPING TO THE STITCHED GFF
        mapped_out1_folder = os.path.join(
            mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name,
                                                 bam_file_name))
        mapped_out1_file = os.path.join(
            mapped_folder,
            "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name),
            "matrix.txt",
        )
        if utils.check_output(mapped_out1_file, 0.2, 0.2):
            print("FOUND {} MAPPING DATA FOR BAM: {}".format(
                stitched_gff_file, mapped_out1_file))
        else:
            cmd1 = "bamliquidator_batch --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format(
                stitched_gff_file,
                mapped_out1_folder,
                bam_file,
            )
            print(cmd1)

            os.system(cmd1)
            if utils.check_output(mapped_out1_file, 0.2, 5):
                print("SUCCESSFULLY MAPPED TO {} FROM BAM: {}".format(
                    stitched_gff_file, bam_file_name))
            else:
                print("ERROR: FAILED TO MAP {} FROM BAM: {}".format(
                    stitched_gff_file, bam_file_name))
                sys.exit()

    print("BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS")
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    rose2_utils.map_collection(
        stitched_collection,
        reference_collection,
        bam_file_list,
        mapped_folder,
        output_file1,
        ref_name=stitched_gff_name,
    )

    print("FINDING AVERAGE SIGNAL AMONGST BAMS")
    meta_output_file = collapse_region_map(output_file1,
                                           input_name + "_MERGED_SIGNAL",
                                           control_bams=args.control)

    # now try the merging

    print("CALLING AND PLOTTING SUPER-ENHANCERS")

    control_name = "NONE"
    cmd = "Rscript {} {} {} {} {}".format(
        os.path.join(ROOT_DIR, "scripts", "ROSE2_callSuper.R"),
        out_folder + "/",  # TODO: fix R script so it does not require '/'
        meta_output_file,
        input_name,
        control_name,
    )
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    print("CALLING GENE MAPPING")

    super_table_file = "{}_SuperEnhancers.table.txt".format(input_name)

    # for now don't use ranking bam to call top genes
    cmd = "ROSE2_geneMapper -g {} -i {} -f".format(
        genome, os.path.join(out_folder, super_table_file))
    print(cmd)
    os.system(cmd)

    stretch_table_file = "{}_StretchEnhancers.table.txt".format(input_name)

    cmd = "ROSE2_geneMapper -g {} -i {} -f".format(
        genome, os.path.join(out_folder, stretch_table_file))
    print(cmd)
    os.system(cmd)

    superstretch_table_file = "{}_SuperStretchEnhancers.table.txt".format(
        input_name)

    cmd = "ROSE2_geneMapper.py -g {} -i {} -f".format(genome, out_folder,
                                                      superstretch_table_file)
    os.system(cmd)
Пример #7
0
def main():
    """Main run call."""
    debug = False
    parser = argparse.ArgumentParser()
    # required flags
    parser.add_argument(
        "-i",
        "--i",
        dest="input",
        required=True,
        help="Enter a .gff or .bed file of binding sites used to make enhancers",
    )
    parser.add_argument(
        "-r",
        "--rankby",
        dest="rankby",
        required=True,
        help="bam_file to rank enhancer by",
    )
    parser.add_argument(
        "-o", "--out", dest="out", required=True, help="Enter an output folder"
    )
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        required=True,
        help="Enter the genome build (MM9,MM8,HG18,HG19)",
    )

    # optional flags
    parser.add_argument(
        "-b",
        "--bams",
        dest="bams",
        required=False,
        help="Enter a comma separated list of additional bam files to map to",
    )
    parser.add_argument(
        "-c",
        "--control",
        dest="control",
        required=False,
        help="bam_file to rank enhancer by",
    )
    parser.add_argument(
        "-s",
        "--stitch",
        dest="stitch",
        default="",
        help=(
            "Enter a max linking distance for stitching. Default will determine optimal stitching"
            " parameter"
        ),
    )
    parser.add_argument(
        "-t",
        "--tss",
        dest="tss",
        default=0,
        help="Enter a distance from TSS to exclude. 0 = no TSS exclusion",
    )

    parser.add_argument(
        "--mask",
        dest="mask",
        required=False,
        help="Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions",
    )

    # RETRIEVING FLAGS
    args = parser.parse_args()

    # making the out folder if it doesn't exist
    out_folder = utils.format_folder(args.out, True)

    # figuring out folder schema
    gff_folder = utils.format_folder(os.path.join(out_folder, "gff"), True)
    mapped_folder = utils.format_folder(os.path.join(out_folder, "mapped_gff"), True)

    # GETTING INPUT FILE
    if args.input.split(".")[-1] == "bed":
        # CONVERTING A BED TO GFF
        input_gff_name = args.input.split("/")[-1][0:-4]
        input_gff_file = os.path.join(gff_folder, "{}.gff".format(input_gff_name))
        utils.bed_to_gff(args.input, input_gff_file)
    elif args.input.split(".")[-1] == "gff":
        # COPY THE INPUT GFF TO THE GFF FOLDER
        input_gff_file = args.input
        copyfile(
            input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file))
        )

    else:
        print(
            "WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT"
        )
        # COPY THE INPUT GFF TO THE GFF FOLDER
        input_gff_file = args.input
        copyfile(
            input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file))
        )

    # GETTING THE LIST OF bam_fileS TO PROCESS
    if args.control:
        bam_file_list = [args.rankby, args.control]

    else:
        bam_file_list = [args.rankby]

    if args.bams:
        bam_file_list += args.bams.split(",")
        # bam_file_list = utils.uniquify(bam_file_list) # makes sad when you have the same control
        # bam over and over again
    # optional args

    # Stitch parameter
    if args.stitch == "":
        stitch_window = ""
    else:
        stitch_window = int(args.stitch)

    # tss args
    tss_window = int(args.tss)
    if tss_window != 0:
        remove_tss = True
    else:
        remove_tss = False

    # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS
    print("USING {} AS THE INPUT GFF".format(input_gff_file))
    input_name = os.path.basename(input_gff_file).split(".")[0]

    # GETTING THE GENOME
    genome = args.genome
    print("USING {} AS THE GENOME".format(genome))

    annot_file = rose2_utils.genome_dict[genome.upper()]

    # GET CHROMS FOUND IN THE BAMS
    print("GETTING CHROMS IN bam_fileS")
    bam_chrom_list = rose2_utils.get_bam_chrom_list(bam_file_list)
    print("USING THE FOLLOWING CHROMS")
    print(bam_chrom_list)

    # LOADING IN THE GFF AND FILTERING BY CHROM
    print("LOADING AND FILTERING THE GFF")
    input_gff = rose2_utils.filter_gff(input_gff_file, bam_chrom_list)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print("LOADING IN GFF REGIONS")
    reference_collection = utils.gff_to_locus_collection(input_gff)
    print("STARTING WITH {} INPUT REGIONS".format(len(reference_collection)))
    print("CHECKING REFERENCE COLLECTION:")
    rose2_utils.check_ref_collection(reference_collection)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if args.mask:
        mask_file = args.mask
        print("USING MASK FILE {}".format(mask_file))
        # if it's a bed file
        if mask_file.split(".")[-1].upper() == "BED":
            mask_gff = utils.bed_to_gff(mask_file)
        elif mask_file.split(".")[-1].upper() == "GFF":
            mask_gff = utils.parse_table(mask_file, "\t")
        else:
            print("MASK MUST BE A .gff or .bed FILE")

        mask_collection = utils.gff_to_locus_collection(mask_gff)
        print("LOADING {} MASK REGIONS".format(str(len(mask_collection))))
        # now mask the reference loci
        reference_loci = reference_collection.get_loci()
        filtered_loci = [
            locus
            for locus in reference_loci
            if len(mask_collection.get_overlap(locus, "both")) == 0
        ]
        print(
            "FILTERED OUT {} LOCI THAT WERE MASKED IN {}".format(
                str(len(reference_loci) - len(filtered_loci)), mask_file
            )
        )
        reference_collection = utils.LocusCollection(filtered_loci, 50)

    # NOW STITCH REGIONS
    print("STITCHING REGIONS TOGETHER")
    stitched_collection, debug_output, stitch_window = rose2_utils.region_stitching(
        reference_collection,
        input_name,
        out_folder,
        stitch_window,
        tss_window,
        annot_file,
        remove_tss,
    )
    # NOW MAKE A STITCHED COLLECTION GFF
    print("MAKING GFF FROM STITCHED COLLECTION")
    stitched_gff = utils.locus_collection_to_gff(stitched_collection)
    # making sure start/stop ordering are correct
    for i in range(len(stitched_gff)):

        line = stitched_gff[i]
        start = int(line[3])
        stop = int(line[4])
        if start > stop:
            line[3] = stop
            line[4] = start

    print(stitch_window)
    print(type(stitch_window))
    if not remove_tss:
        stitched_gff_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED.gff".format(input_name, str(stitch_window // 1000)),
        )
        stitched_gff_name = "{}_{}KB_STITCHED".format(
            input_name, str(stitch_window // 1000)
        )
        debug_out_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED.debug".format(input_name, str(stitch_window // 1000)),
        )
    else:
        stitched_gff_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED_TSS_DISTAL.gff".format(
                input_name, str(stitch_window // 1000)
            ),
        )
        stitched_gff_name = "{}_{}KB_STITCHED_TSS_DISTAL".format(
            input_name, str(stitch_window // 1000)
        )
        debug_out_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED_TSS_DISTAL.debug".format(
                input_name, str(stitch_window // 1000)
            ),
        )

    # WRITING DEBUG OUTPUT TO DISK
    if debug:
        print("WRITING DEBUG OUTPUT TO DISK AS {}".format(debug_out_file))
        utils.unparse_table(debug_output, debug_out_file, "\t")

    # WRITE THE GFF TO DISK
    print("WRITING STITCHED GFF TO DISK AS {}".format(stitched_gff_file))
    utils.unparse_table(stitched_gff, stitched_gff_file, "\t")

    # SETTING UP THE OVERALL OUTPUT FILE
    output_file1 = os.path.join(
        out_folder, "{}_ENHANCER_REGION_MAP.txt".format(stitched_gff_name)
    )
    print("OUTPUT WILL BE WRITTEN TO  {}".format(output_file1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    bam_file_list_unique = list(bam_file_list)
    bam_file_list_unique = utils.uniquify(bam_file_list_unique)
    # prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bam_file_list_unique)
    for bam_file in bam_file_list_unique:

        bam_file_name = os.path.basename(bam_file)

        # MAPPING TO THE STITCHED GFF
        mapped_out1_folder = os.path.join(
            mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name)
        )
        mapped_out1_file = os.path.join(
            mapped_folder,
            "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name),
            "matrix.txt",
        )
        if utils.check_output(mapped_out1_file, 0.2, 0.2):
            print(
                "FOUND {} MAPPING DATA FOR BAM: {}".format(
                    stitched_gff_file, mapped_out1_file
                )
            )
        else:
            cmd1 = "bamliquidator_batch --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format(
                stitched_gff_file, mapped_out1_folder, bam_file,
            )
            print(cmd1)

            os.system(cmd1)
            if utils.check_output(mapped_out1_file, 0.2, 5):
                print(
                    "SUCCESSFULLY MAPPED TO {} FROM BAM: {}".format(
                        stitched_gff_file, bam_file_name
                    )
                )
            else:
                print(
                    "ERROR: FAILED TO MAP {} FROM BAM: {}".format(
                        stitched_gff_file, bam_file_name
                    )
                )
                sys.exit()

    print("BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS")
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    rose2_utils.map_collection(
        stitched_collection,
        reference_collection,
        bam_file_list,
        mapped_folder,
        output_file1,
        ref_name=stitched_gff_name,
    )

    print("CALLING AND PLOTTING SUPER-ENHANCERS")

    if args.control:
        control_name = os.path.basename(args.control)
    else:
        control_name = "NONE"
    cmd = "Rscript {} {} {} {} {}".format(
        os.path.join(ROOT_DIR, "scripts", "ROSE2_callSuper.R"),
        out_folder + "/",  # TODO: fix R script so it does not require '/'
        output_file1,
        input_name,
        control_name,
    )
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    time.sleep(20)
    super_table_file = "{}_SuperEnhancers.table.txt".format(input_name)
    if args.control:
        cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format(
            genome,
            args.rankby,
            args.control,
            os.path.join(out_folder, super_table_file),
        )
    else:
        cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format(
            genome, args.rankby, os.path.join(out_folder, super_table_file)
        )
    os.system(cmd)

    stretch_table_file = "{}_StretchEnhancers.table.txt".format(input_name)
    if args.control:
        cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format(
            genome,
            args.rankby,
            args.control,
            os.path.join(out_folder, stretch_table_file),
        )
    else:
        cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format(
            genome, args.rankby, os.path.join(out_folder, stretch_table_file)
        )
    os.system(cmd)

    superstretch_table_file = "{}_SuperStretchEnhancers.table.txt".format(input_name)
    if args.control:
        cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format(
            genome,
            args.rankby,
            args.control,
            os.path.join(out_folder, superstretch_table_file),
        )
    else:
        cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format(
            genome, args.rankby, os.path.join(out_folder, superstretch_table_file)
        )
    os.system(cmd)
Пример #8
0
def tf_edge_delta_out(
    crc_folder,
    bam_list,
    analysis_name,
    edge_table_path_1,
    edge_table_path_2,
    group1_list,
    group2_list,
    output="",
):
    """Calculates changes in group out degree at each predicted motif occurrence (by subpeaks)."""
    crc_folder = utils.format_folder(crc_folder, True)
    edge_path = merge_edge_tables(
        edge_table_path_1,
        edge_table_path_2,
        os.path.join(crc_folder, "{}_EDGE_TABLE.txt".format(analysis_name)),
    )

    # make a gff of the edge table
    edge_table = utils.parse_table(edge_path, "\t")
    edge_gff = []
    for line in edge_table[1:]:
        gff_line = [
            line[2],
            "{}_{}".format(line[0], line[1]),
            "",
            line[3],
            line[4],
            "",
            ".",
            "",
            "{}_{}".format(line[0], line[1]),
        ]
        edge_gff.append(gff_line)

    edge_gff_path = os.path.join(crc_folder,
                                 "{}_EDGE_TABLE.gff".format(analysis_name))
    utils.unparse_table(edge_gff, edge_gff_path, "\t")

    # direct the output to the crc folder
    signal_path = os.path.join(
        crc_folder, "{}_EDGE_TABLE_signal.txt".format(analysis_name))

    all_group_list = group1_list + group2_list
    if not utils.check_output(signal_path, 0, 0):
        signal_table_list = pipeline_utils.map_regions(
            bam_list,
            [edge_gff_path],
            crc_folder,
            crc_folder,
            all_group_list,
            True,
            signal_path,
            extend_reads_to=100,
        )
        print(signal_table_list)
    else:
        print("Found previous signal table at {}".format(signal_path))

    # now bring in the signal table as a dictionary using the locus line as the id
    print("making log2 group1 vs group2 signal table at edges")
    signal_table = utils.parse_table(signal_path, "\t")

    # figure out columns for group1 and group2
    group1_columns = [signal_table[0].index(name) for name in group1_list]
    group2_columns = [signal_table[0].index(name) for name in group2_list]
    group1_signal_vector = []
    group2_signal_vector = []
    for line in signal_table[1:]:
        group1_signal = numpy.mean(
            [float(line[col]) for col in group1_columns])
        group2_signal = numpy.mean(
            [float(line[col]) for col in group2_columns])

        group1_signal_vector.append(group1_signal)
        group2_signal_vector.append(group2_signal)

    group1_median = numpy.median(group1_signal_vector)
    group2_median = numpy.median(group2_signal_vector)

    print("group1 median signal")
    print(group1_median)
    print("group2 median signal")
    print(group2_median)

    # now that we have the median, we can take edges where at least 1 edge is above the median
    # and both are above zero and generate a new table w/ the fold change
    signal_filtered_path = signal_path.replace(".txt", "_filtered.txt")
    if utils.check_output(signal_filtered_path, 0, 0):
        print("Found filtered signal table for edges at {}".format(
            signal_filtered_path))
        signal_table_filtered = utils.parse_table(signal_filtered_path, "\t")
    else:
        signal_table_filtered = [
            signal_table[0] +
            ["GROUP1_MEAN", "GROUP2_MEAN", "GROUP1_vs_GROUP2_LOG2"]
        ]
        for line in signal_table[1:]:
            group1_signal = numpy.mean(
                [float(line[col]) for col in group1_columns])
            group2_signal = numpy.mean(
                [float(line[col]) for col in group2_columns])

            if (group1_signal > group1_median or group2_signal > group2_median
                ) and min(group1_signal, group2_signal) > 0:
                delta = numpy.log2(group1_signal / group2_signal)
                new_line = line + [group1_signal, group2_signal, delta]
                signal_table_filtered.append(new_line)

        utils.unparse_table(signal_table_filtered, signal_filtered_path, "\t")

    # now get a list of all TFs in the system
    tf_list = utils.uniquify(
        [line[0].split("_")[0] for line in signal_table_filtered[1:]])
    tf_list.sort()
    print(tf_list)

    out_degree_table = [[
        "TF_NAME",
        "EDGE_COUNT",
        "DELTA_MEAN",
        "DELTA_MEDIAN",
        "DELTA_STD",
        "DELTA_SEM",
    ]]

    for tf_name in tf_list:
        print(tf_name)
        edge_vector = [
            float(line[-1]) for line in signal_table_filtered[1:]
            if line[0].split("_")[0] == tf_name
        ]

        edge_count = len(edge_vector)
        delta_mean = round(numpy.mean(edge_vector), 4)
        delta_median = round(numpy.median(edge_vector), 4)
        delta_std = round(numpy.std(edge_vector), 4)
        delta_sem = round(stats.sem(edge_vector), 4)
        tf_out_line = [
            tf_name,
            edge_count,
            delta_mean,
            delta_median,
            delta_std,
            delta_sem,
        ]
        out_degree_table.append(tf_out_line)

    # set final output
    if not output:
        output_path = os.path.join(
            crc_folder, "{}_EDGE_DELTA_OUT.txt".format(analysis_name))
    else:
        output_path = output

    utils.unparse_table(out_degree_table, output_path, "\t")
    print(output_path)
    return output_path
Пример #9
0
def main():
    """Main run method for enhancer promoter contribution tool."""
    parser = argparse.ArgumentParser()

    # required flags
    parser.add_argument(
        "-b",
        "--bam",
        dest="bam",
        nargs="*",
        help="Enter a space separated list of .bam files for the main factor",
        required=True,
    )
    parser.add_argument(
        "-i",
        "--input",
        dest="input",
        type=str,
        help="Enter .gff or .bed file of regions to analyze",
        required=True,
    )
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        type=str,
        help=(
            "specify a genome, HG18,HG19,HG38,MM8,MM9,MM10,RN6 are currently "
            "supported"),
        required=True,
    )
    parser.add_argument(
        "-p",
        "--chrom-path",
        dest="chrom_path",
        type=str,
        help=("Provide path to a folder with a seperate fasta file for each "
              "chromosome"),
        required=True,
    )
    # output flag
    parser.add_argument(
        "-o",
        "--output",
        dest="output",
        type=str,
        help="Enter the output folder.",
        required=True,
    )

    # additional options flags and optional arguments
    parser.add_argument(
        "-a",
        "--activity",
        dest="activity",
        type=str,
        help=("specify a table where first column represents a list of active "
              "refseq genes"),
        required=False,
    )
    parser.add_argument(
        "-c",
        "--control",
        dest="control",
        nargs="*",
        help=("Enter a space separated list of .bam files for background. If "
              "flagged, will perform background subtraction"),
        required=False,
    )
    parser.add_argument(
        "-t",
        "--tss",
        dest="tss",
        type=int,
        help="Define the TSS area +/- the TSS. Default is 1kb",
        required=False,
        default=1000,
    )
    parser.add_argument(
        "-d",
        "--distal",
        dest="distal",
        type=int,
        help="Enter a window to assign distal enhancer signal. Default is 50kb",
        required=False,
        default=50000,
    )
    parser.add_argument(
        "--other-bams",
        dest="other",
        nargs="*",
        help="enter a space separated list of other bams to map to",
        required=False,
    )
    parser.add_argument(
        "--name",
        dest="name",
        type=str,
        help=
        ("enter a root name for the analysis, otherwise will try to find the "
         "name from the input file"),
        required=False,
    )
    parser.add_argument(
        "--top",
        dest="top",
        type=int,
        help=
        ("Run the analysis on the top N genes by total signal. Default is 5000"
         ),
        required=False,
        default=5000,
    )
    parser.add_argument(
        "--tads",
        dest="tads",
        type=str,
        help=
        ("Include a .bed of tad regions to restrict enhancer/gene association"
         ),
        required=False,
        default=None,
    )
    parser.add_argument(
        "--mask",
        dest="mask",
        default=None,
        help=(
            "Mask a set of regions from analysis.  Provide a .bed or .gff of "
            "masking regions"),
    )

    args = parser.parse_args()

    print(args)

    # =====================================================================================
    # ===============================I. PARSING ARGUMENTS==================================
    # =====================================================================================

    print(
        "\n\n#======================================\n#===========I. DATA SUMMARY============\n#="
        "=====================================\n")

    # top analysis subset
    top = args.top

    # input genome
    genome = args.genome.upper()
    print("PERFORMING ANALYSIS ON {} GENOME BUILD".format(genome))

    # set of bams
    bam_file_list = args.bam

    # bring in the input path
    input_path = args.input

    # try to get the input name or use the name argument
    if args.name:
        analysis_name = args.name
    else:
        analysis_name = os.path.basename(input_path).split(".")[0]

    print("USING {} AS ANALYSIS NAME".format(analysis_name))
    # setting up the output folder
    parent_folder = utils.format_folder(args.output, True)
    output_folder = utils.format_folder(
        os.path.join(parent_folder, analysis_name), True)

    print("WRITING OUTPUT TO {}".format(output_folder))

    if input_path.split(".")[-1] == "bed":
        # type is bed
        print("input in bed format, converting to gff")
        input_gff = utils.bed_to_gff(input_path)
    else:
        input_gff = utils.parse_table(input_path, "\t")

    # the tss window for proximal signal assignment
    tss_window = int(args.tss)

    # the distal window for assigning nearby enhancer signal
    distal_window = int(args.distal)

    # activity path
    if args.activity:
        activity_path = args.activity
        activity_table = utils.parse_table(activity_path, "\t")
        ref_col = 0
        # try to find the column for refseq id
        for i in range(len(
                activity_table[2])):  # use an internal row in case of header
            if str(activity_table[1][i]).count("NM_") or str(
                    activity_table[1][i]).count("NR_"):
                ref_col = i

        # now check for header
        if not str(activity_table[0][i]).count("NM_") and not str(
                activity_table[0][i]).count("NR_"):
            print("REMOVING HEADER FROM GENE TABLE:")
            print(activity_table[0])
            activity_table.pop(0)

        gene_list = [line[ref_col] for line in activity_table
                     ]  # this needs to be REFSEQ NM ID
        print("IDENTIFIED {} ACTIVE GENES".format(len(gene_list)))

    else:
        gene_list = []

    # check if tads are being invoked
    if args.tads:
        print("LOADING TAD LOCATIONS FROM {}".format(args.tads))
        tads_path = args.tads
    else:
        tads_path = ""

    print("LOADING ANNOTATION DATA FOR GENOME {}".format(genome))

    genome_dir = args.chrom_path

    # making a chrom_dict that is a list of all chroms with sequence
    chrom_list = utils.uniquify(
        [name.split(".")[0] for name in os.listdir(genome_dir) if name])

    # important here to define the window
    start_dict, tss_collection, mouse_convert_dict = load_annot_file(
        genome,
        tss_window,
        gene_list,
    )

    print("FILTERING THE INPUT GFF FOR GOOD CHROMOSOMES")

    print(chrom_list)
    filtered_gff = [line for line in input_gff if chrom_list.count(line[0])]

    print("{} of INITIAL {} REGIONS ARE IN GOOD CHROMOSOMES".format(
        str(len(filtered_gff)),
        str(len(input_gff)),
    ))

    # =====================================================================================
    # ================II. IDENTIFYING TSS PROXIMAL AND DISTAL ELEMENTS=====================
    # =====================================================================================

    print(
        "\n\n#======================================\n#==II. MAPPING TO TSS/DISTAL REGIONS===\n#="
        "=====================================\n")

    # now we need to split the input region
    print("SPLITTING THE INPUT GFF USING A WINDOW OF {}".format(tss_window))
    split_gff = split_regions(filtered_gff,
                              tss_collection,
                              mask_file=args.mask)
    print(len(filtered_gff))
    print(len(split_gff))

    split_gff_path = os.path.join(output_folder,
                                  "{}_SPLIT.gff".format(analysis_name))
    utils.unparse_table(split_gff, split_gff_path, "\t")
    print("WRITING TSS SPLIT GFF OUT TO {}".format(split_gff_path))

    # now you have to map the bams to the gff
    print("MAPPING TO THE SPLIT GFF")
    mapped_folder = utils.format_folder(
        os.path.join(output_folder, "bam_mapping"), True)

    signal_table = map_bams(bam_file_list, split_gff_path, analysis_name,
                            mapped_folder)
    signal_table_path = os.path.join(
        output_folder, "{}_signal_table.txt".format(analysis_name))
    utils.unparse_table(signal_table, signal_table_path, "\t")

    if args.control:
        control_bam_file_list = args.control
        control_signal_table = map_bams(
            control_bam_file_list,
            split_gff_path,
            analysis_name,
            mapped_folder,
        )
        control_signal_table_path = os.path.join(
            output_folder,
            "{}_control_signal_table.txt".format(analysis_name),
        )
        utils.unparse_table(control_signal_table, control_signal_table_path,
                            "\t")

    # now create the background subtracted summarized average table
    print("CREATING AN AVERAGE SIGNAL TABLE")
    average_table = make_average_table(
        output_folder,
        analysis_name,
        use_background=args.control  # TODO: fix to True or False
    )
    average_table_path = os.path.join(
        output_folder, "{}_average_table.txt".format(analysis_name))
    utils.unparse_table(average_table, average_table_path, "\t")

    # now load up all of the cpg and other parameters to make the actual peak table

    # first check if this has already been done
    peak_table_path = os.path.join(output_folder,
                                   "{}_PEAK_TABLE.txt".format(analysis_name))
    if utils.check_output(peak_table_path, 0.1, 0.1):
        print("PEAK TABLE OUTPUT ALREADY EXISTS")
        peak_table = utils.parse_table(peak_table_path, "\t")
    else:
        peak_table = make_peak_table(
            param_dict,
            split_gff_path,
            average_table_path,
            start_dict,
            gene_list,
            genome_dir,
            tss_window,
            distal_window,
            tads_path,
        )
        utils.unparse_table(peak_table, peak_table_path, "\t")

    gene_table = make_gene_table(peak_table, analysis_name)

    gene_table_path = os.path.join(output_folder,
                                   "{}_GENE_TABLE.txt".format(analysis_name))
    utils.unparse_table(gene_table, gene_table_path, "\t")

    # if mouse, need to convert genes over
    if genome.count("MM") == 1:
        print("CONVERTING MOUSE NAMES TO HUMAN HOMOLOGS FOR GSEA")
        converted_gene_table_path = os.path.join(
            output_folder,
            "{}_GENE_TABLE_CONVERTED.txt".format(analysis_name),
        )

        converted_gene_table = [gene_table[0]]
        for line in gene_table[1:]:
            converted_name = mouse_convert_dict[line[0]]
            if converted_name:
                converted_gene_table.append([converted_name] + line[1:])

                utils.unparse_table(converted_gene_table,
                                    converted_gene_table_path, "\t")

        gene_table_path = converted_gene_table_path
        gene_table = converted_gene_table

    # =====================================================================================
    # ===================================III. PLOTTING ====================================
    # =====================================================================================

    print(
        "\n\n#======================================\n#===III. PLOTTING ENHANCER/PROMOTER===\n#=="
        "====================================\n")

    # if there are fewer genes in the gene table than the top genes, only run on all
    if len(gene_table) < int(top):
        print(
            "WARNING: ONLY {} GENES WITH SIGNAL AT EITHER PROMOTERS OR ENHANCERS. NOT ENOUGH TO"
            "RUN ANALYSIS ON TOP {}".format(str(len(gene_table) - 1),
                                            str(top)))
        top = 0

    # now call the R code
    print("CALLING R PLOTTING SCRIPTS")
    call_r_waterfall(gene_table_path, output_folder, analysis_name, top)
Пример #10
0
def main():
    """Main run function."""
    parser = argparse.ArgumentParser()
    # required flags
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        required=True,
        help="Enter the genome build (HG18,HG19,MM9,RN4,RN6) for the project",
    )
    parser.add_argument(
        "-d",
        "--data",
        dest="data",
        required=True,
        help="Enter the data file for the project",
    )
    parser.add_argument(
        "-r",
        "--rose",
        dest="rose",
        required=True,
        help="Enter a comma separated list of rose folder",
    )
    parser.add_argument(
        "-o",
        "--output",
        dest="output",
        required=True,
        help="Enter the output folder for the project",
    )
    parser.add_argument(
        "-n",
        "--names",
        dest="names",
        required=True,
        help="Enter a comma separated list of names to go with the datasets",
    )

    # additional args
    parser.add_argument(
        "-p",
        "--plot",
        dest="plot",
        action="store_true",
        default=False,
        help="If flagged, will plot differential regions",
    )
    parser.add_argument(
        "-a",
        "--all",
        dest="all",
        action="store_true",
        default=False,
        help=
        "If flagged, will run analysis for all enhancers and not just supers.",
    )
    parser.add_argument(
        "-m",
        "--median",
        dest="median",
        action="store_true",
        default=False,
        help="If flagged, will use median enhancer scaling",
    )
    parser.add_argument(
        "-e",
        "--enhancer-type",
        dest="enhancer_type",
        default="super",
        help=
        "specify type of enhancer to analyze: super, stretch, superStretch",
    )

    args = parser.parse_args()

    print(args)

    genome = args.genome.upper()
    data_file = args.data

    rose_folder_string = args.rose
    rose_folder1, rose_folder2 = rose_folder_string.split(",")
    parent_folder = utils.format_folder(args.output, True)

    name_string = args.names
    name1, name2 = name_string.split(",")

    merge_name = "{}_{}_merged".format(name1, name2)

    # option for median scaling
    median_scale = args.median

    plot_bam = args.plot
    if args.all:
        super_only = False
    else:
        super_only = True

    if super_only and plot_bam:
        print(
            "Running dynamic enhancer analysis on all super enhancers in {} and {} and plotting "
            "output to {}".format(name1, name2, parent_folder))
    if super_only and not plot_bam:
        print(
            "Running dynamic enhancer analysis on all super enhancers in {} and {} and writing "
            "output to {}".format(name1, name2, parent_folder))
    if not super_only and plot_bam:
        print(
            "Running dynamic enhancer analysis on all enhancers in {} and {} and plotting output "
            "to {}. WARNING: Plotting all differential enhancers could take a while"
            .format(name1, name2, parent_folder))
    if not super_only and not plot_bam:
        print(
            "Running dynamic enhancer analysis on all enhancers in {} and {} and writing output "
            "to {}.".format(name1, name2, parent_folder))

    # part 1
    print("PART1: analyzing ROSE output from {} and {}".format(name1, name2))
    # start with the all enhancer tables from the initial rose calls

    rose_folder1 = utils.format_folder(rose_folder1, False)
    rose_folder2 = utils.format_folder(rose_folder2, False)

    rose_dict1 = make_rose_dict(rose_folder1)
    rose_dict2 = make_rose_dict(rose_folder2)

    # choosing the type of enhancer to analyze
    enhancer_call_type = args.enhancer_type.lower()
    if super_only:
        print("ANALYZING ENHANCER TYPE: {}".format(enhancer_call_type.upper()))

    super_file1 = rose_dict1[enhancer_call_type]
    super_file2 = rose_dict2[enhancer_call_type]

    all_file1 = rose_dict1["AllEnhancer"]
    all_file2 = rose_dict2["AllEnhancer"]

    print("\tMERGING ENHANCERS AND CALLING ROSE")
    if super_only:
        if len(super_file1) == 0:
            print("ERROR: UNABLE TO FIND {} FILES IN {}".format(
                enhancer_call_type, rose_folder1))
            sys.exit()
        if len(super_file2) == 0:
            print("ERROR: UNABLE TO FIND {} FILES IN {}".format(
                enhancer_call_type, rose_folder2))
            sys.exit()
        rose_output = call_merge_supers(
            data_file,
            super_file1,
            super_file2,
            name1,
            name2,
            merge_name,
            genome,
            parent_folder,
        )

    else:
        rose_output = call_merge_supers(
            data_file,
            all_file1,
            all_file2,
            name1,
            name2,
            merge_name,
            genome,
            parent_folder,
        )

    print("\tCALCULATING ENHANCER DELTA AND MAKING PLOTS")

    # part2 is the R script
    merged_gff_file = os.path.join(
        parent_folder,
        "{}_{}_MERGED_REGIONS_-0_+0.gff".format(genome, merge_name))
    rcmd = call_delta_r_script(
        merged_gff_file,
        parent_folder,
        data_file,
        name1,
        name2,
        all_file1,
        all_file2,
        median_scale,
    )
    print(rcmd)
    os.system(rcmd)

    time.sleep(30)
    call_rose_gene_mapper(merged_gff_file, genome, parent_folder, name1)

    # rank the genes

    # part 3
    # rank the delta
    print("PART 3: assinging ranks to differential enhancers")
    print("\tASSIGNING SUPER RANK TO MERGED ENHANCERS")

    gff_name = "{}_{}_MERGED_REGIONS_-0_+0".format(genome, merge_name)
    enhancer_to_gene_file = os.path.join(
        parent_folder,
        "{}_ROSE".format(name1),
        "{}_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB.txt".format(
            gff_name),
    )
    if utils.check_output(enhancer_to_gene_file):
        rank_output = os.path.join(
            parent_folder,
            "{}_ROSE".format(name1),
            "{}_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB_RANK.txt".
            format(gff_name),
        )
        assign_enhancer_rank(enhancer_to_gene_file, all_file1, all_file2,
                             name1, name2, rank_output)
    else:
        print("ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN")
        sys.exit()

    # make the rank plot
    print("MAKING RANK PLOTS")
    if utils.check_output(rank_output):
        rcmd = call_rank_r_script(rank_output, name1, name2, super_file1,
                                  super_file2)
        print(rcmd)
        os.system(rcmd)
    else:
        print("ERROR: RANK PLOT SCRIPT FAILED TO RUN")
        sys.exit()

    time.sleep(30)

    print("FINISHING OUTPUT")
    finish_rank_output(
        data_file,
        rank_output,
        genome,
        parent_folder,
        merge_name,
        name1,
        name2,
        1,
        100000,
        super_only,
        plot_bam,
    )
Пример #11
0
def finish_rank_output(
    data_file,
    rank_output,
    genome,
    merge_folder,
    merge_name,
    name1,
    name2,
    cut_off=1.5,
    window=100000,
    super_only=True,
    plot_bam=True,
):
    """Finish rank output.

    Clean up the rank output table. Make a gff of all of the gained/lost supers beyond a certain
    cut_off w/ a window. Make a list of gained genes and lost genes. Make a bed of gained loss.

    """
    data_dict = pipeline_utils.load_data_table(data_file)
    # making sure window and cut_off are int/float
    cut_off = float(cut_off)
    window = int(window)
    genome = genome.upper()

    # make the output folder
    output_folder = utils.format_folder(os.path.join(merge_folder, "output"),
                                        True)

    # bring in the old rank table
    rank_enhancer_table = utils.parse_table(rank_output, "\t")

    # make a new formatted table
    header = rank_enhancer_table[0]
    header[-4] = "DELTA RANK"
    header[-3] = "IS_SUPER"
    formatted_rank_table = [header]

    # the gffs
    gained_gff = []
    lost_gff = []

    gained_window_gff = []
    lost_window_gff = []

    if super_only:
        enhancer_type = "SUPERS"
    else:
        enhancer_type = "ENHANCERS"

    # the beds
    if super_only:
        gained_track_header = (
            'track name="{} {} only SEs" description="{} super enhancers that are found only in '
            '{} vs {}" itemRGB=On color=255,0,0'.format(
                genome, name2, genome, name2, name1))
        gained_bed = [[gained_track_header]]
        conserved_track_header = (
            'track name="{} {} and {} SEs" description="{} super enhancers that are found in both'
            ' {} vs {}" itemRGB=On color=0,0,0'.format(genome, name1, name2,
                                                       genome, name1, name2))
        conserved_bed = [[conserved_track_header]]

        lost_track_header = (
            'track name="{} {} only SEs" description="{} super enhancers that are found only in '
            '{} vs {}" itemRGB=On color=0,255,0'.format(
                genome, name1, genome, name1, name2))
        lost_bed = [[lost_track_header]]
    else:
        gained_track_header = (
            'track name="{} {} only enhancers" description="{} enhancers that are found only in '
            '{} vs {}" itemRGB=On color=255,0,0'.format(
                genome, name2, genome, name2, name1))
        gained_bed = [[gained_track_header]]
        conserved_track_header = (
            'track name="{} {} and {} enhancers" description="{} enhancers that are found in both'
            ' {} vs {}" itemRGB=On color=0,0,0'.format(genome, name1, name2,
                                                       genome, name1, name2))
        conserved_bed = [[conserved_track_header]]

        lost_track_header = (
            'track name="{} {} only enhancers" description="{} enhancers that are found only in '
            '{} vs {}" itemRGB=On color=0,255,0'.format(
                genome, name1, genome, name1, name2))
        lost_bed = [[lost_track_header]]

    # the genes
    gene_table = [[
        "GENE",
        "ENHANCER_ID",
        "ENHANCER_CHROM",
        "ENHANCER_START",
        "ENHANCER_STOP",
        header[6],
        header[7],
        header[8],
        "STATUS",
    ]]

    for line in rank_enhancer_table[1:]:
        # fixing the enhancer ID
        line[0] = line[0].replace("_lociStitched", "")
        formatted_rank_table.append(line)

        # getting the genes
        gene_list = []
        gene_list += line[9].split(",")
        gene_list += line[10].split(",")
        gene_list += line[11].split(",")
        gene_list = [x for x in gene_list if len(x) > 0]
        gene_list = utils.uniquify(gene_list)
        gene_string = ",".join(gene_list)

        bed_line = [line[1], line[2], line[3], line[0], line[-4]]

        # for gained
        if float(line[6]) > cut_off:
            gff_line = [
                line[1],
                line[0],
                "",
                line[2],
                line[3],
                "",
                ".",
                "",
                gene_string,
            ]
            gff_window_line = [
                line[1],
                line[0],
                "",
                int(line[2]) - window,
                int(line[3]) + window,
                "",
                ".",
                "",
                gene_string,
            ]
            gained_gff.append(gff_line)
            gained_window_gff.append(gff_window_line)
            gene_status = name2
            gained_bed.append(bed_line)
        # for lost
        elif float(line[6]) < (-1 * cut_off):
            gff_line = [
                line[1],
                line[0],
                "",
                line[2],
                line[3],
                "",
                ".",
                "",
                gene_string,
            ]
            gff_window_line = [
                line[1],
                line[0],
                "",
                int(line[2]) - window,
                int(line[3]) + window,
                "",
                ".",
                "",
                gene_string,
            ]
            lost_gff.append(gff_line)
            lost_window_gff.append(gff_window_line)
            gene_status = name1
            lost_bed.append(bed_line)
        # for conserved
        else:
            gene_status = "CONSERVED"
            conserved_bed.append(bed_line)

        # now fill in the gene Table
        for gene in gene_list:
            gene_table_line = [
                gene,
                line[0],
                line[1],
                line[2],
                line[3],
                line[6],
                line[7],
                line[8],
                gene_status,
            ]
            gene_table.append(gene_table_line)

    # concat the bed
    full_bed = gained_bed + conserved_bed + lost_bed

    # start writing the output
    # there's the two gffs, the bed,the formatted table, the gene table

    # formatted table
    formatted_filename = os.path.join(
        output_folder,
        "{}_{}_MERGED_{}_RANK_TABLE.txt".format(genome, merge_name,
                                                enhancer_type),
    )
    utils.unparse_table(formatted_rank_table, formatted_filename, "\t")

    # gffs
    gff_folder = utils.format_folder(output_folder + "gff/", True)
    gff_filename_gained = os.path.join(
        gff_folder,
        "{}_{}_{}_ONLY_{}_-0_+0.gff".format(genome, merge_name, name2.upper(),
                                            enhancer_type),
    )
    gff_filename_window_gained = os.path.join(
        gff_folder,
        "{}_{}_{}_ONLY_{}_-{}KB_+{}KB.gff".format(
            genome,
            merge_name,
            name2.upper(),
            enhancer_type,
            str(window // 1000),
            str(window // 1000),
        ),
    )

    gff_filename_lost = os.path.join(
        gff_folder,
        "{}_{}_{}_ONLY_{}_-0_+0.gff".format(genome, merge_name, name1.upper(),
                                            enhancer_type),
    )
    gff_filename_window_lost = os.path.join(
        gff_folder,
        "{}_{}_{}_ONLY_{}_-{}KB_+{}KB.gff".format(
            genome,
            merge_name,
            name1.upper(),
            enhancer_type,
            str(window // 1000),
            str(window // 1000),
        ),
    )

    utils.unparse_table(gained_gff, gff_filename_gained, "\t")
    utils.unparse_table(gained_window_gff, gff_filename_window_gained, "\t")

    utils.unparse_table(lost_gff, gff_filename_lost, "\t")
    utils.unparse_table(lost_window_gff, gff_filename_window_lost, "\t")

    # bed
    bed_filename = os.path.join(
        output_folder, "{}_{}_MERGED_{}.bed".format(genome, merge_name,
                                                    enhancer_type))
    utils.unparse_table(full_bed, bed_filename, "\t")

    # gene_table
    gene_filename = os.path.join(
        output_folder,
        "{}_{}_MERGED_{}_GENE_TABLE.txt".format(genome, merge_name,
                                                enhancer_type),
    )
    utils.unparse_table(gene_table, gene_filename, "\t")

    # finally, move all of the plots to the output folder
    copyfile(
        glob.glob(os.path.join(merge_folder, "{}_ROSE".format(name1),
                               "*.pdf"))[0],
        os.path.join(
            output_folder,
            "{}_{}_MERGED_{}_DELTA.pdf".format(genome, merge_name,
                                               enhancer_type),
        ),
    )

    copyfile(
        glob.glob(
            os.path.join(merge_folder, "{}_ROSE".format(name1),
                         "*RANK_PLOT.png"))[0],
        os.path.join(
            output_folder,
            "{}_{}_MERGED_{}_RANK_PLOT.png".format(genome, merge_name,
                                                   enhancer_type),
        ),
    )

    # now execute the bamPlot_turbo commands
    if plot_bam:
        bam1 = data_dict[name1]["bam"]
        bam2 = data_dict[name2]["bam"]
        bam_string = "{} {}".format(bam1, bam2)
        name_string = "{} {}".format(name1, name2)
        color_string = "0,0,0:100,100,100"

        if len(gained_gff) > 0:
            # gained command
            plot_title = "{}_ONLY_SE".format(name2)
            cmd = (
                "bamPlot_turbo -g {} -b {} -i {} -o {} -n {} -c {} -t {} -r -y UNIFORM -p "
                "MULTIPLE".format(
                    genome,
                    bam_string,
                    gff_filename_gained,
                    output_folder,
                    name_string,
                    color_string,
                    plot_title,
                ))
            os.system(cmd)

            # gained window command
            plot_title = "{}_ONLY_SE_{}KB_WINDOW".format(
                name2, str(window // 1000))
            cmd = (
                "bamPlot_turbo -g {} -b {} -i {} -o {} -n {} -c {} -t {} -r -y UNIFORM -p "
                "MULTIPLE".format(
                    genome,
                    bam_string,
                    gff_filename_window_gained,
                    output_folder,
                    name_string,
                    color_string,
                    plot_title,
                ))
            os.system(cmd)

        if len(lost_gff) > 0:
            # lost command
            plot_title = "{}_ONLY_SE".format(name1)
            cmd = (
                "bamPlot_turbo -g {} -b {} -i {} -o {} -n {} -c {} -t {} -r -y UNIFORM -p "
                "MULTIPLE".format(
                    genome,
                    bam_string,
                    gff_filename_lost,
                    output_folder,
                    name_string,
                    color_string,
                    plot_title,
                ))
            os.system(cmd)

            # lost command
            plot_title = "{}_ONLY_SE_{}KB_WINDOW".format(
                name1, str(window // 1000))
            cmd = (
                "bamPlot_turbo -g {} -b {} -i {} -o {} -n {} -c {} -t {} -r -y UNIFORM -p "
                "MULTIPLE".format(
                    genome,
                    bam_string,
                    gff_filename_window_lost,
                    output_folder,
                    name_string,
                    color_string,
                    plot_title,
                ))
            os.system(cmd)

    return