def make_ticks_image(width, interval_size, tick_params):
    minor_ticks_bp, major_ticks_bp = tick_params

    # Make the tick marks
    t = Ticks(minor_tick_mark_interval_size=(minor_ticks_bp / interval_size),
              major_tick_mark_interval_size=(major_ticks_bp / interval_size))

    # Ticks matrix with a height of 50 px and a max black value of 1
    ticks_matrix = make_ticks_matrix(width, 50, 1, t)

    # Write to a file
    ticks_matrix_filename = generate_random_filename()
    with open(ticks_matrix_filename, 'w') as file:
        for row in ticks_matrix:
            file.write("\t".join([str(val) for val in row]) + "\n")

    ticks_image_filename = generate_random_filename(".tiff")

    os.system(
        "/usr/bin/Rscript " + generate_heatmap_location + " " +
        " ".join([ticks_matrix_filename, "gray", ticks_image_filename, "2.2"]))

    remove_files(ticks_matrix_filename)

    return ticks_image_filename
Пример #2
0
    def test_positive_read_three(self, stdout):
        region_filename = generate_random_filename()
        seq_filename = generate_random_filename()

        with open(region_filename, 'w') as file:
            file.write("\t".join(["chr1", "0", "10", "name", "0", "+"]))

        with open(seq_filename, 'w') as file:
            file.write("\t".join(["chr1", "2", "9", "name", "0", "+"]))

        metaplot.main(['three', region_filename, seq_filename])

        # Get the result from stdout by splitting into a list and making the output floats where possible
        result = [line for line in stdout.getvalue().split("\n") if line]
        result[0] = result[0].split("\t")

        for i, line in enumerate(result[1:]):
            result[i + 1] = [float(val) for val in line.split()]

        seq_file_basename = seq_filename.split("/")[-1]

        expected = [[
            "Position", seq_file_basename + " 3' sense strand",
            seq_file_basename + " 3' divergent strand"
        ], [-5, 0, 0], [-4, 0, 0], [-3, 0, 0], [-2, 0, 0], [-1, 0, 0],
                    [1, 0, 0], [2, 0, 0], [3, 0, 0], [4, 1, 0], [5, 0, 0]]

        remove_files(region_filename, seq_filename)
        self.assertEqual(result, expected)
Пример #3
0
    def test_complete_run(self, stdout):
        reads_filename = generate_random_filename()

        with open(reads_filename, 'w') as file:
            file.write("chr1\t1\t10\tname\t0\t+\n")
            file.write("chr1\t10\t20\tname\t0\t-\n")

        regions_filename = generate_random_filename()

        with open(regions_filename, 'w') as file:
            file.write("chr1\t0\t20\tname\t0\t+\n")

        metaplot.main(['whole', regions_filename, reads_filename])

        output = stdout.getvalue().split("\n")[1:]

        result = []

        for line in output:
            if line:
                result.append(tuple([float(val) for val in line.split()]))

        remove_files(reads_filename, regions_filename)

        position = list(range(-10, 0)) + list(range(1, 11))
        fw_expected = [0] + [1] * 9 + [0] * 10
        rv_expected = [0] * 10 + [-1] * 10

        expected = list(zip(position, fw_expected, rv_expected))

        self.assertEqual(result, expected)
Пример #4
0
    def test_get_counts(self):

        pause_regions_filename = generate_random_filename()

        with open(pause_regions_filename, 'w') as file:
            file.write(
                "\t".join(["chr1", "100", "250", "positive_gene", "90", "+"]) +
                "\n" + "\t".join(
                    ["chr1", "700", "850", "negative_gene", "1523", "-"]) +
                "\n")

        gene_body_filename = generate_random_filename()

        with open(gene_body_filename, 'w') as file:
            file.write(
                "\t".join(["chr1", "251", "750", "positive_gene", "90", "+"]) +
                "\n" + "\t".join(
                    ["chr1", "200", "700", "negative_gene", "1523", "-"]) +
                "\n")

        blacklisted_sequencing_file = generate_random_filename()

        with open(blacklisted_sequencing_file, 'w') as file:
            file.write(
                "\t".join(["chr1", "80", "220", "5'not_counted", "0", "+"]) +
                "\n" +
                "\t".join(["chr1", "259", "285", "5'not_counted", "0", "+"]) +
                "\n" + "\t".join(["chr1", "132", "220", "5'count", "0", "+"]) +
                "\n" + "\t".join(["chr1", "132", "800", "5'count", "0", "-"]) +
                "\n" + "\t".join(["chr1", "750", "783", "5'count", "0", "-"]) +
                "\n" +
                "\t".join(["chr1", "750", "900", "5'not_counted", "0", "-"]) +
                "\n" +
                "\t".join(["chr1", "500", "600", "5'not_counted", "0", "-"]) +
                "\n")

        indv_gene_counts_dict = truQuant.get_counts_in_paused_region(
            pause_regions_filename, blacklisted_sequencing_file)

        indv_gene_counts_dict = truQuant.get_counts_in_gene_bodies(
            gene_body_filename, blacklisted_sequencing_file,
            indv_gene_counts_dict)

        expected_indv_gene_counts_dict = {
            "positive_gene": {
                "Pause": 1,
                "Body": 1
            },
            "negative_gene": {
                "Pause": 2,
                "Body": 1
            }
        }

        self.assertDictEqual(indv_gene_counts_dict,
                             expected_indv_gene_counts_dict)

        remove_files(pause_regions_filename, gene_body_filename,
                     blacklisted_sequencing_file)
Пример #5
0
    def test_two_sequencing_files(self, stdout):
        reads_filename = generate_random_filename()
        reads_filename_two = generate_random_filename()

        with open(reads_filename, 'w') as file:
            file.write("chr1\t1\t10\tname\t0\t+\n")
            file.write("chr1\t10\t20\tname\t0\t-\n")

        with open(reads_filename_two, 'w') as file:
            file.write("chr1\t1\t10\tname\t0\t-\n")
            file.write("chr1\t10\t20\tname\t0\t+\n")

        regions_filename = generate_random_filename()

        with open(regions_filename, 'w') as file:
            file.write("chr1\t0\t20\tname\t0\t+\n")

        metaplot.main(
            ['whole', regions_filename, reads_filename, reads_filename_two])

        output = stdout.getvalue().split("\n")
        header = output[0].split("\t")

        reads_basename = reads_filename.split("/")[-1]
        reads_basename_two = reads_filename_two.split("/")[-1]

        expected_header = [
            "Position", reads_basename + " whole sense strand",
            reads_basename + " whole divergent strand",
            reads_basename_two + " whole sense strand",
            reads_basename_two + " whole divergent strand"
        ]

        self.assertEqual(header, expected_header)

        result = []

        for line in output[1:]:
            if line:
                result.append(tuple([float(val) for val in line.split()]))

        remove_files(reads_filename, regions_filename, reads_filename_two)

        position = list(range(-10, 0)) + list(range(1, 11))
        fw_expected = [0] + [1] * 9 + [0] * 10
        rv_expected = [0] * 10 + [-1] * 10

        fw_expected_two = [0] * 10 + [1] * 10
        rv_expected_two = [0] + [-1] * 9 + [0] * 10

        expected = list(
            zip(position, fw_expected, rv_expected, fw_expected_two,
                rv_expected_two))

        self.assertEqual(result, expected)
def split_bed_file(bed_file):
    fw_filename = generate_random_filename()
    rv_filename = generate_random_filename()

    with open(fw_filename, 'w') as fw_file:
        with open(rv_filename, 'w') as rv_file:
            with open(bed_file) as file:
                for line in file:
                    if "+" in line:
                        fw_file.write(line)
                    else:
                        rv_file.write(line)

    return fw_filename, rv_filename
def make_rgb_heatmap(fold_change_matrix_filename, heatmap_params,
                     output_filename_prefix):
    bp_width, width, height, gamma, max_fold_change, interval_size, minor_ticks_bp, major_ticks_bp = heatmap_params

    tick_params = minor_ticks_bp, major_ticks_bp

    only_heatmap_filename = generate_random_filename(extension=".tiff")

    if max_fold_change != None:
        negative_max_fold_change = -1 * max_fold_change
    else:
        negative_max_fold_change = None

    generate_heatmap(fold_change_matrix_filename,
                     'red/blue',
                     only_heatmap_filename,
                     gamma,
                     negative_max_fold_change,
                     max_fold_change,
                     ticks=None)

    ticks_image_filename = make_ticks_image(width, interval_size, tick_params)

    # Combine the two images together
    output_filename = output_filename_prefix + "_max_" + str(
        max_fold_change) + "_width_" + str(
            bp_width) + "bp_gene_body_fold_change_heatmap"

    combine_images(ticks_image_filename, only_heatmap_filename,
                   output_filename)

    remove_files(fold_change_matrix_filename, ticks_image_filename,
                 only_heatmap_filename)
Пример #8
0
def add_ticks_matrix(old_matrix_filename, ticks):
    matrix = []
    all_values = []
    with open(old_matrix_filename) as file:
        for line in file:
            if line != "":
                matrix.append([float(val) for val in line.split()])
                all_values.extend([float(val) for val in line.split()])

    max_value = max(all_values)
    matrix_width = len(matrix[0])
    matrix_height = len(matrix)

    if ticks:
        ticks_matrix = make_ticks_matrix(matrix_width, 50, max_value, ticks)

        # Add the ticks matrix to the bottom of the matrix
        matrix.extend(ticks_matrix)

    # Now write the new matrix to a file
    new_matrix_filename = generate_random_filename('.matrix')
    with open(new_matrix_filename, 'w') as file:
        for row in matrix:
            output_row = [str(val) for val in row]
            file.write("\t".join(output_row) + "\n")

    return new_matrix_filename
Пример #9
0
    def test_incorrect_number_of_arguments(self):
        with self.assertRaises(SystemExit):
            with Quieter():
                nucleotide_heatmap.parse_args([])

        with self.assertRaises(SystemExit):
            with Quieter():
                nucleotide_heatmap.parse_args(["max_tss_file"])

        with self.assertRaises(SystemExit):
            with Quieter():
                nucleotide_heatmap.parse_args(["max_tss_file", "region_width"])

        with self.assertRaises(SystemExit):
            with Quieter():
                nucleotide_heatmap.parse_args(["max_tss_file", "region_width", "vertical_average", "extra"])

        max_tss_file = generate_random_filename()
        with open(max_tss_file, 'w') as file:
            file.write(
                "\t".join(['chr1', '1', '2', 'name', '0', '+'])
            )

        result = nucleotide_heatmap.parse_args([max_tss_file, '50', '2000', '2'])
        self.assertEqual(result, (max_tss_file, 50, 2000, 2))

        remove_files(max_tss_file)
Пример #10
0
    def test_parse_input(self):
        # No arguments throws error
        with self.assertRaises(SystemExit):
            with Quieter():
                metaplot.parse_input([])

        # Needs a region file and at least one seq file
        regions_file = generate_random_filename()
        with open(regions_file, 'w') as file:
            file.write("\t".join(['chr1', '1', '3', 'name', '0', '+']))

        region_length = 2
        max_threads = multiprocessing.cpu_count()

        # These will work!!
        result = metaplot.parse_input(['five', regions_file, 'seq_file'])
        self.assertEqual(result,
                         ('five', regions_file, ['seq_file'], 2, max_threads))
        result = metaplot.parse_input(['three', regions_file, 'seq_file'])
        self.assertEqual(result,
                         ('three', regions_file, ['seq_file'], 2, max_threads))

        # Test the threading is working
        result = metaplot.parse_input(
            ['five', regions_file, 'seq_file', '-t', '4'])
        self.assertEqual(result, ('five', regions_file, ['seq_file'], 2, 4))

        result = metaplot.parse_input(
            ['three', regions_file, 'seq_file', '--threads', '4'])
        self.assertEqual(result, ('three', regions_file, ['seq_file'], 2, 4))

        remove_files(regions_file)
    def test_arguments(self):
        with self.assertRaises(SystemExit):
            with Quieter():
                sequence_from_region_around_max_tss.parse_args([])

        with self.assertRaises(SystemExit):
            with Quieter():
                sequence_from_region_around_max_tss.parse_args(
                    ["max_tss_file"])

        with self.assertRaises(SystemExit):
            with Quieter():
                sequence_from_region_around_max_tss.parse_args(
                    ["max_tss_file", 'left'])

        max_tss_file = generate_random_filename()

        with open(max_tss_file, 'w') as file:
            file.write(
                "\t".join(["chr16", "53607", "53608", "POLR3K", "0", "-"]) +
                "\n" +
                "\t".join(["chr16", "53872", "53873", "SNRNP25", "0", "+"]) +
                "\n")

        result = sequence_from_region_around_max_tss.parse_args(
            [max_tss_file, '-5', '10'])
        search = [["-", 5], ['+', 10]]
        self.assertEqual(result, (False, max_tss_file, search))

        remove_files(max_tss_file)
def run_coverage(regions_filename,
                 sequencing_filename,
                 output_filename='',
                 flags=None):
    """
    Runs strand specific bedtools coverage to get the number of counts in the sequencing file in the regions file.

    :param regions_filename: filename of the regions of the genome to quantify
    :type regions_filename: str
    :param sequencing_filename: filename of the sequencing data collected
    :type sequencing_filename: str
    :param output_filename: optional name of the output file (will be random if not provided)
    :type output_filename: str
    :param flags: optional flags (like -d for depth at each base)
    :type flags: list
    :return: filename of the resultant bedtools coverage output
    :rtype: str
    """
    if output_filename == '':
        output_filename = generate_random_filename()

    # Convert the flags to string
    if flags != None:
        flag_string = ' '.join(flags)
    else:
        flag_string = ''

    verify_bed_files(regions_filename, sequencing_filename)

    os.system("bedtools coverage -s -nonamecheck " + flag_string + " -a " +
              regions_filename + " -b " + sequencing_filename + " > " +
              output_filename)

    return output_filename
Пример #13
0
def get_individual_matrix(regions_filename, seq_file_data, end,
                          repeat_amounts):
    repeat_amount, vertical_averaging = repeat_amounts
    seq_file, norm_factor = seq_file_data

    # 2. Load 2D list containing the data to be outputted
    original_matrix = get_original_matrix(regions_filename, seq_file,
                                          norm_factor, end)

    # Expand the matrix using the repeat amounts and write it to a file
    matrix_filename = generate_random_filename(".matrix")

    with open(matrix_filename, 'w') as file:
        for row in original_matrix:
            # Make the row the correct size by repeating each element by repeat_amount
            output_list = []
            for val in row:
                for _ in range(repeat_amount):
                    output_list.append(str(val))

            file.write("\t".join(output_list) + "\n")

    # Do the vertical averaging
    heatmap_matrix = average_matrix(matrix_filename, vertical_averaging)
    remove_files(matrix_filename)

    return heatmap_matrix
    def test_get_regions_file(self):

        max_tss_file = generate_random_filename()

        with open(max_tss_file, 'w') as file:
            file.write(
                "\t".join(["chr16", "53607", "53608", "POLR3K", "0", "-"]) +
                "\n" +
                "\t".join(["chr16", "53872", "53873", "SNRNP25", "0", "+"]) +
                "\n")

        search = [["-", 5], ["+", 5]]

        chrom_sizes = {}
        with open(hg38_chrom_sizes_random_file) as file:
            for line in file:
                chrom, size = line.split()
                chrom_sizes[chrom] = int(size)

        region_file, gene_names = sequence_from_region_around_max_tss.get_regions_file(
            max_tss_file, search, chrom_sizes)

        result = []

        with open(region_file) as file:
            for line in file:
                result.append(line.split())

        expected = [["chr16", "53603", "53613", "POLR3K", "0", "-"],
                    ["chr16", "53867", "53877", "SNRNP25", "0", "+"]]

        self.assertEqual(result, expected)

        remove_files(max_tss_file, region_file)
Пример #15
0
def gather_data(sequencing_file, blacklist_filename, annotated_dataset,
                region_filenames, truQuant_regions_dict):
    paused_region_filename, gene_body_region_filename = region_filenames

    region_data_dict = {}
    # We need to blacklist the data before running the program
    blacklisted_sequencing_filename = generate_random_filename()

    run_subtract(sequencing_file,
                 rna_blacklist_file,
                 blacklist_filename,
                 strand_specific=False,
                 output_filename=blacklisted_sequencing_filename)

    indv_gene_counts_dict = get_counts_in_paused_region(
        paused_region_filename, blacklisted_sequencing_filename)
    get_counts_in_gene_bodies(gene_body_region_filename,
                              blacklisted_sequencing_filename,
                              indv_gene_counts_dict)

    # Only get the region data from the dataset which was annotated
    if annotated_dataset:
        five_prime_counts_dict = build_counts_dict(sequencing_file, "five")

        for gene in truQuant_regions_dict:
            region_data_dict[gene] = get_region_data(
                truQuant_regions_dict[gene]["Pause"], five_prime_counts_dict)

    remove_files(blacklisted_sequencing_filename)

    return sequencing_file, indv_gene_counts_dict, region_data_dict
Пример #16
0
    def test_expand_region(self):
        max_tss_file = generate_random_filename()

        with open(max_tss_file, 'w') as file:
            file.write(
                "\t".join(["chr1", "925739", "925740", "SAMD11", "0", "+"]) + "\n" +
                "\t".join(["chr1", "959255", "959256", "NOC2L", "0", "-"]) + "\n"
            )

        region_width = 20
        expanded_region = nucleotide_heatmap.expand_region(max_tss_file, region_width)

        result = []
        with open(expanded_region) as file:
            for line in file:
                result.append(line.split())

        expected = [
            ["chr1", "925729", "925749", "SAMD11", "0", "+"],
            ["chr1", "959246", "959266", "NOC2L", "0", "-"]
        ]

        self.assertEqual(result, expected)

        remove_files(expanded_region, max_tss_file)
Пример #17
0
    def test_get_pausing_distances_helper(self):
        region_filename = generate_random_filename()

        with open(region_filename, 'w') as file:
            file.write(
                "\t".join(["chr1", "100", "101", "positive_gene", "0", "+"]) + "\n" +
                "\t".join(["chr1", "9999", "10000", "negative_gene", "0", "-"]) + "\n"
            )

        transcripts_dict = {
            "chr1": {
                "+": {
                    100: {200: 3, 300: 1, 800: 1, 283: 1}
                },
                "-": {
                    9999: {9000: 3, 8000: 1, 7050: 1, 6542: 1}
                }
            }
        }

        result = tps_distance_per_gene.get_pausing_distances_helper(region_filename, transcripts_dict, 1)

        expected = {
            "positive_gene": 101,
            "negative_gene": 999
        }

        self.assertDictEqual(result, expected)

        remove_files(region_filename)
Пример #18
0
    def test_arguments(self):
        # Should print the usage
        with self.assertRaises(SystemExit):
            with Quieter():
                tps_distance_per_gene.parse_args([])

        with self.assertRaises(SystemExit):
            with Quieter():
                tps_distance_per_gene.parse_args(["regions filename"])

        regions_file = generate_random_filename()
        with open(regions_file, 'w') as file:
            file.write(
                "\t".join(['chr1', '1', '3', 'name', '0', '+'])
            )

        max_threads = multiprocessing.cpu_count()
        result = tps_distance_per_gene.parse_args([regions_file, 'seq_file'])
        self.assertEqual(result, (regions_file, ['seq_file'], 2, max_threads))

        result = tps_distance_per_gene.parse_args([regions_file, 'seq_file', '-t', '2'])
        self.assertEqual(result, (regions_file, ['seq_file'], 2, 2))

        result = tps_distance_per_gene.parse_args([regions_file, 'seq_file', '--threads', '2'])
        self.assertEqual(result, (regions_file, ['seq_file'], 2, 2))
def main(args):
    numerator_seq_files_data, denominator_seq_files_data, matrix_params, heatmap_params, filenames, max_threads = get_args(
        args)

    # Get the fold change matrix
    fold_change_matrix = get_fold_change_matrix(numerator_seq_files_data,
                                                denominator_seq_files_data,
                                                matrix_params, filenames,
                                                max_threads)

    # Now plot!
    bp_width, width, height, max_log2_fc, interval_size, minor_ticks, major_ticks = heatmap_params
    output_prefix = filenames[-1]

    output_filename = output_prefix + "_max_" + str(max_log2_fc) +  "_width_" + str(bp_width) + \
                      "bp_fold_change_TES_heatmap"

    only_heatmap_filename = generate_random_filename(".tiff")

    negative_log2_value = -1 * max_log2_fc if max_log2_fc else None

    generate_heatmap(fold_change_matrix, 'red/blue', only_heatmap_filename,
                     2.2, negative_log2_value, max_log2_fc)

    tick_params = (minor_ticks, major_ticks)

    ticks_image_filename = make_ticks_image(width, interval_size, tick_params)

    combine_images(ticks_image_filename, only_heatmap_filename,
                   output_filename)

    remove_files(fold_change_matrix, ticks_image_filename,
                 only_heatmap_filename)
Пример #20
0
def read_coverage_file(coverage_file, width):
    # Goal is to make a table with the gene name and then all of the values
    data = defaultdict(list)

    with open(coverage_file) as file:
        for line in file:
            chrom, left, right, gene_name, score, strand, counts, _, _, _ = line.split(
            )
            data[gene_name].append(counts)

    # Write the dictionary to a file by sorting by gene length
    lines = ["\t".join(data[gene_name]) for gene_name in data]
    num_lines = len(lines)

    sorted_matrix_filename = generate_random_filename(".sorted.matrix")

    with open(sorted_matrix_filename, 'w') as file:
        for line in lines:
            # Need to add 0's to get to the same length
            curr_length = len(line.split())
            append_string = "\t".join(["0"] * (width - curr_length))

            file.write(append_string + "\t" + line + "\n")

    return sorted_matrix_filename, num_lines
def blacklist_extended_gene_bodies(tsr_file, downstream_extension):
    """

    :param tsr_file:
    :type tsr_file: str
    :param blacklist_filename:
    :type blacklist_filename: str
    :param downstream_extension:
    :type downstream_extension: int
    :return:
    """
    annotation_extension = 1000
    percent_for_blacklisting = 0.3
    truQuant_regions_dict = {}

    blacklist_filename = generate_random_filename()

    # 1: Make the regions we are going to be searching for max TSSs in max TSRs
    search_regions_dict, annotations_dict = truQuant.make_search_regions(
        annotation_file, annotation_extension)

    # 2: Make the pause regions and gene bodies
    gene_tsr_dict, flow_through_tsrs = truQuant.map_tsrs_to_search_regions(
        tsr_file, search_regions_dict)
    max_tsrs_dict, non_max_tsrs_dict = truQuant.find_max_tsr_in_search_region(
        gene_tsr_dict)
    _define_pause_regions_and_gene_bodies(max_tsrs_dict, annotations_dict,
                                          truQuant_regions_dict,
                                          downstream_extension)
    _make_blacklisted_regions(blacklist_filename, annotations_dict,
                              max_tsrs_dict, non_max_tsrs_dict,
                              flow_through_tsrs, percent_for_blacklisting)

    return blacklist_filename
Пример #22
0
def map_tsrs_to_search_regions(tsr_filename, search_regions_dict):
    # Maps the TSRs found from tsrFinder to the search regions
    # (5' end of gene extended upstream to the most downstream methionine)

    gene_tsr_dict = defaultdict(list)
    flow_through_tsrs = []

    search_regions_filename = generate_random_filename()

    with open(search_regions_filename, 'w') as file:
        for chrom in search_regions_dict:
            for region in search_regions_dict[chrom]:
                file.write("\t".join([str(val) for val in region]) + "\n")

    mapped_tsrs_filename = generate_random_filename()
    flow_through_tsrs_filename = generate_random_filename()

    os.system("bedtools intersect -s -a " + search_regions_filename + " -b " +
              tsr_filename + " -wa -wb > " + mapped_tsrs_filename)
    os.system("bedtools intersect -s -v -a " + tsr_filename + " -b " +
              search_regions_filename + " -wa > " + flow_through_tsrs_filename)

    with open(mapped_tsrs_filename) as file:
        for line in file:
            gene_name = line.split()[3]
            mapped_tsr = line.split()[6:]
            tsr_chromosome, tsr_left, tsr_right, tsr_read_sum, tsr_strength, tsr_strand, tss_left, tss_right, \
            tss_strength, avg_tss = mapped_tsr

            gene_tsr_dict[gene_name].append([
                tsr_chromosome, tsr_left, tsr_right, tsr_read_sum,
                tsr_strength, tsr_strand, avg_tss
            ])

    with open(flow_through_tsrs_filename) as file:
        for line in file:
            tsr_chromosome, tsr_left, tsr_right, tsr_read_sum, tsr_strength, tsr_strand, tss_left, tss_right, \
            tss_strength, avg_tss = line.split()
            flow_through_tsrs.append([
                tsr_chromosome, tsr_left, tsr_right, tsr_read_sum,
                tsr_strength, tsr_strand, avg_tss
            ])

    remove_files(search_regions_filename, mapped_tsrs_filename,
                 flow_through_tsrs_filename)

    return gene_tsr_dict, flow_through_tsrs
Пример #23
0
    def test_complete_run(self, stdout):
        regions_filename = generate_random_filename()

        with open(regions_filename, 'w') as file:
            file.write(
                "\t".join(["chr1", "100", "101", "positive_gene", "0", "+"]) + "\n" +
                "\t".join(["chr1", "9999", "10000", "negative_gene", "0", "-"]) + "\n"
            )

        sequencing_file = generate_random_filename()

        with open(sequencing_file, 'w') as file:
            file.write(
                "\t".join(["chr1", "100", "201", "name", "0", "+"]) + "\n" +
                "\t".join(["chr1", "100", "301", "name", "0", "+"]) + "\n" +
                "\t".join(["chr1", "100", "801", "name", "0", "+"]) + "\n" +
                "\t".join(["chr1", "100", "201", "name", "0", "+"]) + "\n" +
                "\t".join(["chr1", "100", "201", "name", "0", "+"]) + "\n" +
                "\t".join(["chr1", "100", "284", "name", "0", "+"]) + "\n" +

                "\t".join(["chr1", "9000", "10000", "name", "0", "-"]) + "\n" +
                "\t".join(["chr1", "9000", "10000", "name", "0", "-"]) + "\n" +
                "\t".join(["chr1", "9000", "10000", "name", "0", "-"]) + "\n" +
                "\t".join(["chr1", "8000", "10000", "name", "0", "-"]) + "\n" +
                "\t".join(["chr1", "7050", "10000", "name", "0", "-"]) + "\n" +
                "\t".join(["chr1", "6542", "10000", "name", "0", "-"]) + "\n"
            )

        tps_distance_per_gene.main([regions_filename, sequencing_file])

        result = stdout.getvalue()

        # Eliminate the headers
        result = result.split("\n")[1:]

        # Put the result into a list
        result = [line.split() for line in result if line]

        expected = [
            ["positive_gene", "101"],
            ["negative_gene", "999"]
        ]

        self.assertEqual(result, expected)

        remove_files(regions_filename, sequencing_file)
Пример #24
0
def make_incremented_regions(regions_filename, downstream_distance,
                             interval_size, upstream_distance):
    # Using the regions provided, make incremented regions
    with open(regions_filename) as file:
        regions = []
        for i, line in enumerate(file):
            if i != 0:
                gene_name, chromosome, pause_left, pause_right, strand, total_reads, max_tss, max_tss_five_prime_reads, avg_tss, \
                                    std_tss, gene_body_left, gene_body_right, *_ = line.split()

                if strand == "+":
                    region_left = int(max_tss) - upstream_distance
                    region_right = int(gene_body_right) + downstream_distance
                else:
                    region_left = int(gene_body_left) - downstream_distance
                    region_right = int(max_tss) + upstream_distance

                # Add the region to regions
                regions.append([
                    chromosome, region_left, region_right, gene_name,
                    max_tss_five_prime_reads, strand
                ])

    # Go through all the regions and make the incremented ones
    incremented_regions = []
    for region in regions:
        chromosome, left, right, gene_name, score, strand = region

        if strand == "+":
            # We work from left to right
            for i in range(left + interval_size, right + 1, interval_size):
                # Looping through each interval region
                incremented_regions.append([
                    chromosome, i - interval_size, i, gene_name, score, strand
                ])

        else:
            # We work from right to left
            if left + (interval_size - 1) > 0:
                for i in range(right, left + (interval_size - 1),
                               (-1 * interval_size)):
                    # Looping through each interval region
                    incremented_regions.append([
                        chromosome, i - interval_size, i, gene_name, score,
                        strand
                    ])

    region_intervals_filename = generate_random_filename()

    with open(region_intervals_filename, 'w') as tmp_region_file:
        output_writer = csv.writer(tmp_region_file,
                                   delimiter='\t',
                                   lineterminator='\n')
        for region in incremented_regions:
            output_writer.writerow(region)

    return region_intervals_filename
Пример #25
0
    def test_map_tsrs_to_search_regions(self):
        # Test # TSRs
        # One that is contained in the search region, one with partial overlap in the 5' end,
        # one with partial overlap in the 3' end. One with no overlap before the TSR. One with no overlap after the TSR
        # One on the opposite strand

        # Need to define a TSR file and a search regions dict
        search_regions_dict = {
            "chr1": [["chr1", "100", "200", "positive_strand_test", "0", "+"],
                     ["chr1", "500", "600", "negative_strand_test", "0", "-"]]
        }

        tsr_filename = generate_random_filename(".tab")

        additional_columns = [
            "tss_left", "tss_right", "tss_strength", "avg_tss"
        ]

        with open(tsr_filename, 'w') as file:
            file.write("\t".join(["chr1", "40", "60", "no_overlap", "0", "+"] +
                                 additional_columns) + "\n")
            file.write(
                "\t".join(["chr1", "300", "320", "no_overlap2", "0", "+"] +
                          additional_columns) + "\n")
            file.write("\t".join(
                ["chr1", "90", "110", "partial_overlap_5'", "0", "+"] +
                additional_columns) + "\n")
            file.write("\t".join(
                ["chr1", "190", "210", "partial_overlap_3'", "0", "+"] +
                additional_columns) + "\n")
            file.write("\t".join(
                ["chr1", "140", "160", "complete_overlap", "0", "+"] +
                additional_columns) + "\n")
            file.write(
                "\t".join(["chr1", "140", "160", "opposite_strand", "0", "-"] +
                          additional_columns) + "\n")

        gene_tsr_dict, flow_through_tsrs = truQuant.map_tsrs_to_search_regions(
            tsr_filename, search_regions_dict)

        expected_gene_tsr_dict = {
            "positive_strand_test":
            [["chr1", "90", "110", "partial_overlap_5'", "0", "+", "avg_tss"],
             ["chr1", "190", "210", "partial_overlap_3'", "0", "+", "avg_tss"],
             ["chr1", "140", "160", "complete_overlap", "0", "+", "avg_tss"]]
        }

        expected_flow_through_tsrs = [
            ["chr1", "40", "60", "no_overlap", "0", "+", "avg_tss"],
            ["chr1", "300", "320", "no_overlap2", "0", "+", "avg_tss"],
            ["chr1", "140", "160", "opposite_strand", "0", "-", "avg_tss"]
        ]

        self.assertDictEqual(gene_tsr_dict, expected_gene_tsr_dict)
        self.assertEqual(flow_through_tsrs, expected_flow_through_tsrs)

        remove_files(tsr_filename)
def make_incremented_regions(regions_filename, upstream_distance,
                             downstream_distance, interval_size):
    # Using the regions provided, make incremented regions
    with open(regions_filename) as file:
        regions = []
        for line in file:
            chromosome, left, right, gene_name, score, strand = line.split()

            if strand == "+":
                # We use the right position because that is the TES
                region_left = int(right) - upstream_distance
                region_right = int(right) + downstream_distance
            else:
                # We use the left position because that is the TES
                region_left = int(left) - downstream_distance
                region_right = int(left) + upstream_distance

            # Add the region to regions
            regions.append([
                chromosome, region_left, region_right, gene_name, score, strand
            ])

    # Go through all the regions and make the incremented ones
    incremented_regions = []
    for region in regions:
        chromosome, left, right, gene_name, score, strand = region

        if strand == "+":
            # We work from left to right
            for i in range(left + interval_size, right + 1, interval_size):
                # Looping through each interval region
                incremented_regions.append([
                    chromosome, i - interval_size, i, gene_name, score, strand
                ])

        else:
            # We work from right to left
            if left + (interval_size - 1) > 0:
                for i in range(right, left + (interval_size - 1),
                               (-1 * interval_size)):
                    # Looping through each interval region
                    incremented_regions.append([
                        chromosome, i - interval_size, i, gene_name, score,
                        strand
                    ])

    region_intervals_filename = generate_random_filename()

    with open(region_intervals_filename, 'w') as tmp_region_file:
        output_writer = csv.writer(tmp_region_file,
                                   delimiter='\t',
                                   lineterminator='\n')
        for region in incremented_regions:
            output_writer.writerow(region)

    return region_intervals_filename
    def test_with_n_in_sequence(self, stdout):
        region_file = generate_random_filename()

        with open(region_file, 'w') as file:
            file.write("chr1\t1\t11\tname\t0\t+")

        self.assertFalse(self.get_sequence(stdout.getvalue()))

        base_distribution.main([region_file])
        remove_files(region_file)
Пример #28
0
def make_ticks_image(width, tick_params, tick_width):
    # Make the tick marks
    t = Ticks(*tick_params, offset=1, width=tick_width)

    # Ticks matrix with a height of 50 px and a max black value of 1
    ticks_matrix = make_ticks_matrix(width, 50, 1, t)

    # Write to a file
    ticks_matrix_filename = generate_random_filename('.matrix')
    with open(ticks_matrix_filename, 'w') as file:
        for row in ticks_matrix:
            file.write("\t".join([str(val) for val in row]) + "\n")

    ticks_image_filename = generate_random_filename(".tiff")

    os.system("/usr/bin/Rscript " + generate_heatmap_location + " " +
              " ".join([ticks_matrix_filename, "gray", ticks_image_filename, "2.2"]))

    remove_files(ticks_matrix_filename)

    return ticks_image_filename
def make_log_two_fold_change_matrix(numerator_filename, denominator_filename):
    # Going to divide the first by the second
    first_matrix = []
    with open(numerator_filename) as file:
        for line in file:
            first_matrix.append([float(val) for val in line.rstrip().split()])

    second_matrix = []
    with open(denominator_filename) as file:
        for line in file:
            second_matrix.append([float(val) for val in line.rstrip().split()])

    # Verify the matricies are the same size
    first_matrix_width = len(first_matrix[0])
    first_matrix_height = len(first_matrix)

    second_matrix_width = len(second_matrix[0])
    second_matrix_height = len(second_matrix)

    if not (first_matrix_width == second_matrix_width) and not (
            first_matrix_height == second_matrix_height):
        sys.stderr.write("The matricies are not the same size. Exiting ...")
        sys.exit(1)

    # Now divide the first by the second
    fold_change_matrix = []
    for i in range(first_matrix_height):
        fold_change_matrix.append([0 for _ in range(first_matrix_width)])

    for row in range(first_matrix_height):
        for col in range(first_matrix_width):
            numerator = first_matrix[row][col]
            denominator = second_matrix[row][col]

            if numerator == 0:
                numerator = 1

            if denominator == 0:
                denominator = 1

            # Take the log2 of this value
            log_two_fold_change = math.log2(numerator / denominator)

            fold_change_matrix[row][col] = log_two_fold_change

    # Write to a file
    fold_change_matrix_filename = generate_random_filename(".matrix")
    with open(fold_change_matrix_filename, 'w') as file:
        for row in fold_change_matrix:
            str_row = [str(val) for val in row]
            file.write("\t".join(str_row) + "\n")

    return fold_change_matrix_filename
def get_coverage_files_helper(filename, region_intervals_file):
    # First makes the three bed file
    three_prime_end_file = make_read_end_file(filename, 'three')

    # Run coverage on the three bed file
    coverage_file = generate_random_filename()
    run_coverage(region_intervals_file,
                 three_prime_end_file,
                 output_filename=coverage_file)

    remove_files(three_prime_end_file)
    return coverage_file