Exemplo n.º 1
0
 def test_run_process(self):
     filepath = "{0}/tests/data/test_file.txt".format(MODULE_DIR)
     remove_file(filepath)
     # simple test to create file
     args = ["touch", filepath]
     run_process(args)
     self.assertEqual(os.path.isfile(filepath), True)
Exemplo n.º 2
0
def intersect_with_bed(input_bam, input_bed, output_file, force_strand=True):
    """
    Given a bam file, intersect with a bed file to leave only thse reads that
    correspond to entries in the bed file

    Parameters
    ---------
    input_bam : str
        Path to the .bam file containing the reads
    input_bed : str
        Path to the .bed file containing the intervals
    output_file : str
        Path to the output file
    force_strand : bool
        If set, ensure the reads map to the same strand

    Examples
    ---------
    >>> from bioUtilities.bam import intersect_with_bed
    >>> intersect_with_bed("reads.bam", "exons.bed", "exon_reads.bam")
    """

    # now use bedtools to count the reads
    args = ["bedtools", "intersect", "-s", "-abam", input_bam, "-b", input_bed]
    if not force_strand:
        del args[2]
    run_process(args, file_for_output=output_file)
Exemplo n.º 3
0
def nm_filter(input_file, output_file, lower_limit = 0, upper_limit = 10):
    """
    Filter a .bam/.sam by NM value

    Parameters
    ---------
    input_file : str
        Path to the file to be filtered
    output_file : str
        Path to the output
    lower_limit : int
        If set, the lower boundary of NM values, for which all reads have to
        be greater than or equal to
    upper_limit : int
        If set, the upper boundary of NM values, for which all reads have to
        be less than or equal to

    Examples
    ---------
    >>> from bioUtilities.bam import nm_filter
    >>> nm_filter("test.bam", "test_output.bam", lower_limit = 2)
    >>> nm_filter("test.bam", "test_output.bam", upper_limit = 6)
    >>> nm_filter("test.bam", "test_output.bam", lower_limit = 2, upper_limit = 6)
    """

    # if neither thresholds are specified
    if not lower_limit and not upper_limit:
        raise Exception("\nERROR: You must specify at least one of the lower_limit or upper_limit thresholds.\n")

    if not lower_limit:
        print("Using the default lower limit of 0.")
    if not upper_limit:
        print("Using the default uppper limit of 10.")

    #create output file
    if output_file[-4:] == ".bam":
        temp_output_file = "{0}.sam".format(output_file[:-4])
    else:
        temp_output_file = output_file

    grep_args = ["^@"]
    for i in range(lower_limit, upper_limit + 1):
        grep_args.append("\|\tNM:i:{0}\t".format(i))
    grep_args = "".join(grep_args)

    # read in the file
    file_reads = run_process(["samtools", "view", "-h", input_file])
    # now run the grep args
    output = run_process(["grep", grep_args], input_to_pipe = file_reads, file_for_output = temp_output_file)

    # if the output file is in bam format
    if output_file != temp_output_file:
        samtools_args = ["samtools", "view", "-bh", temp_output_file]
        run_process(samtools_args, file_for_output = output_file)
        remove_file(temp_output_file)
Exemplo n.º 4
0
 def test_mapq_filter_lower_limit(self):
     input_file = "{0}/tests/data/input.bam".format(MODULE_DIR)
     expected_file = "{0}/tests/data/expected_mapq_filter_1.sam".format(MODULE_DIR)
     observed_file = "{0}/tests/data/observed_mapq_filter_1.bam".format(MODULE_DIR)
     mapq_filter(input_file, observed_file, lower_limit = 200)
     expected = read_many_fields(expected_file, "\t")
     # convert bam to sam to check correct output
     # use samtools to extract in the same format as sam
     temp_observed = "{0}/tests/data/observed_mapq_filter_1.sam".format(MODULE_DIR)
     samtools_args = ["samtools", "view", observed_file]
     run_process(samtools_args, file_for_output = temp_observed)
     observed = read_many_fields(temp_observed, "\t")
     self.assertEqual(expected, observed)
     remove_file(temp_observed)
     remove_file(observed_file)
Exemplo n.º 5
0
def read_count(input_file):
    """
    Get the number of reads from a .bam/.sam file

    Parameters
    ---------
    input_file : str
        Path to the file to be counted

    Returns
    ---------
    read_count : int
        The number of reads in the specified file

    Examples
    ---------
    >>> from bioUtilities.bam import read_count
    >>> reads = read_count("test.bam")
    >>> print(reads)
    >>> 15
    """

    raw_read_count = run_process(["samtools", "view", "-c", input_file])
    read_count = int(re.findall("(\d+)", raw_read_count)[0])
    return read_count
Exemplo n.º 6
0
 def test_xt_filter(self):
     input_file = "{0}/tests/data/input.bam".format(MODULE_DIR)
     expected_file = "{0}/tests/data/expected_xt_filter.sam".format(
         MODULE_DIR)
     observed_file = "{0}/tests/data/observed_xt_filter.bam".format(
         MODULE_DIR)
     xt_filter(input_file, observed_file, filter="XT:A:U")
     #convert bam to sam to check correct output
     temp_observed = "{0}/tests/data/observed_xt_filter.sam".format(
         MODULE_DIR)
     samtools_args = ["samtools", "view", observed_file]
     run_process(samtools_args, file_for_output=temp_observed)
     expected = read_many_fields(expected_file, "\t")
     observed = read_many_fields(temp_observed, "\t")
     self.assertEqual(expected, observed)
     remove_file(temp_observed)
     remove_file(observed_file)
Exemplo n.º 7
0
def xt_filter(input_file, output_file, filter=None):
    """
    Filter a .bam/.sam file by the XT tag

    Parameters
    ---------
    input_file : str
        Path to the file to be filtered
    output_file : str
        Path to the output
    filter : str
        Filter than reads should contain


    Examples
    ---------
    >>> from bioUtilities.bam import xt_filter
    >>> xt_filter("test.bam", "test_xt_filtered.bam", filter = "XT:A:U")
    """

    if not xt_filter:
        raise Exception('\nXT filter not specified.\n')
    # if the output format is .bam, temporarily create .sam output file
    if output_file[-4:] == ".bam":
        temp_output_file = "{0}.sam".format(output_file[:-4])
    else:
        temp_output_file = output_file

    # get the header of the file
    sam_output = run_process(["samtools", "view", "-h", input_file])
    grep_args = []
    # get header lines
    grep_args.append("^@")
    # get XT values matching the filter
    grep_args.append("\|\t{0}\t".format(filter))
    grep_args = "".join(grep_args)
    # run the filter
    run_process(["grep", grep_args],
                input_to_pipe=sam_output,
                file_for_output=temp_output_file)

    # if wanting to create bam, create bam and delete sam
    if output_file != temp_output_file:
        samtools_args = ["samtools", "view", "-bh", temp_output_file]
        run_process(samtools_args, file_for_output=output_file)
        remove_file(temp_output_file)
Exemplo n.º 8
0
 def test_intersect_with_bed(self):
     input_bed = "{0}/tests/data/input.bed".format(MODULE_DIR)
     input_bam = "{0}/tests/data/input3.bam".format(MODULE_DIR)
     observed_file = "{0}/tests/data/observed_intersect_with_bed.bam".format(
         MODULE_DIR)
     expected_file = "{0}/tests/data/expected_intersect_with_bed.sam".format(
         MODULE_DIR)
     remove_file(observed_file)
     intersect_with_bed(input_bam, input_bed, observed_file)
     observed_sam = "{0}/tests/data/observed_intersect_with_bed.sam".format(
         MODULE_DIR)
     args = ["samtools", "view", "-h", observed_file]
     run_process(args, file_for_output=observed_sam)
     observed = read_many_fields(observed_sam)
     expected = read_many_fields(expected_file)
     self.assertEqual(expected, observed)
     remove_file(observed_file)
     remove_file(observed_sam)
Exemplo n.º 9
0
def line_count(filepath):
    """
    Count the number of lines in a file

    Parameters
    ---------
    filepath : str
        Path to the file

    Returns
    ---------
    line_count : int
        The number of lines in the file

    Examples
    ---------
    >>> from bioUtilities.files import line_count
    >>> line_count("test_file.txt")
    3
    """

    count_output = run_process(["wc", "-l", filepath])
    line_count = int(re.findall("\s+(\d+)", count_output)[0])
    return line_count
Exemplo n.º 10
0
def mapq_filter(input_file, output_file, lower_limit=None, upper_limit=None):
    """
    Filter a .bam/.sam by MAPQ value

    Parameters
    ---------
    input_file : str
        Path to the file to be filtered
    output_file : str
        Path to the output
    lower_limit : int
        If set, the lower boundary of mapq values, for which all reads have to
        be greater than or equal to
    upper_limit : int
        If set, the upper boundary of mapq values, for which all reads have to
        be less than or equal to

    Examples
    ---------
    >>> from bioUtilities.bam import xt_filter
    >>> mapq_filter("test.bam", "test_output.bam", lower_limit = 100)
    >>> mapq_filter("test.bam", "test_output.bam", upper_limit = 250)
    >>> mapq_filter("test.bam", "test_output.bam", lower_limit = 100, upper_limit = 250)
    """

    #if neither thresholds are specified
    if not lower_limit and not upper_limit:
        raise Exception(
            "ERROR: You must specify at least one of the lower_limit or upper_limit thresholds."
        )

    samtools_args = ["samtools", "view", "-h"]
    # if both thresholds are specified, we want the reads with values between these
    if lower_limit and upper_limit:
        # create temp file
        temp_directory = "temp_mapq_filter.{0}".format(random.random())
        create_directory(temp_directory)
        temp_file = "{0}/{1}".format(temp_directory,
                                     output_file.split("/")[-1])
        # first get everything above the lower limit
        temp_args = samtools_args.copy()
        temp_args.extend(["-q", lower_limit, input_file])
        run_process(temp_args, file_for_output=temp_file)
        # now get everything below the upper limit. need to account for
        # samtools removing everything below threshold
        # so when inversing need to add 1 to total
        temp_args = samtools_args.copy()
        upper_limit = upper_limit + 1
        temp_args.extend(["-q", upper_limit, temp_file, "-U", output_file])
        run_process(temp_args)
        # cleanup the temp files
        remove_directory(temp_directory)
    # if only a lower limit is specified
    elif lower_limit and not upper_limit:
        samtools_args.extend(["-bq", lower_limit, input_file])
        run_process(samtools_args, file_for_output=output_file)
    #if only the upper threshold is specified
    elif upper_limit and not lower_limit:
        # account for inverse by adding 1
        upper_limit = upper_limit + 1
        samtools_args.extend(
            ["-q", upper_limit, input_file, "-U", output_file])
        run_process(samtools_args)
Exemplo n.º 11
0
def count_interval_reads(input_file,
                         input_bam,
                         output_file,
                         paired_end=False,
                         min_qual=None,
                         min_length=50):
    """
    For each interval in bed format, count the number of reads in the bam file

    Parameters
    ---------
    input_file : str
        Path to the file containing the intervals
    input_bam : str
        Path to the .bam file containing the reads
    output_file : str
        Path to the output file


    Dependencies
    ---------
    featureCounts v1.6.4

    Examples
    ---------
    >>> from bioUtilities.bam import count_interval_reads
    >>> count_interval_reads("exon_junctions.bed", "reads.bam", "exon_junction_reads.bed")
    """

    # check that featureCounts command exists
    if not shutil.which('featureCounts'):
        raise Exception('\nERROR: featureCounts must be installed.\n')

    # if input_file is in bed format, need to convert to .saf format
    # .saf format its 1-based
    if get_extension(input_file) == ".bed":
        base_input_file = input_file
        working_input_file = "{0}.saf".format(input_file[:-4])
        bed_to_saf(old_input_file, input_file)
    else:
        working_input_file = input_file

    if get_extension(output_file) == ".bed":
        working_output_file = "{0}.saf".format(output_file[:-4])
    else:
        working_output_file = output_file

    # now can use featureCounts to count reads
    # this return the file in 'saf' format
    args = ["featureCounts", "-fO", "-F", "SAF", "-g", "ID"]
    if paired_end:
        args.append("-p")
    if min_qual:
        args.extend(["-Q", min_qual])
    if min_length:
        args.extend(["-d", min_length])
    args.extend(
        ["-a", working_input_file, "-o", working_output_file, input_bam])

    # now run the count
    run_process(args)

    # if the output format is bed, convert the saf output to bed
    if get_extension(output_file) == ".bed":
        entries = read_many_fields(working_output_file)[2:]
        with open(output_file, "w") as outfile:
            for entry in entries:
                output = [
                    entry[1],
                    str(int(entry[2]) - 1),
                    str(int(entry[3]) - 1), entry[0], ".", entry[4]
                ]
                output.extend(entry[5:])
                outfile.write("{0}\n".format("\t".join(output)))

    # now clean up the files
    if working_input_file != input_file:
        remove_file(working_input_file)
    if working_output_file != output_file:
        remove_file(output_file)