Пример #1
0
def main(command_line_args=None):
    """

    :param command_line_args:
    """
    VersionDependencies.python_check()

    if not command_line_args:
        command_line_args = sys.argv

    parser = argparse.ArgumentParser(
        description="A package to process Synthetic Lethal Data.\n {0} v{1}".
        format(__package__, __version__),
        formatter_class=RawTextHelpFormatter)

    parser.add_argument('--options_file',
                        action='store',
                        dest='options_file',
                        required=True,
                        help='File containing program parameters.')

    # Convert universal variables intended as boolean from string to boolean.
    args, options_parser = string_to_boolean(Tool_Box.options_file(parser))

    # Check file names and paths for errors
    error_checking(args)

    log = Tool_Box.Logger(args)
    Tool_Box.log_environment_info(log, args, command_line_args)

    start_time = time.time()
    module_name = "Synthetic_Lethal"

    log.info(
        "{0} v{1}; Module: Synthetic Lethal Analysis v{2} Beginning".format(
            __package__, __version__, Synthetic_Lethal.__version__))

    synthetic_lethal = Synthetic_Lethal.SyntheticLethal(log, args)

    if args.TargetSearch:
        synthetic_lethal.fastq_analysis()
    elif args.Statistics:
        synthetic_lethal.statistics()
    else:
        log.error('No module selected to run.')

    warning = "\033[1;31m **See warnings above**\033[m" if log.warning_occurred else ''
    elapsed_time = int(time.time() - start_time)
    log.info(
        "****Völundr {0} complete ({1} seconds, {2} Mb peak memory).****\n{3}".
        format(module_name, elapsed_time, Tool_Box.peak_memory(), warning))
Пример #2
0
    def seg_count_file(self):
        """
        This function parses the tab delimited SegCopy file into a complex dictionary.
        :return:
        """

        prior_ploidy = {
        }  # This is essentially a tracking dictionary that I make here because the keys are available.
        bin_tracking_dict = Tool_Box.VivifiedDictionary()
        line_num = 0
        seg_copy_array = self.array_builder()

        seg_count = list(csv.reader(open(self.input_file), delimiter='\t'))

        for line in seg_count:
            if line_num > 0:
                bin_tracking_dict[line[0]][line_num] = (line[1], line[2])

            elif line_num == 0:  # First line is the header.
                label_list = line
                for i in range(len(label_list)):
                    if i > 2:
                        prior_ploidy[label_list[i]] = [-1, False, 0, 0, 0]
            line_num += 1

        if not eval(self.chrY):
            with suppress(KeyError):
                bin_tracking_dict.pop("chrY")

        return prior_ploidy, bin_tracking_dict, seg_copy_array
Пример #3
0
def main(command_line_args=None):
    VersionDependencies.python_check()

    if not command_line_args:
        command_line_args = sys.argv
    run_start = datetime.datetime.today().strftime("%a %b %d %H:%M:%S %Y")
    parser = argparse.ArgumentParser(description="A little ditty to manipulate FASTQ files.\n {0} v{1}"
                                     .format(__package__, __version__), formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument('--options_file', action='store', dest='options_file', required=True,
                        help='File containing program parameters.')

    options_parser = Tool_Box.options_file(parser)
    args = options_parser.parse_args()
    # args, options_parser = string_to_boolean(args, options_parser)
    options_parser.set_defaults(Trim5=0)
    options_parser.set_defaults(Trim3=0)
    options_parser.set_defaults(Minimum_Length=100)
    options_parser.set_defaults(N_Limit=100)
    options_parser.set_defaults(HaloPLEX=False)
    options_parser.set_defaults(ThruPLEX=False)
    options_parser.set_defaults(FASTQ_PreProcess=True)
    args = options_parser.parse_args()

    # Check options file for errors.
    error_checking(args)

    log = Tool_Box.Logger(args)
    Tool_Box.log_environment_info(log, args, command_line_args)
    start_time = time.time()
    module_name = ""

    # Initialize generator to read each FASTQ file
    fastq1 = FASTQ_Tools.FASTQ_Reader(args.FASTQ1, log)
    fastq2 = FASTQ_Tools.FASTQ_Reader(args.FASTQ2, log)
    index1 = FASTQ_Tools.FASTQ_Reader(args.Index1, log)
    index2 = FASTQ_Tools.FASTQ_Reader(args.Index2, log)

    splitter_data = FASTQ_Tools.FastqSplitter(args, log, fastq1, fastq2, index1, index2, paired_end=True)
    new_fastq1, new_fastq2 = splitter_data.file_writer()

    warning = "\033[1;31m **See warnings above**\033[m" if log.warning_occurred else ''
    elapsed_time = int(time.time() - start_time)
    log.info("****FASTQ Preprocessing {0} complete ({1} seconds, {2} Mb peak memory).****"
             .format(module_name, elapsed_time, Tool_Box.peak_memory(), warning))
Пример #4
0
    def data_processing(self):
        self._log.info("Begin Family Size and UMT Analysis")
        umt_stats_outstring = "UMT\tCount"
        family_size_outstring = "Family\tCount"

        for index_key in self._family_data:
            count_list = []

            for k, v in sorted(Counter(self._family_data[index_key]).items(),
                               key=lambda x: x[1],
                               reverse=True):
                umt_stats_outstring += "\n{0}\t{1}".format(k, v)
                count_list.append(str(v))

            c = dict(Counter(count_list))
            for k in natsort.natsorted(c):
                family_size_outstring += "\n{0}\t{1}".format(k, c[k])

        stats_filename = "{0}{1}_UMT_Stats.txt".format(
            self._args.Working_Folder, self._data_source)
        size_filename = "{0}{1}_Family_Size.txt".format(
            self._args.Working_Folder, self._data_source)

        # Deleting the files if they exist prevents a random text file busy OSError I am getting using VBox on Windows.
        Tool_Box.delete([stats_filename, size_filename])

        umt_stats_file = open(
            "{0}{1}_UMT_Stats.txt".format(self._args.Working_Folder,
                                          self._data_source), 'w')

        family_size_file = open(size_filename, "w")

        umt_stats_file.write(umt_stats_outstring)
        family_size_file.write(family_size_outstring)

        umt_stats_file.close()
        family_size_file.close()

        self._log.info(
            "{0} {1} UMT Family Size and Stats Files Written".format(
                self._args.Job_Name, self._data_source))
def string_to_boolean(parser):
    """
    Converts strings to boolean.  Done to keep the eval() function out of the code.
    :param parser:
    :return:
    """

    options_parser = Tool_Box.options_file(parser)
    args = options_parser.parse_args()
    options_parser.set_defaults(PairedEnd=bool(strtobool(args.PairedEnd)))
    options_parser.set_defaults(Build_PhiX_DataFrame=bool(strtobool(args.Build_PhiX_DataFrame)))

    args = options_parser.parse_args()

    return args, options_parser
Пример #6
0
def main():
    """

    """
    VersionDependencies.python_check()

    parser = argparse.ArgumentParser(
        description="A package to process Synthetic Lethal Data.\n {0} v{1}".
        format(__package__, __version__),
        formatter_class=RawTextHelpFormatter)

    parser.add_argument('--options_file',
                        action='store',
                        dest='options_file',
                        required=True,
                        help='File containing program parameters.')

    # Convert strings to int, float, boolean, check file names and paths for errors
    args, log = error_checking(parser)
    start_time = time.time()

    # Initialize program
    synthetic_lethal = Synthetic_Lethal.SyntheticLethal(log, args)

    if args.TargetSearch:
        module_name = "Target Search"
        log.info("{} v{}; Module: {} v{} Beginning".format(
            __package__, __version__, module_name,
            Synthetic_Lethal.__version__))

        synthetic_lethal.fastq_analysis()

    elif args.Statistics:
        module_name = "Statistical Analysis"
        log.info("{} v{}; Module: {} v{} Beginning".format(
            __package__, __version__, module_name,
            Synthetic_Lethal.__version__))
        synthetic_lethal.statistics()

    else:
        module_name = "No module selected"
        log.error('No module selected to run.')

    warning = "\033[1;31m **See warnings above**\033[m" if log.warning_occurred else ''
    elapsed_time = int(time.time() - start_time)
    log.info(
        "****Völundr {0} complete ({1} seconds, {2} Mb peak memory).****\n{3}".
        format(module_name, elapsed_time, Tool_Box.peak_memory(), warning))
Пример #7
0
def string_to_boolean(parser):
    """
    Converts strings to boolean.  Done to keep the eval() function out of the code.
    :param parser:
    :return:
    """
    options_parser = Tool_Box.options_file(parser)
    args = options_parser.parse_args()

    if args.IndelProcessing == "True":
        # Tool_Box.debug_messenger("Pear set to FALSE.")
        options_parser.set_defaults(PEAR=True)
        options_parser.set_defaults(
            Demultiplex=bool(strtobool(args.Demultiplex)))
        options_parser.set_defaults(
            OutputRawData=bool(strtobool(args.OutputRawData)))
        options_parser.set_defaults(
            DeleteConsensusFASTQ=bool(strtobool(args.DeleteConsensusFASTQ)))

    options_parser.set_defaults(
        IndelProcessing=bool(strtobool(args.IndelProcessing)))
    options_parser.set_defaults(Verbose=args.Verbose.upper())

    return options_parser.parse_args()
Пример #8
0
    def file_writer(self):
        """
        Process FASTQ file(s) and write new version(s) suitable for aligners.  Return file name.
        :return:
        """

        self.log.info("Begin writing temporary FASTQ files.")
        current_read_count = 0
        file1 = "{0}{1}_R1_processed.fastq.gz".format(self.args.WorkingFolder,
                                                      self.args.Job_Name)
        file2 = "{0}{1}_R2_processed.fastq.gz".format(self.args.WorkingFolder,
                                                      self.args.Job_Name)
        temp_file1 = Writer(self.log, file1)
        temp_file2 = Writer(self.log, file2)
        self.log.info("Writing {0} and {1}".format(file1, file2))
        fastq1_list = []
        fastq2_list = []
        eof = False
        Read = collections.namedtuple('Read', 'name, seq, index, qual')

        # This generator returns objects not lines.
        while not eof:
            try:
                fastq1_read = next(self.fastq1_file.seq_read())
                fastq2_read = next(self.fastq2_file.seq_read())
                if self.index1_file is not None:
                    index1_read = next(self.index1_file.seq_read())
                if self.index2_file is not None:
                    index2_read = next(self.index2_file.seq_read())
            except StopIteration:
                eof = True
                continue

            current_read_count += 1

            # Apply Filters
            trim_5 = int(self.args.Trim5)
            trim_3 = int(self.args.Trim3)
            min_length = int(self.args.Minimum_Length) + trim_5 + trim_3

            # Filter reads based on length and number of N's.
            if (len(fastq1_read.seq) < min_length
                    or len(fastq2_read.seq) < min_length
                    or fastq1_read.seq.count("N") / len(fastq1_read.seq) >=
                    float(self.args.N_Limit)
                    or fastq2_read.seq.count("N") / len(fastq2_read.seq) >=
                    float(self.args.N_Limit)):
                continue

            # Add the UMT's to the header.
            if self.args.HaloPLEX:
                header1 = "{0}|{0}:{1}".format(index1_read.seq,
                                               fastq1_read.name)
                header2 = "{0}|{0}:{1}".format(index1_read.seq,
                                               fastq2_read.name)

                # Fixme: This needs to be exposed to the user.
                # Short HaloPLEX reads have issues.  Found that reads <= 100 all show a 3' -1 or -2 error
                if len(fastq1_read.seq) <= 100:
                    read_trim(fastq1_read, trim5=0, trim3=3)
                if len(fastq2_read.seq) <= 100:
                    read_trim(fastq2_read, trim5=0, trim3=3)

            elif self.args.ThruPLEX:
                # header1 = "{0}|{1}".format(fastq1_read.name.split(":")[-1], fastq1_read.name)
                umt1 = fastq1_read.seq[:6]
                umt2 = fastq2_read.seq[:6]
                header1 = "{0}|{1}:{2}".format(umt1, umt2, fastq1_read.name)
                header2 = "{0}|{1}:{2}".format(umt1, umt2, fastq2_read.name)
                read_trim(fastq1_read, trim5=len(umt1), trim3=0)
                read_trim(fastq2_read, trim5=len(umt2), trim3=0)

            elif self.args.FASTQ_PreProcess:
                # The indices are after the last ":" in the header.
                header1 = "{}:{}+{}".format(fastq1_read.name, index1_read.seq,
                                            index2_read.seq)
                header2 = "{}:{}+{}".format(fastq2_read.name, index1_read.seq,
                                            index2_read.seq)

            else:
                self.log.error("Only HaloPLEX or ThruPLEX currently enabled.")
                raise SystemExit(1)

            # Trim sequences from ends if needed.
            if trim_5 > 0 or trim_3 > 0:
                read_trim(fastq1_read, trim_5, trim_3)
                read_trim(fastq2_read, trim_5, trim_3)

            fastq1_read.name = header1
            fastq2_read.name = header2

            fastq1_list.append(
                Read(fastq1_read.name, fastq1_read.seq, fastq1_read.index,
                     fastq1_read.qual))
            fastq2_list.append(
                Read(fastq2_read.name, fastq2_read.seq, fastq2_read.index,
                     fastq2_read.qual))

            # empirically determined for UNC Longleaf cluster.  May need to expose this to user.  Writes blocks of data
            # to disk speeding up entire process.
            if current_read_count % 1000000 == 0:
                temp_file1.write(fastq1_list)
                temp_file2.write(fastq2_list)
                fastq1_list.clear()
                fastq2_list.clear()

        # Cleans up any writes still needed and closes files
        if fastq1_list:
            temp_file1.write(fastq1_list)
            fastq1_list.clear()
        if fastq2_list:
            temp_file2.write(fastq2_list)
            fastq2_list.clear()
        if temp_file1:
            temp_file1.close()
        if temp_file2:
            temp_file2.close()

        self.log.info("Modified FASTQ file(s) written")
        if self.args.FASTQ_PreProcess:
            Tool_Box.compress_files(file1, self.log)
            Tool_Box.compress_files(file2, self.log)
        return file1, file2
Пример #9
0
def main(command_line_args=None):
    """

    :param command_line_args:
    """
    VersionDependencies.python_check()

    if not command_line_args:
        command_line_args = sys.argv

    parser = argparse.ArgumentParser(
        description="A package to process Synthetic Lethal Data.\n {0} v{1}".
        format(__package__, __version__),
        formatter_class=RawTextHelpFormatter)

    parser.add_argument('--options_file',
                        action='store',
                        dest='options_file',
                        required=True,
                        help='File containing program parameters.')

    options_parser = Tool_Box.options_file(parser)
    args = options_parser.parse_args()

    # If we are doing statistical analysis the user will not input an Index_Mismatch value
    if not getattr(args, "Index_Mismatch", False):
        options_parser.add_argument("--Index_Mismatch",
                                    dest="Index_Mismatch",
                                    default=0)
        options_parser.add_argument("--Analyze_Unknowns",
                                    dest="Analyze_Unknowns",
                                    default="False")
        args = options_parser.parse_args()

    log = Tool_Box.Logger(args)
    Tool_Box.log_environment_info(log, args, command_line_args)
    start_time = time.time()
    module_name = "Synthetic_Lethal"

    log.info(
        "{0} v{1}; Module: Synthetic Lethal Analysis v{2} Beginning".format(
            __package__, __version__, Synthetic_Lethal.__version__))

    # Convert universal variables intended as boolean from string to boolean.
    # ToDo: Should be a cleaner method to do this.
    if args.Target_Search == "True":
        options_parser.set_defaults(Target_Search=True)
        if args.RevComp == "True":
            options_parser.set_defaults(RevComp=True)
        else:
            options_parser.set_defaults(RevComp=False)
        if args.Delete_Demultiplexed_FASTQ == "True":
            options_parser.set_defaults(Delete_Demultiplexed_FASTQ=True)
        else:
            options_parser.set_defaults(Delete_Demultiplexed_FASTQ=False)
        if args.compress == "True":
            options_parser.set_defaults(compress=True)
        else:
            options_parser.set_defaults(compress=False)
    else:
        options_parser.set_defaults(Target_Search=False)

    if args.Statistics == "True":
        options_parser.set_defaults(Statistics=True)
    else:
        options_parser.set_defaults(Statistics=False)

    if args.Analyze_Unknowns == "True":
        options_parser.set_defaults(Analyze_Unknowns=True)
    else:
        options_parser.set_defaults(Analyze_Unknowns=False)

    args = options_parser.parse_args()

    synthetic_lethal = Synthetic_Lethal.SyntheticLethal(log, args)

    # Add some parameters to our options parser object.
    args = options_parser.parse_args()

    if args.Target_Search:
        synthetic_lethal.fastq_analysis()
    elif args.Statistics:
        synthetic_lethal.statistics()
    else:
        log.error('No module selected to run.')

    warning = "\033[1;31m **See warnings above**\033[m" if log.warning_occurred else ''
    elapsed_time = int(time.time() - start_time)
    log.info(
        "****Volundr {0} complete ({1} seconds, {2} Mb peak memory).****\n{3}".
        format(module_name, elapsed_time, Tool_Box.peak_memory(), warning))
Пример #10
0
def pear_consensus(args, log):
    """
    This will take the input FASTQ files and use PEAR to generate a consensus file.
    :param args:
    :param log:
    :return:
    """
    log.info("Beginning PEAR Consensus")

    fastq_consensus_prefix = "{}{}".format(args.WorkingFolder, args.Job_Name)
    fastq_consensus_file = "{}.assembled.fastq".format(fastq_consensus_prefix)
    discarded_fastq = "{}.discarded.fastq".format(fastq_consensus_prefix)
    r1_unassembled = "{}.unassembled.forward.fastq".format(
        fastq_consensus_prefix)
    r2_unassembled = "{}.unassembled.reverse.fastq".format(
        fastq_consensus_prefix)

    y = "-y {} ".format(args.Memory)
    j = "-j {} ".format(int(args.Spawn) - 1)

    p_value = ''
    if args.PValue:
        p_value = "-p {} ".format(args.PValue)
    min_overlap = ''
    if args.MinOverlap:
        min_overlap = "-v {} ".format(args.MinOverlap)
    quality_threshold = ""
    if args.QualityThreshold:
        quality_threshold = "-q {} ".format(args.QualityThreshold)
    phred_value = ""
    if args.PhredValue:
        phred_value = "-b {} ".format(args.PhredValue)
    test_method = ""
    if args.TestMethod:
        test_method = "-g {}".format(args.TestMethod)
    n = ""
    if args.MinConsensusLength:
        n = "-n {} ".format(args.MinConsensusLength)

    proc = subprocess.run(
        "{}{}Pear{}bin{}./pear -f {} -r {} -o {} {}{}{}{}{}{}{}".format(
            pathlib.Path(__file__).parent.absolute(), os.sep, os.sep, os.sep,
            args.FASTQ1, args.FASTQ2, fastq_consensus_prefix, y, j, n, p_value,
            min_overlap, quality_threshold, phred_value, test_method),
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        shell=True)

    if proc.stderr:
        log.error("{}\n{}\n".format(proc.stderr.decode(),
                                    proc.stdout.decode()))
        return
    else:
        log.info(
            "Begin PEAR Output\n"
            "----------------------------------------------------------------------------------------------------------\n{}"
            "\n----------------------------------------------------------------------------------------------------------\n"
            .format(proc.stdout.decode()))

    file_list = [fastq_consensus_file, r1_unassembled, r2_unassembled]

    if os.stat(discarded_fastq).st_size > 0:
        file_list.append(discarded_fastq)
    else:
        Tool_Box.delete([discarded_fastq])

    return file_list
Пример #11
0
    def consensus_demultiplex(self):
        """
        Takes a FASTQ file of consensus reads and identifies each by index.  Handles writing demultiplexed FASTQ if
        user desired.
        """
        self.log.info("Consensus Index Search")
        eof = False
        start_time = time.time()
        split_time = time.time()
        fastq_file_name_list = []
        fastq_data_dict = collections.defaultdict(lambda: collections.defaultdict(list))
        indexed_read_count = 0
        key_counts = []
        while not eof:
            # Debugging Code Block
            if self.args.Verbose == "DEBUG":
                read_limit = 1000000
                if self.read_count > read_limit:
                    if self.args.Demultiplex:
                        for index_name in fastq_data_dict:
                            r1_data = fastq_data_dict[index_name]["R1"]
                            r1, r2 = self.fastq_outfile_dict[index_name]
                            r1.write(r1_data)
                            r1.close()
                            if not self.args.PEAR:
                                r2_data = fastq_data_dict[index_name]["R2"]
                                r2.write(r2_data)
                                r2.close()

                    Tool_Box.debug_messenger("Limiting Reads Here to {}".format(read_limit))
                    eof = True
            fastq2_read = None
            try:
                fastq1_read = next(self.fastq1.seq_read())
                if not self.args.PEAR:
                    fastq2_read = next(self.fastq2.seq_read())

            except StopIteration:
                if self.args.Demultiplex:
                    for index_name in fastq_data_dict:
                        r1_data = fastq_data_dict[index_name]["R1"]
                        r1, r2 = self.fastq_outfile_dict[index_name]
                        r1.write(r1_data)
                        r1.close()
                        if not self.args.PEAR:
                            r2_data = fastq_data_dict[index_name]["R2"]
                            r2.write(r2_data)
                            r2.close()

                eof = True
                continue

            self.read_count += 1
            if self.read_count % 100000 == 0:
                elapsed_time = int(time.time() - start_time)
                block_time = int(time.time() - split_time)
                split_time = time.time()
                self.log.info("Processed {} reads in {} seconds.  Total elapsed time: {} seconds."
                              .format(self.read_count, block_time, elapsed_time))

            # Match read with library index.
            match_found, left_seq, right_seq, index_name, fastq1_read, fastq2_read = \
                self.index_matching(fastq1_read, fastq2_read)

            if match_found:
                indexed_read_count += 1
                locus = self.index_dict[index_name][7]
                phase_key = "{}+{}".format(index_name, locus)
                r2_found = False
                r1_found = False
                if self.args.Platform == "Illumina":
                    # Score the phasing and place the reads in a dictionary.
                    for r2_phase, r1_phase in zip(self.phase_dict[locus]["R2"], self.phase_dict[locus]["R1"]):

                        r2_phase_name = r2_phase[1]
                        r1_phase_name = r1_phase[1]

                        # Tag reads that should not have any phasing.
                        if not r1_phase[0]:
                            self.phase_count[phase_key]["Phase " + r1_phase_name] = -1
                            self.phase_count[phase_key]["Phase " + r2_phase_name] = -1
                            continue
                        else:
                            self.phase_count[phase_key]["Phase " + r1_phase_name] += 0
                            self.phase_count[phase_key]["Phase " + r2_phase_name] += 0

                        # The phasing is the last N nucleotides of the consensus.
                        if r2_phase[0] == Sequence_Magic.rcomp(fastq1_read.seq[-len(r2_phase[0]):]) and not r2_found:
                            self.phase_count[phase_key]["Phase "+r2_phase_name] += 1
                            r2_found = True

                        if r1_phase[0] == fastq1_read.seq[:len(r1_phase[0])] and not r1_found:
                            self.phase_count[phase_key]["Phase "+r1_phase_name] += 1
                            r1_found = True

                    # if no phasing is found then note that.
                    if not r2_found:
                        self.phase_count[phase_key]["No Read 2 Phasing"] += 1
                    if not r1_found:
                        self.phase_count[phase_key]["No Read 1 Phasing"] += 1

                    # The adapters on Gupta Lab AAVS1.1 are reversed causing the reads to be reversed.
                    if locus == "AAVS1.1":
                        self.sequence_dict[index_name].append(fastq1_read.seq)
                    else:
                        self.sequence_dict[index_name].append(fastq1_read.seq)

                elif self.args.Platform == "TruSeq":
                    self.sequence_dict[index_name].append(right_seq)

                elif self.args.Platform == "Ramsden":
                    self.sequence_dict[index_name].append(Sequence_Magic.rcomp(fastq1_read.seq))

                else:
                    self.log.error("--Platform {} not correctly defined.  Edit parameter file and try again"
                                   .format(self.args.Platform))
                    raise SystemExit(1)

                if self.args.Demultiplex:
                    fastq_data_dict[index_name]["R1"].append([fastq1_read.name, fastq1_read.seq, fastq1_read.qual])
                    if not self.args.PEAR:
                        fastq_data_dict[index_name]["R2"].append([fastq2_read.name, fastq2_read.seq, fastq2_read.qual])

                    fastq_file_name_list.append("{}{}_{}_Consensus.fastq"
                                                .format(self.args.WorkingFolder, self.args.Job_Name, index_name))

            elif self.args.Demultiplex and not match_found:
                fastq_data_dict['Unknown']["R1"].append([fastq1_read.name, fastq1_read.seq, fastq1_read.qual])
                fastq_data_dict['Unknown']["R2"].append([fastq1_read.name, fastq1_read.seq, fastq1_read.qual])

                fastq_file_name_list.append("{}{}_Unknown_Consensus.fastq"
                                            .format(self.args.WorkingFolder, self.args.Job_Name))

        if self.args.Demultiplex:
            self.fastq_compress(list(set(fastq_file_name_list)))

        for key in self.sequence_dict:
            key_counts.append(len(self.sequence_dict[key]))

        # The lower limit is used when plotting the data.  Generally the lowest values are just noise.
        if len(key_counts) == 0:
            self.log.error("No Scar Patterns Found")
            raise SystemExit(1)
        lower, upper_limit = stats.norm.interval(0.9, loc=statistics.mean(key_counts), scale=stats.sem(key_counts))
        lower_limit = statistics.mean(key_counts)-lower

        return indexed_read_count, lower_limit
Пример #12
0
    def string_conversions(parser):
        """
        Convert True/False statements in parameter file to boolean
        :param parser:
        :return:
        """
        options_parser = Tool_Box.options_file(parser)
        initial_args = options_parser.parse_args()

        options_parser.set_defaults(
            TargetSearch=bool(strtobool(initial_args.TargetSearch)))
        options_parser.set_defaults(
            Statistics=bool(strtobool(initial_args.Statistics)))

        options_parser.set_defaults(Verbose=initial_args.Verbose.upper())

        if initial_args.Statistics == "False":
            options_parser.set_defaults(
                AnchorSeq=initial_args.AnchorSeq.upper())
            options_parser.set_defaults(Analyze_Unknowns=bool(
                strtobool(initial_args.Analyze_Unknowns)))
            options_parser.set_defaults(Delete_Demultiplexed_FASTQ=bool(
                strtobool(initial_args.Delete_Demultiplexed_FASTQ)))
            options_parser.set_defaults(
                RevComp=bool(strtobool(initial_args.RevComp)))
            options_parser.set_defaults(BatchSize=int(initial_args.BatchSize))
            options_parser.set_defaults(
                Target_Mismatch=int(initial_args.Target_Mismatch))
            options_parser.set_defaults(
                MinimumReadLength=int(initial_args.MinimumReadLength))
            options_parser.set_defaults(N_Limit=10)
            options_parser.set_defaults(
                Target_Length=int(initial_args.Target_Length))
            options_parser.set_defaults(
                Target_Start=int(initial_args.Target_Start))
            # options_parser.set_defaults(Index_Mismatch=int(initial_args.Index_Mismatch))
            options_parser.set_defaults(Spawn=int(initial_args.Spawn))
            options_parser.set_defaults(
                Target_Padding=int(initial_args.Target_Padding))
            options_parser.set_defaults(
                Expected_Position=int(initial_args.Expected_Position))
            options_parser.set_defaults(
                AnchorMismatch=int(initial_args.AnchorMismatch))
            options_parser.set_defaults(
                AnchorStart=int(initial_args.AnchorStart))
            options_parser.set_defaults(
                AnchorStop=int(initial_args.AnchorStop))
        else:
            options_parser.set_defaults(
                Write_TDnorm_Log2_sgRNA_Control_File=bool(
                    strtobool(
                        initial_args.Write_TDnorm_Log2_sgRNA_Control_File)))
            options_parser.set_defaults(
                Write_TDnorm_Log2_sgRNA_Sample_File=bool(
                    strtobool(
                        initial_args.Write_TDnorm_Log2_sgRNA_Sample_File)))
            options_parser.set_defaults(Write_Log2_sgRNA_File=bool(
                strtobool(initial_args.Write_Log2_sgRNA_File)))
            options_parser.set_defaults(Write_Permuted_Log2_Data_File=bool(
                strtobool(initial_args.Write_Permuted_Log2_Data_File)))
            options_parser.set_defaults(Bad_sgRNA_Lower_Percentile=float(
                initial_args.Bad_sgRNA_Lower_Percentile))
            options_parser.set_defaults(Bad_sgRNA_Upper_Percentile=float(
                initial_args.Bad_sgRNA_Upper_Percentile))
            options_parser.set_defaults(
                UpperPercentile=float(initial_args.UpperPercentile))
            options_parser.set_defaults(
                LowerPercentile=float(initial_args.LowerPercentile))
            options_parser.set_defaults(
                PermutationCount=int(initial_args.PermutationCount))
            options_parser.set_defaults(Alpha=float(initial_args.Alpha))
            options_parser.set_defaults(
                Target_Mismatch=float(initial_args.Target_Mismatch))
            options_parser.set_defaults(
                UpperGuideLimit=float(initial_args.UpperGuideLimit))
            options_parser.set_defaults(
                LowerGuideLimit=float(initial_args.LowerGuideLimit))

        initial_args = options_parser.parse_args()

        return initial_args
Пример #13
0
def error_checking(parser):
    """
    Check parameter file for errors and return parser object.
    :param parser:
    :return:
    """
    def string_conversions(parser):
        """
        Convert True/False statements in parameter file to boolean
        :param parser:
        :return:
        """
        options_parser = Tool_Box.options_file(parser)
        initial_args = options_parser.parse_args()

        options_parser.set_defaults(
            TargetSearch=bool(strtobool(initial_args.TargetSearch)))
        options_parser.set_defaults(
            Statistics=bool(strtobool(initial_args.Statistics)))

        options_parser.set_defaults(Verbose=initial_args.Verbose.upper())

        if initial_args.Statistics == "False":
            options_parser.set_defaults(
                AnchorSeq=initial_args.AnchorSeq.upper())
            options_parser.set_defaults(Analyze_Unknowns=bool(
                strtobool(initial_args.Analyze_Unknowns)))
            options_parser.set_defaults(Delete_Demultiplexed_FASTQ=bool(
                strtobool(initial_args.Delete_Demultiplexed_FASTQ)))
            options_parser.set_defaults(
                RevComp=bool(strtobool(initial_args.RevComp)))
            options_parser.set_defaults(BatchSize=int(initial_args.BatchSize))
            options_parser.set_defaults(
                Target_Mismatch=int(initial_args.Target_Mismatch))
            options_parser.set_defaults(
                MinimumReadLength=int(initial_args.MinimumReadLength))
            options_parser.set_defaults(N_Limit=10)
            options_parser.set_defaults(
                Target_Length=int(initial_args.Target_Length))
            options_parser.set_defaults(
                Target_Start=int(initial_args.Target_Start))
            # options_parser.set_defaults(Index_Mismatch=int(initial_args.Index_Mismatch))
            options_parser.set_defaults(Spawn=int(initial_args.Spawn))
            options_parser.set_defaults(
                Target_Padding=int(initial_args.Target_Padding))
            options_parser.set_defaults(
                Expected_Position=int(initial_args.Expected_Position))
            options_parser.set_defaults(
                AnchorMismatch=int(initial_args.AnchorMismatch))
            options_parser.set_defaults(
                AnchorStart=int(initial_args.AnchorStart))
            options_parser.set_defaults(
                AnchorStop=int(initial_args.AnchorStop))
        else:
            options_parser.set_defaults(
                Write_TDnorm_Log2_sgRNA_Control_File=bool(
                    strtobool(
                        initial_args.Write_TDnorm_Log2_sgRNA_Control_File)))
            options_parser.set_defaults(
                Write_TDnorm_Log2_sgRNA_Sample_File=bool(
                    strtobool(
                        initial_args.Write_TDnorm_Log2_sgRNA_Sample_File)))
            options_parser.set_defaults(Write_Log2_sgRNA_File=bool(
                strtobool(initial_args.Write_Log2_sgRNA_File)))
            options_parser.set_defaults(Write_Permuted_Log2_Data_File=bool(
                strtobool(initial_args.Write_Permuted_Log2_Data_File)))
            options_parser.set_defaults(Bad_sgRNA_Lower_Percentile=float(
                initial_args.Bad_sgRNA_Lower_Percentile))
            options_parser.set_defaults(Bad_sgRNA_Upper_Percentile=float(
                initial_args.Bad_sgRNA_Upper_Percentile))
            options_parser.set_defaults(
                UpperPercentile=float(initial_args.UpperPercentile))
            options_parser.set_defaults(
                LowerPercentile=float(initial_args.LowerPercentile))
            options_parser.set_defaults(
                PermutationCount=int(initial_args.PermutationCount))
            options_parser.set_defaults(Alpha=float(initial_args.Alpha))
            options_parser.set_defaults(
                Target_Mismatch=float(initial_args.Target_Mismatch))
            options_parser.set_defaults(
                UpperGuideLimit=float(initial_args.UpperGuideLimit))
            options_parser.set_defaults(
                LowerGuideLimit=float(initial_args.LowerGuideLimit))

        initial_args = options_parser.parse_args()

        return initial_args

    args = string_conversions(parser)
    log = Tool_Box.Logger(args)
    Tool_Box.log_environment_info(log, args, sys.argv)

    if not pathlib.Path(args.WorkingFolder).exists():
        print(
            "\033[1;31mERROR:\n\tWorking Folder Path: {} Not Found.  Check Parameter File."
            .format(args.WorkingFolder))
        raise SystemExit(1)

    if args.Statistics:
        if not pathlib.Path(args.DataFiles).exists():
            print(
                "\033[1;31mERROR:\n\t--DataFiles Folder Path: {} Not Found.  Check Parameter File."
                .format(args.DataFiles))
            raise SystemExit(1)

    if not pathlib.Path(args.SampleManifest).exists():
        print(
            "\033[1;31mERROR:\n\t--SampleManifest: {} Not Found.  Check Parameter File."
            .format(args.SampleManifest))
        raise SystemExit(1)

    if not pathlib.Path(args.Master_Index_File).exists():
        print(
            "\033[1;31mERROR:\n\t--Master_Index_File: {} Not Found.  Check Parameter File."
            .format(args.Master_Index_File))
        raise SystemExit(1)

    if not pathlib.Path(args.Target_File).exists():
        print(
            "\033[1;31mERROR:\n\t--Target_File: {} Not Found.  Check Parameter File."
            .format(args.Target_File))
        raise SystemExit(1)

    if args.TargetSearch:
        if getattr(args, "FASTQ1",
                   False) and not pathlib.Path(args.FASTQ1).exists():
            print(
                "\033[1;31mERROR:\n\t--FASTQ1: {} Not Found.  Check Parameter File."
                .format(args.FASTQ1))
            raise SystemExit(1)

        try:
            mime_type1 = magic.from_file(args.FASTQ1, mime=True).decode()

        except AttributeError:
            mime_type1 = magic.from_file(args.FASTQ1, mime=True)

        if "text" in mime_type1 or "gzip" in mime_type1:
            pass
        else:
            log.error(
                "Unsupported FASTQ file-type.  Only TEXT or GZIP Allowed.")
            raise SystemExit(1)

    return args, log
Пример #14
0
            plot_data_dict[data_pair[6]][5].append(data_pair[4])

            plot_data_dict[data_pair[6]][6].append(data_pair[5])
            plot_data_dict[data_pair[6]][7].append(color_dict[data_pair[6]])
            count = len(plot_data_dict[data_pair[6]][0])

            if count > 1:
                previous = plot_data_dict[data_pair[6]][0][count - 2]
                plot_data_dict[data_pair[6]][8]\
                    .append(plot_data_dict[data_pair[6]][8][count - 2] + 0.0007 + (0.5*previous) + data_pair[0] * 0.5)

    return plot_data_dict


# This is here to run the module as a stand-alone.
if __name__ == '__main__':
    ToolBox.debug_messenger("Standing Alone")

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('--options_file',
                        action='store',
                        dest='options_file',
                        required=True,
                        help='File containing program parameters.')

    options_parser = ToolBox.options_file(parser)
    args = options_parser.parse_args()

    scarmapperplot(args)
Пример #15
0
    def quality_check(data_bundle, fastq_files):
        """
        Called by the multiprocessor pool.  Examines the indices and determines the mismatches and N counts.

        :param data_bundle:
        :param fastq_files:
        :return:
        """

        prog_check = data_bundle[0]
        index_list = data_bundle[1]
        file1_anchor_seq = data_bundle[2]
        file2_anchor_seq = data_bundle[3]
        fastq1 = FASTQ_Reader(fastq_files[0])
        fastq2 = FASTQ_Reader(fastq_files[1])

        umt_dict = collections.defaultdict(
            lambda: collections.defaultdict(int))
        anchor_dict = Tool_Box.VivifiedDictionary()
        read_count = 0

        try:
            while True:
                fastq1_read = next(fastq1.seq_read())
                fastq2_read = next(fastq2.seq_read())
                read_count += 1

                if read_count % int(prog_check) == 0:
                    print("      -->Processed {0} reads in file {1} and {2}.".
                          format(read_count, fastq_files[0], fastq_files[1]))

                # Get read index and UMT.
                umt = "{0}{1}".format(
                    fastq1_read.name.split("|")[0],
                    fastq2_read.name.split("|")[1].split(":")[0])
                read_index = fastq1_read.name.split(":")[-1]

                # Quantify anchor lengths.
                unknown_anchor1 = fastq1_read.seq[7:18]
                unknown_anchor2 = fastq2_read.seq[7:18]
                match1 = Levenshtein.distance(file1_anchor_seq,
                                              unknown_anchor1)
                match2 = Levenshtein.distance(file2_anchor_seq,
                                              unknown_anchor2)

                for index in index_list:
                    index_match = Levenshtein.distance(read_index,
                                                       index[0][:6])

                    # Add anchor and UMT data to dictionaries.
                    if index[0] in anchor_dict and index_match < 2:
                        anchor_dict[index[0]]["R1"][match1] += 1
                        anchor_dict[index[0]]["R2"][match2] += 1
                        umt_dict[index[0]][umt] += 1
                        # if umt in umt_dict[index[0]]:
                        #     umt_dict[index[0]][umt] += 1
                        # else:
                        #     umt_dict[index[0]][umt] = 1

                    elif index_match < 2:
                        anchor_dict[
                            index[0]]["R1"] = [0] * len(file1_anchor_seq)
                        anchor_dict[
                            index[0]]["R2"] = [0] * len(file2_anchor_seq)
                        anchor_dict[index[0]]["R1"][match1] += 1
                        anchor_dict[index[0]]["R2"][match2] += 1
                        umt_dict[index[0]][umt] = 1
        except StopIteration:
            return anchor_dict, umt_dict
Пример #16
0
    def temp_file_writer(self, limit):
        """
        Write the temporary FASTQ files.  Also create list of temporary BAM file names for use later.
        :return:
        """

        self.log.info("Begin writing temporary FASTQ files.")
        i = 0
        temp_file1 = None
        temp_file2 = None
        fastq_file_list = []
        bam_file_list = []
        read_count = 0
        limit_counter = 0

        while read_count <= self.read_count:
            try:
                # This generator is returning actual reads not lines.
                fastq1_read = next(self.fastq1_file.seq_read())
                fastq2_read = next(self.fastq2_file.seq_read())
                if self.index1_file is not None:
                    fastq3_read = next(self.index1_file.seq_read())
            except StopIteration:
                read_count += 1
                continue

            read_count += 1

            try:
                fastq1_n_frac = fastq1_read.seq.count("N") / len(
                    fastq1_read.seq)
                fastq2_n_frac = fastq2_read.seq.count("N") / len(
                    fastq2_read.seq)
            except ZeroDivisionError:
                continue

            # Apply Filters
            if (len(fastq1_read.seq) < int(self.args.Minimum_Length)
                    or len(fastq2_read.seq) < int(self.args.Minimum_Length)
                    or fastq1_n_frac >= float(self.args.N_Limit)
                    or fastq2_n_frac >= float(self.args.N_Limit)):
                continue

            if limit_counter % limit == 0:
                if temp_file1:
                    temp_file1.close()
                    limit_counter = 0
                if temp_file2:
                    temp_file2.close()

                file1 = "{0}{1}_R1_tmp_{2}.fastq.gz".format(
                    self.args.WorkingFolder, self.args.Job_Name, i)
                file2 = "{0}{1}_R2_tmp_{2}.fastq.gz".format(
                    self.args.WorkingFolder, self.args.Job_Name, i)
                bam_file_list.append("{0}{1}_R1_tmp_{2}.bam".format(
                    self.args.WorkingFolder, self.args.Job_Name, i))
                fastq_file_list.append((file1, file2))
                temp_file1 = Writer(self.log, file1)
                temp_file2 = Writer(self.log, file2)

                self.log.info("Writing {0} and {1}".format(file1, file2))
                i += 1

            limit_counter += 1

            # BAM files are missing the barcodes because of a space in some of the header files.
            # fastq1_read.name = fastq1_read.name.replace(" ", ":")
            # fastq2_read.name = fastq2_read.name.replace(" ", ":")

            # Add the UMT's to the header.
            if self.args.HaloPLEX:
                umi = fastq3_read.seq
                header1 = "{0}|{1}:{2}".format(
                    fastq1_read.name.split(":")[-1], umi, fastq1_read.name)
                header2 = "{0}|{1}:{2}".format(
                    fastq2_read.name.split(":")[-1], umi, fastq2_read.name)

            elif self.args.ThruPLEX:
                # header1 = "{0}|{1}".format(fastq1_read.name.split(":")[-1], fastq1_read.name)
                umt1 = fastq1_read.seq[:6]
                umt2 = fastq2_read.seq[:6]
                header1 = "{0}|{1}:{2}".format(umt1, umt2, fastq1_read.name)
                header2 = "{0}|{1}:{2}".format(umt1, umt2, fastq2_read.name)
            else:
                Tool_Box.debug_messenger(
                    "Only HaloPLEX or ThruPLEX currently enabled.")
                self.log.error("Only HaloPLEX or ThruPLEX currently enabled.")
                raise SystemExit(1)

            # Trim adapter sequences from 5' end if needed.
            if int(self.args.trim) > 0:
                fastq1_read.seq = fastq1_read.seq[int(self.args.trim):]
                fastq1_read.qual = fastq1_read.qual[int(self.args.trim):]
                fastq2_read.seq = fastq2_read.seq[int(self.args.trim):]
                fastq2_read.qual = fastq2_read.qual[int(self.args.trim):]

            fastq1_read.name = header1
            fastq2_read.name = header2

            temp_file1.write(self.fastq1_file)
            temp_file2.write(self.fastq2_file)

        if temp_file1:
            temp_file1.close()
        if temp_file2:
            temp_file2.close()

        self.log.info("All temporary FASTQ files written")

        return fastq_file_list, bam_file_list
Пример #17
0
def main(command_line_args=None):
    """
    Let's get this party started.
    :param command_line_args:
    """
    start_time = time.time()
    VersionDependencies.python_check()

    if not command_line_args:
        command_line_args = sys.argv

    run_start = datetime.datetime.today().strftime("%H:%M:%S %Y  %a %b %d")
    parser = argparse.ArgumentParser(
        description=
        "A package to map genomic repair scars at defined loci.\n {} v{}".
        format(__package__, __version__),
        formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument('--options_file',
                        action='store',
                        dest='options_file',
                        required=True,
                        help='File containing program parameters.')

    # Check options file for errors and return object.
    args = error_checking(string_to_boolean(parser))

    log = Tool_Box.Logger(args)
    Tool_Box.log_environment_info(log, args, command_line_args)

    module_name = ""
    log.info("{} v{}".format(__package__, __version__))

    if args.IndelProcessing:
        file_list = []
        if args.Platform == "Illumina" or args.Platform == "Ramsden" or args.Platform == "TruSeq":
            log.info("Sending FASTQ files to FASTQ preprocessor.")

            if args.PEAR:
                file_list = pear_consensus(args, log)
                if not file_list:
                    log.error("PEAR failed.  Check logs.")
                    raise SystemExit(1)
                fastq_consensus = file_list[0]

                fq1 = FASTQ_Tools.FASTQ_Reader(fastq_consensus, log)
                fq2 = None

            else:
                fq2 = FASTQ_Tools.FASTQ_Reader(args.FASTQ2, log)
                fq1 = FASTQ_Tools.FASTQ_Reader(args.FASTQ1, log)

            sample_manifest = Tool_Box.FileParser.indices(
                log, args.SampleManifest)
            indel_processing = \
                Indel_Processing.DataProcessing(log, args, run_start, __version__,
                                                Target_Mapper.TargetMapper(log, args, sample_manifest), fq1, fq2)

            indel_processing.main_loop()

            # Compress or delete PEAR files.
            if args.PEAR and file_list:
                if args.DeleteConsensusFASTQ:
                    log.info("Deleting PEAR FASTQ Files.")
                    Tool_Box.delete(file_list)
                else:
                    log.info(
                        "Compressing {} FASTQ Files Generated by PEAR.".format(
                            len(file_list)))
                    p = pathos.multiprocessing.Pool(int(args.Spawn))
                    p.starmap(Tool_Box.compress_files,
                              zip(file_list, itertools.repeat(log)))
        else:
            log.error(
                "Only 'Illumina', 'TruSeq' or 'Ramsden' --Platform methods currently allowed."
            )
            raise SystemExit(1)

    elif not args.IndelProcessing:
        # Run frequency file Combine module
        run_start = datetime.datetime.today().strftime("%a %b %d %H:%M:%S %Y")
        log.info("Process Replicates.")
        data_dict = collections.defaultdict(list)
        file_list = [
            f for f in glob.glob("{}*ScarMapper_Frequency.txt".format(
                args.DataFiles, ))
        ]
        file_count = len(file_list)
        page_header = "# ScarMapper File Merge v{}\n# Run: {}\n# Sample Name: {}\n" \
            .format(__version__, run_start, args.SampleName)

        line_num = 0
        index_file = list(csv.reader(open(file_list[0]), delimiter='\t'))
        for line in index_file:
            if not line:
                break
            elif line_num > 3:
                page_header += "{}\n".format(line[0])

            line_num += 1
        page_header += "\n\n"

        for file_name in file_list:
            freq_file_data = Tool_Box.FileParser.indices(log, file_name)

            for row in freq_file_data:
                key = "{}|{}|{}|{}".format(row[3], row[4], row[6], row[8])
                row_data = row[2:]

                if key in data_dict:
                    data_dict[key][0].append(float(row[1]))
                else:
                    data_dict[key] = [[float(row[1])], row_data]

        # Process Data and Write Combined Frequency results file

        plot_data_dict = collections.defaultdict(list)
        label_dict = collections.defaultdict(float)
        output_data_dict = collections.defaultdict(list)
        marker_list = []

        for key, row_list in data_dict.items():
            # Force pattern to be in at least half of the files.
            if len(row_list[0]) / file_count >= 0.5:
                row_string = "\t".join(row_list[1])
                freq = gmean(row_list[0])
                sem = stats.sem(row_list[0])
                freq_results_outstring = "{}\t{}\t{}\n".format(
                    freq, sem, row_string)
                output_key = freq

                # Freq is a 17 digit float so it is very unlikely to be duplicated but if it is this increments it by
                # a small number then checks the uniqueness again.
                if output_key in output_data_dict:
                    output_key = output_key + 1e-16
                    if output_key in output_data_dict:
                        output_key = output_key + 1e-16

                scar_type = row_list[1][0]
                label_dict[scar_type] += freq

                # Gather up our data for plotting
                lft_del = int(row_list[1][1])
                rt_del = int(row_list[1][2])
                mh_size = int(row_list[1][5])
                ins_size = int(row_list[1][7])

                output_data_dict[output_key] = \
                    [(freq, lft_del, rt_del, mh_size, ins_size, scar_type), freq_results_outstring]

        freq_results_outstring = \
            "{}# Frequency\tSEM\tScar Type\tLeft Deletions\tRight Deletions\tDeletion Size\tMicrohomology\t" \
            "Microhomology Size\tInsertion\tInsertion Size\tLeft Template\tRight Template\tConsensus Left Junction\t" \
            "Consensus Right Junction\tTarget Left Junction\tTarget Right Junction\tConsensus\tTarget Region\n" \
            .format(page_header)

        # Now draw a pretty graph of the data if we are not dealing with a negative control.
        for k in natsort.natsorted(output_data_dict, reverse=True):
            data_list = output_data_dict[k]
            freq_results_outstring += data_list[1]

            freq = data_list[0][0]
            lft_del = data_list[0][1]
            rt_del = data_list[0][2]
            mh_size = data_list[0][3]
            ins_size = data_list[0][4]
            scar_type = data_list[0][5]

            # Plotting all scar patterns is messy.  This provides a cutoff.
            if freq < 0.00025:
                continue

            y_value = freq * 0.5
            lft_ins_width = freq
            rt_ins_width = freq

            # This is gathered up to find the largest value.  Used to set the x-axis limits.
            marker_list.extend([
                lft_del + (mh_size * 0.5), rt_del + (mh_size * 0.5), ins_size
            ])

            # Deletion size included half the size of any microhomology present.
            lft_del_plot_value = (lft_del + (mh_size * 0.5)) * -1
            rt_del_plot_value = rt_del + (mh_size * 0.5)

            # Insertions are centered on 0 so we need to take half the value for each side.
            lft_ins_plot_value = (ins_size * 0.5) * -1
            rt_ins_plot_value = ins_size * 0.5

            # Scale the width of bars for insertions inside of deletions
            if lft_del + (mh_size * 0.5) != 0:
                lft_ins_width = freq * 0.5
            if rt_del + (mh_size * 0.5) != 0:
                rt_ins_width = freq * 0.5

            if scar_type not in plot_data_dict:
                plot_data_dict[scar_type] = \
                    [[freq], [lft_del_plot_value], [rt_del_plot_value], [lft_ins_plot_value],
                     [rt_ins_plot_value], [lft_ins_width], [rt_ins_width], [y_value]]
            else:
                # Get some previous plot data
                count = len(plot_data_dict[scar_type][0])
                previous_freq = plot_data_dict[scar_type][0][count - 1]
                previous_y = plot_data_dict[scar_type][7][count - 1]

                plot_data_dict[scar_type][0].append(freq)
                plot_data_dict[scar_type][1].append(lft_del_plot_value)
                plot_data_dict[scar_type][2].append(rt_del_plot_value)
                plot_data_dict[scar_type][3].append(lft_ins_plot_value)
                plot_data_dict[scar_type][4].append(rt_ins_plot_value)
                plot_data_dict[scar_type][5].append(lft_ins_width)
                plot_data_dict[scar_type][6].append(rt_ins_width)

                # Use the previous plot data to find the y-value of the current bar.
                plot_data_dict[scar_type][7] \
                    .append(previous_y + 0.002 + (0.5 * previous_freq) + y_value)

        plot_data_dict['Marker'] = [(max(marker_list)) * -1, max(marker_list)]
        # sample_name = "{}.{}".format(args.Job_Name, args.SampleName)

        ScarMapperPlot.scarmapperplot(args,
                                      datafile=None,
                                      sample_name=args.SampleName,
                                      plot_data_dict=plot_data_dict,
                                      label_dict=label_dict)

        freq_results_file = \
            open("{}{}_ScarMapper_Combined_Frequency.txt".format(args.WorkingFolder, args.SampleName), "w")

        freq_results_file.write(freq_results_outstring)
        freq_results_file.close()

    warning = "\033[1;31m **See warnings above**\033[m" if log.warning_occurred else ''
    elapsed_time = int(time.time() - start_time)
    log.info(
        "****ScarMapper {0} complete ({1} seconds, {2} Mb peak memory).****".
        format(module_name, elapsed_time, Tool_Box.peak_memory(), warning))

    # All done so we need to quit otherwise Python will not release the log file on virtual Linux.
    exit(0)