Exemplo n.º 1
0
            plot_data_dict[data_pair[6]][5].append(data_pair[4])

            plot_data_dict[data_pair[6]][6].append(data_pair[5])
            plot_data_dict[data_pair[6]][7].append(color_dict[data_pair[6]])
            count = len(plot_data_dict[data_pair[6]][0])

            if count > 1:
                previous = plot_data_dict[data_pair[6]][0][count - 2]
                plot_data_dict[data_pair[6]][8]\
                    .append(plot_data_dict[data_pair[6]][8][count - 2] + 0.0007 + (0.5*previous) + data_pair[0] * 0.5)

    return plot_data_dict


# This is here to run the module as a stand-alone.
if __name__ == '__main__':
    ToolBox.debug_messenger("Standing Alone")

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('--options_file',
                        action='store',
                        dest='options_file',
                        required=True,
                        help='File containing program parameters.')

    options_parser = ToolBox.options_file(parser)
    args = options_parser.parse_args()

    scarmapperplot(args)
Exemplo n.º 2
0
    def temp_file_writer(self, limit):
        """
        Write the temporary FASTQ files.  Also create list of temporary BAM file names for use later.
        :return:
        """

        self.log.info("Begin writing temporary FASTQ files.")
        i = 0
        temp_file1 = None
        temp_file2 = None
        fastq_file_list = []
        bam_file_list = []
        read_count = 0
        limit_counter = 0

        while read_count <= self.read_count:
            try:
                # This generator is returning actual reads not lines.
                fastq1_read = next(self.fastq1_file.seq_read())
                fastq2_read = next(self.fastq2_file.seq_read())
                if self.index1_file is not None:
                    fastq3_read = next(self.index1_file.seq_read())
            except StopIteration:
                read_count += 1
                continue

            read_count += 1

            try:
                fastq1_n_frac = fastq1_read.seq.count("N") / len(
                    fastq1_read.seq)
                fastq2_n_frac = fastq2_read.seq.count("N") / len(
                    fastq2_read.seq)
            except ZeroDivisionError:
                continue

            # Apply Filters
            if (len(fastq1_read.seq) < int(self.args.Minimum_Length)
                    or len(fastq2_read.seq) < int(self.args.Minimum_Length)
                    or fastq1_n_frac >= float(self.args.N_Limit)
                    or fastq2_n_frac >= float(self.args.N_Limit)):
                continue

            if limit_counter % limit == 0:
                if temp_file1:
                    temp_file1.close()
                    limit_counter = 0
                if temp_file2:
                    temp_file2.close()

                file1 = "{0}{1}_R1_tmp_{2}.fastq.gz".format(
                    self.args.WorkingFolder, self.args.Job_Name, i)
                file2 = "{0}{1}_R2_tmp_{2}.fastq.gz".format(
                    self.args.WorkingFolder, self.args.Job_Name, i)
                bam_file_list.append("{0}{1}_R1_tmp_{2}.bam".format(
                    self.args.WorkingFolder, self.args.Job_Name, i))
                fastq_file_list.append((file1, file2))
                temp_file1 = Writer(self.log, file1)
                temp_file2 = Writer(self.log, file2)

                self.log.info("Writing {0} and {1}".format(file1, file2))
                i += 1

            limit_counter += 1

            # BAM files are missing the barcodes because of a space in some of the header files.
            # fastq1_read.name = fastq1_read.name.replace(" ", ":")
            # fastq2_read.name = fastq2_read.name.replace(" ", ":")

            # Add the UMT's to the header.
            if self.args.HaloPLEX:
                umi = fastq3_read.seq
                header1 = "{0}|{1}:{2}".format(
                    fastq1_read.name.split(":")[-1], umi, fastq1_read.name)
                header2 = "{0}|{1}:{2}".format(
                    fastq2_read.name.split(":")[-1], umi, fastq2_read.name)

            elif self.args.ThruPLEX:
                # header1 = "{0}|{1}".format(fastq1_read.name.split(":")[-1], fastq1_read.name)
                umt1 = fastq1_read.seq[:6]
                umt2 = fastq2_read.seq[:6]
                header1 = "{0}|{1}:{2}".format(umt1, umt2, fastq1_read.name)
                header2 = "{0}|{1}:{2}".format(umt1, umt2, fastq2_read.name)
            else:
                Tool_Box.debug_messenger(
                    "Only HaloPLEX or ThruPLEX currently enabled.")
                self.log.error("Only HaloPLEX or ThruPLEX currently enabled.")
                raise SystemExit(1)

            # Trim adapter sequences from 5' end if needed.
            if int(self.args.trim) > 0:
                fastq1_read.seq = fastq1_read.seq[int(self.args.trim):]
                fastq1_read.qual = fastq1_read.qual[int(self.args.trim):]
                fastq2_read.seq = fastq2_read.seq[int(self.args.trim):]
                fastq2_read.qual = fastq2_read.qual[int(self.args.trim):]

            fastq1_read.name = header1
            fastq2_read.name = header2

            temp_file1.write(self.fastq1_file)
            temp_file2.write(self.fastq2_file)

        if temp_file1:
            temp_file1.close()
        if temp_file2:
            temp_file2.close()

        self.log.info("All temporary FASTQ files written")

        return fastq_file_list, bam_file_list
Exemplo n.º 3
0
    def consensus_demultiplex(self):
        """
        Takes a FASTQ file of consensus reads and identifies each by index.  Handles writing demultiplexed FASTQ if
        user desired.
        """
        self.log.info("Consensus Index Search")
        eof = False
        start_time = time.time()
        split_time = time.time()
        fastq_file_name_list = []
        fastq_data_dict = collections.defaultdict(lambda: collections.defaultdict(list))
        indexed_read_count = 0
        key_counts = []
        while not eof:
            # Debugging Code Block
            if self.args.Verbose == "DEBUG":
                read_limit = 1000000
                if self.read_count > read_limit:
                    if self.args.Demultiplex:
                        for index_name in fastq_data_dict:
                            r1_data = fastq_data_dict[index_name]["R1"]
                            r1, r2 = self.fastq_outfile_dict[index_name]
                            r1.write(r1_data)
                            r1.close()
                            if not self.args.PEAR:
                                r2_data = fastq_data_dict[index_name]["R2"]
                                r2.write(r2_data)
                                r2.close()

                    Tool_Box.debug_messenger("Limiting Reads Here to {}".format(read_limit))
                    eof = True
            fastq2_read = None
            try:
                fastq1_read = next(self.fastq1.seq_read())
                if not self.args.PEAR:
                    fastq2_read = next(self.fastq2.seq_read())

            except StopIteration:
                if self.args.Demultiplex:
                    for index_name in fastq_data_dict:
                        r1_data = fastq_data_dict[index_name]["R1"]
                        r1, r2 = self.fastq_outfile_dict[index_name]
                        r1.write(r1_data)
                        r1.close()
                        if not self.args.PEAR:
                            r2_data = fastq_data_dict[index_name]["R2"]
                            r2.write(r2_data)
                            r2.close()

                eof = True
                continue

            self.read_count += 1
            if self.read_count % 100000 == 0:
                elapsed_time = int(time.time() - start_time)
                block_time = int(time.time() - split_time)
                split_time = time.time()
                self.log.info("Processed {} reads in {} seconds.  Total elapsed time: {} seconds."
                              .format(self.read_count, block_time, elapsed_time))

            # Match read with library index.
            match_found, left_seq, right_seq, index_name, fastq1_read, fastq2_read = \
                self.index_matching(fastq1_read, fastq2_read)

            if match_found:
                indexed_read_count += 1
                locus = self.index_dict[index_name][7]
                phase_key = "{}+{}".format(index_name, locus)
                r2_found = False
                r1_found = False
                if self.args.Platform == "Illumina":
                    # Score the phasing and place the reads in a dictionary.
                    for r2_phase, r1_phase in zip(self.phase_dict[locus]["R2"], self.phase_dict[locus]["R1"]):

                        r2_phase_name = r2_phase[1]
                        r1_phase_name = r1_phase[1]

                        # Tag reads that should not have any phasing.
                        if not r1_phase[0]:
                            self.phase_count[phase_key]["Phase " + r1_phase_name] = -1
                            self.phase_count[phase_key]["Phase " + r2_phase_name] = -1
                            continue
                        else:
                            self.phase_count[phase_key]["Phase " + r1_phase_name] += 0
                            self.phase_count[phase_key]["Phase " + r2_phase_name] += 0

                        # The phasing is the last N nucleotides of the consensus.
                        if r2_phase[0] == Sequence_Magic.rcomp(fastq1_read.seq[-len(r2_phase[0]):]) and not r2_found:
                            self.phase_count[phase_key]["Phase "+r2_phase_name] += 1
                            r2_found = True

                        if r1_phase[0] == fastq1_read.seq[:len(r1_phase[0])] and not r1_found:
                            self.phase_count[phase_key]["Phase "+r1_phase_name] += 1
                            r1_found = True

                    # if no phasing is found then note that.
                    if not r2_found:
                        self.phase_count[phase_key]["No Read 2 Phasing"] += 1
                    if not r1_found:
                        self.phase_count[phase_key]["No Read 1 Phasing"] += 1

                    # The adapters on Gupta Lab AAVS1.1 are reversed causing the reads to be reversed.
                    if locus == "AAVS1.1":
                        self.sequence_dict[index_name].append(fastq1_read.seq)
                    else:
                        self.sequence_dict[index_name].append(fastq1_read.seq)

                elif self.args.Platform == "TruSeq":
                    self.sequence_dict[index_name].append(right_seq)

                elif self.args.Platform == "Ramsden":
                    self.sequence_dict[index_name].append(Sequence_Magic.rcomp(fastq1_read.seq))

                else:
                    self.log.error("--Platform {} not correctly defined.  Edit parameter file and try again"
                                   .format(self.args.Platform))
                    raise SystemExit(1)

                if self.args.Demultiplex:
                    fastq_data_dict[index_name]["R1"].append([fastq1_read.name, fastq1_read.seq, fastq1_read.qual])
                    if not self.args.PEAR:
                        fastq_data_dict[index_name]["R2"].append([fastq2_read.name, fastq2_read.seq, fastq2_read.qual])

                    fastq_file_name_list.append("{}{}_{}_Consensus.fastq"
                                                .format(self.args.WorkingFolder, self.args.Job_Name, index_name))

            elif self.args.Demultiplex and not match_found:
                fastq_data_dict['Unknown']["R1"].append([fastq1_read.name, fastq1_read.seq, fastq1_read.qual])
                fastq_data_dict['Unknown']["R2"].append([fastq1_read.name, fastq1_read.seq, fastq1_read.qual])

                fastq_file_name_list.append("{}{}_Unknown_Consensus.fastq"
                                            .format(self.args.WorkingFolder, self.args.Job_Name))

        if self.args.Demultiplex:
            self.fastq_compress(list(set(fastq_file_name_list)))

        for key in self.sequence_dict:
            key_counts.append(len(self.sequence_dict[key]))

        # The lower limit is used when plotting the data.  Generally the lowest values are just noise.
        if len(key_counts) == 0:
            self.log.error("No Scar Patterns Found")
            raise SystemExit(1)
        lower, upper_limit = stats.norm.interval(0.9, loc=statistics.mean(key_counts), scale=stats.sem(key_counts))
        lower_limit = statistics.mean(key_counts)-lower

        return indexed_read_count, lower_limit