Пример #1
0
    def calculate_bin_coverage(self, bin):
        """
        Take a single bin, return a matrix. This is passed to a multiprocessing Pool.

        :param bin: Bin should be passed as "Chr19_4343343"
        :return: pd.DataFrame with rows containing NaNs dropped
        """
        # Get reads from bam file
        parser = BamFileReadParser(self.input_bam_file, 20, self.mbias_read1_5, self.mbias_read1_3,
                                   self.mbias_read2_5, self.mbias_read2_3, self.no_overlap)
        # Split bin into parts
        chromosome, bin_location = bin.split("_")
        bin_location = int(bin_location)
        try:
            reads = parser.parse_reads(chromosome, bin_location-self.bin_size, bin_location)
            matrix = parser.create_matrix(reads)
        except BaseException as e:
            # No reads are within this window, do nothing
            self.bins_no_reads += 1
            return None
        except:
            logging.error("Unknown error: {}".format(bin))
            return None

        # drop rows of ALL NaN
        matrix = matrix.dropna(how="all")
        # convert to data_frame of 1s and 0s, drop rows with NaN
        matrix = matrix.dropna()
        # if matrix is empty, attempt to create it with correction before giving up
        if len(matrix) == 0:
            original_matrix = matrix.copy()
            reads = parser.correct_cpg_positions(reads)
            try:
                matrix = parser.create_matrix(reads)
            except InvalidIndexError as e:
                logging.error("Invalid Index error when creating matrices at bin {}".format(bin))
                logging.debug(str(e))
                return bin, original_matrix
            except ValueError as e:
                logging.error("Matrix concat error ar bin {}".format(bin))
                logging.debug(str(e))

            matrix = matrix.dropna()
            if len(matrix) > 0:
                logging.info("Correction attempt at bin {}: SUCCESS".format(bin))
            else:
                logging.info("Correction attempt at bin {}: FAILED".format(bin))

        return bin, matrix
Пример #2
0
    def _multiprocess_extract(self, one_bin: str):
        """Function to be used for multiprocessing
        
        Arguments:
            one_bin {str} -- bin id as "chr7_222222"
        
        Returns:
            [tuple] -- bin, matrix
        """
        try:
            read_parser = BamFileReadParser(self.bam_file,
                                            20,
                                            read1_5=self.mbias_read1_5,
                                            read1_3=self.mbias_read1_3,
                                            read2_5=self.mbias_read2_5,
                                            read2_3=self.mbias_read2_3)
            chrom, loc = one_bin.split("_")
            loc = int(loc)
            reads = read_parser.parse_reads(chrom, loc - 100,
                                            loc)  # TODO unhardcode bin size
            matrix = read_parser.create_matrix(reads)
            matrix = matrix.dropna(how="all")
            # if matrix.shape[0] == 0:
            #     return None
            matrix = matrix.fillna(-1)
            matrix = np.array(matrix)
            matrix = matrix.astype('int8')
        except:  # BAD EXCEPTION
            return (one_bin, np.array([]))

        return (one_bin, matrix)
Пример #3
0
    def process_bins(self, bin):
        """
        This is the main method and should be called using Pool.map It takes one bin location and uses the other helper
        functions to get the reads, form the matrix, cluster it with DBSCAN, and output the cluster data as text lines
        ready to writing to a file.

        :param bin: string in this format: "chr19_55555"
        :return: a list of lines representing the cluster data from that bin

        """
        try:
            chromosome, bin_loc = bin.split("_")
        except ValueError:
            return None
        bin_loc = int(bin_loc)

        # Create bam parser and parse reads
        bam_parser_A = BamFileReadParser(self.bam_a,
                                         20,
                                         read1_5=self.mbias_read1_5,
                                         read1_3=self.mbias_read1_3,
                                         read2_5=self.mbias_read2_5,
                                         read2_3=self.mbias_read2_3,
                                         no_overlap=self.no_overlap)
        reads_A = bam_parser_A.parse_reads(chromosome, bin_loc - self.bin_size,
                                           bin_loc)

        if not self.single_file_mode:
            bam_parser_B = BamFileReadParser(self.bam_b,
                                             20,
                                             read1_5=self.mbias_read1_5,
                                             read1_3=self.mbias_read1_3,
                                             read2_5=self.mbias_read2_5,
                                             read2_3=self.mbias_read2_3,
                                             no_overlap=self.no_overlap)
            reads_B = bam_parser_B.parse_reads(chromosome,
                                               bin_loc - self.bin_size,
                                               bin_loc)

        # This try/catch block returns None for a bin if any discrepancies in the data format of the bins are detected.
        # The Nones are filtered out during the output of the data
        try:
            # create matrix  drop NA
            # This matrix is actually a pandas dataframe
            matrix_A = bam_parser_A.create_matrix(reads_A).dropna()

            # Attempt to correct CpG Position if necessary
            if len(matrix_A) == 0:
                reads_A = self.attempt_cpg_position_correction(
                    reads_A, bam_parser_A)
                matrix_A = bam_parser_A.create_matrix(reads_A).dropna()
            if not self.single_file_mode:
                matrix_B = bam_parser_B.create_matrix(reads_B).dropna()

                # attempt to correct CpG position in B if necessary
                if len(matrix_B) == 0:
                    reads_B = self.attempt_cpg_position_correction(
                        reads_B, bam_parser_B)
                    matrix_B = bam_parser_B.create_matrix(reads_B).dropna()

        except ValueError as e:
            logging.error(
                "ValueError when creating matrix at bin {}. Stack trace will be below if log level=DEBUG"
                .format(bin))
            logging.debug(str(e))
            return None
        except InvalidIndexError as e:
            logging.error(
                "Invalid Index error when creating matrices at bin {}".format(
                    bin))
            logging.debug(str(e))
            return None

        # if read depths are still not a minimum, skip
        if matrix_A.shape[0] < self.read_depth_req:
            return None
        if not self.single_file_mode:
            if matrix_B.shape[0] < self.read_depth_req:
                return None

        # create labels and add to dataframe

        # If two files label each A and B, otherwise use file_name as label
        if not self.single_file_mode:
            labels_A = ['A'] * len(matrix_A)
            matrix_A['input'] = labels_A
            labels_B = ['B'] * len(matrix_B)
            matrix_B['input'] = labels_B
        else:
            labels_A = [os.path.basename(self.bam_a)] * len(matrix_A)
            matrix_A['input'] = labels_A

        if not self.single_file_mode:
            try:
                # ensure they have the same CpG positions
                matrix_B.columns = matrix_A.columns
                full_matrix = pd.concat([matrix_A, matrix_B], sort=False)
            except ValueError as e:
                logging.error("Matrix concat error in bin {}".format(bin))
                # logging.debug(str(e))
                return None
        else:
            full_matrix = matrix_A

        # Get data without labels for clustering
        data_to_cluster = np.array(full_matrix)[:, :-1]

        # Create DBSCAN classifier and cluster add cluster classes to df
        clf = DBSCAN(min_samples=2)
        try:
            labels = clf.fit_predict(data_to_cluster)
        except ValueError as e:
            # log error
            logging.error(
                "ValueError when trying to cluster bin {}".format(bin))
            logging.debug(str(e))
            return None

        full_matrix['class'] = labels

        # Filter out any clusters with less than a minimum
        filtered_matrix = self.filter_data_frame(full_matrix)
        if self.remove_noise:
            filtered_matrix = filtered_matrix[filtered_matrix['class'] != -1]

        # return generate_output_data(filtered_matrix, chromosome, bin_loc)
        return self.generate_individual_matrix_data(filtered_matrix,
                                                    chromosome, bin_loc)