Пример #1
0
    def _multiprocess_extract(self, one_bin: str):
        """Function to be used for multiprocessing
        
        Arguments:
            one_bin {str} -- bin id as "chr7_222222"
        
        Returns:
            [tuple] -- bin, matrix
        """
        try:
            read_parser = BamFileReadParser(self.bam_file,
                                            20,
                                            read1_5=self.mbias_read1_5,
                                            read1_3=self.mbias_read1_3,
                                            read2_5=self.mbias_read2_5,
                                            read2_3=self.mbias_read2_3)
            chrom, loc = one_bin.split("_")
            loc = int(loc)
            reads = read_parser.parse_reads(chrom, loc - 100,
                                            loc)  # TODO unhardcode bin size
            matrix = read_parser.create_matrix(reads)
            matrix = matrix.dropna(how="all")
            # if matrix.shape[0] == 0:
            #     return None
            matrix = matrix.fillna(-1)
            matrix = np.array(matrix)
            matrix = matrix.astype('int8')
        except:  # BAD EXCEPTION
            return (one_bin, np.array([]))

        return (one_bin, matrix)
Пример #2
0
    def get_chromosome_lengths(self):
        """
        Get dictionary containing lengths of the chromosomes. Uses bam file for reference

        :return: Dictionary of chromosome lengths, ex: {"chrX": 222222}
        """
        parser = BamFileReadParser(self.input_bam_file, 20)
        return dict(zip(parser.OpenBamFile.references, parser.OpenBamFile.lengths))
Пример #3
0
    def run(self):
        """
		"""

        # Get command line arguments
        args = self.parse_arguments()
        self.input_bam_file = args.i
        self.chromes = args.c
        self.impute = args.p
        self.outfile = args.o

        # Get a parser and chromosome information
        parser = BamFileReadParser(self.input_bam_file, quality_score=0)
        chromosome_lens = self.get_chromosome_lengths(parser, self.chromes)

        print("got parser")
        # get a list of all the bin names
        bin_name_list = []
        for chrom in chromosome_lens:
            chrome_bin_list = self.generate_bins_list(chromosome_lens)[chrom]
            bin_name_list += chrome_bin_list

        print("got bin name list")
        #bin_name_list = bin_name_list[100000:101000] # TODO: don't do this!!!

        # Get PReLIM models
        if args.p:
            # load existing models
            if args.m:
                self.models = self.load_models(args.m)
                print("loaded models")
            # train new models
            else:
                self.models = self.train(parser, bin_name_list)
        else:
            print("not imputing, no need for models")

        # parallelize the imputation
        num_cpus = int(mp.cpu_count() / 2)
        num_bins = len(bin_name_list)
        chunk_size = int(num_bins / num_cpus)
        bin_name_chunks = [
            bin_name_list[i:i + chunk_size]
            for i in range(0, num_bins, chunk_size)
        ]

        pool = mp.Pool(num_cpus)
        results = pool.map(self.binName2dssFormat_chunk, bin_name_chunks)

        self.print_results(results, self.outfile)
Пример #4
0
    def calculate_bin_coverage(self, bin):
        """
        Take a single bin, return a matrix. This is passed to a multiprocessing Pool.

        :param bin: Bin should be passed as "Chr19_4343343"
        :return: pd.DataFrame with rows containing NaNs dropped
        """
        # Get reads from bam file
        parser = BamFileReadParser(self.input_bam_file, 20, self.mbias_read1_5, self.mbias_read1_3,
                                   self.mbias_read2_5, self.mbias_read2_3, self.no_overlap)
        # Split bin into parts
        chromosome, bin_location = bin.split("_")
        bin_location = int(bin_location)
        try:
            reads = parser.parse_reads(chromosome, bin_location-self.bin_size, bin_location)
            matrix = parser.create_matrix(reads)
        except BaseException as e:
            # No reads are within this window, do nothing
            self.bins_no_reads += 1
            return None
        except:
            logging.error("Unknown error: {}".format(bin))
            return None

        # drop rows of ALL NaN
        matrix = matrix.dropna(how="all")
        # convert to data_frame of 1s and 0s, drop rows with NaN
        matrix = matrix.dropna()
        # if matrix is empty, attempt to create it with correction before giving up
        if len(matrix) == 0:
            original_matrix = matrix.copy()
            reads = parser.correct_cpg_positions(reads)
            try:
                matrix = parser.create_matrix(reads)
            except InvalidIndexError as e:
                logging.error("Invalid Index error when creating matrices at bin {}".format(bin))
                logging.debug(str(e))
                return bin, original_matrix
            except ValueError as e:
                logging.error("Matrix concat error ar bin {}".format(bin))
                logging.debug(str(e))

            matrix = matrix.dropna()
            if len(matrix) > 0:
                logging.info("Correction attempt at bin {}: SUCCESS".format(bin))
            else:
                logging.info("Correction attempt at bin {}: FAILED".format(bin))

        return bin, matrix
Пример #5
0
    def binName2dssFormat_chunk(self, bin_name_chunk):
        """
		"""
        results = []

        parser = BamFileReadParser(self.input_bam_file, quality_score=0)

        for bin_name in bin_name_chunk:
            # get the bin matrix
            bin_matrix = self.get_bin_matrix(parser,
                                             bin_name,
                                             return_cpgs=True)
            # get the chrome name
            chrome = bin_name.split("_")[0]
            # convert to dss format
            dss_format = self.toDssFormat(bin_matrix, chrome)
            # append
            results += (dss_format)

        return results
Пример #6
0
    def process_bins(self, bin):
        """
        This is the main method and should be called using Pool.map It takes one bin location and uses the other helper
        functions to get the reads, form the matrix, cluster it with DBSCAN, and output the cluster data as text lines
        ready to writing to a file.

        :param bin: string in this format: "chr19_55555"
        :return: a list of lines representing the cluster data from that bin

        """
        try:
            chromosome, bin_loc = bin.split("_")
        except ValueError:
            return None
        bin_loc = int(bin_loc)

        # Create bam parser and parse reads
        bam_parser_A = BamFileReadParser(self.bam_a,
                                         20,
                                         read1_5=self.mbias_read1_5,
                                         read1_3=self.mbias_read1_3,
                                         read2_5=self.mbias_read2_5,
                                         read2_3=self.mbias_read2_3,
                                         no_overlap=self.no_overlap)
        reads_A = bam_parser_A.parse_reads(chromosome, bin_loc - self.bin_size,
                                           bin_loc)

        if not self.single_file_mode:
            bam_parser_B = BamFileReadParser(self.bam_b,
                                             20,
                                             read1_5=self.mbias_read1_5,
                                             read1_3=self.mbias_read1_3,
                                             read2_5=self.mbias_read2_5,
                                             read2_3=self.mbias_read2_3,
                                             no_overlap=self.no_overlap)
            reads_B = bam_parser_B.parse_reads(chromosome,
                                               bin_loc - self.bin_size,
                                               bin_loc)

        # This try/catch block returns None for a bin if any discrepancies in the data format of the bins are detected.
        # The Nones are filtered out during the output of the data
        try:
            # create matrix  drop NA
            # This matrix is actually a pandas dataframe
            matrix_A = bam_parser_A.create_matrix(reads_A).dropna()

            # Attempt to correct CpG Position if necessary
            if len(matrix_A) == 0:
                reads_A = self.attempt_cpg_position_correction(
                    reads_A, bam_parser_A)
                matrix_A = bam_parser_A.create_matrix(reads_A).dropna()
            if not self.single_file_mode:
                matrix_B = bam_parser_B.create_matrix(reads_B).dropna()

                # attempt to correct CpG position in B if necessary
                if len(matrix_B) == 0:
                    reads_B = self.attempt_cpg_position_correction(
                        reads_B, bam_parser_B)
                    matrix_B = bam_parser_B.create_matrix(reads_B).dropna()

        except ValueError as e:
            logging.error(
                "ValueError when creating matrix at bin {}. Stack trace will be below if log level=DEBUG"
                .format(bin))
            logging.debug(str(e))
            return None
        except InvalidIndexError as e:
            logging.error(
                "Invalid Index error when creating matrices at bin {}".format(
                    bin))
            logging.debug(str(e))
            return None

        # if read depths are still not a minimum, skip
        if matrix_A.shape[0] < self.read_depth_req:
            return None
        if not self.single_file_mode:
            if matrix_B.shape[0] < self.read_depth_req:
                return None

        # create labels and add to dataframe

        # If two files label each A and B, otherwise use file_name as label
        if not self.single_file_mode:
            labels_A = ['A'] * len(matrix_A)
            matrix_A['input'] = labels_A
            labels_B = ['B'] * len(matrix_B)
            matrix_B['input'] = labels_B
        else:
            labels_A = [os.path.basename(self.bam_a)] * len(matrix_A)
            matrix_A['input'] = labels_A

        if not self.single_file_mode:
            try:
                # ensure they have the same CpG positions
                matrix_B.columns = matrix_A.columns
                full_matrix = pd.concat([matrix_A, matrix_B], sort=False)
            except ValueError as e:
                logging.error("Matrix concat error in bin {}".format(bin))
                # logging.debug(str(e))
                return None
        else:
            full_matrix = matrix_A

        # Get data without labels for clustering
        data_to_cluster = np.array(full_matrix)[:, :-1]

        # Create DBSCAN classifier and cluster add cluster classes to df
        clf = DBSCAN(min_samples=2)
        try:
            labels = clf.fit_predict(data_to_cluster)
        except ValueError as e:
            # log error
            logging.error(
                "ValueError when trying to cluster bin {}".format(bin))
            logging.debug(str(e))
            return None

        full_matrix['class'] = labels

        # Filter out any clusters with less than a minimum
        filtered_matrix = self.filter_data_frame(full_matrix)
        if self.remove_noise:
            filtered_matrix = filtered_matrix[filtered_matrix['class'] != -1]

        # return generate_output_data(filtered_matrix, chromosome, bin_loc)
        return self.generate_individual_matrix_data(filtered_matrix,
                                                    chromosome, bin_loc)