def get_chromosome_lengths(self): """ Get dictionary containing lengths of the chromosomes. Uses bam file for reference :return: Dictionary of chromosome lengths, ex: {"chrX": 222222} """ parser = BamFileReadParser(self.input_bam_file, 20) return dict( zip(parser.OpenBamFile.references, parser.OpenBamFile.lengths))
def attempt_cpg_position_correction(reads, parser: BamFileReadParser): """ Take the reads and a parser object, attempted cpg position correction and return corrected reads :param reads: parsed reads from BamFileReadParser :param parser: an instance of the BamFileReadParser object :return: reads with CpG positions corrected """ corrected_reads = parser.correct_cpg_positions(reads) return corrected_reads
def run(self): """ """ # Get command line arguments args = self.parse_arguments() self.input_bam_file = args.i self.chromes = args.c self.impute = args.p self.outfile = args.o # Get a parser and chromosome information parser = BamFileReadParser(self.input_bam_file, quality_score=0) chromosome_lens = self.get_chromosome_lengths(parser, self.chromes) print("got parser") # get a list of all the bin names bin_name_list = [] for chrom in chromosome_lens: chrome_bin_list = self.generate_bins_list(chromosome_lens)[chrom] bin_name_list += chrome_bin_list print("got bin name list") #bin_name_list = bin_name_list[100000:101000] # TODO: don't do this!!! # Get PReLIM models if args.p: # load existing models if args.m: self.models = self.load_models(args.m) print("loaded models") # train new models else: self.models = self.train(parser, bin_name_list) else: print("not imputing, no need for models") # parallelize the imputation num_cpus = int(mp.cpu_count() / 2) num_bins = len(bin_name_list) chunk_size = int(num_bins / num_cpus) bin_name_chunks = [ bin_name_list[i:i + chunk_size] for i in range(0, num_bins, chunk_size) ] pool = mp.Pool(num_cpus) results = pool.map(self.binName2dssFormat_chunk, bin_name_chunks) self.print_results(results, self.outfile)
def binName2dssFormat_chunk(self, bin_name_chunk): """ """ results = [] parser = BamFileReadParser(self.input_bam_file, quality_score=0) for bin_name in bin_name_chunk: # get the bin matrix bin_matrix = self.get_bin_matrix(parser, bin_name, return_cpgs=True) # get the chrome name chrome = bin_name.split("_")[0] # convert to dss format dss_format = self.toDssFormat(bin_matrix, chrome) # append results += (dss_format) return results
def calculate_bin_coverage(self, bin): """ Take a single bin, return a matrix. This is passed to a multiprocessing Pool. :param bin: Bin should be passed as "Chr19_4343343" :return: pd.DataFrame with rows containing NaNs dropped """ # Get reads from bam file parser = BamFileReadParser(self.input_bam_file, 20, self.mbias_read1_5, self.mbias_read1_3, self.mbias_read2_5, self.mbias_read2_3, self.no_overlap) # Split bin into parts chromosome, bin_location = bin.split("_") bin_location = int(bin_location) try: reads = parser.parse_reads(chromosome, bin_location-self.bin_size, bin_location) matrix = parser.create_matrix(reads) except BaseException as e: # No reads are within this window, do nothing self.bins_no_reads += 1 return None except: logging.error("Unknown error: {}".format(bin)) return None # drop rows of ALL NaN matrix = matrix.dropna(how="all") # convert to data_frame of 1s and 0s, drop rows with NaN matrix = matrix.dropna() # if matrix is empty, attempt to create it with correction before giving up if len(matrix) == 0: original_matrix = matrix.copy() reads = parser.correct_cpg_positions(reads) try: matrix = parser.create_matrix(reads) except InvalidIndexError as e: logging.error("Invalid Index error when creating matrices at bin {}".format(bin)) logging.debug(str(e)) return bin, original_matrix except ValueError as e: logging.error("Matrix concat error ar bin {}".format(bin)) logging.debug(str(e)) matrix = matrix.dropna() if len(matrix) > 0: logging.info("Correction attempt at bin {}: SUCCESS".format(bin)) else: logging.info("Correction attempt at bin {}: FAILED".format(bin)) return bin, matrix
def process_bins(self, bin): """ This is the main method and should be called using Pool.map It takes one bin location and uses the other helper functions to get the reads, form the matrix, cluster it with DBSCAN, and output the cluster data as text lines ready to writing to a file. :param bin: string in this format: "chr19_55555" :return: a list of lines representing the cluster data from that bin """ try: chromosome, bin_loc = bin.split("_") except ValueError: return None bin_loc = int(bin_loc) # Create bam parser and parse reads bam_parser_A = BamFileReadParser(self.bam_a, 20, read1_5=self.mbias_read1_5, read1_3=self.mbias_read1_3, read2_5=self.mbias_read2_5, read2_3=self.mbias_read2_3, no_overlap=self.no_overlap) reads_A = bam_parser_A.parse_reads(chromosome, bin_loc - self.bin_size, bin_loc) if not self.single_file_mode: bam_parser_B = BamFileReadParser(self.bam_b, 20, read1_5=self.mbias_read1_5, read1_3=self.mbias_read1_3, read2_5=self.mbias_read2_5, read2_3=self.mbias_read2_3, no_overlap=self.no_overlap) reads_B = bam_parser_B.parse_reads(chromosome, bin_loc - self.bin_size, bin_loc) # This try/catch block returns None for a bin if any discrepancies in the data format of the bins are detected. # The Nones are filtered out during the output of the data try: # create matrix drop NA # This matrix is actually a pandas dataframe matrix_A = bam_parser_A.create_matrix(reads_A).dropna() # Attempt to correct CpG Position if necessary if len(matrix_A) == 0: reads_A = self.attempt_cpg_position_correction( reads_A, bam_parser_A) matrix_A = bam_parser_A.create_matrix(reads_A).dropna() if not self.single_file_mode: matrix_B = bam_parser_B.create_matrix(reads_B).dropna() # attempt to correct CpG position in B if necessary if len(matrix_B) == 0: reads_B = self.attempt_cpg_position_correction( reads_B, bam_parser_B) matrix_B = bam_parser_B.create_matrix(reads_B).dropna() except ValueError as e: logging.error( "ValueError when creating matrix at bin {}. Stack trace will be below if log level=DEBUG" .format(bin)) logging.debug(str(e)) return None except InvalidIndexError as e: logging.error( "Invalid Index error when creating matrices at bin {}".format( bin)) logging.debug(str(e)) return None # if read depths are still not a minimum, skip if matrix_A.shape[0] < self.read_depth_req: return None if not self.single_file_mode: if matrix_B.shape[0] < self.read_depth_req: return None # create labels and add to dataframe # If two files label each A and B, otherwise use file_name as label if not self.single_file_mode: labels_A = ['A'] * len(matrix_A) matrix_A['input'] = labels_A labels_B = ['B'] * len(matrix_B) matrix_B['input'] = labels_B else: labels_A = [os.path.basename(self.bam_a)] * len(matrix_A) matrix_A['input'] = labels_A if not self.single_file_mode: try: # ensure they have the same CpG positions matrix_B.columns = matrix_A.columns full_matrix = pd.concat([matrix_A, matrix_B], sort=False) except ValueError as e: logging.error("Matrix concat error in bin {}".format(bin)) # logging.debug(str(e)) return None else: full_matrix = matrix_A # Get data without labels for clustering data_to_cluster = np.array(full_matrix)[:, :-1] # Create DBSCAN classifier and cluster add cluster classes to df clf = DBSCAN(min_samples=2) try: labels = clf.fit_predict(data_to_cluster) except ValueError as e: # log error logging.error( "ValueError when trying to cluster bin {}".format(bin)) logging.debug(str(e)) return None full_matrix['class'] = labels # Filter out any clusters with less than a minimum filtered_matrix = self.filter_data_frame(full_matrix) if self.remove_noise: filtered_matrix = filtered_matrix[filtered_matrix['class'] != -1] # return generate_output_data(filtered_matrix, chromosome, bin_loc) return self.generate_individual_matrix_data(filtered_matrix, chromosome, bin_loc)