def main(args=None): args = parse_arguments().parse_args(args) matrices_name = args.matrix threads = args.threads matrices_list = cell_name_list(matrices_name) bulk_matrix = None all_data_collected = False thread_done = [False] * threads length_index = [None] * threads length_index[0] = 0 matricesPerThread = len(matrices_list) // threads queue = [None] * threads process = [None] * threads for i in range(threads): if i < threads - 1: matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread] length_index[i + 1] = length_index[i] + len(matrices_name_list) else: matrices_name_list = matrices_list[i * matricesPerThread:] queue[i] = Queue() process[i] = Process(target=create_bulk_matrix, kwargs=dict(pMatrixName=matrices_name, pMatricesList=matrices_name_list, pQueue=queue[i])) process[i].start() while not all_data_collected: for i in range(threads): if queue[i] is not None and not queue[i].empty(): csr_matrix_worker = queue[i].get() if bulk_matrix is None: bulk_matrix = csr_matrix_worker else: bulk_matrix.matrix += csr_matrix_worker.matrix queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) bulk_matrix.save(args.outFileName)
def main(args=None): args = parse_arguments().parse_args(args) matrices_list = cell_name_list(args.matrix) print('Filename: {}'.format(args.matrix)) print('Contains {} single-cell matrices'.format(len(matrices_list))) print('The information stored via cooler.info of the first cell is: \n') cooler_file = cooler.Cooler(args.matrix + '::' + matrices_list[0]) if cooler_file.info is not None: for key, value in cooler_file.info.items(): print(key, value) print('Chromosomes: {}'.format(cooler_file.chromnames)) if args.writeOutNames is not None: with open(args.writeOutNames, 'w') as file: for cell in matrices_list: file.write("{}\n".format(cell[7:]))
def main(args=None): args = parse_arguments().parse_args(args) matrices_list = cell_name_list(args.matrix) # if args.labels and len(matrices_list) != len(args.labels): # log.error("The number of labels does not match the number of matrices.") # exit(0) if args.labels: label_list = [None] * len(matrices_list) with open(args.labels, 'r') as file: for line in file.readlines(): try: matrix_name, label_name = line.strip().split('\t') except Exception: matrix_name, label_name = line.strip().split(' ') if matrix_name in matrices_list: index = matrices_list.index(matrix_name) label_list[index] = label_name args.labels = label_list else: label_list = [x.split('/')[2].split('.')[0].split('_')[2] for x in matrices_list] args.labels = label_list num_files = len(matrices_list) map(lambda x: os.path.basename(x), matrices_list) # initialize results matrix results = np.zeros((num_files, num_files), dtype='float') rows, cols = np.triu_indices(num_files) correlation_opts = {'spearman': spearmanr, 'pearson': pearsonr} hic_mat_list = [] max_value = None min_value = None all_mat = None all_nan = [] # load csr matrices in parallel chromosome_indices = None cooler_obj = cooler.Cooler(args.matrix + '::' + matrices_list[0]) binsDataFrame = cooler_obj.bins()[:] chromosome_indices = {} for chromosome in cooler_obj.chromnames: chromosome_indices[chromosome] = np.array(binsDataFrame.index[binsDataFrame['chrom'] == chromosome].tolist()) threads = args.threads all_data_collected = False thread_done = [False] * threads length_index = [None] * threads matrix_list_threads = [None] * threads # all_mat_thread = [None] * threads length_index[0] = 0 matricesPerThread = len(matrices_list) // threads queue = [None] * threads process = [None] * threads for i in range(threads): if i < threads - 1: matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread] length_index[i + 1] = length_index[i] + len(matrices_name_list) else: matrices_name_list = matrices_list[i * matricesPerThread:] queue[i] = Queue() process[i] = Process(target=load_matrix_list, kwargs=dict( pMatrixName=args.matrix, pMatricesList=matrices_name_list, pArgs=args, pChromosomeIndices=chromosome_indices, pQueue=queue[i] ) ) process[i].start() fail_flag = False time_start = time.time() wait_threshold = 60 * 5 while not all_data_collected: for i in range(threads): if queue[i] is not None and not queue[i].empty(): csr_matrix_worker = queue[i].get() if isinstance(csr_matrix_worker, str): log.error('{}'.format(csr_matrix_worker)) fail_flag = True else: matrix_list_threads[i], all_mat_thread = csr_matrix_worker if all_mat is None: all_mat = all_mat_thread else: all_mat += all_mat_thread queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True time_start = time.time() all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False if time.time() - time_start > wait_threshold: log.error('The wait threshold time limit is reached. It seems parts of your data are too large for Python\'s queues to be passed back. Please use either a higher number of threads or use the `--saveMemory` option if available.') for i in range(threads): if process[i] is not None: process[i].join() process[i].terminate() exit(1) time.sleep(1) if fail_flag: # log.error(fail_message) exit(1) hic_mat_list = [item for sublist in matrix_list_threads for item in sublist] # remove nan bins rows_keep = cols_keep = np.delete(list(range(all_mat.shape[1])), all_nan) all_mat = all_mat[rows_keep, :][:, cols_keep] # make large matrix to correlate by # using sparse matrix tricks big_mat = None for mat in hic_mat_list: mat = mat[rows_keep, :][:, cols_keep] sample_vector = (mat + all_mat).data - all_mat.data if big_mat is None: big_mat = sample_vector else: big_mat = np.vstack([big_mat, sample_vector]) # take the transpose such that columns represent each of the samples big_mat = np.ma.masked_invalid(big_mat).T grids = gridspec.GridSpec(num_files, num_files) grids.update(wspace=0, hspace=0) plt.figure(figsize=(2 * num_files, 2 * num_files)) plt.rcParams['font.size'] = 8.0 min_value = int(big_mat.min()) max_value = int(big_mat.max()) if (min_value % 2 == 0 and max_value % 2 == 0) or \ (min_value % 1 == 0 and max_value % 2 == 1): # make one value odd and the other even max_value += 1 # if args.log1p: # major_locator = FixedLocator(list(range(min_value, max_value, 2))) # minor_locator = FixedLocator(list(range(min_value, max_value, 1))) # parallel correlation computation all_data_collected = False thread_done = [False] * threads matricesPerThread = len(rows) // threads queue = [None] * threads process = [None] * threads for i in range(threads): if i < threads - 1: start_index = i * matricesPerThread end_index = (i + 1) * matricesPerThread else: start_index = i * matricesPerThread end_index = len(rows) queue[i] = Queue() process[i] = Process(target=compute_correlation, kwargs=dict( pCorrelationFunction=correlation_opts[args.method], pRows=rows, pColumns=cols, pBigMatrix=big_mat, pIndexStart=start_index, pIndexEnd=end_index, pResults=results, pQueue=queue[i] ) ) process[i].start() fail_flag = False time_start = time.time() while not all_data_collected: for i in range(threads): if queue[i] is not None and not queue[i].empty(): correlated_result = queue[i].get() if isinstance(correlated_result, str): log.error('{}'.format(correlated_result)) fail_flag = True else: results += correlated_result queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True time_start = time.time() all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) if fail_flag: exit(1) results = results + np.triu(results, 1).T plot_correlation(results, args.labels, args.outFileNameHeatmap, args.zMax, args.zMin, args.colorMap, image_format=args.plotFileFormat, pFontSize=args.fontsize, pFigureSize=args.figuresize)
def main(args=None): args = parse_arguments().parse_args(args) matrices_name = args.matrix threads = args.threads matrices_list = cell_name_list(matrices_name) all_samples_number = len(matrices_list) if args.runChromosomeCheck: ##################################################### # Detect broken chromosomes and remove these matrices ##################################################### keep_matrices_thread = [None] * threads all_data_collected = False thread_done = [False] * threads length_index = [None] * threads length_index[0] = 0 matricesPerThread = len(matrices_list) // threads queue = [None] * threads process = [None] * threads for i in range(threads): if i < threads - 1: matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread] length_index[i + 1] = length_index[i] + len(matrices_name_list) else: matrices_name_list = matrices_list[i * matricesPerThread:] queue[i] = Queue() process[i] = Process(target=compute_contains_all_chromosomes, kwargs=dict( pMatrixName=matrices_name, pMatricesList=matrices_name_list, pChromosomes=args.chromosomes, pQueue=queue[i] ) ) process[i].start() while not all_data_collected: for i in range(threads): if queue[i] is not None and not queue[i].empty(): worker_result = queue[i].get() keep_matrices_thread[i] = worker_result queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) keep_matrices_chromosome_names = np.array([item for sublist in keep_matrices_thread for item in sublist], dtype=bool) matrices_name_chromosome_names = np.array(matrices_list) matrices_list = matrices_name_chromosome_names[keep_matrices_chromosome_names] matrices_remove = matrices_name_chromosome_names[~keep_matrices_chromosome_names] ####################################### read_coverage_thread = [None] * threads sparsity_thread = [None] * threads all_data_collected = False thread_done = [False] * threads length_index = [None] * threads length_index[0] = 0 matricesPerThread = len(matrices_list) // threads queue = [None] * threads process = [None] * threads for i in range(threads): if i < threads - 1: matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread] length_index[i + 1] = length_index[i] + len(matrices_name_list) else: matrices_name_list = matrices_list[i * matricesPerThread:] queue[i] = Queue() process[i] = Process(target=compute_read_coverage_sparsity, kwargs=dict( pMatrixName=matrices_name, pMatricesList=matrices_name_list, pXDimension=len(matrices_list), pMaximumRegionToConsider=args.maximumRegionToConsider, pQueue=queue[i] ) ) process[i].start() while not all_data_collected: for i in range(threads): if queue[i] is not None and not queue[i].empty(): worker_result = queue[i].get() read_coverage_thread[i] = worker_result[0] sparsity_thread[i] = worker_result[1] queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) read_coverage = np.array([item for sublist in read_coverage_thread for item in sublist]) sparsity = np.array([item for sublist in sparsity_thread for item in sublist]) plt.close() plt.hist(read_coverage, bins=100) plt.suptitle('Read coverage of {}'.format(os.path.basename(args.matrix)), fontsize=12) plt.grid(True) if args.minimumReadCoverage > 0: plt.axvline(args.minimumReadCoverage, color='r', linestyle='dashed', linewidth=1) plt.title('Matrices with a read coverage < {} are removed.'.format(args.minimumReadCoverage), fontsize=10) plt.xlabel('Read coverage') plt.ylabel('Frequency') plt.savefig(args.outFileNameReadCoverage, dpi=args.dpi) plt.close() plt.hist(sparsity, bins=100) plt.suptitle('Density of {}'.format(os.path.basename(args.matrix)), fontsize=12) if args.minimumDensity > 0: plt.title('Matrices with a density < {} are removed.'.format(args.minimumDensity), fontsize=10) plt.grid(True) plt.xlabel('Density') plt.ylabel('Frequency') if args.minimumDensity > 0: plt.axvline(args.minimumDensity, color='r', linestyle='dashed', linewidth=1) plt.savefig(args.outFileNameDensity, dpi=args.dpi) plt.close() mask_read_coverage = read_coverage >= args.minimumReadCoverage mask_sparsity = sparsity >= args.minimumDensity mask = np.logical_and(mask_read_coverage, mask_sparsity) matrices_list_filtered = np.array(matrices_list)[mask] sum_read_coverage = np.sum(~mask_read_coverage) sum_sparsity = np.sum(~mask_sparsity) if not args.plotOnly: np.savetxt('accepted_matrices.txt', matrices_list_filtered, fmt="%s") np.savetxt('rejected_matrices.txt', np.array(matrices_list)[~mask], fmt="%s") if os.path.exists(args.outputScool): os.remove(args.outputScool) cooler.fileops.cp(args.matrix + '::/bins', args.outputScool + '::/bins') cooler.fileops.cp(args.matrix + '::/chroms', args.outputScool + '::/chroms') with cooler.util.open_hdf5(args.matrix) as source: attributes_dict = {} for k, v in source.attrs.items(): attributes_dict[k] = v attributes_dict['ncells'] = len(matrices_list_filtered) attributes_dict['creation-date'] = datetime.now().isoformat() with h5py.File(args.outputScool, "r+") as f: h5 = f['/'] h5.attrs.update(attributes_dict) content_bins_ln = ['chrom', 'start', 'end'] for matrix in matrices_list_filtered: cooler.fileops.cp(args.matrix + '::' + matrix + '/pixels', args.outputScool + '::' + matrix + '/pixels') cooler.fileops.cp(args.matrix + '::' + matrix + '/indexes', args.outputScool + '::' + matrix + '/indexes') cooler.fileops.ln(args.outputScool + '::' + '/chroms', args.outputScool + '::' + matrix + '/chroms') cooler.fileops.ln(args.outputScool + '::' + '/bins/chrom', args.outputScool + '::' + matrix + '/bins/chrom') cooler.fileops.ln(args.outputScool + '::' + '/bins/start', args.outputScool + '::' + matrix + '/bins/start') cooler.fileops.ln(args.outputScool + '::' + '/bins/end', args.outputScool + '::' + matrix + '/bins/end') group_dataset_list = cooler.fileops.ls(args.matrix + '::' + matrix + '/bins/') for datatype in group_dataset_list: last_element = datatype.split('/')[-1] if not (last_element) in content_bins_ln and last_element != '': cooler.fileops.cp(args.matrix + '::' + matrix + '/bins/' + last_element, args.outputScool + '::' + matrix + '/bins/' + last_element) with cooler.util.open_hdf5(args.matrix) as source: # , cooler.util.open_hdf5(args.outputScool + '::' + matrix) as destination: attributes_dict = {} for k, v in source[matrix].attrs.items(): attributes_dict[k] = v with h5py.File(args.outputScool, "r+") as f: h5 = f[matrix] h5.attrs.update(attributes_dict) ################## # Create QC report ################## header = '# QC report for single-cell Hi-C data generated by scHiCExplorer ' + __version__ + '\n' matrix_statistics = 'scHi-C sample contained {} cells:\n'.format(all_samples_number) if args.runChromosomeCheck: matrices_bad_chromosomes = 'Number of removed matrices containing bad chromosomes {}\n'.format(len(matrices_remove)) matrices_low_read_coverage = 'Number of removed matrices due to low read coverage (< {}): {}\n'.format(args.minimumReadCoverage, sum_read_coverage) matrices_too_sparse = 'Number of removed matrices due to too many zero bins (< {} density, within {} relative genomic distance): {}\n'.format(args.minimumDensity, args.maximumRegionToConsider, sum_sparsity) matrix_qc = '{} samples passed the quality control. Please consider matrices with a low read coverage may be the matrices with a low density and overlap therefore.'.format(len(matrices_list_filtered)) with open(args.outFileNameQCReport, 'w') as file: file.write(header) file.write(matrix_statistics) if args.runChromosomeCheck: file.write(matrices_bad_chromosomes) file.write(matrices_low_read_coverage) file.write(matrices_too_sparse) file.write(matrix_qc)
def main(args=None): args = parse_arguments().parse_args(args) if args.region is not None and args.chromosomes is not None: raise Exception('--chromosomes and --region are mutual exclusive.') exit(1) matrices_list = cell_name_list(args.matrix) columns = 4 if len(matrices_list) < columns: columns = len(matrices_list) rows = int(np.ceil(len(matrices_list) / columns)) if rows < 1: rows = 1 if len(matrices_list) > 12: figsize = (5, 5.5) elif len(matrices_list) > 8: figsize = (5, 4.5) elif len(matrices_list) > 4: figsize = (5, 4) else: figsize = (5, 3) f, axes = plt.subplots(rows, columns, figsize=figsize) title_string = 'Consensus matrices of {}'.format(os.path.basename(args.matrix.split('.scool')[0])) if args.chromosomes: title_string += ' on chromosome: {}'.format(' '.join(args.chromosomes)) elif args.region: title_string += ' for {}'.format(args.region) else: title_string += ' on all chromosomes' if args.no_header: plt.suptitle(title_string, fontsize=args.fontsize) from mpl_toolkits.axes_grid1 import make_axes_locatable for i, matrix in enumerate(matrices_list): if args.chromosomes is not None and len(args.chromosomes) == 1: hic_ma = hm.hiCMatrix(pMatrixFile=args.matrix + '::' + matrix, pChrnameList=args.chromosomes) elif args.region is not None: hic_ma = hm.hiCMatrix(pMatrixFile=args.matrix + '::' + matrix, pChrnameList=[args.region]) else: hic_ma = hm.hiCMatrix(pMatrixFile=args.matrix + '::' + matrix) if args.chromosomes: hic_ma.keepOnlyTheseChr(args.chromosomes) matrix_data = hic_ma.matrix matrix_data = matrix_data.toarray() mask = matrix_data == 0 try: matrix_data[mask] = np.nanmin(matrix_data[mask == False]) except ValueError: log.info('Matrix contains only 0. Set all values to {}'.format(np.finfo(float).tiny)) matrix_data[mask] = np.finfo(float).tiny if np.isnan(matrix_data).any() or np.isinf(matrix_data).any(): mask_nan = np.isnan(matrix_data) mask_inf = np.isinf(matrix_data) matrix_data[mask_nan] = np.nanmin(matrix_data[mask_nan == False]) matrix_data[mask_inf] = np.nanmin(matrix_data[mask_inf == False]) matrix_data += 1 if args.log1p: matrix_data += 1 norm = LogNorm() else: norm = None if rows == 1: im = axes[i % columns].imshow(matrix_data, cmap=args.colorMap, norm=norm) axes[i % columns].get_xaxis().set_ticks([]) axes[i % columns].get_yaxis().set_ticks([]) axes[i % columns].yaxis.set_visible(False) axes[i % columns].set_xlabel(str(matrix.split('/')[-1].split('cluster_')[-1])) else: im = axes[i // columns, i % columns].imshow(matrix_data, cmap=args.colorMap, norm=norm) axes[i // columns, i % columns].get_xaxis().set_ticks([]) axes[i // columns, i % columns].get_yaxis().set_ticks([]) axes[i // columns, i % columns].yaxis.set_visible(False) axes[i // columns, i % columns].set_xlabel(str(matrix.split('/')[-1].split('cluster_')[-1].split(':')[0])) number_of_plots = len(matrices_list) i = -1 while rows * columns > number_of_plots: axes[-1, i].axis('off') number_of_plots += 1 i -= 1 plt.tight_layout() f.subplots_adjust(right=0.8) cbar_ax = f.add_axes([0.85, 0.15, 0.05, 0.7]) f.colorbar(im, cax=cbar_ax) plt.savefig(args.outFileName, dpi=args.dpi) plt.close()
def main(args=None): args = parse_arguments().parse_args(args) log.debug(args) matrix_file_handler_object_list = [] matrices_list = cell_name_list(args.matrix) if args.action in ['extractToCool', 'extractScool']: if args.cellList is not None: matrix_list_tmp = [] with open(args.cellList, 'r') as file: for line in file: values = line.strip() log.debug('values {}'.format(values)) if not values.startswith('/cells'): values = '/cells/' + values if values in matrices_list: matrix_list_tmp.append(values) matrices_list = matrix_list_tmp if len(matrices_list) == 0: raise OSError('No cells for processing. Terminating.') exit(1) if len(matrices_list) < args.threads: args.threads = len(matrices_list) matrixFileHandlerInput = MatrixFileHandler(pFileType='cool', pMatrixFile=args.matrix + "::" + matrices_list[0]) _matrix, cut_intervals_all, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() threads = args.threads matrixFileHandler_list = [None] * args.threads process = [None] * args.threads queue = [None] * args.threads thread_done = [False] * args.threads matricesPerThread = len(matrices_list) // threads for i in range(args.threads): if i < threads - 1: matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread] else: matrices_name_list = matrices_list[i * matricesPerThread:] queue[i] = Queue() process[i] = Process(target=load_cool_files, kwargs=dict(pMatrixName=args.matrix, pMatricesList=matrices_name_list, pCutIntervals=cut_intervals_all, pQueue=queue[i])) process[i].start() all_data_collected = False fail_flag = False fail_message = '' while not all_data_collected: for i in range(threads): if queue[i] is not None and not queue[i].empty(): matrixFileHandler_list[i] = queue[i].get() if 'Fail:' in matrixFileHandler_list[i]: fail_flag = True fail_message = matrixFileHandler_list[i][6:] queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) if fail_flag: log.error(fail_message) exit(1) matrix_file_handler_object_list = [ item for sublist in matrixFileHandler_list for item in sublist ] if args.action in ['extractScool', 'update']: matrixFileHandler = MatrixFileHandler(pFileType='scool') matrixFileHandler.matrixFile.coolObjectsList = matrix_file_handler_object_list matrixFileHandler.save(args.outFileName, pSymmetric=True, pApplyCorrection=False) else: if not os.path.exists(args.outFileName): try: os.makedirs(args.outFileName) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise for matrixFileHandler in matrix_file_handler_object_list: matrixFileHandler.save( args.outFileName + '/' + matrixFileHandler.matrixFile.matrixFileName + '.cool', pApplyCorrection=True, pSymmetric=True)
def main(args=None): args = parse_arguments().parse_args(args) threads = args.threads merged_matrices = [None] * threads matrices_list = cell_name_list(args.matrix) if len(matrices_list) < threads: threads = len(matrices_list) all_data_collected = False thread_done = [False] * threads length_index = [None] * threads length_index[0] = 0 matricesPerThread = len(matrices_list) // threads queue = [None] * threads process = [None] * threads for i in range(threads): if i < threads - 1: matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread] length_index[i + 1] = length_index[i] + len(matrices_name_list) else: matrices_name_list = matrices_list[i * matricesPerThread:] queue[i] = Queue() process[i] = Process(target=compute_merge, kwargs=dict(pMatrixName=args.matrix, pMatrixList=matrices_name_list, pRunningWindow=args.runningWindow, pNumBins=args.numBins, pQueue=queue[i])) process[i].start() fail_flag = False fail_message = '' while not all_data_collected: for i in range(threads): if queue[i] is not None and not queue[i].empty(): # log.debug('i {}'.format(i)) # log.debug('len(queue) {}'.format(len(queue))) # log.debug('len(merged_matrices) {}'.format(len(merged_matrices))) merged_matrices[i] = queue[i].get() if isinstance( merged_matrices[i][0], str) and merged_matrices[i][0].startswith('Fail: '): fail_flag = True fail_message = merged_matrices[i][0] queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) if fail_flag: log.error('{}'.format(fail_message)) exit(1) matrixFileHandlerObjects_list = [ item for sublist in merged_matrices for item in sublist ] matrixFileHandler = MatrixFileHandler(pFileType='scool') matrixFileHandler.matrixFile.coolObjectsList = matrixFileHandlerObjects_list matrixFileHandler.save(args.outFileName, pSymmetric=True, pApplyCorrection=False)
def main(args=None): args = parse_arguments().parse_args(args) threads = args.threads matrixFileHandler_list = [None] * threads matrices_list = cell_name_list(args.matrix) if len(matrices_list) < threads: threads = len(matrices_list) matrixFileHandlerInput = MatrixFileHandler(pFileType='cool', pMatrixFile=args.matrix + "::" + matrices_list[0]) _matrix, cut_intervals_all, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() all_data_collected = False thread_done = [False] * threads length_index = [None] * threads length_index[0] = 0 matricesPerThread = len(matrices_list) // threads queue = [None] * threads process = [None] * threads print('Threads: ' + str(threads)) for i in range(threads): if i < threads - 1: matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread] length_index[i + 1] = length_index[i] + len(matrices_name_list) else: matrices_name_list = matrices_list[i * matricesPerThread:] queue[i] = Queue() process[i] = Process(target=compute_correction, kwargs=dict( pMatrixName=args.matrix, pMatrixList=matrices_name_list, pCutIntervals=cut_intervals_all, pQueue=queue[i] ) ) process[i].start() fail_flag = False while not all_data_collected: for i in range(threads): if queue[i] is not None and not queue[i].empty(): matrixFileHandler_list[i] = queue[i].get() # csr_matrix_worker = queue[i].get() if isinstance(matrixFileHandler_list[i], str): log.error('{}'.format(matrixFileHandler_list[i])) fail_flag = True queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) if fail_flag: exit(1) matrix_file_handler_object_list = [item for sublist in matrixFileHandler_list for item in sublist] matrixFileHandler = MatrixFileHandler(pFileType='scool') matrixFileHandler.matrixFile.coolObjectsList = matrix_file_handler_object_list matrixFileHandler.save(args.outFileName, pSymmetric=True, pApplyCorrection=False)
def main(args=None): args = parse_arguments().parse_args(args) matrices_name = args.matrix threads = args.threads matrices_list = cell_name_list(matrices_name) read_coverage = [None] * threads all_data_collected = False thread_done = [False] * threads matricesPerThread = len(matrices_list) // threads queue = [None] * threads process = [None] * threads for i in range(threads): if i < threads - 1: matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread] else: matrices_name_list = matrices_list[i * matricesPerThread:] queue[i] = Queue() process[i] = Process(target=compute_read_distribution, kwargs=dict(pMatrixName=matrices_name, pMatricesList=matrices_name_list, pMaximalDistance=args.maximalDistance, pChromosomes=args.chromosomes, pQueue=queue[i])) process[i].start() while not all_data_collected: for i in range(threads): if queue[i] is not None and not queue[i].empty(): read_coverage[i], resolution = queue[i].get() queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) read_distributions = [] for thread_data in read_coverage: for matrix_data in thread_data: read_distributions.append(matrix_data) read_distributions = np.array(read_distributions) clusters = {} clusters_svl = {} short_range_distance = args.distanceShortRange // resolution long_range_distance = args.distanceLongRange // resolution with open(args.clusters, 'r') as cluster_file: for i, line in enumerate(cluster_file.readlines()): line = line.strip() line_ = line.split(' ')[1] if int(line_) in clusters: clusters[int(line_)].append(read_distributions[i]) clusters_svl[int(line_)].append( np.sum(read_distributions[i][:short_range_distance]) / np.sum(read_distributions[i] [short_range_distance:long_range_distance])) else: clusters[int(line_)] = [read_distributions[i]] clusters_svl[int(line_)] = [ np.sum(read_distributions[i][:short_range_distance]) / np.sum(read_distributions[i] [short_range_distance:long_range_distance]) ] if args.orderBy == 'svl': for i, cluster_key in enumerate(clusters.keys()): clusters[cluster_key] = np.array(clusters[cluster_key]) clusters_svl[cluster_key] = np.array(clusters_svl[cluster_key]) sorted_indices = np.argsort(clusters_svl[cluster_key]) clusters[cluster_key] = clusters[cluster_key][sorted_indices] cluster_to_plot = [] clusters_list = [] cluster_size = [] key_list_cluster = sorted(clusters.keys()) for i, key in enumerate(key_list_cluster): cluster_to_plot = [] for cluster_item in clusters[key]: cluster_to_plot.append(cluster_item) clusters_list.append(np.array(cluster_to_plot)) cluster_size.append(len(cluster_to_plot)) cluster_size = np.array(cluster_size) cluster_size = (1.0 - 0.1) * (cluster_size - min(cluster_size)) / ( max(cluster_size) - min(cluster_size)) + (0.1) cluster_size = list(cluster_size) all_data = None index_clusters = [] cluster_ticks = [] cluster_ticks_top = [] ticks_position = [] for i, cluster in enumerate(clusters_list): if all_data is None: all_data = cluster index_clusters.append(len(cluster)) ticks_position.append(0 + len(cluster) // 2) else: all_data = np.append(all_data, cluster, axis=0) index_clusters.append(index_clusters[i - 1] + len(cluster)) ticks_position.append(index_clusters[i - 1] + len(cluster) // 2) cluster_ticks.append('Cluster {}: {} cells'.format((i), len(cluster))) cluster_ticks_top.append('Cluster {}'.format(i)) if len(matrices_list) > 1000: fig = plt.figure(figsize=(8, 3)) elif len(matrices_list) > 500: fig = plt.figure(figsize=(5, 3)) elif len(matrices_list) > 250: fig = plt.figure(figsize=(4, 3)) else: fig = plt.figure(figsize=(3, 3)) plt.imshow(all_data.T, cmap=args.colorMap, norm=LogNorm(), aspect="auto") for index in index_clusters: plt.axvline(index - 1, color='black', linewidth=0.4) y_ticks = [] y_labels = [] unit = 'MB' factor = args.maximalDistance // 10 if factor >= 1000000: unit = 'MB' elif factor >= 1000: unit = 'kb' else: unit = 'b' for i in range(0, (args.maximalDistance) + 1, resolution): if i % (factor) == 0: y_ticks.append(i // resolution) label = '' if factor >= 1000000: label = str(i // 1000000) elif factor >= 1000: label = str(i // 1000) else: label = str(i) y_labels.append(label + unit) plt.yticks(ticks=y_ticks, labels=y_labels, fontsize=args.fontsize) plt.gca().invert_yaxis() if args.ticks: plt.xticks(ticks=ticks_position, labels=cluster_ticks_top, rotation=args.rotationX, fontsize=args.fontsize) elif args.legend: plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom=False, # ticks along the bottom edge are off top=False, # ticks along the top edge are off labelbottom=False) if len(cluster_ticks) < 5: ncols = 1 else: ncols = 3 leg = plt.legend(cluster_ticks, loc='upper center', bbox_to_anchor=(0.5, -0.01), fancybox=True, shadow=False, ncol=ncols, fontsize=args.fontsize) for item in leg.legendHandles: item.set_visible(False) else: plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom=False, # ticks along the bottom edge are off top=False, # ticks along the top edge are off labelbottom=False) fig.autofmt_xdate() cbar = plt.colorbar() cbar.ax.set_ylabel('% contacts', rotation=270, fontsize=args.fontsize) cbar.ax.yaxis.set_label_coords(args.fontsize, 0.5) cbar.ax.invert_yaxis() cbar.ax.tick_params(labelsize=args.fontsize) plt.tight_layout() plt.savefig(args.outFileName, dpi=args.dpi) plt.close()
def main(args=None): args = parse_arguments().parse_args(args) matrices_name = args.matrix threads = args.threads matrices_list = cell_name_list(matrices_name) if not os.path.exists(args.outputFolder + '/cells'): try: os.makedirs(args.outputFolder + '/cells') except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise if threads > len(matrices_list): threads = len(matrices_list) # load bin ids only once cooler_obj = cooler.Cooler(matrices_name + '::' + matrices_list[0]) bins = cooler_obj.bins()[:] all_data_collected = False thread_done = [False] * threads cell_name_array_thread = [None] * threads matricesPerThread = len(matrices_list) // threads queue = [None] * threads process = [None] * threads for i in range(threads): if i < threads - 1: matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread] else: matrices_name_list = matrices_list[i * matricesPerThread:] queue[i] = Queue() process[i] = Process(target=convert_files, kwargs=dict( pMatrixName=matrices_name, pMatricesList=matrices_name_list, pBinsDataFrame=bins, pOutputFolder=args.outputFolder, pFormat=args.format, pQueue=queue[i] ) ) process[i].start() while not all_data_collected: for i in range(threads): if queue[i] is not None and not queue[i].empty(): cell_name_array_thread[i] = queue[i].get() queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) for subset in cell_name_array_thread: if subset[0] is None: exit(1) cell_name_array = [item for sublist in cell_name_array_thread for item in sublist] # write cell names to file with open(args.outputCellNameFile, 'w') as file: for cell_name in cell_name_array: file.write('{}\n'.format(cell_name)) # write chromsizes to file with open(args.outputChromosomeSize, 'w') as file: for chromosome_name, size in cooler_obj.chromsizes.items(): file.write('{}\t{}\n'.format(chromosome_name, size))
def main(args=None): args = parse_arguments().parse_args(args) # if args.threads <= 4: # log.error('') # exit(1) outputFolder = os.path.dirname(os.path.abspath(args.outFileName)) + '/' raw_file_name = os.path.splitext(os.path.basename(args.outFileName))[0] if args.numberOfNearestNeighbors is None: cooler_obj = cooler.Cooler(args.matrix) args.numberOfNearestNeighbors = int(cooler_obj.info['ncells']) if args.cell_coloring_type: cell_name_cell_type_dict = {} cell_type_color_dict = {} color_cell_type_dict = {} cell_type_counter = 0 with open(args.cell_coloring_type, 'r') as file: for i, line in enumerate(file.readlines()): line = line.strip() try: cell_name, cell_type = line.split('\t') except Exception: cell_name, cell_type = line.split(' ') cell_name_cell_type_dict[cell_name] = cell_type if cell_type not in cell_type_color_dict: cell_type_color_dict[cell_type] = cell_type_counter color_cell_type_dict[cell_type_counter] = cell_type cell_type_counter += 1 if args.cell_coloring_batch: cell_name_cell_type_dict_batch = {} cell_type_color_dict_batch = {} color_cell_type_dict_batch = {} cell_type_counter_batch = 0 with open(args.cell_coloring_batch, 'r') as file: for i, line in enumerate(file.readlines()): line = line.strip() try: cell_name, cell_type = line.split('\t') except Exception: cell_name, cell_type = line.split(' ') cell_name_cell_type_dict_batch[cell_name] = cell_type if cell_type not in cell_type_color_dict_batch: cell_type_color_dict_batch[cell_type] = cell_type_counter_batch color_cell_type_dict_batch[cell_type_counter_batch] = cell_type cell_type_counter_batch += 1 if args.clusterMethod == 'spectral': cluster_object = SpectralClustering(n_clusters=args.numberOfClusters, affinity='nearest_neighbors', n_jobs=args.threads, random_state=0) elif args.clusterMethod == 'kmeans': cluster_object = KMeans(n_clusters=args.numberOfClusters, random_state=0, n_jobs=args.threads, precompute_distances=True) elif args.clusterMethod.startswith('agglomerative'): for linkage in ['ward', 'complete', 'average', 'single']: if linkage in args.clusterMethod: cluster_object = AgglomerativeClustering(n_clusters=args.numberOfClusters, linkage=linkage) break elif args.clusterMethod == 'birch': cluster_object = Birch(n_clusters=args.numberOfClusters) else: log.error('No valid cluster method given: {}'.format(args.clusterMethod)) umap_params_dict = {} if not args.noUMAP: for param in vars(args): if 'umap_' in param: umap_params_dict[param] = vars(args)[param] umap_params_dict['umap_random'] = 42 # log.debug(umap_params_dict) if args.saveMemory: matrices_list = cell_name_list(args.matrix) max_nnz = 0 for matrix in matrices_list: cooler_obj = cooler.Cooler(args.matrix + '::' + matrix) nnz = cooler_obj.info['nnz'] if max_nnz < nnz: max_nnz = nnz minHash_object = None matricesPerRun = int(len(matrices_list) * args.shareOfMatrixToBeTransferred) if matricesPerRun < 1: matricesPerRun = 1 chromosome_indices = None if args.intraChromosomalContactsOnly: cooler_obj = cooler.Cooler(args.matrix + '::' + matrices_list[0]) binsDataFrame = cooler_obj.bins()[:] chromosome_indices = {} for chromosome in cooler_obj.chromnames: chromosome_indices[chromosome] = np.array(binsDataFrame.index[binsDataFrame['chrom'] == chromosome].tolist()) for j, i in enumerate(range(0, len(matrices_list), matricesPerRun)): if i < len(matrices_list) - 1: matrices_share = matrices_list[i:i + matricesPerRun] else: matrices_share = matrices_list[i:] neighborhood_matrix, matrices_list_share = open_and_store_matrix(args.matrix, matrices_share, 0, len(matrices_share), args.chromosomes, args.intraChromosomalContactsOnly, chromosome_indices) if minHash_object is None: minHash_object = MinHash(n_neighbors=args.numberOfNearestNeighbors, number_of_hash_functions=args.numberOfHashFunctions, number_of_cores=args.threads, shingle_size=0, fast=args.euclideanModeMinHash, maxFeatures=int(max_nnz), absolute_numbers=False) if j == 0: minHash_object.fit(neighborhood_matrix) else: minHash_object.partial_fit(X=neighborhood_matrix) precomputed_graph = minHash_object.kneighbors_graph(mode='distance') precomputed_graph = np.nan_to_num(precomputed_graph) precomputed_graph.data[np.isinf(precomputed_graph.data)] = 0 if not args.noPCA: pca = PCA(n_components=min(precomputed_graph.shape) - 1) precomputed_graph = np.nan_to_num(precomputed_graph.todense()) precomputed_graph[np.isinf(precomputed_graph)] = 0 precomputed_graph = pca.fit_transform(precomputed_graph) if args.dimensionsPCA: args.dimensionsPCA = min(args.dimensionsPCA, precomputed_graph.shape[0]) precomputed_graph = precomputed_graph[:, :args.dimensionsPCA] # cluster_object.fit(precomputed_graph[:, :args.dimensionsPCA]) if not args.noUMAP: if umap_params_dict is None: reducer = umap.UMAP() else: reducer = umap.UMAP(n_neighbors=umap_params_dict['umap_n_neighbors'], n_components=umap_params_dict['umap_n_components'], metric=umap_params_dict['umap_metric'], n_epochs=umap_params_dict['umap_n_epochs'], learning_rate=umap_params_dict['umap_learning_rate'], init=umap_params_dict['umap_init'], min_dist=umap_params_dict['umap_min_dist'], spread=umap_params_dict['umap_spread'], set_op_mix_ratio=umap_params_dict['umap_set_op_mix_ratio'], local_connectivity=umap_params_dict['umap_local_connectivity'], repulsion_strength=umap_params_dict['umap_repulsion_strength'], negative_sample_rate=umap_params_dict['umap_negative_sample_rate'], transform_queue_size=umap_params_dict['umap_transform_queue_size'], a=umap_params_dict['umap_a'], b=umap_params_dict['umap_b'], angular_rp_forest=umap_params_dict['umap_angular_rp_forest'], target_n_neighbors=umap_params_dict['umap_target_n_neighbors'], target_metric=umap_params_dict['umap_target_metric'], target_weight=umap_params_dict['umap_target_weight'], random_state=umap_params_dict['umap_random'], force_approximation_algorithm=umap_params_dict['umap_force_approximation_algorithm'], verbose=umap_params_dict['umap_verbose'], unique=umap_params_dict['umap_unique']) precomputed_graph = reducer.fit_transform(precomputed_graph) precomputed_graph = np.nan_to_num(precomputed_graph) precomputed_graph[np.isinf(precomputed_graph)] = 0 try: cluster_object.fit(precomputed_graph) except Exception: cluster_object.fit(precomputed_graph.todense()) minHashClustering = MinHashClustering(minHashObject=minHash_object, clusteringObject=cluster_object) minHashClustering._precomputed_graph = precomputed_graph else: neighborhood_matrix, matrices_list = create_csr_matrix_all_cells(args.matrix, args.threads, args.chromosomes, outputFolder, raw_file_name, args.intraChromosomalContactsOnly, pDistance=args.distance) if args.saveIntermediateRawMatrix: save_npz(args.saveIntermediateRawMatrix, neighborhood_matrix) if not args.saveMemory: minHash_object = MinHash(n_neighbors=args.numberOfNearestNeighbors, number_of_hash_functions=args.numberOfHashFunctions, number_of_cores=args.threads, shingle_size=5, fast=args.euclideanModeMinHash, maxFeatures=int(max(neighborhood_matrix.getnnz(1))), absolute_numbers=False, max_bin_size=100000, minimal_blocks_in_common=100, excess_factor=1, prune_inverse_index=False) minHashClustering = MinHashClustering(minHashObject=minHash_object, clusteringObject=cluster_object) minHashClustering.fit(X=neighborhood_matrix, pSaveMemory=args.shareOfMatrixToBeTransferred, pPca=(not args.noPCA), pPcaDimensions=args.dimensionsPCA, pUmap=(not args.noUMAP), pUmapDict=umap_params_dict) if args.noPCA and args.noUMAP: mask = np.isnan(minHashClustering._precomputed_graph.data) minHashClustering._precomputed_graph.data[mask] = 0 mask = np.isinf(minHashClustering._precomputed_graph.data) minHashClustering._precomputed_graph.data[mask] = 0 labels_clustering = minHashClustering.predict(minHashClustering._precomputed_graph, pPca=args.noPCA, pPcaDimensions=args.dimensionsPCA) if args.createScatterPlot: if args.noPCA and args.noUMAP: pca = PCA(n_components=min(minHashClustering._precomputed_graph.shape) - 1) neighborhood_matrix_knn = pca.fit_transform(minHashClustering._precomputed_graph.todense()) else: neighborhood_matrix_knn = minHashClustering._precomputed_graph list(set(labels_clustering)) colors = process_cmap(args.colorMap) try: neighborhood_matrix_knn = neighborhood_matrix_knn.toarray() except Exception: pass label_x = 'PC1' label_y = 'PC2' if not (args.noUMAP): label_x = 'UMAP1' label_y = 'UMAP2' if args.cell_coloring_type: if len(colors) < len(cell_type_color_dict): log.error('The chosen colormap offers too less values for the number of clusters.') exit(1) labels_clustering_cell_type = [] for cell_name in matrices_list: labels_clustering_cell_type.append(cell_type_color_dict[cell_name_cell_type_dict[cell_name]]) labels_clustering_cell_type = np.array(labels_clustering_cell_type) log.debug('labels_clustering_cell_type: {}'.format(len(labels_clustering_cell_type))) log.debug('matrices_list: {}'.format(len(matrices_list))) plt.figure(figsize=(args.figuresize[0], args.figuresize[1])) for i, color in enumerate(colors[:len(cell_type_color_dict)]): mask = labels_clustering_cell_type == i log.debug('plot cluster: {} {}'.format(color_cell_type_dict[i], np.sum(mask))) plt.scatter(neighborhood_matrix_knn[:, 0].T[mask], neighborhood_matrix_knn[:, 1].T[mask], color=color, label=str(color_cell_type_dict[i]), s=20, alpha=0.7) plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=args.fontsize) plt.xticks([]) plt.yticks([]) plt.xlabel(label_x, fontsize=args.fontsize) plt.ylabel(label_y, fontsize=args.fontsize) if '.' not in args.createScatterPlot: args.createScatterPlot += '.png' scatter_plot_name = '.'.join(args.createScatterPlot.split('.')[:-1]) + '_cell_color.' + args.createScatterPlot.split('.')[-1] plt.tight_layout() plt.savefig(scatter_plot_name, dpi=args.dpi) plt.close() # compute overlap of cell_type find found clusters computed_clusters = set(labels_clustering) cell_type_amounts_dict = {} percentage_threshold = 0.8 if args.latexTable: for threshold in [0.7, 0.8, 0.9]: cell_type_amounts_dict[threshold] = {} with open(args.latexTable, 'w') as matches_file: header = '\\begin{table}[!htb]\n\\footnotesize\n\\begin{tabular}{|l' body = '\\hline Cluster ' for i in range(len(color_cell_type_dict)): mask_cell_type = labels_clustering_cell_type == i header += '|c' body += '& ' + str(color_cell_type_dict[i]) + ' (' + str(np.sum(mask_cell_type)) + ' cells)' header += '|}\n' body += '\\\\\n' # body = '' for i in computed_clusters: body += '\\hline Cluster ' + str(i) mask_computed_clusters = labels_clustering == i body += ' (' + str(np.sum(mask_computed_clusters)) + ' cells)' for j in range(len(cell_type_color_dict)): mask_cell_type = labels_clustering_cell_type == j mask = mask_computed_clusters & mask_cell_type number_of_matches = np.sum(mask) body += '& ' + str(number_of_matches) if number_of_matches != 1: body += ' cells / ' else: body += ' cell / ' body += '{:.2f}'.format((number_of_matches / np.sum(mask_computed_clusters)) * 100) + ' \\% ' for threshold in [0.7, 0.8, 0.9]: if number_of_matches / np.sum(mask_computed_clusters) >= threshold: if color_cell_type_dict[j] in cell_type_amounts_dict[threshold]: cell_type_amounts_dict[threshold][color_cell_type_dict[j]] += number_of_matches else: cell_type_amounts_dict[threshold][color_cell_type_dict[j]] = number_of_matches else: if color_cell_type_dict[j] in cell_type_amounts_dict[threshold]: continue else: cell_type_amounts_dict[threshold][color_cell_type_dict[j]] = 0 body += '\\\\\n' body += '\\hline ' + '&' * len(cell_type_color_dict) + '\\\\\n' for threshold in [0.7, 0.8, 0.9]: body += '\\hline Correct identified $>{}\\%$'.format(int(threshold * 100)) for i in range(len(cell_type_color_dict)): mask_cell_type = labels_clustering_cell_type == i if color_cell_type_dict[i] in cell_type_amounts_dict[threshold]: body += '& ' + str(cell_type_amounts_dict[threshold][color_cell_type_dict[i]]) + ' / ' + str(np.sum(mask_cell_type)) + ' (' body += '{:.2f}'.format((cell_type_amounts_dict[threshold][color_cell_type_dict[i]] / np.sum(mask_cell_type)) * 100) else: body += '& ' + str(0) + ' / ' + str(np.sum(mask_cell_type)) + ' (' body += '{:.2f}'.format(0 / np.sum(mask_cell_type)) body += ' \\%)' body += '\\\\\n' body += '\\hline \n' body += '\\end{tabular}\n\\caption{}\n\\end{table}' matches_file.write(header) matches_file.write(body) else: with open('matches.txt', 'w') as matches_file: for i in computed_clusters: mask_computed_clusters = labels_clustering == i for j in range(len(cell_type_color_dict)): mask_cell_type = labels_clustering_cell_type == j mask = mask_computed_clusters & mask_cell_type number_of_matches = np.sum(mask) matches_file.write('Computed cluster {} (size: {}) matching with cell type {} (size: {}) {} times. Rate (matches/computed_clusters): {}%\n'.format( i, np.sum(mask_computed_clusters), color_cell_type_dict[j], np.sum(mask_cell_type), number_of_matches, number_of_matches / np.sum(mask_computed_clusters))) if number_of_matches / np.sum(mask_computed_clusters) >= percentage_threshold: if color_cell_type_dict[j] in cell_type_amounts_dict: cell_type_amounts_dict[color_cell_type_dict[j]] += number_of_matches else: cell_type_amounts_dict[color_cell_type_dict[j]] = number_of_matches matches_file.write('\n') all_detected = 0 all_possible = 0 for i in range(len(cell_type_color_dict)): mask_cell_type = labels_clustering_cell_type == i all_possible += np.sum(mask_cell_type) if color_cell_type_dict[i] in cell_type_amounts_dict: all_detected += cell_type_amounts_dict[color_cell_type_dict[i]] cell_type_amounts_dict[color_cell_type_dict[i]] /= np.sum(mask_cell_type) else: cell_type_amounts_dict[color_cell_type_dict[i]] = 0.0 correct_associated = 0.0 for cell_iterator in cell_type_color_dict: correct_associated += cell_type_amounts_dict[cell_iterator] correct_associated /= len(cell_type_amounts_dict) # all_detected /= all_possible # correct_associated = ((correct_associated*4) + (all_detected)) / 5 # correct_associated = correct_associated with open('correct_associated', 'w') as file: file.write(str(correct_associated)) if args.cell_coloring_batch: if len(colors) < len(cell_type_color_dict_batch): log.error('The chosen colormap offers too less values for the number of clusters.') exit(1) labels_clustering_cell_type_batch = [] for cell_name in matrices_list: labels_clustering_cell_type_batch.append(cell_type_color_dict_batch[cell_name_cell_type_dict_batch[cell_name]]) labels_clustering_cell_type_batch = np.array(labels_clustering_cell_type_batch) log.debug('labels_clustering_cell_type: {}'.format(len(labels_clustering_cell_type_batch))) log.debug('matrices_list: {}'.format(len(matrices_list))) plt.figure(figsize=(args.figuresize[0], args.figuresize[1])) for i, color in enumerate(colors[:len(cell_type_color_dict_batch)]): mask = labels_clustering_cell_type_batch == i log.debug('plot cluster: {} {}'.format(color_cell_type_dict_batch[i], np.sum(mask))) plt.scatter(neighborhood_matrix_knn[:, 0].T[mask], neighborhood_matrix_knn[:, 1].T[mask], color=color, label=str(color_cell_type_dict_batch[i]), s=20, alpha=0.7) plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=args.fontsize) plt.xticks([]) plt.yticks([]) plt.xlabel(label_x, fontsize=args.fontsize) plt.ylabel(label_y, fontsize=args.fontsize) if '.' not in args.createScatterPlot: args.createScatterPlot += '.png' scatter_plot_name = '.'.join(args.createScatterPlot.split('.')[:-1]) + '_cell_color_batch.' + args.createScatterPlot.split('.')[-1] plt.tight_layout() plt.savefig(scatter_plot_name, dpi=args.dpi) plt.close() plt.figure(figsize=(args.figuresize[0], args.figuresize[1])) for i, color in enumerate(colors[:args.numberOfClusters]): mask = labels_clustering == i plt.scatter(neighborhood_matrix_knn[:, 0].T[mask], neighborhood_matrix_knn[:, 1].T[mask], color=color, label=str(i), s=20, alpha=0.7) plt.legend(fontsize=args.fontsize) plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=args.fontsize) plt.xticks([]) plt.yticks([]) plt.xlabel(label_x, fontsize=args.fontsize) plt.ylabel(label_y, fontsize=args.fontsize) if '.' not in args.createScatterPlot: args.createScatterPlot += '.png' scatter_plot_name = '.'.join(args.createScatterPlot.split('.')[:-1]) + '.' + args.createScatterPlot.split('.')[-1] plt.tight_layout() plt.savefig(scatter_plot_name, dpi=args.dpi) plt.close() matrices_cluster = list(zip(matrices_list, labels_clustering)) np.savetxt(args.outFileName, matrices_cluster, fmt="%s")
def main(args=None): args = parse_arguments().parse_args(args) matrices_name = args.matrix threads = args.threads matrices_list = cell_name_list(matrices_name) if args.createSubmatrix is not None and args.regions is None and args.chromosomes is None: for matrix in matrices_list[:args.createSubmatrix]: cooler.fileops.cp(args.matrix + '::' + matrix, args.outFileName + '::' + matrix) exit(0) input_count_matrices = len(matrices_list) if threads > len(matrices_list): threads = len(matrices_list) # load bin ids only once cooler_obj_external = cooler.Cooler(matrices_name + '::' + matrices_list[0]) bins = cooler_obj_external.bins()[:] # apply the inverted operation if the number of values is less # the idea is that for # indices = pixels['bin1_id'].apply(lambda x: x in pListIds) # the search time is less if the list pListIds is shorter # therefore the drop must be inverted too apply_inverted = False if args.action == 'keep': list_ids = bins.index[bins['chrom'].apply( lambda x: x in args.chromosomes)].tolist() list_inverted_logic_ids = bins.index[bins['chrom'].apply( lambda x: x not in args.chromosomes)].tolist() bins_new = bins[bins['chrom'].apply( lambda x: x in args.chromosomes)].reset_index() else: list_ids = bins.index[bins['chrom'].apply( lambda x: x not in args.chromosomes)].tolist() list_inverted_logic_ids = bins.index[bins['chrom'].apply( lambda x: x in args.chromosomes)].tolist() bins_new = bins[bins['chrom'].apply( lambda x: x not in args.chromosomes)].reset_index() if len(list_inverted_logic_ids) < len(list_ids): apply_inverted = True list_ids = list_inverted_logic_ids dict_values = bins_new['index'].to_dict() inv_map = {} for k, v in dict_values.items(): if k == v: continue inv_map[v] = k bins_new.drop(['index'], axis=1, inplace=True) all_data_collected = False thread_done = [False] * threads pixels_thread = [None] * threads keep_thread = [None] * threads matricesPerThread = len(matrices_list) // threads queue = [None] * threads process = [None] * threads for i in range(threads): if i < threads - 1: matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread] else: matrices_name_list = matrices_list[i * matricesPerThread:] queue[i] = Queue() process[i] = Process(target=compute_adjust_matrix, kwargs=dict(pMatrixName=matrices_name, pMatricesList=matrices_name_list, pArgs=args, pListIds=list_ids, pInvertedMap=inv_map, pInvertedLogic=apply_inverted, pQueue=queue[i])) process[i].start() while not all_data_collected: for i in range(threads): if queue[i] is not None and not queue[i].empty(): pixels_thread[i], keep_thread[i] = queue[i].get() queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) pixels_list = [item for sublist in pixels_thread for item in sublist] keep_list = [item for sublist in keep_thread for item in sublist] matrices_list = np.array(matrices_list) mask = np.array(keep_list) matrices_list = matrices_list[mask] matrixFileHandler = MatrixFileHandler(pFileType='scool') matrixFileHandler.matrixFile.bins = bins_new matrixFileHandler.matrixFile.pixel_list = pixels_list matrixFileHandler.matrixFile.name_list = matrices_list matrixFileHandler.save(args.outFileName, pSymmetric=True, pApplyCorrection=False) broken_count = input_count_matrices - np.sum(np.array(keep_list)) print( 'Out of {} matrices, {} were removed because they were broken.'.format( input_count_matrices, broken_count))
def main(args=None): args = parse_arguments().parse_args(args) matrices_name = args.matrix threads = args.threads matrices_list = cell_name_list(matrices_name) if threads > len(matrices_list): threads = len(matrices_list) compartments_matrix = None all_data_collected = False thread_done = [False] * threads length_index = [None] * threads length_index[0] = 0 matricesPerThread = len(matrices_list) // threads queue = [None] * threads process = [None] * threads for i in range(threads): if i < threads - 1: matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread] length_index[i + 1] = length_index[i] + len(matrices_name_list) else: matrices_name_list = matrices_list[i * matricesPerThread:] queue[i] = Queue() process[i] = Process(target=open_and_store_matrix, kwargs=dict(pMatrixName=matrices_name, pMatricesList=matrices_name_list, pIndex=length_index[i], pXDimension=len(matrices_list), pChromosomes=args.chromosomes, pNorm=args.norm, pExtraTrack=args.extraTrack, pHistonMarkType=args.histonMarkType, pBinarization=args.binarization, pQueue=queue[i])) process[i].start() while not all_data_collected: for i in range(threads): if queue[i] is not None and not queue[i].empty(): compartments_worker = queue[i].get() if compartments_matrix is None: compartments_matrix = compartments_worker else: compartments_matrix += compartments_worker queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) if args.clusterMethod == 'spectral': spectral_clustering = SpectralClustering( n_clusters=args.numberOfClusters, n_jobs=args.threads, random_state=0) labels_clustering = spectral_clustering.fit_predict( compartments_matrix) elif args.clusterMethod == 'kmeans': kmeans_object = KMeans(n_clusters=args.numberOfClusters, random_state=0, n_jobs=args.threads, precompute_distances=True) labels_clustering = kmeans_object.fit_predict(compartments_matrix) matrices_cluster = list(zip(matrices_list, labels_clustering)) np.savetxt(args.outFileName, matrices_cluster, fmt="%s")
def main(args=None): args = parse_arguments().parse_args(args) matrices_name = args.matrix threads = args.threads matrices_list = cell_name_list(matrices_name) svl_matrix = None all_data_collected = False thread_done = [False] * threads length_index = [None] * threads length_index[0] = 0 matricesPerThread = len(matrices_list) // threads queue = [None] * threads process = [None] * threads for i in range(threads): if i < threads - 1: matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread] length_index[i + 1] = length_index[i] + len(matrices_name_list) else: matrices_name_list = matrices_list[i * matricesPerThread:] queue[i] = Queue() process[i] = Process(target=create_svl_data, kwargs=dict(pMatrixName=matrices_name, pMatricesList=matrices_name_list, pIndex=length_index[i], pXDimension=len(matrices_list), pDistanceMin=args.distanceShortRange, pDistanceMax=args.distanceLongRange, pQueue=queue[i])) process[i].start() while not all_data_collected: for i in range(threads): if queue[i] is not None and not queue[i].empty(): csr_matrix_worker = queue[i].get() if svl_matrix is None: svl_matrix = csr_matrix_worker else: svl_matrix += csr_matrix_worker queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) if args.clusterMethod == 'spectral': spectral_clustering = SpectralClustering( n_clusters=args.numberOfClusters, affinity='nearest_neighbors', n_jobs=args.threads, random_state=0) labels_clustering = spectral_clustering.fit_predict(svl_matrix) elif args.clusterMethod == 'kmeans': kmeans_object = KMeans(n_clusters=args.numberOfClusters, random_state=0, n_jobs=args.threads, precompute_distances=True) labels_clustering = kmeans_object.fit_predict(svl_matrix) matrices_cluster = list(zip(matrices_list, labels_clustering)) np.savetxt(args.outFileName, matrices_cluster, fmt="%s")